diff --git a/.gitignore b/.gitignore index f8e1a654..fac9db78 100644 --- a/.gitignore +++ b/.gitignore @@ -38,6 +38,7 @@ docs/_build # Environments env/ venv/ +.env # Pyenv files .python-version diff --git a/docs/conf.py b/docs/conf.py index 479cdee8..77990294 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -32,30 +32,30 @@ extensions = [] # Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +templates_path = ["_templates"] # The suffix of source filenames. -source_suffix = '.rst' +source_suffix = ".rst" # The encoding of source files. # source_encoding = 'utf-8-sig' # The master toctree document. -master_doc = 'index' +master_doc = "index" # General information about the project. -project = 'pycaption' -copyright = '2012-2026, PBS.org ' \ - '(available under the Apache License, Version 2.0)' +project = "pycaption" +copyright = "2012-2026, PBS.org " \ + "(available under the Apache License, Version 2.0)" # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. # # The short X.Y version. -version = '2.2.20' +version = "2.2.20" # The full version, including alpha/beta/rc tags. -release = '2.2.20' +release = "2.2.20" # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. @@ -69,7 +69,7 @@ # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. -exclude_patterns = ['_build'] +exclude_patterns = ["_build"] # The reST default role (used for this markup: `text`) to use for all # documents. @@ -87,7 +87,7 @@ # show_authors = False # The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' +pygments_style = "sphinx" # A list of ignored prefixes for module index sorting. # modindex_common_prefix = [] @@ -131,7 +131,7 @@ # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] +html_static_path = ["_static"] # Add any extra paths that contain custom files (such as robots.txt or # .htaccess) here, relative to this directory. These files are copied @@ -180,7 +180,7 @@ # html_file_suffix = None # Output file base name for HTML help builder. -htmlhelp_basename = 'pycaptiondoc' +htmlhelp_basename = "pycaptiondoc" # -- Options for LaTeX output --------------------------------------------- @@ -188,10 +188,8 @@ latex_elements = { # The paper size ('letterpaper' or 'a4paper'). # 'papersize': 'letterpaper', - # The font size ('10pt', '11pt' or '12pt'). # 'pointsize': '10pt', - # Additional stuff for the LaTeX preamble. # 'preamble': '', } @@ -200,8 +198,7 @@ # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ - ('index', 'pycaption.tex', 'pycaption Documentation', - 'PBS', 'manual'), + ("index", "pycaption.tex", "pycaption Documentation", "PBS", "manual"), ] # The name of an image file (relative to this directory) to place at the top of @@ -229,10 +226,7 @@ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). -man_pages = [ - ('index', 'pycaption', 'pycaption Documentation', - ['PBS'], 1) -] +man_pages = [("index", "pycaption", "pycaption Documentation", ["PBS"], 1)] # If true, show URL addresses after external links. # man_show_urls = False @@ -244,9 +238,15 @@ # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - ('index', 'pycaption', 'pycaption Documentation', - 'PBS', 'pycaption', 'One line description of project.', - 'Miscellaneous'), + ( + "index", + "pycaption", + "pycaption Documentation", + "PBS", + "pycaption", + "One line description of project.", + "Miscellaneous", + ), ] # Documents to append as an appendix to all manuals. diff --git a/pycaption/__init__.py b/pycaption/__init__.py index adc9b501..3e60702f 100644 --- a/pycaption/__init__.py +++ b/pycaption/__init__.py @@ -1,30 +1,52 @@ -from .base import ( - CaptionConverter, CaptionNode, Caption, CaptionList, CaptionSet, +from .base import Caption, CaptionConverter, CaptionList, CaptionNode, CaptionSet +from .dfxp import DFXPReader, DFXPWriter +from .exceptions import ( + CaptionLineLengthError, + CaptionReadError, + CaptionReadNoCaptions, + CaptionReadSyntaxError, ) -from .dfxp import DFXPWriter, DFXPReader from .microdvd import MicroDVDReader, MicroDVDWriter from .sami import SAMIReader, SAMIWriter -from .srt import SRTReader, SRTWriter from .scc import SCCReader, SCCWriter from .scc.translator import translate_scc +from .srt import SRTReader, SRTWriter from .transcript import TranscriptWriter from .webvtt import WebVTTReader, WebVTTWriter -from .exceptions import ( - CaptionReadError, CaptionReadNoCaptions, CaptionReadSyntaxError, CaptionLineLengthError -) - __all__ = [ - 'CaptionConverter', 'DFXPReader', 'DFXPWriter', 'MicroDVDReader', - 'MicroDVDWriter', 'SAMIReader', 'SAMIWriter', 'SRTReader', 'SRTWriter', - 'SCCReader', 'SCCWriter', 'translate_scc', 'WebVTTReader', 'WebVTTWriter', - 'CaptionReadError', 'CaptionReadNoCaptions', 'CaptionReadSyntaxError', - 'detect_format', 'CaptionNode', 'Caption', 'CaptionList', 'CaptionSet', - 'TranscriptWriter' + "CaptionConverter", + "DFXPReader", + "DFXPWriter", + "MicroDVDReader", + "MicroDVDWriter", + "SAMIReader", + "SAMIWriter", + "SRTReader", + "SRTWriter", + "SCCReader", + "SCCWriter", + "translate_scc", + "WebVTTReader", + "WebVTTWriter", + "CaptionReadError", + "CaptionReadNoCaptions", + "CaptionReadSyntaxError", + "detect_format", + "CaptionNode", + "Caption", + "CaptionList", + "CaptionSet", + "TranscriptWriter", ] SUPPORTED_READERS = ( - DFXPReader, MicroDVDReader, WebVTTReader, SAMIReader, SRTReader, SCCReader, + DFXPReader, + MicroDVDReader, + WebVTTReader, + SAMIReader, + SRTReader, + SCCReader, ) @@ -36,7 +58,7 @@ def detect_format(caps): """ if not len(caps): raise CaptionReadNoCaptions("Empty caption file") - + for reader in SUPPORTED_READERS: if reader().detect(caps): return reader diff --git a/pycaption/dfxp/__init__.py b/pycaption/dfxp/__init__.py index 75d48474..aa19ca9c 100644 --- a/pycaption/dfxp/__init__.py +++ b/pycaption/dfxp/__init__.py @@ -1,2 +1,2 @@ from .base import * # noqa: F401, F403 -from .extras import SinglePositioningDFXPWriter, LegacyDFXPWriter # noqa: F401 +from .extras import LegacyDFXPWriter, SinglePositioningDFXPWriter # noqa: F401 diff --git a/pycaption/dfxp/base.py b/pycaption/dfxp/base.py index bef05864..b6facc97 100644 --- a/pycaption/dfxp/base.py +++ b/pycaption/dfxp/base.py @@ -5,25 +5,43 @@ from bs4 import BeautifulSoup, NavigableString from ..base import ( - BaseReader, BaseWriter, CaptionSet, CaptionList, Caption, CaptionNode, DEFAULT_LANGUAGE_CODE, + BaseReader, + BaseWriter, + Caption, + CaptionList, + CaptionNode, + CaptionSet, ) from ..exceptions import ( - CaptionReadNoCaptions, CaptionReadSyntaxError, InvalidInputError, + CaptionReadNoCaptions, + CaptionReadSyntaxError, CaptionReadTimingError, + InvalidInputError, ) from ..geometry import ( - Point, Stretch, UnitEnum, Padding, VerticalAlignmentEnum, - HorizontalAlignmentEnum, Alignment, Layout, + Alignment, + HorizontalAlignmentEnum, + Layout, + Padding, + Point, + Stretch, + UnitEnum, + VerticalAlignmentEnum, ) from ..utils import is_leaf __all__ = [ - 'DFXP_BASE_MARKUP', 'DFXP_DEFAULT_STYLE', 'DFXP_DEFAULT_STYLE_ID', - 'DFXP_DEFAULT_REGION_ID', 'DFXPReader', 'DFXPWriter', 'DFXP_DEFAULT_REGION' + "DFXP_BASE_MARKUP", + "DFXP_DEFAULT_STYLE", + "DFXP_DEFAULT_STYLE_ID", + "DFXP_DEFAULT_REGION_ID", + "DFXPReader", + "DFXPWriter", + "DFXP_DEFAULT_REGION", ] -DFXP_BASE_MARKUP = ''' +DFXP_BASE_MARKUP = """ @@ -32,36 +50,35 @@ -''' +""" DFXP_DEFAULT_STYLE = { - 'color': 'white', - 'font-family': 'monospace', - 'font-size': '1c', + "color": "white", + "font-family": "monospace", + "font-size": "1c", } DFXP_DEFAULT_REGION = Layout( - alignment=Alignment( - HorizontalAlignmentEnum.START, VerticalAlignmentEnum.BOTTOM) + alignment=Alignment(HorizontalAlignmentEnum.START, VerticalAlignmentEnum.BOTTOM) ) -DFXP_DEFAULT_STYLE_ID = 'default' -DFXP_DEFAULT_REGION_ID = 'bottom' +DFXP_DEFAULT_STYLE_ID = "default" +DFXP_DEFAULT_REGION_ID = "bottom" CLOCK_TIME_PATTERN = ( - r'(?P(?P\d+):(?P\d{2}):(?P\d{2})' - r'(:(?P\d{2})|\.(?P\d+))?)' + r"(?P(?P\d+):(?P\d{2}):(?P\d{2})" + r"(:(?P\d{2})|\.(?P\d+))?)" ) -OFFSET_TIME_PATTERN = (r'(?P(?P\d+(\.\d+)?)' - r'(?Ph|m|s|ms|f|t))') -TIME_EXPRESSION_PATTERN = re.compile( - fr'^({CLOCK_TIME_PATTERN}|{OFFSET_TIME_PATTERN})$') +OFFSET_TIME_PATTERN = ( + r"(?P(?P\d+(\.\d+)?)" r"(?Ph|m|s|ms|f|t))" +) +TIME_EXPRESSION_PATTERN = re.compile(rf"^({CLOCK_TIME_PATTERN}|{OFFSET_TIME_PATTERN})$") MICROSECONDS_PER_UNIT = { "hours": 3600000000, "minutes": 60000000, "seconds": 1000000, - "milliseconds": 1000 + "milliseconds": 1000, } DFXP_DEFAULT_LANGUAGE_CODE = "en" @@ -78,43 +95,41 @@ def __init__(self, *args, **kw): that permits use of attributes in the TT Style Namespace; however, this attribute applies as a style property only to those element types indicated in the following table.""" - self.read_invalid_positioning = ( - kw.get('read_invalid_positioning', False)) + self.read_invalid_positioning = kw.get("read_invalid_positioning", False) self.nodes = [] def detect(self, content): - if '' in content.lower(): + if "" in content.lower(): return True else: return False def read(self, content): if not isinstance(content, str): - raise InvalidInputError('The content is not a unicode string.') + raise InvalidInputError("The content is not a unicode string.") dfxp_document = self._get_dfxp_parser_class()( - content, read_invalid_positioning= - self.read_invalid_positioning) + content, read_invalid_positioning=self.read_invalid_positioning + ) caption_dict = {} style_dict = {} - default_language = dfxp_document.tt.attrs.get('xml:lang', - DEFAULT_LANGUAGE_CODE) + default_language = dfxp_document.tt.attrs.get("xml:lang", DEFAULT_LANGUAGE_CODE) # Each div represents all the captions for a single language. - for div in dfxp_document.find_all('div'): - lang = div.attrs.get('xml:lang', default_language) + for div in dfxp_document.find_all("div"): + lang = div.attrs.get("xml:lang", default_language) caption_dict[lang] = self._convert_div_to_caption_list(div) - for style in dfxp_document.find_all('style'): - id_ = style.attrs.get('xml:id') or style.attrs.get('id') + for style in dfxp_document.find_all("style"): + id_ = style.attrs.get("xml:id") or style.attrs.get("id") if id_: # Don't create document styles for those styles that are # descendants of tags. See link: # http://www.w3.org/TR/ttaf1-dfxp/#styling-vocabulary-style - if 'region' not in [parent_.name for parent_ in style.parents]: + if "region" not in [parent_.name for parent_ in style.parents]: style_dict[id_] = self._convert_style(style) caption_set = CaptionSet(caption_dict, styles=style_dict) @@ -131,9 +146,12 @@ def _get_dfxp_parser_class(): def _convert_div_to_caption_list(self, div): return CaptionList( - [self._convert_p_tag_to_caption(p_tag) - for p_tag in div.find_all('p') if p_tag.get_text().strip()], - div.layout_info + [ + self._convert_p_tag_to_caption(p_tag) + for p_tag in div.find_all("p") + if p_tag.get_text().strip() + ], + div.layout_info, ) def _convert_p_tag_to_caption(self, p_tag): @@ -144,27 +162,27 @@ def _convert_p_tag_to_caption(self, p_tag): if len(self.nodes) > 0: return Caption( - start, end, self.nodes, style=styles, - layout_info=p_tag.layout_info) + start, end, self.nodes, style=styles, layout_info=p_tag.layout_info + ) return None def _find_and_convert_times(self, p_tag): - begin = p_tag.get('begin') + begin = p_tag.get("begin") if not begin: - raise CaptionReadTimingError( - f'Missing begin time on line {p_tag}.') + raise CaptionReadTimingError(f"Missing begin time on line {p_tag}.") - end = p_tag.get('end') - dur = p_tag.get('dur') + end = p_tag.get("end") + dur = p_tag.get("dur") if not end and not dur: raise CaptionReadTimingError( - f'Missing end time or duration on line {p_tag}.') + f"Missing end time or duration on line {p_tag}." + ) start = self._convert_timestamp_to_microseconds(begin) if end: - end = self._convert_timestamp_to_microseconds(p_tag['end']) + end = self._convert_timestamp_to_microseconds(p_tag["end"]) else: - dur = self._convert_timestamp_to_microseconds(p_tag['dur']) + dur = self._convert_timestamp_to_microseconds(p_tag["dur"]) end = start + dur return start, end @@ -173,32 +191,41 @@ def _convert_timestamp_to_microseconds(self, stamp): match = TIME_EXPRESSION_PATTERN.search(stamp) if not match: raise CaptionReadTimingError( - f'Invalid timestamp: {stamp}. Accepted formats: hh:mm:ss / ' - 'hh:mm:ss:ff / hh:mm:ss.sub-frames / time_count h|m|s|ms|f.') - if match.group('clock_time'): + f"Invalid timestamp: {stamp}. Accepted formats: hh:mm:ss / " + "hh:mm:ss:ff / hh:mm:ss.sub-frames / time_count h|m|s|ms|f." + ) + if match.group("clock_time"): return self._convert_clock_time_to_microseconds(match) else: return self._convert_time_count_to_microseconds(match) @staticmethod def _convert_clock_time_to_microseconds(clock_time_match): - microseconds = int(clock_time_match.group('hours')) * \ - MICROSECONDS_PER_UNIT["hours"] - microseconds += int(clock_time_match.group('minutes')) * \ - MICROSECONDS_PER_UNIT["minutes"] - microseconds += int(clock_time_match.group('seconds')) * \ - MICROSECONDS_PER_UNIT["seconds"] - if clock_time_match.group('sub_frames'): - microseconds += int(clock_time_match.group('sub_frames').ljust( - 3, '0')) * MICROSECONDS_PER_UNIT["milliseconds"] - elif clock_time_match.group('frames'): - microseconds += int(clock_time_match.group('frames')) / 30 * \ - MICROSECONDS_PER_UNIT["seconds"] + microseconds = ( + int(clock_time_match.group("hours")) * MICROSECONDS_PER_UNIT["hours"] + ) + microseconds += ( + int(clock_time_match.group("minutes")) * MICROSECONDS_PER_UNIT["minutes"] + ) + microseconds += ( + int(clock_time_match.group("seconds")) * MICROSECONDS_PER_UNIT["seconds"] + ) + if clock_time_match.group("sub_frames"): + microseconds += ( + int(clock_time_match.group("sub_frames").ljust(3, "0")) + * MICROSECONDS_PER_UNIT["milliseconds"] + ) + elif clock_time_match.group("frames"): + microseconds += ( + int(clock_time_match.group("frames")) + / 30 + * MICROSECONDS_PER_UNIT["seconds"] + ) return int(microseconds) @staticmethod def _convert_time_count_to_microseconds(time_count_match): - value = float(time_count_match.group('time_count')) + value = float(time_count_match.group("time_count")) metric = time_count_match.group("metric") if metric == "h": microseconds = value * MICROSECONDS_PER_UNIT["hours"] @@ -211,8 +238,9 @@ def _convert_time_count_to_microseconds(time_count_match): elif metric == "f": microseconds = value / 30 * MICROSECONDS_PER_UNIT["seconds"] elif metric == "t": - raise NotImplementedError("The tick metric for time count is " - "not currently implemented.") + raise NotImplementedError( + "The tick metric for time count is " "not currently implemented." + ) return int(microseconds) def _convert_tag_to_node(self, tag): @@ -228,15 +256,13 @@ def _convert_tag_to_node(self, tag): # unicode string with xml entities already converted to unicode # characters. tag_text = result.groups()[0] - node = CaptionNode.create_text( - tag_text, layout_info=tag.layout_info) + node = CaptionNode.create_text(tag_text, layout_info=tag.layout_info) self.nodes.append(node) # convert line breaks - elif tag.name == 'br': - self.nodes.append( - CaptionNode.create_break(layout_info=tag.layout_info)) + elif tag.name == "br": + self.nodes.append(CaptionNode.create_break(layout_info=tag.layout_info)) # convert italics - elif tag.name == 'span': + elif tag.name == "span": # convert span self._convert_span_to_nodes(tag) else: @@ -251,9 +277,8 @@ def _convert_span_to_nodes(self, tag): # TODO - this is an obvious very old bug. args will be a dictionary. # but since nobody complained, I'll leave it like that. # Happy investigating! - if args != '': - node = CaptionNode.create_style( - True, args, layout_info=tag.layout_info) + if args != "": + node = CaptionNode.create_style(True, args, layout_info=tag.layout_info) node.start = True node.content = args self.nodes.append(node) @@ -261,8 +286,7 @@ def _convert_span_to_nodes(self, tag): # recursively call function for any children elements for a in tag.contents: self._convert_tag_to_node(a) - node = CaptionNode.create_style( - False, args, layout_info=tag.layout_info) + node = CaptionNode.create_style(False, args, layout_info=tag.layout_info) node.start = False node.content = args self.nodes.append(node) @@ -288,37 +312,36 @@ def _convert_style(self, tag): for arg in dfxp_attrs: if arg.lower() == "style": # Support multiple classes per tag - attrs['classes'] = dfxp_attrs[arg].strip().split(' ') + attrs["classes"] = dfxp_attrs[arg].strip().split(" ") # Save old class attribute for compatibility - attrs['class'] = dfxp_attrs[arg] + attrs["class"] = dfxp_attrs[arg] elif arg.lower() == "tts:fontstyle" and dfxp_attrs[arg] == "italic": - attrs['italics'] = True + attrs["italics"] = True elif arg.lower() == "tts:fontweight" and dfxp_attrs[arg] == "bold": - attrs['bold'] = True + attrs["bold"] = True elif (arg.lower() == "tts:textdecoration" and "underline" in dfxp_attrs[arg].strip().split(" ")): - attrs['underline'] = True + attrs["underline"] = True elif arg.lower() == "tts:textalign": - attrs['text-align'] = dfxp_attrs[arg] + attrs["text-align"] = dfxp_attrs[arg] elif arg.lower() == "tts:fontfamily": - attrs['font-family'] = dfxp_attrs[arg] + attrs["font-family"] = dfxp_attrs[arg] elif arg.lower() == "tts:fontsize": - attrs['font-size'] = dfxp_attrs[arg] + attrs["font-size"] = dfxp_attrs[arg] elif arg.lower() == "tts:color": - attrs['color'] = dfxp_attrs[arg] + attrs["color"] = dfxp_attrs[arg] return attrs class DFXPWriter(BaseWriter): def __init__(self, *args, **kwargs): - self.write_inline_positioning = kwargs.pop( - 'write_inline_positioning', False) + self.write_inline_positioning = kwargs.pop("write_inline_positioning", False) self.p_style = False self.open_span = False self.region_creator = None super().__init__(*args, **kwargs) - def write(self, caption_set, force=''): + def write(self, caption_set, force=""): """Converts a CaptionSet into an equivalent corresponding DFXP file :type caption_set: pycaption.base.CaptionSet @@ -327,14 +350,14 @@ def write(self, caption_set, force=''): :rtype: str """ - dfxp = BeautifulSoup(DFXP_BASE_MARKUP, 'lxml-xml') + dfxp = BeautifulSoup(DFXP_BASE_MARKUP, "lxml-xml") langs = caption_set.get_languages() if force in langs: langs = [force] - dfxp.find('tt')['xml:lang'] = force + dfxp.find("tt")["xml:lang"] = force else: - dfxp.find('tt')['xml:lang'] = DFXP_DEFAULT_LANGUAGE_CODE + dfxp.find("tt")["xml:lang"] = DFXP_DEFAULT_LANGUAGE_CODE caption_set = deepcopy(caption_set) @@ -343,10 +366,12 @@ def write(self, caption_set, force=''): for lang in langs: for caption in caption_set.get_captions(lang): caption.layout_info = self._relativize_and_fit_to_screen( - caption.layout_info) + caption.layout_info + ) for node in caption.nodes: node.layout_info = self._relativize_and_fit_to_screen( - node.layout_info) + node.layout_info + ) # Create the styles in the section, or a default style. for style_id, style in caption_set.get_styles(): @@ -354,27 +379,28 @@ def write(self, caption_set, force=''): dfxp = self._recreate_styling_tag(style_id, style, dfxp) if not caption_set.get_styles(): dfxp = self._recreate_styling_tag( - DFXP_DEFAULT_STYLE_ID, DFXP_DEFAULT_STYLE, dfxp) + DFXP_DEFAULT_STYLE_ID, DFXP_DEFAULT_STYLE, dfxp + ) - self.region_creator = self._get_region_creator_class()( - dfxp, caption_set) + self.region_creator = self._get_region_creator_class()(dfxp, caption_set) self.region_creator.create_document_regions() - body = dfxp.find('body') + body = dfxp.find("body") for lang in langs: - div = dfxp.new_tag('div') - div['xml:lang'] = lang + div = dfxp.new_tag("div") + div["xml:lang"] = lang self._assign_positioning_data(div, lang, caption_set) for caption in caption_set.get_captions(lang): if caption.style: caption_style = caption.style else: - caption_style = {'class': DFXP_DEFAULT_STYLE_ID} + caption_style = {"class": DFXP_DEFAULT_STYLE_ID} p = self._recreate_p_tag( - caption, caption_style, dfxp, caption_set, lang) + caption, caption_style, dfxp, caption_set, lang + ) self._assign_positioning_data(p, lang, caption_set, caption) div.append(p) @@ -388,8 +414,9 @@ def _get_region_creator_class(): """Hook method for providing a custom RegionCreator""" return RegionCreator - def _assign_positioning_data(self, tag, lang, caption_set=None, - caption=None, caption_node=None): + def _assign_positioning_data( + self, tag, lang, caption_set=None, caption=None, caption_node=None + ): """Modifies the current tag, assigning it the 'region' attribute. :param tag: the BeautifulSoup tag to be modified @@ -401,10 +428,11 @@ def _assign_positioning_data(self, tag, lang, caption_set=None, :type caption_node: CaptionNode """ assigned_id, attribs = self.region_creator.get_positioning_info( - lang, caption_set, caption, caption_node) + lang, caption_set, caption, caption_node + ) if assigned_id: - tag['region'] = assigned_id + tag["region"] = assigned_id # Write non-standard positioning information if self.write_inline_positioning: @@ -412,84 +440,81 @@ def _assign_positioning_data(self, tag, lang, caption_set=None, def _recreate_styling_tag(self, style, content, dfxp): # TODO - should be drastically simplified: if attributes : append - dfxp_style = dfxp.new_tag('style') - dfxp_style.attrs.update({'xml:id': style}) + dfxp_style = dfxp.new_tag("style") + dfxp_style.attrs.update({"xml:id": style}) attributes = _recreate_style(content, dfxp) dfxp_style.attrs.update(attributes) - new_tag = dfxp.new_tag('style') - new_tag.attrs.update({'xml:id': style}) + new_tag = dfxp.new_tag("style") + new_tag.attrs.update({"xml:id": style}) if dfxp_style != new_tag: - dfxp.find('styling').append(dfxp_style) + dfxp.find("styling").append(dfxp_style) return dfxp - def _recreate_p_tag(self, caption, caption_style, dfxp, caption_set=None, - lang=None): + def _recreate_p_tag( + self, caption, caption_style, dfxp, caption_set=None, lang=None + ): start = caption.format_start() end = caption.format_end() p = dfxp.new_tag("p", begin=start, end=end) p.string = self._recreate_text(caption, dfxp, caption_set, lang) if dfxp.find("style", {"xml:id": "p"}): - p['style'] = 'p' + p["style"] = "p" p.attrs.update(_recreate_style(caption_style, dfxp)) return p def _recreate_text(self, caption, dfxp, caption_set=None, lang=None): - line = '' + line = "" for node in caption.nodes: if node.type_ == CaptionNode.TEXT: line += self._encode(node.content) elif node.type_ == CaptionNode.BREAK: - line = line.rstrip() + '
\n ' + line = line.rstrip() + "
\n " elif node.type_ == CaptionNode.STYLE: - line = self._recreate_span( - line, node, dfxp, caption_set, caption, lang) + line = self._recreate_span(line, node, dfxp, caption_set, caption, lang) return line.rstrip() - def _recreate_span(self, line, node, dfxp, caption_set=None, caption=None, - lang=None): + def _recreate_span( + self, line, node, dfxp, caption_set=None, caption=None, lang=None + ): # TODO - This method seriously has to go away! # Because of the CaptionNode.STYLE nodes, tree-like structures are # are really hard to build, and proper xml elements can't be added. # We are left with creating tags manually, which is hard to understand # and harder to maintain if node.start: - styles = '' + styles = "" content_with_style = _recreate_style(node.content, dfxp) for style, value in list(content_with_style.items()): styles += f' {style}="{value}"' if node.layout_info: - region_id, region_attribs = ( - self.region_creator.get_positioning_info( - lang, caption_set, caption, node - )) + region_id, region_attribs = self.region_creator.get_positioning_info( + lang, caption_set, caption, node + ) styles += f' region="{region_id}"' if self.write_inline_positioning: - styles += ' ' + ' '.join( - [ - f'{k_}="{v_}"' - for k_, v_ in list(region_attribs.items()) - ] + styles += " " + " ".join( + [f'{k_}="{v_}"' for k_, v_ in list(region_attribs.items())] ) if styles: if self.open_span: - line = line.rstrip() + ' ' - line += f'' + line = line.rstrip() + " " + line += f"" self.open_span = True elif self.open_span: - line = line.rstrip() + ' ' + line = line.rstrip() + " " self.open_span = False return line @@ -518,13 +543,21 @@ class LayoutAwareDFXPParser(BeautifulSoup): features we support, it was easier to use pre-order and it seems to have been enough. It should be clarified whether this is ok or not. """ + # A lot of elements will have no positioning info. Use this flyweight # to save memory NO_POSITIONING_INFO = None - def __init__(self, markup="", features="html.parser", builder=None, - parse_only=None, from_encoding=None, - read_invalid_positioning=False, **kwargs): + def __init__( + self, + markup="", + features="html.parser", + builder=None, + parse_only=None, + from_encoding=None, + read_invalid_positioning=False, + **kwargs, + ): """The `features` param determines the parser to be used. The parsers are usually html parsers, some more forgiving than others, and as such they do stuff very differently especially for xml files. We chose this @@ -558,12 +591,11 @@ def __init__(self, markup="", features="html.parser", builder=None, # Work around for lack of ''' support in html.parser markup = markup.replace("'", "'") - super().__init__( - markup, features, builder, parse_only, from_encoding, **kwargs) + super().__init__(markup, features, builder, parse_only, from_encoding, **kwargs) self.read_invalid_positioning = read_invalid_positioning - for div in self.find_all('div'): + for div in self.find_all("div"): self._pre_order_visit(div) def _pre_order_visit(self, element, inherit_from=None): @@ -585,8 +617,7 @@ def _pre_order_visit(self, element, inherit_from=None): # TODO - this looks highly cachable. If it turns out too much # memory is being taken up by the caption set, cache this with a # WeakValueDict - layout_info = ( - self._extract_positioning_information(region_id, element)) + layout_info = self._extract_positioning_information(region_id, element) element.layout_info = layout_info for child in element.contents: self._pre_order_visit(child, inherit_from=layout_info) @@ -597,7 +628,7 @@ def _get_region_from_ancestors(element): region_id = None parent = element.parent while parent: - region_id = parent.get('region') + region_id = parent.get("region") if region_id: break parent = parent.parent @@ -616,9 +647,7 @@ def _get_region_from_descendants(element): return None region_id = None - child_region_ids = { - child.get('region') for child in element.findChildren() - } + child_region_ids = {child.get("region") for child in element.findChildren()} if len(child_region_ids) > 1: raise LookupError if len(child_region_ids) == 1: @@ -641,8 +670,8 @@ def _determine_region_id(cls, element): # element could be a NavigableString. Those are dumb. region_id = None - if hasattr(element, 'get'): - region_id = element.get('region') + if hasattr(element, "get"): + region_id = element.get("region") if not region_id: region_id = cls._get_region_from_ancestors(element) @@ -669,7 +698,7 @@ def _extract_positioning_information(self, region_id, element): region_tag = None if region_id is not None: - region_tag = self.find('region', {'xml:id': region_id}) + region_tag = self.find("region", {"xml:id": region_id}) region_scraper = self._get_layout_info_scraper_class()(self, region_tag) @@ -707,13 +736,12 @@ def __init__(self, document, region=None): :param region: the region tag """ self.region = region - self._styling_section = document.findChild('styling') + self._styling_section = document.findChild("styling") if region: - self.region_styles = self._get_style_sources( - self._styling_section, region) + self.region_styles = self._get_style_sources(self._styling_section, region) else: self.region_styles = [] - self.root_element = document.find('tt') + self.root_element = document.find("tt") @classmethod def _get_style_sources(cls, styling_section, element): @@ -736,7 +764,7 @@ def _get_style_sources(cls, styling_section, element): styling """ # If we're analyzing a NavigableString, just quit - if not hasattr(element, 'findAll'): + if not hasattr(element, "findAll"): return () nested_styles = [] @@ -747,19 +775,19 @@ def _get_style_sources(cls, styling_section, element): # if the parent is a
tag. Technically, this step shouldn't be # skipped, but it would make the reader read in O(n^2) (half an hour # for 1500 timed captions) - if element.name not in ('div', 'body', 'tt'): + if element.name not in ("div", "body", "tt"): for style in element.contents: - if getattr(style, 'name', None) == 'style': + if getattr(style, "name", None) == "style": nested_styles.extend( cls._get_style_reference_chain(style, styling_section) ) - referenced_style_id = element.get('style') + referenced_style_id = element.get("style") referenced_styles = [] if referenced_style_id and styling_section: referenced_style = styling_section.findChild( - 'style', {'xml:id': referenced_style_id} + "style", {"xml:id": referenced_style_id} ) referenced_styles = cls._get_style_reference_chain( @@ -786,12 +814,10 @@ def _get_style_reference_chain(cls, style, styling_tag): if not styling_tag: return result - reference = style.get('style') + reference = style.get("style") if reference: - referenced_styles = styling_tag.findChildren( - 'style', {'xml:id': reference} - ) + referenced_styles = styling_tag.findChildren("style", {"xml:id": reference}) if len(referenced_styles) == 1: return result + cls._get_style_reference_chain( @@ -827,47 +853,49 @@ def scrape_positioning_info(self, element=None, even_invalid=False): """ usable_elem = element if even_invalid else None - origin = self._find_attribute( - usable_elem, 'tts:origin', Point.from_xml_attribute, ['auto'] - ) or DFXP_DEFAULT_REGION.origin + origin = ( + self._find_attribute( + usable_elem, "tts:origin", Point.from_xml_attribute, ["auto"] + ) + or DFXP_DEFAULT_REGION.origin + ) extent = self._find_attribute( - usable_elem, 'tts:extent', Stretch.from_xml_attribute, ['auto']) + usable_elem, "tts:extent", Stretch.from_xml_attribute, ["auto"] + ) if not extent: extent = self._find_root_extent() or DFXP_DEFAULT_REGION.extent - padding = self._find_attribute( - usable_elem, 'tts:padding', Padding.from_xml_attribute - ) or DFXP_DEFAULT_REGION.padding + padding = ( + self._find_attribute(usable_elem, "tts:padding", Padding.from_xml_attribute) + or DFXP_DEFAULT_REGION.padding + ) # tts:textAlign is a special attribute, which can not be ignored when # specified on the element itself (only

nodes matter) # On elements like it is also read, because this was legacy # behavior. - if getattr(element, 'name', None) in ('span', 'p'): + if getattr(element, "name", None) in ("span", "p"): text_align_source = element else: text_align_source = None - text_align = ( - self._find_attribute(text_align_source, 'tts:textAlign') - or _create_external_horizontal_alignment( + text_align = self._find_attribute( + text_align_source, "tts:textAlign" + ) or _create_external_horizontal_alignment( DFXP_DEFAULT_REGION.alignment.horizontal ) - ) - display_align = ( - self._find_attribute(usable_elem, 'tts:displayAlign') - or _create_external_vertical_alignment( - DFXP_DEFAULT_REGION.alignment.vertical - ) - ) + display_align = self._find_attribute( + usable_elem, "tts:displayAlign" + ) or _create_external_vertical_alignment(DFXP_DEFAULT_REGION.alignment.vertical) alignment = _create_internal_alignment(text_align, display_align) return origin, extent, padding, alignment - def _find_attribute_on_element_or_styles(self, attribute_name, element, - factory, ignore, ignorecase): + def _find_attribute_on_element_or_styles( + self, attribute_name, element, factory, ignore, ignorecase + ): """Look up the given attribute on the element, and all the styles referenced by it. @@ -886,8 +914,7 @@ def _find_attribute_on_element_or_styles(self, attribute_name, element, ) if value is None: # Does a referenced style of the element have it? - for style in self._get_style_sources( - self._styling_section, element): + for style in self._get_style_sources(self._styling_section, element): value = _get_object_from_attribute( style, attribute_name, factory, ignore, ignorecase ) @@ -895,8 +922,9 @@ def _find_attribute_on_element_or_styles(self, attribute_name, element, break return value - def _find_attribute(self, element, attribute_name, factory=lambda x: x, - ignore=(), ignorecase=True): + def _find_attribute( + self, element, attribute_name, factory=lambda x: x, ignore=(), ignorecase=True + ): """Try to find the `attribute_name` specified on the element, all its parents and all their styles (and referenced styles). @@ -918,7 +946,8 @@ def _find_attribute(self, element, attribute_name, factory=lambda x: x, # Does the element itself have it inline, or any of its styles? if element: value = self._find_attribute_on_element_or_styles( - attribute_name, element, factory, ignore, ignorecase) + attribute_name, element, factory, ignore, ignorecase + ) if value is None: # Do any of the element's parents have the attribute? @@ -952,7 +981,7 @@ def _find_root_extent(self): if extent is None: root = self.root_element extent = _get_object_from_attribute( - root, 'tts:extent', Stretch.from_xml_attribute + root, "tts:extent", Stretch.from_xml_attribute ) if extent is not None: @@ -1047,15 +1076,18 @@ def _create_unique_regions(unique_layouts, dfxp, id_factory): :rtype: dict """ region_map = {} - layout_section = dfxp.find('layout') + layout_section = dfxp.find("layout") for region_spec in unique_layouts: if ( - region_spec.origin or region_spec.extent - or region_spec.padding or region_spec.alignment): - new_region = dfxp.new_tag('region') + region_spec.origin + or region_spec.extent + or region_spec.padding + or region_spec.alignment + ): + new_region = dfxp.new_tag("region") new_id = id_factory() - new_region['xml:id'] = new_id + new_region["xml:id"] = new_id region_map[region_spec] = new_id region_attribs = _convert_layout_to_attributes(region_spec) @@ -1071,29 +1103,31 @@ def create_document_regions(self): """ # Creates the default region default_region_map = self._create_unique_regions( - [DFXP_DEFAULT_REGION], - self._dfxp, lambda: DFXP_DEFAULT_REGION_ID + [DFXP_DEFAULT_REGION], self._dfxp, lambda: DFXP_DEFAULT_REGION_ID ) unique_regions = self._collect_unique_regions( - self._caption_set, DFXP_DEFAULT_REGION) + self._caption_set, DFXP_DEFAULT_REGION + ) # Create the document specified regions self._region_map = self._create_unique_regions( - unique_regions, self._dfxp, self._get_new_id) + unique_regions, self._dfxp, self._get_new_id + ) self._region_map.update(default_region_map) - def _get_new_id(self, prefix='r'): + def _get_new_id(self, prefix="r"): """Return new, unique ids (use an internal counter). :type prefix: str """ - new_id = f'{prefix}{self._id_seed}' + new_id = f"{prefix}{self._id_seed}" self._id_seed += 1 return new_id def get_positioning_info( - self, lang, caption_set=None, caption=None, caption_node=None): + self, lang, caption_set=None, caption=None, caption_node=None + ): """For the given element will return a valid region ID, used for assigning to the element, and a dict containing the positioning attributes of that region (useful for inline non-standard positioning) @@ -1147,37 +1181,37 @@ def get_positioning_info( def cleanup_regions(self): """Remove the unused regions from the output file""" - layout_tag = self._dfxp.find('layout') + layout_tag = self._dfxp.find("layout") if not layout_tag: return - regions = layout_tag.findChildren('region') + regions = layout_tag.findChildren("region") if not regions: return for region in regions: - if region.attrs.get('xml:id') not in self._assigned_region_ids: + if region.attrs.get("xml:id") not in self._assigned_region_ids: region.extract() def _recreate_style(content, dfxp): dfxp_style = {} - if 'class' in content: - if dfxp.find("style", {"xml:id": content['class']}): - dfxp_style['style'] = content['class'] - if 'text-align' in content: - dfxp_style['tts:textAlign'] = content['text-align'] - if 'italics' in content: - dfxp_style['tts:fontStyle'] = 'italic' - if 'font-family' in content: - dfxp_style['tts:fontFamily'] = content['font-family'] - if 'font-size' in content: - dfxp_style['tts:fontSize'] = content['font-size'] - if 'color' in content: - dfxp_style['tts:color'] = content['color'] - if 'display-align' in content: - dfxp_style['tts:displayAlign'] = content['display-align'] + if "class" in content: + if dfxp.find("style", {"xml:id": content["class"]}): + dfxp_style["style"] = content["class"] + if "text-align" in content: + dfxp_style["tts:textAlign"] = content["text-align"] + if "italics" in content: + dfxp_style["tts:fontStyle"] = "italic" + if "font-family" in content: + dfxp_style["tts:fontFamily"] = content["font-family"] + if "font-size" in content: + dfxp_style["tts:fontSize"] = content["font-size"] + if "color" in content: + dfxp_style["tts:color"] = content["color"] + if "display-align" in content: + dfxp_style["tts:displayAlign"] = content["display-align"] return dfxp_style @@ -1204,8 +1238,7 @@ def _create_internal_alignment(text_align, display_align): if not (text_align or display_align): return None - return Alignment.from_horizontal_and_vertical_align( - text_align, display_align) + return Alignment.from_horizontal_and_vertical_align(text_align, display_align) def _create_external_horizontal_alignment(horizontal_component): @@ -1218,15 +1251,15 @@ def _create_external_horizontal_alignment(horizontal_component): result = None if horizontal_component == HorizontalAlignmentEnum.LEFT: - result = 'left' + result = "left" if horizontal_component == HorizontalAlignmentEnum.CENTER: - result = 'center' + result = "center" if horizontal_component == HorizontalAlignmentEnum.RIGHT: - result = 'right' + result = "right" if horizontal_component == HorizontalAlignmentEnum.START: - result = 'start' + result = "start" if horizontal_component == HorizontalAlignmentEnum.END: - result = 'end' + result = "end" return result @@ -1241,11 +1274,11 @@ def _create_external_vertical_alignment(vertical_component): result = None if vertical_component == VerticalAlignmentEnum.TOP: - result = 'before' + result = "before" if vertical_component == VerticalAlignmentEnum.CENTER: - result = 'center' + result = "center" if vertical_component == VerticalAlignmentEnum.BOTTOM: - result = 'after' + result = "after" return result @@ -1265,21 +1298,20 @@ def _create_external_alignment(alignment): if not (alignment.horizontal or alignment.vertical): return result - horizontal_alignment = _create_external_horizontal_alignment( - alignment.horizontal) + horizontal_alignment = _create_external_horizontal_alignment(alignment.horizontal) if horizontal_alignment: - result['tts:textAlign'] = horizontal_alignment + result["tts:textAlign"] = horizontal_alignment - vertical_alignment = _create_external_vertical_alignment( - alignment.vertical) + vertical_alignment = _create_external_vertical_alignment(alignment.vertical) if vertical_alignment: - result['tts:displayAlign'] = vertical_alignment + result["tts:displayAlign"] = vertical_alignment return result -def _get_object_from_attribute(tag, attr_name, factory, - ignore_vals=(), ignorecase=True): +def _get_object_from_attribute( + tag, attr_name, factory, ignore_vals=(), ignorecase=True +): """For the xml `tag`, tries to retrieve the attribute `attr_name` and pass that to the factory in order to get a result. If the value of the attribute is in the `ignore_vals` iterable, returns None. @@ -1291,7 +1323,7 @@ def _get_object_from_attribute(tag, attr_name, factory, :param ignore_vals: iterable of attribute values to ignore :raise CaptionReadSyntaxError: if the attribute has some crazy value """ - if not hasattr(tag, 'has_attr'): + if not hasattr(tag, "has_attr"): return attr_value = None @@ -1329,13 +1361,13 @@ def _convert_layout_to_attributes(layout): return _create_external_alignment(DFXP_DEFAULT_REGION.alignment) if layout.origin: - result['tts:origin'] = layout.origin.to_xml_attribute() + result["tts:origin"] = layout.origin.to_xml_attribute() if layout.extent: - result['tts:extent'] = layout.extent.to_xml_attribute() + result["tts:extent"] = layout.extent.to_xml_attribute() if layout.padding: - result['tts:padding'] = layout.padding.to_xml_attribute() + result["tts:padding"] = layout.padding.to_xml_attribute() if layout.alignment: result.update(_create_external_alignment(layout.alignment)) diff --git a/pycaption/dfxp/extras.py b/pycaption/dfxp/extras.py index 70a60c82..dfc4fa7f 100644 --- a/pycaption/dfxp/extras.py +++ b/pycaption/dfxp/extras.py @@ -6,10 +6,10 @@ from bs4 import BeautifulSoup -from .base import DFXPWriter, DFXP_DEFAULT_REGION from ..base import BaseWriter, CaptionNode, merge_concurrent_captions +from .base import DFXP_DEFAULT_REGION, DFXPWriter -LEGACY_DFXP_BASE_MARKUP = ''' +LEGACY_DFXP_BASE_MARKUP = """ @@ -18,33 +18,30 @@ -''' +""" LEGACY_DFXP_DEFAULT_STYLE = { - 'color': 'white', - 'font-family': 'monospace', - 'font-size': '1c', + "color": "white", + "font-family": "monospace", + "font-size": "1c", } -LEGACY_DFXP_DEFAULT_STYLE_ID = 'default' -LEGACY_DFXP_DEFAULT_REGION_ID = 'bottom' +LEGACY_DFXP_DEFAULT_STYLE_ID = "default" +LEGACY_DFXP_DEFAULT_REGION_ID = "bottom" -LEGACY_DFXP_DEFAULT_REGION = { - 'text-align': 'center', - 'display-align': 'after' -} +LEGACY_DFXP_DEFAULT_REGION = {"text-align": "center", "display-align": "after"} class SinglePositioningDFXPWriter(DFXPWriter): """ A dfxp writer, that ignores all positioning, using a single provided value """ - def __init__(self, default_positioning=DFXP_DEFAULT_REGION, - *args, **kwargs): + + def __init__(self, default_positioning=DFXP_DEFAULT_REGION, *args, **kwargs): super().__init__(*args, **kwargs) self.default_positioning = default_positioning - def write(self, captions_set, force=''): + def write(self, captions_set, force=""): """Writes a DFXP file using the positioning provided in the initializer :type captions_set: pycaption.base.CaptionSet @@ -52,7 +49,8 @@ def write(self, captions_set, force=''): :rtype: str """ captions_set = self._create_single_positioning_caption_set( - captions_set, self.default_positioning) + captions_set, self.default_positioning + ) return super().write(captions_set, force) # noqa @@ -80,42 +78,45 @@ def _create_single_positioning_caption_set(caption_set, positioning): caption.layout_info = positioning for node in caption.nodes: - if hasattr(node, 'layout_info'): + if hasattr(node, "layout_info"): node.layout_info = positioning for _, style in caption_set.get_styles(): - if 'text-align' in style: - style.pop('text-align') + if "text-align" in style: + style.pop("text-align") return caption_set class LegacyDFXPWriter(BaseWriter): """Ported the legacy DFXPWriter from 0.4.5""" + def __init__(self, *args, **kw): self.p_style = False self.open_span = False - def write(self, caption_set, force=''): + def write(self, caption_set, force=""): caption_set = deepcopy(caption_set) caption_set = merge_concurrent_captions(caption_set) - dfxp = BeautifulSoup(LEGACY_DFXP_BASE_MARKUP, 'lxml-xml') - dfxp.find('tt')['xml:lang'] = "en" + dfxp = BeautifulSoup(LEGACY_DFXP_BASE_MARKUP, "lxml-xml") + dfxp.find("tt")["xml:lang"] = "en" for style_id, style in caption_set.get_styles(): if style != {}: dfxp = self._recreate_styling_tag(style_id, style, dfxp) if not caption_set.get_styles(): dfxp = self._recreate_styling_tag( - LEGACY_DFXP_DEFAULT_STYLE_ID, LEGACY_DFXP_DEFAULT_STYLE, dfxp) + LEGACY_DFXP_DEFAULT_STYLE_ID, LEGACY_DFXP_DEFAULT_STYLE, dfxp + ) # XXX For now we will always use this default region. In the future if # regions are provided, they will be kept dfxp = self._recreate_region_tag( - LEGACY_DFXP_DEFAULT_REGION_ID, LEGACY_DFXP_DEFAULT_REGION, dfxp) + LEGACY_DFXP_DEFAULT_REGION_ID, LEGACY_DFXP_DEFAULT_REGION, dfxp + ) - body = dfxp.find('body') + body = dfxp.find("body") if force: langs = [self._force_language(force, caption_set.get_languages())] @@ -123,17 +124,18 @@ def write(self, caption_set, force=''): langs = caption_set.get_languages() for lang in langs: - div = dfxp.new_tag('div') - div['xml:lang'] = lang + div = dfxp.new_tag("div") + div["xml:lang"] = lang for caption in caption_set.get_captions(lang): if caption.style: caption_style = caption.style - caption_style.update( - {'region': LEGACY_DFXP_DEFAULT_REGION_ID}) + caption_style.update({"region": LEGACY_DFXP_DEFAULT_REGION_ID}) else: - caption_style = {'class': LEGACY_DFXP_DEFAULT_STYLE_ID, - 'region': LEGACY_DFXP_DEFAULT_REGION_ID} + caption_style = { + "class": LEGACY_DFXP_DEFAULT_STYLE_ID, + "region": LEGACY_DFXP_DEFAULT_REGION_ID, + } p = self._recreate_p_tag(caption, caption_style, dfxp) div.append(p) @@ -151,29 +153,29 @@ def _force_language(self, force, langs): return langs[-1] def _recreate_region_tag(self, region_id, styling, dfxp): - dfxp_region = dfxp.new_tag('region') - dfxp_region.attrs.update({'xml:id': region_id}) + dfxp_region = dfxp.new_tag("region") + dfxp_region.attrs.update({"xml:id": region_id}) attributes = self._recreate_style(styling, dfxp) dfxp_region.attrs.update(attributes) - new_tag = dfxp.new_tag('region') - new_tag.attrs.update({'xml:id': region_id}) + new_tag = dfxp.new_tag("region") + new_tag.attrs.update({"xml:id": region_id}) if dfxp_region != new_tag: - dfxp.find('layout').append(dfxp_region) + dfxp.find("layout").append(dfxp_region) return dfxp def _recreate_styling_tag(self, style, content, dfxp): - dfxp_style = dfxp.new_tag('style') - dfxp_style.attrs.update({'xml:id': style}) + dfxp_style = dfxp.new_tag("style") + dfxp_style.attrs.update({"xml:id": style}) attributes = self._recreate_style(content, dfxp) dfxp_style.attrs.update(attributes) - new_tag = dfxp.new_tag('style') - new_tag.attrs.update({'xml:id': style}) + new_tag = dfxp.new_tag("style") + new_tag.attrs.update({"xml:id": style}) if dfxp_style != new_tag: - dfxp.find('styling').append(dfxp_style) + dfxp.find("styling").append(dfxp_style) return dfxp @@ -184,21 +186,21 @@ def _recreate_p_tag(self, caption, caption_style, dfxp): p.string = self._recreate_text(caption, dfxp) if dfxp.find("style", {"xml:id": "p"}): - p['style'] = 'p' + p["style"] = "p" p.attrs.update(self._recreate_style(caption_style, dfxp)) return p def _recreate_text(self, caption, dfxp): - line = '' + line = "" for node in caption.nodes: if node.type_ == CaptionNode.TEXT: - line += escape(node.content) + ' ' + line += escape(node.content) + " " elif node.type_ == CaptionNode.BREAK: - line = line.rstrip() + '
\n ' + line = line.rstrip() + "
\n " elif node.type_ == CaptionNode.STYLE: line = self._recreate_span(line, node, dfxp) @@ -207,7 +209,7 @@ def _recreate_text(self, caption, dfxp): def _recreate_span(self, line, node, dfxp): if node.start: - styles = '' + styles = "" content_with_style = self._recreate_style(node.content, dfxp) for style, value in list(content_with_style.items()): @@ -215,12 +217,12 @@ def _recreate_span(self, line, node, dfxp): if styles: if self.open_span: - line = line.rstrip() + '
' - line += f'' + line = line.rstrip() + " " + line += f"" self.open_span = True elif self.open_span: - line = line.rstrip() + ' ' + line = line.rstrip() + " " self.open_span = False return line @@ -228,23 +230,23 @@ def _recreate_span(self, line, node, dfxp): def _recreate_style(self, content, dfxp): dfxp_style = {} - if 'region' in content: - if dfxp.find('region', {'xml:id': content['region']}): - dfxp_style['region'] = content['region'] - if 'class' in content: - if dfxp.find("style", {"xml:id": content['class']}): - dfxp_style['style'] = content['class'] - if 'text-align' in content: - dfxp_style['tts:textAlign'] = content['text-align'] - if 'italics' in content: - dfxp_style['tts:fontStyle'] = 'italic' - if 'font-family' in content: - dfxp_style['tts:fontFamily'] = content['font-family'] - if 'font-size' in content: - dfxp_style['tts:fontSize'] = content['font-size'] - if 'color' in content: - dfxp_style['tts:color'] = content['color'] - if 'display-align' in content: - dfxp_style['tts:displayAlign'] = content['display-align'] + if "region" in content: + if dfxp.find("region", {"xml:id": content["region"]}): + dfxp_style["region"] = content["region"] + if "class" in content: + if dfxp.find("style", {"xml:id": content["class"]}): + dfxp_style["style"] = content["class"] + if "text-align" in content: + dfxp_style["tts:textAlign"] = content["text-align"] + if "italics" in content: + dfxp_style["tts:fontStyle"] = "italic" + if "font-family" in content: + dfxp_style["tts:fontFamily"] = content["font-family"] + if "font-size" in content: + dfxp_style["tts:fontSize"] = content["font-size"] + if "color" in content: + dfxp_style["tts:color"] = content["color"] + if "display-align" in content: + dfxp_style["tts:displayAlign"] = content["display-align"] return dfxp_style diff --git a/pycaption/exceptions.py b/pycaption/exceptions.py index 0474c05d..7afcf2f3 100644 --- a/pycaption/exceptions.py +++ b/pycaption/exceptions.py @@ -2,8 +2,9 @@ class CaptionReadError(Exception): """ Generic error raised when the reading of the caption file failed. """ + def __str__(self): - return f'{self.__class__.__name__}({self.args[0]})' + return f"{self.__class__.__name__}({self.args[0]})" class CaptionReadNoCaptions(CaptionReadError): diff --git a/pycaption/geometry.py b/pycaption/geometry.py index f5cc8b07..b6375ac4 100644 --- a/pycaption/geometry.py +++ b/pycaption/geometry.py @@ -10,7 +10,7 @@ import re from enum import Enum -from .exceptions import RelativizationError, CaptionReadSyntaxError +from .exceptions import CaptionReadSyntaxError, RelativizationError class UnitEnum(Enum): @@ -22,11 +22,12 @@ class UnitEnum(Enum): if unit == UnitEnum.CELL : ... """ - PIXEL = 'px' - EM = 'em' - PERCENT = '%' - CELL = 'c' - PT = 'pt' + + PIXEL = "px" + EM = "em" + PERCENT = "%" + CELL = "c" + PT = "pt" class VerticalAlignmentEnum(Enum): @@ -37,18 +38,20 @@ class VerticalAlignmentEnum(Enum): if alignment == VerticalAlignmentEnum.BOTTOM: ... """ - TOP = 'top' - CENTER = 'center' - BOTTOM = 'bottom' + + TOP = "top" + CENTER = "center" + BOTTOM = "bottom" class HorizontalAlignmentEnum(Enum): """Enumeration object specifying the horizontal alignment preferences""" - LEFT = 'left' - CENTER = 'center' - RIGHT = 'right' - START = 'start' - END = 'end' + + LEFT = "left" + CENTER = "center" + RIGHT = "right" + START = "start" + END = "end" class Alignment: @@ -63,11 +66,7 @@ def __init__(self, horizontal, vertical): self.vertical = vertical def __hash__(self): - return hash( - hash(self.horizontal) * 83 - + hash(self.vertical) * 89 - + 97 - ) + return hash(hash(self.horizontal) * 83 + hash(self.vertical) * 89 + 97) def __eq__(self, other): return ( @@ -85,27 +84,26 @@ def serialized(self): return self.horizontal, self.vertical @classmethod - def from_horizontal_and_vertical_align(cls, text_align=None, - display_align=None): + def from_horizontal_and_vertical_align(cls, text_align=None, display_align=None): horizontal_obj = None vertical_obj = None - if text_align == 'left': + if text_align == "left": horizontal_obj = HorizontalAlignmentEnum.LEFT - if text_align == 'start': + if text_align == "start": horizontal_obj = HorizontalAlignmentEnum.START - if text_align == 'center': + if text_align == "center": horizontal_obj = HorizontalAlignmentEnum.CENTER - if text_align == 'right': + if text_align == "right": horizontal_obj = HorizontalAlignmentEnum.RIGHT - if text_align == 'end': + if text_align == "end": horizontal_obj = HorizontalAlignmentEnum.END - if display_align == 'before': + if display_align == "before": vertical_obj = VerticalAlignmentEnum.TOP - if display_align == 'center': + if display_align == "center": vertical_obj = VerticalAlignmentEnum.CENTER - if display_align == 'after': + if display_align == "after": vertical_obj = VerticalAlignmentEnum.BOTTOM if not any([horizontal_obj, vertical_obj]): @@ -125,7 +123,7 @@ def from_xml_attribute(cls, attribute): :type attribute: str """ - horizontal, vertical = attribute.split(' ') + horizontal, vertical = attribute.split(" ") horizontal = Size.from_string(horizontal) vertical = Size.from_string(vertical) @@ -146,8 +144,8 @@ def __init__(self, horizontal, vertical): """ for parameter in [horizontal, vertical]: if not isinstance(parameter, Size): - raise ValueError("Stretch must be initialized with two valid " - "Size objects.") + raise ValueError("Stretch must be initialized with two valid " + "Size objects.") self.horizontal = horizontal self.vertical = vertical @@ -158,18 +156,17 @@ def is_measured_in(self, measure_unit): :return: True/False """ return ( - self.horizontal.unit == measure_unit - and self.vertical.unit == measure_unit + self.horizontal.unit == measure_unit and self.vertical.unit == measure_unit ) def __repr__(self): - return f'' + return f"" def serialized(self): """Returns a tuple of the useful attributes of this object""" return ( None if not self.horizontal else self.horizontal.serialized(), - None if not self.vertical else self.vertical.serialized() + None if not self.vertical else self.vertical.serialized(), ) def __eq__(self, other): @@ -181,20 +178,16 @@ def __eq__(self, other): ) def __hash__(self): - return hash( - hash(self.horizontal) * 59 - + hash(self.vertical) * 61 - + 67 - ) + return hash(hash(self.horizontal) * 59 + hash(self.vertical) * 61 + 67) def __bool__(self): return True if self.horizontal or self.vertical else False def to_xml_attribute(self, **kwargs): """Returns a string representation of this object as an xml attribute""" - return '{horizontal} {vertical}'.format( + return "{horizontal} {vertical}".format( horizontal=self.horizontal.to_xml_attribute(), - vertical=self.vertical.to_xml_attribute() + vertical=self.vertical.to_xml_attribute(), ) def is_relative(self): @@ -215,7 +208,7 @@ def as_percentage_of(self, video_width, video_height): """ return Stretch( self.horizontal.as_percentage_of(video_width=video_width), - self.vertical.as_percentage_of(video_height=video_height) + self.vertical.as_percentage_of(video_height=video_height), ) @@ -256,7 +249,7 @@ def from_extent(cls, extent, origin): @property def extent(self): """How wide this rectangle stretches (horizontally and vertically)""" - if hasattr(self, '_extent'): + if hasattr(self, "_extent"): return self._extent else: return self._p1 - self._p2 @@ -264,7 +257,7 @@ def extent(self): @property def origin(self): """Out of its 4 points, returns the one closest to the origin""" - if hasattr(self, '_origin'): + if hasattr(self, "_origin"): return self._origin else: return Point.align_from_origin(self._p1, self._p2)[0] @@ -274,7 +267,7 @@ def origin(self): @property def lower_right_point(self): """The point furthest from the origin from the rectangle's 4 points""" - if hasattr(self, '_p2'): + if hasattr(self, "_p2"): return Point.align_from_origin(self._p1, self._p2)[1] else: return self.origin.add_extent(self.extent) @@ -288,11 +281,7 @@ def __eq__(self, other): ) def __hash__(self): - return hash( - hash(self.origin) * 71 - + hash(self.extent) * 73 - + 79 - ) + return hash(hash(self.origin) * 71 + hash(self.extent) * 73 + 79) class Point(TwoDimensionalObject): @@ -305,14 +294,13 @@ def __init__(self, x, y): """ for parameter in [x, y]: if not isinstance(parameter, Size): - raise ValueError("Point must be initialized with two valid " + raise ValueError("Point must be initialized with two valid " "Size objects.") self.x = x self.y = y def __sub__(self, other): - """Returns an Stretch object, if the other point's units are compatible - """ + """Returns an Stretch object, if the other point's units are compatible""" return Stretch(abs(self.x - other.x), abs(self.y - other.y)) def add_stretch(self, stretch): @@ -339,7 +327,7 @@ def as_percentage_of(self, video_width, video_height): """ return Point( self.x.as_percentage_of(video_width=video_width), - self.y.as_percentage_of(video_height=video_height) + self.y.as_percentage_of(video_height=video_height), ) @classmethod @@ -355,17 +343,19 @@ def align_from_origin(cls, p1, p2): if p1.x >= p2.x and p1.y >= p2.y: return p2 else: - return (Point(min(p1.x, p2.x), min(p1.y, p2.y)), - Point(max(p1.x, p2.x), max(p1.y, p2.y))) + return ( + Point(min(p1.x, p2.x), min(p1.y, p2.y)), + Point(max(p1.x, p2.x), max(p1.y, p2.y)), + ) def __repr__(self): - return f'' + return f"" def serialized(self): """Returns the "useful" values of this object.""" return ( None if not self.x else self.x.serialized(), - None if not self.y else self.y.serialized() + None if not self.y else self.y.serialized(), ) def __eq__(self, other): @@ -377,18 +367,14 @@ def __eq__(self, other): ) def __hash__(self): - return hash( - hash(self.x) * 51 - + hash(self.y) * 53 - + 57 - ) + return hash(hash(self.x) * 51 + hash(self.y) * 53 + 57) def __bool__(self): return True if self.x or self.y else False def to_xml_attribute(self, **kwargs): """Returns a string representation of this object as an xml attribute""" - return f'{self.x.to_xml_attribute()} {self.y.to_xml_attribute()}' + return f"{self.x.to_xml_attribute()} {self.y.to_xml_attribute()}" class Size: @@ -455,12 +441,14 @@ def as_percentage_of(self, video_width=None, video_height=None): # The input must be valid so that any conversion can be done if not (video_width or video_height): raise RelativizationError( - "At least one of video width or height" - " must be given as a reference") + "At least one of video width or height" + " must be given as a reference" + ) elif video_width and video_height: raise RelativizationError( "Only one of video width or height can be given as reference" - " per value being converted") + " per value being converted" + ) if unit == UnitEnum.EM: # TODO: Implement proper conversion of em in function of font-size @@ -504,13 +492,15 @@ def from_string(cls, string): """ size_pattern = re.compile( r"^(((?P\d+(\.\d+)?)(?P" - fr"{'|'.join([unit.value for unit in UnitEnum])}))|0)$") + rf"{'|'.join([unit.value for unit in UnitEnum])}))|0)$" + ) match = size_pattern.search(string) if not match: raise CaptionReadSyntaxError( f"Invalid size: {string}. Please make sure the provided value " "is a number followed by one of the supported units: " - f"{', '.join([unit.value for unit in UnitEnum])}.") + f"{', '.join([unit.value for unit in UnitEnum])}." + ) unit = match.group("unit") if unit: value = match.group("value") @@ -520,19 +510,18 @@ def from_string(cls, string): return cls(match.group(0), UnitEnum.PIXEL) def __repr__(self): - return f'' + return f"" def __str__(self): value = round(self.value, 2) if value.is_integer(): s = f"{int(value)}" else: - s = f"{value:.2f}".rstrip('0').rstrip('.') + s = f"{value:.2f}".rstrip("0").rstrip(".") return f"{s}{self.unit.value}" def to_xml_attribute(self, **kwargs): - """Returns a string representation of this object, as an xml attribute - """ + """Returns a string representation of this object, as an xml attribute""" return str(self) def serialized(self): @@ -548,11 +537,7 @@ def __eq__(self, other): ) def __hash__(self): - return hash( - hash(self.value) * 41 - + hash(self.unit) * 43 - + 47 - ) + return hash(hash(self.value) * 41 + hash(self.unit) * 43 + 47) def __bool__(self): return self.unit in UnitEnum and self.value is not None @@ -579,7 +564,7 @@ def __init__(self, before=None, after=None, start=None, end=None): self.start = start # left self.end = end # right - for attr in ['before', 'after', 'start', 'end']: + for attr in ["before", "after", "start", "end"]: # Ensure that a Padding object always explicitly defines all # four possible paddings if not isinstance(getattr(self, attr), Size): @@ -600,7 +585,7 @@ def from_xml_attribute(cls, attribute): :param attribute: a string like object, representing a dfxp attr. value :return: a Padding object """ - values_list = attribute.split(' ') + values_list = attribute.split(" ") sizes = [] for value in values_list: @@ -615,11 +600,13 @@ def from_xml_attribute(cls, attribute): elif len(sizes) == 4: return cls(sizes[0], sizes[2], sizes[3], sizes[1]) else: - raise ValueError(f'The provided value "{attribute}" could not be ' - "parsed into the a padding. Check out " - "http://www.w3.org/TR/ttaf1-dfxp/" - "#style-attribute-padding for the definition " - "and examples") + raise ValueError( + f'The provided value "{attribute}" could not be ' + "parsed into the a padding. Check out " + "http://www.w3.org/TR/ttaf1-dfxp/" + "#style-attribute-padding for the definition " + "and examples" + ) def __repr__(self): return ( @@ -633,7 +620,7 @@ def serialized(self): None if not self.before else self.before.serialized(), None if not self.after else self.after.serialized(), None if not self.start else self.start.serialized(), - None if not self.end else self.end.serialized() + None if not self.end else self.end.serialized(), ) def __eq__(self, other): @@ -656,8 +643,8 @@ def __hash__(self): ) def to_xml_attribute( - self, attribute_order=('before', 'end', 'after', 'start'), - **kwargs): + self, attribute_order=("before", "end", "after", "start"), **kwargs + ): """Returns a string representation of this object as an xml attribute TODO - should extend the attribute_order tuple to contain 4 tuples, @@ -671,22 +658,21 @@ def to_xml_attribute( string_list = [] for attrib in attribute_order: if hasattr(self, attrib): - string_list.append( - getattr(self, attrib).to_xml_attribute()) + string_list.append(getattr(self, attrib).to_xml_attribute()) except AttributeError: # A Padding object with attributes set to None is considered # invalid. All four possible paddings must be set. If one of them # is not, this error is raised. raise ValueError("The attribute order specified is invalid.") - return ' '.join(string_list) + return " ".join(string_list) def as_percentage_of(self, video_width, video_height): return Padding( self.before.as_percentage_of(video_height=video_height), self.after.as_percentage_of(video_height=video_height), self.start.as_percentage_of(video_width=video_width), - self.end.as_percentage_of(video_width=video_width) + self.end.as_percentage_of(video_width=video_width), ) def is_relative(self): @@ -710,8 +696,15 @@ class Layout: specific for each caption type. """ - def __init__(self, origin=None, extent=None, padding=None, alignment=None, - webvtt_positioning=None, inherit_from=None): + def __init__( + self, + origin=None, + extent=None, + padding=None, + alignment=None, + webvtt_positioning=None, + inherit_from=None, + ): """ :type origin: Point :param origin: The point on the screen which is the top left vertex @@ -745,16 +738,21 @@ def __init__(self, origin=None, extent=None, padding=None, alignment=None, self.webvtt_positioning = webvtt_positioning if inherit_from: - for attr_name in ['origin', 'extent', 'padding', 'alignment']: + for attr_name in ["origin", "extent", "padding", "alignment"]: attr = getattr(self, attr_name) if not attr: setattr(self, attr_name, getattr(inherit_from, attr_name)) def __bool__(self): - return any([ - self.origin, self.extent, self.padding, self.alignment, - self.webvtt_positioning - ]) + return any( + [ + self.origin, + self.extent, + self.padding, + self.alignment, + self.webvtt_positioning, + ] + ) def __repr__(self): return ( @@ -768,7 +766,7 @@ def serialized(self): None if not self.origin else self.origin.serialized(), None if not self.extent else self.extent.serialized(), None if not self.padding else self.padding.serialized(), - None if not self.alignment else self.alignment.serialized() + None if not self.alignment else self.alignment.serialized(), ) def __eq__(self, other): @@ -807,16 +805,15 @@ def is_relative(self): return is_relative def as_percentage_of(self, video_width, video_height): - params = {'alignment': self.alignment} + params = {"alignment": self.alignment} # We don't need to preserve webvtt_positioning on Layout # transformations because, if it is set, the WebVTT writer # returns as soon as it's found and the transformations are # never triggered. - for attr_name in ['origin', 'extent', 'padding']: + for attr_name in ["origin", "extent", "padding"]: attr = getattr(self, attr_name) if attr: - params[attr_name] = attr.as_percentage_of(video_width, - video_height) + params[attr_name] = attr.as_percentage_of(video_width, video_height) return Layout(**params) def fit_to_screen(self): @@ -853,8 +850,10 @@ def fit_to_screen(self): found_absolute_unit = True if found_absolute_unit: - raise ValueError("Units must be relativized before extent " - "can be calculated based on origin.") + raise ValueError( + "Units must be relativized before extent " + "can be calculated based on origin." + ) new_horizontal = self.extent.horizontal new_vertical = self.extent.vertical diff --git a/pycaption/microdvd.py b/pycaption/microdvd.py index 6c31ca15..e0564225 100644 --- a/pycaption/microdvd.py +++ b/pycaption/microdvd.py @@ -2,11 +2,18 @@ from copy import deepcopy from .base import ( - BaseReader, BaseWriter, CaptionSet, CaptionList, Caption, CaptionNode, DEFAULT_LANGUAGE_CODE, + BaseReader, + BaseWriter, + Caption, + CaptionList, + CaptionNode, + CaptionSet, ) from .exceptions import ( - CaptionReadNoCaptions, CaptionReadSyntaxError, CaptionReadTimingError, + CaptionReadNoCaptions, + CaptionReadSyntaxError, + CaptionReadTimingError, InvalidInputError, ) @@ -17,7 +24,7 @@ def detect(self, content): def read(self, content, lang=DEFAULT_LANGUAGE_CODE): if not isinstance(content, str): - raise InvalidInputError('The content is not a unicode string.') + raise InvalidInputError("The content is not a unicode string.") lines = content.splitlines() captions = CaptionList() @@ -28,26 +35,24 @@ def read(self, content, lang=DEFAULT_LANGUAGE_CODE): m = re.match(r"{(\d+)}{(\d+)}(.*)", line) if not m: - raise CaptionReadSyntaxError( - "Line does not match expected format") + raise CaptionReadSyntaxError("Line does not match expected format") start, end, txt = m.groups() - if start == '0' and end == '0': + if start == "0" and end == "0": try: fps = float(txt) continue except ValueError: - raise CaptionReadTimingError( - 'FPS information is not provided') + raise CaptionReadTimingError("FPS information is not provided") caption_start = self._framestomicro(int(start), fps) caption_end = self._framestomicro(int(end), fps) nodes = [] - for line in txt.split('|'): + for line in txt.split("|"): # skip extra blank lines - if line != '': + if line != "": nodes.append(CaptionNode.create_text(line)) nodes.append(CaptionNode.create_break()) @@ -67,7 +72,7 @@ def read(self, content, lang=DEFAULT_LANGUAGE_CODE): return caption_set def _framestomicro(self, framenum, fps=25.0): - return int(framenum / fps * (10 ** 6)) + return int(framenum / fps * (10**6)) class MicroDVDWriter(BaseWriter): @@ -77,34 +82,32 @@ def write(self, caption_set): captions = [] for lang in caption_set.get_languages(): - captions.append( - self._recreate_lang(caption_set.get_captions(lang)) - ) + captions.append(self._recreate_lang(caption_set.get_captions(lang))) - return ''.join(captions) + return "".join(captions) def _microtoframes(self, micro, fps=25.0): - return int(micro * fps / (10 ** 6)) + return int(micro * fps / (10**6)) def _recreate_lang(self, captions): - sub = '' + sub = "" for caption in captions: start = self._microtoframes(caption.start) end = self._microtoframes(caption.end) - sub += f'{{{start}}}{{{end}}}' + sub += f"{{{start}}}{{{end}}}" - new_content = '' + new_content = "" for node in caption.nodes: new_content = self._recreate_line(new_content, node) # Eliminate excessive line breaks - new_content = new_content.strip() + '\n' - while '\n\n' in new_content: - new_content = new_content.replace('\n\n', '\n') + new_content = new_content.strip() + "\n" + while "\n\n" in new_content: + new_content = new_content.replace("\n\n", "\n") # Break unnecessary on last line - while '|\n' in new_content: - new_content = new_content.replace('|\n', '\n') + while "|\n" in new_content: + new_content = new_content.replace("|\n", "\n") sub += new_content @@ -114,6 +117,6 @@ def _recreate_line(self, sub, line): if line.type_ == CaptionNode.TEXT: return sub + line.content elif line.type_ == CaptionNode.BREAK: - return sub + '|' + return sub + "|" else: return sub diff --git a/pycaption/sami.py b/pycaption/sami.py index b310bbcc..69fcf8fe 100644 --- a/pycaption/sami.py +++ b/pycaption/sami.py @@ -36,37 +36,45 @@ """ import re -from xml.dom import SyntaxErr from collections import deque from copy import deepcopy from html.entities import name2codepoint from html.parser import HTMLParser from logging import FATAL +from xml.dom import SyntaxErr from xml.sax.saxutils import escape from bs4 import BeautifulSoup, NavigableString -from cssutils import parseString, log, css as cssutils_css +from cssutils import css as cssutils_css +from cssutils import log, parseString from .base import ( - BaseReader, BaseWriter, CaptionSet, CaptionList, Caption, CaptionNode, DEFAULT_LANGUAGE_CODE, + BaseReader, + BaseWriter, + Caption, + CaptionList, + CaptionNode, + CaptionSet, ) from .exceptions import ( - CaptionReadNoCaptions, CaptionReadSyntaxError, InvalidInputError, - CaptionReadTimingError + CaptionReadNoCaptions, + CaptionReadSyntaxError, + CaptionReadTimingError, + InvalidInputError, ) -from .geometry import Layout, Alignment, Padding, Size +from .geometry import Alignment, Layout, Padding, Size # change cssutils default logging log.setLevel(FATAL) -SAMI_BASE_MARKUP = ''' +SAMI_BASE_MARKUP = """