import sys from html2text import config from html2text.compat import htmlentitydefs def name2cp(k): """Return sname to codepoint""" if k == 'apos': return ord("'") return htmlentitydefs.name2codepoint[k] unifiable_n = {} for k in config.UNIFIABLE.keys(): unifiable_n[name2cp(k)] = config.UNIFIABLE[k] def hn(tag): if tag[0] == 'h' and len(tag) == 2: try: n = int(tag[1]) if n in range(1, 10): # pragma: no branch return n except ValueError: return 0 def dumb_property_dict(style): """ :returns: A hash of css attributes """ out = dict([(x.strip().lower(), y.strip().lower()) for x, y in [z.split(':', 1) for z in style.split(';') if ':' in z ] ] ) return out def dumb_css_parser(data): """ :type data: str :returns: A hash of css selectors, each of which contains a hash of css attributes. :rtype: dict """ # remove @import sentences data += ';' importIndex = data.find('@import') while importIndex != -1: data = data[0:importIndex] + data[data.find(';', importIndex) + 1:] importIndex = data.find('@import') # parse the css. reverted from dictionary comprehension in order to # support older pythons elements = [x.split('{') for x in data.split('}') if '{' in x.strip()] try: elements = dict([(a.strip(), dumb_property_dict(b)) for a, b in elements]) except ValueError: # pragma: no cover elements = {} # not that important return elements def element_style(attrs, style_def, parent_style): """ :type attrs: dict :type style_def: dict :type style_def: dict :returns: A hash of the 'final' style attributes of the element :rtype: dict """ style = parent_style.copy() if 'class' in attrs: for css_class in attrs['class'].split(): css_style = style_def.get('.' + css_class, {}) style.update(css_style) if 'style' in attrs: immediate_style = dumb_property_dict(attrs['style']) style.update(immediate_style) return style def google_list_style(style): """ Finds out whether this is an ordered or unordered list :type style: dict :rtype: str """ if 'list-style-type' in style: list_style = style['list-style-type'] if list_style in ['disc', 'circle', 'square', 'none']: return 'ul' return 'ol' def google_has_height(style): """ Check if the style of the element has the 'height' attribute explicitly defined :type style: dict :rtype: bool """ if 'height' in style: return True return False def google_text_emphasis(style): """ :type style: dict :returns: A list of all emphasis modifiers of the element :rtype: list """ emphasis = [] if 'text-decoration' in style: emphasis.append(style['text-decoration']) if 'font-style' in style: emphasis.append(style['font-style']) if 'font-weight' in style: emphasis.append(style['font-weight']) return emphasis def google_fixed_width_font(style): """ Check if the css of the current element defines a fixed width font :type style: dict :rtype: bool """ font_family = '' if 'font-family' in style: font_family = style['font-family'] if 'courier new' == font_family or 'consolas' == font_family: return True return False def list_numbering_start(attrs): """ Extract numbering from list element attributes :type attrs: dict :rtype: int or None """ if 'start' in attrs: try: return int(attrs['start']) - 1 except ValueError: pass return 0 def skipwrap(para, wrap_links): # If it appears to contain a link # don't wrap if (len(config.RE_LINK.findall(para)) > 0) and not wrap_links: return True # If the text begins with four spaces or one tab, it's a code block; # don't wrap if para[0:4] == ' ' or para[0] == '\t': return True # If the text begins with only two "--", possibly preceded by # whitespace, that's an emdash; so wrap. stripped = para.lstrip() if stripped[0:2] == "--" and len(stripped) > 2 and stripped[2] != "-": return False # I'm not sure what this is for; I thought it was to detect lists, # but there's a
-inside- case in one of the tests that # also depends upon it. if stripped[0:1] in ('-', '*') and not stripped[0:2] == '**': return True # If the text begins with a single -, *, or +, followed by a space, # or an integer, followed by a ., followed by a space (in either # case optionally proceeded by whitespace), it's a list; don't wrap. if config.RE_ORDERED_LIST_MATCHER.match(stripped) or \ config.RE_UNORDERED_LIST_MATCHER.match(stripped): return True return False def wrapwrite(text): text = text.encode('utf-8') try: # Python3 sys.stdout.buffer.write(text) except AttributeError: sys.stdout.write(text) def wrap_read(): # pragma: no cover """ :rtype: str """ try: return sys.stdin.read() except AttributeError: return sys.stdin.buffer.read() def escape_md(text): """ Escapes markdown-sensitive characters within other markdown constructs. """ return config.RE_MD_CHARS_MATCHER.sub(r"\\\1", text) def escape_md_section(text, snob=False): """ Escapes markdown-sensitive characters across whole document sections. """ text = config.RE_MD_BACKSLASH_MATCHER.sub(r"\\\1", text) if snob: text = config.RE_MD_CHARS_MATCHER_ALL.sub(r"\\\1", text) text = config.RE_MD_DOT_MATCHER.sub(r"\1\\\2", text) text = config.RE_MD_PLUS_MATCHER.sub(r"\1\\\2", text) text = config.RE_MD_DASH_MATCHER.sub(r"\1\\\2", text) return text def reformat_table(lines, right_margin): """ Given the lines of a table padds the cells and returns the new lines """ # find the maximum width of the columns max_width = [len(x.rstrip()) + right_margin for x in lines[0].split('|')] max_cols = len(max_width) for line in lines: cols = [x.rstrip() for x in line.split('|')] num_cols = len(cols) # don't drop any data if colspan attributes result in unequal lengths if num_cols < max_cols: cols += [''] * (max_cols - num_cols) elif max_cols < num_cols: max_width += [ len(x) + right_margin for x in cols[-(num_cols - max_cols):] ] max_cols = num_cols max_width = [max(len(x) + right_margin, old_len) for x, old_len in zip(cols, max_width)] # reformat new_lines = [] for line in lines: cols = [x.rstrip() for x in line.split('|')] if set(line.strip()) == set('-|'): filler = '-' new_cols = [x.rstrip() + (filler * (M - len(x.rstrip()))) for x, M in zip(cols, max_width)] else: filler = ' ' new_cols = [x.rstrip() + (filler * (M - len(x.rstrip()))) for x, M in zip(cols, max_width)] new_lines.append('|'.join(new_cols)) return new_lines def pad_tables_in_text(text, right_margin=1): """ Provide padding for tables in the text """ lines = text.split('\n') table_buffer, table_started = [], False new_lines = [] for line in lines: # Toggle table started if (config.TABLE_MARKER_FOR_PAD in line): table_started = not table_started if not table_started: table = reformat_table(table_buffer, right_margin) new_lines.extend(table) table_buffer = [] new_lines.append('') continue # Process lines if table_started: table_buffer.append(line) else: new_lines.append(line) new_text = '\n'.join(new_lines) return new_text