From 7f4bc5c36eb34ac642c740c81dd44dc7c02919b6 Mon Sep 17 00:00:00 2001 From: Jim Miller Date: Sat, 29 Apr 2017 12:19:07 -0500 Subject: [PATCH] Update html2text to (2016, 9, 19). --- included_dependencies/html2text/__init__.py | 71 +++-- included_dependencies/html2text/cli.py | 299 ++++++++++++++++++++ included_dependencies/html2text/compat.py | 4 + included_dependencies/html2text/config.py | 8 + included_dependencies/html2text/utils.py | 56 +++- 5 files changed, 412 insertions(+), 26 deletions(-) create mode 100644 included_dependencies/html2text/cli.py diff --git a/included_dependencies/html2text/__init__.py b/included_dependencies/html2text/__init__.py index e7b88be3..c6ed1e10 100644 --- a/included_dependencies/html2text/__init__.py +++ b/included_dependencies/html2text/__init__.py @@ -4,14 +4,13 @@ from __future__ import division import re import sys -import cgi try: from textwrap import wrap except ImportError: # pragma: no cover pass -from html2text.compat import urlparse, HTMLParser +from html2text.compat import urlparse, HTMLParser, html_escape from html2text import config from html2text.utils import ( @@ -27,10 +26,11 @@ from html2text.utils import ( list_numbering_start, dumb_css_parser, escape_md_section, - skipwrap + skipwrap, + pad_tables_in_text ) -__version__ = (2016, 4, 2) +__version__ = (2016, 9, 19) # TODO: @@ -68,6 +68,7 @@ class HTML2Text(HTMLParser.HTMLParser): self.images_with_size = config.IMAGES_WITH_SIZE # covered in cli self.ignore_emphasis = config.IGNORE_EMPHASIS # covered in cli self.bypass_tables = config.BYPASS_TABLES # covered in cli + self.ignore_tables = config.IGNORE_TABLES # covered in cli self.google_doc = False # covered in cli self.ul_item_mark = '*' # covered in cli self.emphasis_mark = '_' # covered in cli @@ -77,6 +78,8 @@ class HTML2Text(HTMLParser.HTMLParser): self.hide_strikethrough = False # covered in cli self.mark_code = config.MARK_CODE self.wrap_links = config.WRAP_LINKS # covered in cli + self.pad_tables = config.PAD_TABLES # covered in cli + self.default_image_alt = config.DEFAULT_IMAGE_ALT # covered in cli self.tag_callback = None if out is None: # pragma: no cover @@ -130,7 +133,11 @@ class HTML2Text(HTMLParser.HTMLParser): def handle(self, data): self.feed(data) self.feed("") - return self.optwrap(self.close()) + markdown = self.optwrap(self.close()) + if self.pad_tables: + return pad_tables_in_text(markdown) + else: + return markdown def outtextf(self, s): self.outtextlist.append(s) @@ -142,23 +149,20 @@ class HTML2Text(HTMLParser.HTMLParser): try: nochr = unicode('') + unicode_character = unichr except NameError: nochr = str('') + unicode_character = chr self.pbr() self.o('', 0, 'end') outtext = nochr.join(self.outtextlist) + if self.unicode_snob: - try: - nbsp = unichr(name2cp('nbsp')) - except NameError: - nbsp = chr(name2cp('nbsp')) + nbsp = unicode_character(name2cp('nbsp')) else: - try: - nbsp = unichr(32) - except NameError: - nbsp = chr(32) + nbsp = unicode_character(32) try: outtext = outtext.replace(unicode(' _place_holder;'), nbsp) except NameError: @@ -173,14 +177,14 @@ class HTML2Text(HTMLParser.HTMLParser): def handle_charref(self, c): charref = self.charref(c) if not self.code and not self.pre: - charref = cgi.escape(charref) + charref = html_escape(charref) self.handle_data(charref, True) def handle_entityref(self, c): entityref = self.entityref(c) if (not self.code and not self.pre and entityref != ' _place_holder;'): - entityref = cgi.escape(entityref) + entityref = html_escape(entityref) self.handle_data(entityref, True) def handle_starttag(self, tag, attrs): @@ -331,7 +335,10 @@ class HTML2Text(HTMLParser.HTMLParser): self.p() if tag == "br" and start: - self.o(" \n") + if self.blockquote > 0: + self.o(" \n> ") + else: + self.o(" \n") if tag == "hr" and start: self.p() @@ -439,7 +446,7 @@ class HTML2Text(HTMLParser.HTMLParser): if 'src' in attrs: if not self.images_to_alt: attrs['href'] = attrs['src'] - alt = attrs.get('alt') or '' + alt = attrs.get('alt') or self.default_image_alt # If we have images_with_size, write raw html including width, # height, and alt attributes @@ -541,7 +548,16 @@ class HTML2Text(HTMLParser.HTMLParser): self.start = 1 if tag in ["table", "tr", "td", "th"]: - if self.bypass_tables: + if self.ignore_tables: + if tag == 'tr': + if start: + pass + else: + self.soft_br() + else: + pass + + elif self.bypass_tables: if start: self.soft_br() if tag in ["td", "th"]: @@ -556,8 +572,16 @@ class HTML2Text(HTMLParser.HTMLParser): self.o(''.format(tag)) else: - if tag == "table" and start: - self.table_start = True + if tag == "table": + if start: + self.table_start = True + if self.pad_tables: + self.o("<"+config.TABLE_MARKER_FOR_PAD+">") + self.o(" \n") + else: + if self.pad_tables: + self.o("") + self.o(" \n") if tag in ["td", "th"] and start: if self.split_next_td: self.o("| ") @@ -707,9 +731,6 @@ class HTML2Text(HTMLParser.HTMLParser): self.outcount += 1 def handle_data(self, data, entity_char=False): - if r'\/script>' in data: - self.quiet -= 1 - if self.style: self.style_def.update(dumb_css_parser(data)) @@ -814,7 +835,9 @@ class HTML2Text(HTMLParser.HTMLParser): for para in text.split("\n"): if len(para) > 0: if not skipwrap(para, self.wrap_links): - result += "\n".join(wrap(para, self.body_width)) + result += "\n".join( + wrap(para, self.body_width, break_long_words=False) + ) if para.endswith(' '): result += " \n" newlines = 1 diff --git a/included_dependencies/html2text/cli.py b/included_dependencies/html2text/cli.py new file mode 100644 index 00000000..c9357879 --- /dev/null +++ b/included_dependencies/html2text/cli.py @@ -0,0 +1,299 @@ +import optparse +import warnings + +from html2text.compat import urllib +from html2text import HTML2Text, config, __version__ +from html2text.utils import wrapwrite, wrap_read + + +def main(): + baseurl = '' + + class bcolors: # pragma: no cover + HEADER = '\033[95m' + OKBLUE = '\033[94m' + OKGREEN = '\033[92m' + WARNING = '\033[93m' + FAIL = '\033[91m' + ENDC = '\033[0m' + BOLD = '\033[1m' + UNDERLINE = '\033[4m' + + p = optparse.OptionParser( + '%prog [(filename|url) [encoding]]', + version='%prog ' + ".".join(map(str, __version__)) + ) + p.add_option( + "--default-image-alt", + dest="default_image_alt", + action="store", + type="str", + default=config.DEFAULT_IMAGE_ALT, + help="The default alt string for images with missing ones") + p.add_option( + "--pad-tables", + dest="pad_tables", + action="store_true", + default=config.PAD_TABLES, + help="pad the cells to equal column width in tables" + ) + p.add_option( + "--no-wrap-links", + dest="wrap_links", + action="store_false", + default=config.WRAP_LINKS, + help="wrap links during conversion" + ) + p.add_option( + "--ignore-emphasis", + dest="ignore_emphasis", + action="store_true", + default=config.IGNORE_EMPHASIS, + help="don't include any formatting for emphasis" + ) + p.add_option( + "--reference-links", + dest="inline_links", + action="store_false", + default=config.INLINE_LINKS, + help="use reference style links instead of inline links" + ) + p.add_option( + "--ignore-links", + dest="ignore_links", + action="store_true", + default=config.IGNORE_ANCHORS, + help="don't include any formatting for links") + p.add_option( + "--protect-links", + dest="protect_links", + action="store_true", + default=config.PROTECT_LINKS, + help=("protect links from line breaks surrounding them " + + "with angle brackets")) + p.add_option( + "--ignore-images", + dest="ignore_images", + action="store_true", + default=config.IGNORE_IMAGES, + help="don't include any formatting for images" + ) + p.add_option( + "--images-to-alt", + dest="images_to_alt", + action="store_true", + default=config.IMAGES_TO_ALT, + help="Discard image data, only keep alt text" + ) + p.add_option( + "--images-with-size", + dest="images_with_size", + action="store_true", + default=config.IMAGES_WITH_SIZE, + help="Write image tags with height and width attrs as raw html to " + "retain dimensions" + ) + p.add_option( + "-g", "--google-doc", + action="store_true", + dest="google_doc", + default=False, + help="convert an html-exported Google Document" + ) + p.add_option( + "-d", "--dash-unordered-list", + action="store_true", + dest="ul_style_dash", + default=False, + help="use a dash rather than a star for unordered list items" + ) + p.add_option( + "-e", "--asterisk-emphasis", + action="store_true", + dest="em_style_asterisk", + default=False, + help="use an asterisk rather than an underscore for emphasized text" + ) + p.add_option( + "-b", "--body-width", + dest="body_width", + action="store", + type="int", + default=config.BODY_WIDTH, + help="number of characters per output line, 0 for no wrap" + ) + p.add_option( + "-i", "--google-list-indent", + dest="list_indent", + action="store", + type="int", + default=config.GOOGLE_LIST_INDENT, + help="number of pixels Google indents nested lists" + ) + p.add_option( + "-s", "--hide-strikethrough", + action="store_true", + dest="hide_strikethrough", + default=False, + help="hide strike-through text. only relevant when -g is " + "specified as well" + ) + p.add_option( + "--escape-all", + action="store_true", + dest="escape_snob", + default=False, + help="Escape all special characters. Output is less readable, but " + "avoids corner case formatting issues." + ) + p.add_option( + "--bypass-tables", + action="store_true", + dest="bypass_tables", + default=config.BYPASS_TABLES, + help="Format tables in HTML rather than Markdown syntax." + ) + p.add_option( + "--ignore-tables", + action="store_true", + dest="ignore_tables", + default=config.IGNORE_TABLES, + help="Ignore table-related tags (table, th, td, tr) while keeping rows." + ) + p.add_option( + "--single-line-break", + action="store_true", + dest="single_line_break", + default=config.SINGLE_LINE_BREAK, + help=( + "Use a single line break after a block element rather than two " + "line breaks. NOTE: Requires --body-width=0" + ) + ) + p.add_option( + "--unicode-snob", + action="store_true", + dest="unicode_snob", + default=config.UNICODE_SNOB, + help="Use unicode throughout document" + ) + p.add_option( + "--no-automatic-links", + action="store_false", + dest="use_automatic_links", + default=config.USE_AUTOMATIC_LINKS, + help="Do not use automatic links wherever applicable" + ) + p.add_option( + "--no-skip-internal-links", + action="store_false", + dest="skip_internal_links", + default=config.SKIP_INTERNAL_LINKS, + help="Do not skip internal links" + ) + p.add_option( + "--links-after-para", + action="store_true", + dest="links_each_paragraph", + default=config.LINKS_EACH_PARAGRAPH, + help="Put links after each paragraph instead of document" + ) + p.add_option( + "--mark-code", + action="store_true", + dest="mark_code", + default=config.MARK_CODE, + help="Mark program code blocks with [code]...[/code]" + ) + p.add_option( + "--decode-errors", + dest="decode_errors", + action="store", + type="string", + default=config.DECODE_ERRORS, + help="What to do in case of decode errors.'ignore', 'strict' and 'replace' are acceptable values" + ) + (options, args) = p.parse_args() + + # process input + encoding = "utf-8" + if len(args) == 2: + encoding = args[1] + elif len(args) > 2: + p.error('Too many arguments') + + if len(args) > 0 and args[0] != '-': # pragma: no cover + file_ = args[0] + + if file_.startswith('http://') or file_.startswith('https://'): + warnings.warn("Support for retrieving html over network is set for deprecation by version (2017, 1, x)", + DeprecationWarning) + baseurl = file_ + j = urllib.urlopen(baseurl) + data = j.read() + if encoding is None: + try: + from feedparser import _getCharacterEncoding as enc + except ImportError: + enc = lambda x, y: ('utf-8', 1) + encoding = enc(j.headers, data)[0] + if encoding == 'us-ascii': + encoding = 'utf-8' + else: + data = open(file_, 'rb').read() + if encoding is None: + try: + from chardet import detect + except ImportError: + detect = lambda x: {'encoding': 'utf-8'} + encoding = detect(data)['encoding'] + else: + data = wrap_read() + + if hasattr(data, 'decode'): + try: + try: + data = data.decode(encoding, errors=options.decode_errors) + except TypeError: + # python 2.6.x does not have the errors option + data = data.decode(encoding) + except UnicodeDecodeError as err: + warning = bcolors.WARNING + "Warning:" + bcolors.ENDC + warning += ' Use the ' + bcolors.OKGREEN + warning += '--decode-errors=ignore' + bcolors.ENDC + 'flag.' + print(warning) + raise err + + h = HTML2Text(baseurl=baseurl) + # handle options + if options.ul_style_dash: + h.ul_item_mark = '-' + if options.em_style_asterisk: + h.emphasis_mark = '*' + h.strong_mark = '__' + + h.body_width = options.body_width + h.google_list_indent = options.list_indent + h.ignore_emphasis = options.ignore_emphasis + h.ignore_links = options.ignore_links + h.protect_links = options.protect_links + h.ignore_images = options.ignore_images + h.images_to_alt = options.images_to_alt + h.images_with_size = options.images_with_size + h.google_doc = options.google_doc + h.hide_strikethrough = options.hide_strikethrough + h.escape_snob = options.escape_snob + h.bypass_tables = options.bypass_tables + h.ignore_tables = options.ignore_tables + h.single_line_break = options.single_line_break + h.inline_links = options.inline_links + h.unicode_snob = options.unicode_snob + h.use_automatic_links = options.use_automatic_links + h.skip_internal_links = options.skip_internal_links + h.links_each_paragraph = options.links_each_paragraph + h.mark_code = options.mark_code + h.wrap_links = options.wrap_links + h.pad_tables = options.pad_tables + h.default_image_alt = options.default_image_alt + + wrapwrite(h.handle(data)) diff --git a/included_dependencies/html2text/compat.py b/included_dependencies/html2text/compat.py index 2120a41b..60907abf 100644 --- a/included_dependencies/html2text/compat.py +++ b/included_dependencies/html2text/compat.py @@ -6,8 +6,12 @@ if sys.version_info[0] == 2: import urlparse import HTMLParser import urllib + from cgi import escape as html_escape else: import urllib.parse as urlparse import html.entities as htmlentitydefs import html.parser as HTMLParser import urllib.request as urllib + from html import escape + def html_escape(s): + return escape(s, quote=False) diff --git a/included_dependencies/html2text/config.py b/included_dependencies/html2text/config.py index 85bf47dc..48e17845 100644 --- a/included_dependencies/html2text/config.py +++ b/included_dependencies/html2text/config.py @@ -3,6 +3,8 @@ import re # Use Unicode characters instead of their ascii psuedo-replacements UNICODE_SNOB = 0 +# Marker to use for marking tables for padding post processing +TABLE_MARKER_FOR_PAD = "special_marker_for_table_padding" # Escape all special characters. Output is less readable, but avoids # corner case formatting issues. ESCAPE_SNOB = 0 @@ -36,6 +38,8 @@ IMAGES_WITH_SIZE = False IGNORE_EMPHASIS = False MARK_CODE = False DECODE_ERRORS = 'strict' +DEFAULT_IMAGE_ALT = '' +PAD_TABLES = False # Convert links with same href and text to format if they are absolute links USE_AUTOMATIC_LINKS = True @@ -116,7 +120,11 @@ UNIFIABLE = { 'rlm': '' } +# Format tables in HTML rather than Markdown syntax BYPASS_TABLES = False +# Ignore table-related tags (table, th, td, tr) while keeping rows +IGNORE_TABLES = False + # Use a single line break after a block element rather an two line breaks. # NOTE: Requires body width setting to be 0. diff --git a/included_dependencies/html2text/utils.py b/included_dependencies/html2text/utils.py index 418c89cb..bd6fc634 100644 --- a/included_dependencies/html2text/utils.py +++ b/included_dependencies/html2text/utils.py @@ -31,7 +31,7 @@ def dumb_property_dict(style): """ :returns: A hash of css attributes """ - out = dict([(x.strip(), y.strip()) for x, y in + out = dict([(x.strip().lower(), y.strip().lower()) for x, y in [z.split(':', 1) for z in style.split(';') if ':' in z ] @@ -149,7 +149,7 @@ def google_fixed_width_font(style): font_family = '' if 'font-family' in style: font_family = style['font-family'] - if 'Courier New' == font_family or 'Consolas' == font_family: + if 'courier new' == font_family or 'consolas' == font_family: return True return False @@ -244,3 +244,55 @@ def escape_md_section(text, snob=False): text = config.RE_MD_DASH_MATCHER.sub(r"\1\\\2", text) return text + +def reformat_table(lines, right_margin): + """ + Given the lines of a table + padds the cells and returns the new lines + """ + # find the maximum width of the columns + max_width = [len(x.rstrip()) + right_margin for x in lines[0].split('|')] + for line in lines: + cols = [x.rstrip() for x in line.split('|')] + max_width = [max(len(x) + right_margin, old_len) + for x, old_len in zip(cols, max_width)] + + # reformat + new_lines = [] + for line in lines: + cols = [x.rstrip() for x in line.split('|')] + if set(line.strip()) == set('-|'): + filler = '-' + new_cols = [x.rstrip() + (filler * (M - len(x.rstrip()))) + for x, M in zip(cols, max_width)] + else: + filler = ' ' + new_cols = [x.rstrip() + (filler * (M - len(x.rstrip()))) + for x, M in zip(cols, max_width)] + new_lines.append('|'.join(new_cols)) + return new_lines + +def pad_tables_in_text(text, right_margin=1): + """ + Provide padding for tables in the text + """ + lines = text.split('\n') + table_buffer, altered_lines, table_widths, table_started = [], [], [], False + new_lines = [] + for line in lines: + # Toogle table started + if (config.TABLE_MARKER_FOR_PAD in line): + table_started = not table_started + if not table_started: + table = reformat_table(table_buffer, right_margin) + new_lines.extend(table) + table_buffer = [] + new_lines.append('') + continue + # Process lines + if table_started: + table_buffer.append(line) + else: + new_lines.append(line) + new_text = '\n'.join(new_lines) + return new_text