Update html2text to (2016, 9, 19).

2025-12-06 08:52:55 +01:00 · 2017-04-29 12:19:07 -05:00 · 2017-04-29 12:19:07 -05:00 · 7f4bc5c36e
commit 7f4bc5c36e
parent db7777b161
5 changed files with 412 additions and 26 deletions
--- a/included_dependencies/html2text/init.py
+++ b/included_dependencies/html2text/init.py
@ -4,14 +4,13 @@
 from __future__ import division
 import re
 import sys
-import cgi

 try:
    from textwrap import wrap
 except ImportError:  # pragma: no cover
    pass

-from html2text.compat import urlparse, HTMLParser
+from html2text.compat import urlparse, HTMLParser, html_escape
 from html2text import config

 from html2text.utils import (
@ -27,10 +26,11 @@ from html2text.utils import (
    list_numbering_start,
    dumb_css_parser,
    escape_md_section,
-    skipwrap
+    skipwrap,
+    pad_tables_in_text
 )

-__version__ = (2016, 4, 2)
+__version__ = (2016, 9, 19)


 # TODO:
@ -68,6 +68,7 @@ class HTML2Text(HTMLParser.HTMLParser):
        self.images_with_size = config.IMAGES_WITH_SIZE  # covered in cli
        self.ignore_emphasis = config.IGNORE_EMPHASIS  # covered in cli
        self.bypass_tables = config.BYPASS_TABLES  # covered in cli
+        self.ignore_tables = config.IGNORE_TABLES  # covered in cli
        self.google_doc = False  # covered in cli
        self.ul_item_mark = '*'  # covered in cli
        self.emphasis_mark = '_'  # covered in cli
@ -77,6 +78,8 @@ class HTML2Text(HTMLParser.HTMLParser):
        self.hide_strikethrough = False  # covered in cli
        self.mark_code = config.MARK_CODE
        self.wrap_links = config.WRAP_LINKS  # covered in cli
+        self.pad_tables = config.PAD_TABLES  # covered in cli
+        self.default_image_alt = config.DEFAULT_IMAGE_ALT  # covered in cli
        self.tag_callback = None

        if out is None:  # pragma: no cover
@ -130,7 +133,11 @@ class HTML2Text(HTMLParser.HTMLParser):
    def handle(self, data):
        self.feed(data)
        self.feed("")
-        return self.optwrap(self.close())
+        markdown = self.optwrap(self.close())
+        if self.pad_tables:
+            return pad_tables_in_text(markdown)
+        else:
+            return markdown

    def outtextf(self, s):
        self.outtextlist.append(s)
@ -142,23 +149,20 @@ class HTML2Text(HTMLParser.HTMLParser):

        try:
            nochr = unicode('')
+            unicode_character = unichr
        except NameError:
            nochr = str('')
+            unicode_character = chr

        self.pbr()
        self.o('', 0, 'end')

        outtext = nochr.join(self.outtextlist)
+
        if self.unicode_snob:
-            try:
-                nbsp = unichr(name2cp('nbsp'))
-            except NameError:
-                nbsp = chr(name2cp('nbsp'))
+            nbsp = unicode_character(name2cp('nbsp'))
        else:
-            try:
-                nbsp = unichr(32)
-            except NameError:
-                nbsp = chr(32)
+            nbsp = unicode_character(32)
        try:
            outtext = outtext.replace(unicode('&nbsp_place_holder;'), nbsp)
        except NameError:
@ -173,14 +177,14 @@ class HTML2Text(HTMLParser.HTMLParser):
    def handle_charref(self, c):
        charref = self.charref(c)
        if not self.code and not self.pre:
-            charref = cgi.escape(charref)
+            charref = html_escape(charref)
        self.handle_data(charref, True)

    def handle_entityref(self, c):
        entityref = self.entityref(c)
        if (not self.code and not self.pre
                and entityref != '&nbsp_place_holder;'):
-            entityref = cgi.escape(entityref)
+            entityref = html_escape(entityref)
        self.handle_data(entityref, True)

    def handle_starttag(self, tag, attrs):
@ -331,6 +335,9 @@ class HTML2Text(HTMLParser.HTMLParser):
                self.p()

        if tag == "br" and start:
+            if self.blockquote > 0:
+                self.o("  \n> ")
+            else:
                self.o("  \n")

        if tag == "hr" and start:
@ -439,7 +446,7 @@ class HTML2Text(HTMLParser.HTMLParser):
            if 'src' in attrs:
                if not self.images_to_alt:
                    attrs['href'] = attrs['src']
-                alt = attrs.get('alt') or ''
+                alt = attrs.get('alt') or self.default_image_alt

                # If we have images_with_size, write raw html including width,
                # height, and alt attributes
@ -541,7 +548,16 @@ class HTML2Text(HTMLParser.HTMLParser):
                self.start = 1

        if tag in ["table", "tr", "td", "th"]:
-            if self.bypass_tables:
+            if self.ignore_tables:
+                if tag == 'tr':
+                    if start:
+                        pass
+                    else:
+                        self.soft_br()
+                else:
+                    pass
+
+            elif self.bypass_tables:
                if start:
                    self.soft_br()
                if tag in ["td", "th"]:
@ -556,8 +572,16 @@ class HTML2Text(HTMLParser.HTMLParser):
                        self.o('</{0}>'.format(tag))

            else:
-                if tag == "table" and start:
+                if tag == "table":
+                    if start:
                        self.table_start = True
+                        if self.pad_tables:
+                            self.o("<"+config.TABLE_MARKER_FOR_PAD+">")
+                            self.o("  \n")
+                    else:
+                        if self.pad_tables:
+                            self.o("</"+config.TABLE_MARKER_FOR_PAD+">")
+                            self.o("  \n")
                if tag in ["td", "th"] and start:
                    if self.split_next_td:
                        self.o("| ")
@ -707,9 +731,6 @@ class HTML2Text(HTMLParser.HTMLParser):
            self.outcount += 1

    def handle_data(self, data, entity_char=False):
-        if r'\/script>' in data:
-            self.quiet -= 1
-
        if self.style:
            self.style_def.update(dumb_css_parser(data))

@ -814,7 +835,9 @@ class HTML2Text(HTMLParser.HTMLParser):
        for para in text.split("\n"):
            if len(para) > 0:
                if not skipwrap(para, self.wrap_links):
-                    result += "\n".join(wrap(para, self.body_width))
+                    result += "\n".join(
+                        wrap(para, self.body_width, break_long_words=False)
+                    )
                    if para.endswith('  '):
                        result += "  \n"
                        newlines = 1
--- a/included_dependencies/html2text/cli.py
+++ b/included_dependencies/html2text/cli.py
@ -0,0 +1,299 @@
+import optparse
+import warnings
+
+from html2text.compat import urllib
+from html2text import HTML2Text, config, __version__
+from html2text.utils import wrapwrite, wrap_read
+
+
+def main():
+    baseurl = ''
+
+    class bcolors:  # pragma: no cover
+        HEADER = '\033[95m'
+        OKBLUE = '\033[94m'
+        OKGREEN = '\033[92m'
+        WARNING = '\033[93m'
+        FAIL = '\033[91m'
+        ENDC = '\033[0m'
+        BOLD = '\033[1m'
+        UNDERLINE = '\033[4m'
+
+    p = optparse.OptionParser(
+        '%prog [(filename|url) [encoding]]',
+        version='%prog ' + ".".join(map(str, __version__))
+    )
+    p.add_option(
+        "--default-image-alt",
+        dest="default_image_alt",
+        action="store",
+        type="str",
+        default=config.DEFAULT_IMAGE_ALT,
+        help="The default alt string for images with missing ones")
+    p.add_option(
+        "--pad-tables",
+        dest="pad_tables",
+        action="store_true",
+        default=config.PAD_TABLES,
+        help="pad the cells to equal column width in tables"
+    )
+    p.add_option(
+        "--no-wrap-links",
+        dest="wrap_links",
+        action="store_false",
+        default=config.WRAP_LINKS,
+        help="wrap links during conversion"
+    )
+    p.add_option(
+        "--ignore-emphasis",
+        dest="ignore_emphasis",
+        action="store_true",
+        default=config.IGNORE_EMPHASIS,
+        help="don't include any formatting for emphasis"
+    )
+    p.add_option(
+        "--reference-links",
+        dest="inline_links",
+        action="store_false",
+        default=config.INLINE_LINKS,
+        help="use reference style links instead of inline links"
+    )
+    p.add_option(
+        "--ignore-links",
+        dest="ignore_links",
+        action="store_true",
+        default=config.IGNORE_ANCHORS,
+        help="don't include any formatting for links")
+    p.add_option(
+        "--protect-links",
+        dest="protect_links",
+        action="store_true",
+        default=config.PROTECT_LINKS,
+        help=("protect links from line breaks surrounding them " +
+              "with angle brackets"))
+    p.add_option(
+        "--ignore-images",
+        dest="ignore_images",
+        action="store_true",
+        default=config.IGNORE_IMAGES,
+        help="don't include any formatting for images"
+    )
+    p.add_option(
+        "--images-to-alt",
+        dest="images_to_alt",
+        action="store_true",
+        default=config.IMAGES_TO_ALT,
+        help="Discard image data, only keep alt text"
+    )
+    p.add_option(
+        "--images-with-size",
+        dest="images_with_size",
+        action="store_true",
+        default=config.IMAGES_WITH_SIZE,
+        help="Write image tags with height and width attrs as raw html to "
+             "retain dimensions"
+    )
+    p.add_option(
+        "-g", "--google-doc",
+        action="store_true",
+        dest="google_doc",
+        default=False,
+        help="convert an html-exported Google Document"
+    )
+    p.add_option(
+        "-d", "--dash-unordered-list",
+        action="store_true",
+        dest="ul_style_dash",
+        default=False,
+        help="use a dash rather than a star for unordered list items"
+    )
+    p.add_option(
+        "-e", "--asterisk-emphasis",
+        action="store_true",
+        dest="em_style_asterisk",
+        default=False,
+        help="use an asterisk rather than an underscore for emphasized text"
+    )
+    p.add_option(
+        "-b", "--body-width",
+        dest="body_width",
+        action="store",
+        type="int",
+        default=config.BODY_WIDTH,
+        help="number of characters per output line, 0 for no wrap"
+    )
+    p.add_option(
+        "-i", "--google-list-indent",
+        dest="list_indent",
+        action="store",
+        type="int",
+        default=config.GOOGLE_LIST_INDENT,
+        help="number of pixels Google indents nested lists"
+    )
+    p.add_option(
+        "-s", "--hide-strikethrough",
+        action="store_true",
+        dest="hide_strikethrough",
+        default=False,
+        help="hide strike-through text. only relevant when -g is "
+             "specified as well"
+    )
+    p.add_option(
+        "--escape-all",
+        action="store_true",
+        dest="escape_snob",
+        default=False,
+        help="Escape all special characters.  Output is less readable, but "
+             "avoids corner case formatting issues."
+    )
+    p.add_option(
+        "--bypass-tables",
+        action="store_true",
+        dest="bypass_tables",
+        default=config.BYPASS_TABLES,
+        help="Format tables in HTML rather than Markdown syntax."
+    )
+    p.add_option(
+        "--ignore-tables",
+        action="store_true",
+        dest="ignore_tables",
+        default=config.IGNORE_TABLES,
+        help="Ignore table-related tags (table, th, td, tr) while keeping rows."
+    )
+    p.add_option(
+        "--single-line-break",
+        action="store_true",
+        dest="single_line_break",
+        default=config.SINGLE_LINE_BREAK,
+        help=(
+            "Use a single line break after a block element rather than two "
+            "line breaks. NOTE: Requires --body-width=0"
+        )
+    )
+    p.add_option(
+        "--unicode-snob",
+        action="store_true",
+        dest="unicode_snob",
+        default=config.UNICODE_SNOB,
+        help="Use unicode throughout document"
+    )
+    p.add_option(
+        "--no-automatic-links",
+        action="store_false",
+        dest="use_automatic_links",
+        default=config.USE_AUTOMATIC_LINKS,
+        help="Do not use automatic links wherever applicable"
+    )
+    p.add_option(
+        "--no-skip-internal-links",
+        action="store_false",
+        dest="skip_internal_links",
+        default=config.SKIP_INTERNAL_LINKS,
+        help="Do not skip internal links"
+    )
+    p.add_option(
+        "--links-after-para",
+        action="store_true",
+        dest="links_each_paragraph",
+        default=config.LINKS_EACH_PARAGRAPH,
+        help="Put links after each paragraph instead of document"
+    )
+    p.add_option(
+        "--mark-code",
+        action="store_true",
+        dest="mark_code",
+        default=config.MARK_CODE,
+        help="Mark program code blocks with [code]...[/code]"
+    )
+    p.add_option(
+        "--decode-errors",
+        dest="decode_errors",
+        action="store",
+        type="string",
+        default=config.DECODE_ERRORS,
+        help="What to do in case of decode errors.'ignore', 'strict' and 'replace' are acceptable values"
+    )
+    (options, args) = p.parse_args()
+
+    # process input
+    encoding = "utf-8"
+    if len(args) == 2:
+        encoding = args[1]
+    elif len(args) > 2:
+        p.error('Too many arguments')
+
+    if len(args) > 0 and args[0] != '-':  # pragma: no cover
+        file_ = args[0]
+
+        if file_.startswith('http://') or file_.startswith('https://'):
+            warnings.warn("Support for retrieving html over network is set for deprecation by version (2017, 1, x)",
+                    DeprecationWarning)
+            baseurl = file_
+            j = urllib.urlopen(baseurl)
+            data = j.read()
+            if encoding is None:
+                try:
+                    from feedparser import _getCharacterEncoding as enc
+                except ImportError:
+                    enc = lambda x, y: ('utf-8', 1)
+                encoding = enc(j.headers, data)[0]
+                if encoding == 'us-ascii':
+                    encoding = 'utf-8'
+        else:
+            data = open(file_, 'rb').read()
+            if encoding is None:
+                try:
+                    from chardet import detect
+                except ImportError:
+                    detect = lambda x: {'encoding': 'utf-8'}
+                encoding = detect(data)['encoding']
+    else:
+        data = wrap_read()
+
+    if hasattr(data, 'decode'):
+        try:
+            try:
+                data = data.decode(encoding, errors=options.decode_errors)
+            except TypeError:
+                # python 2.6.x does not have the errors option
+                data = data.decode(encoding)
+        except UnicodeDecodeError as err:
+            warning = bcolors.WARNING + "Warning:" + bcolors.ENDC
+            warning += ' Use the ' + bcolors.OKGREEN
+            warning += '--decode-errors=ignore' + bcolors.ENDC + 'flag.'
+            print(warning)
+            raise err
+
+    h = HTML2Text(baseurl=baseurl)
+    # handle options
+    if options.ul_style_dash:
+        h.ul_item_mark = '-'
+    if options.em_style_asterisk:
+        h.emphasis_mark = '*'
+        h.strong_mark = '__'
+
+    h.body_width = options.body_width
+    h.google_list_indent = options.list_indent
+    h.ignore_emphasis = options.ignore_emphasis
+    h.ignore_links = options.ignore_links
+    h.protect_links = options.protect_links
+    h.ignore_images = options.ignore_images
+    h.images_to_alt = options.images_to_alt
+    h.images_with_size = options.images_with_size
+    h.google_doc = options.google_doc
+    h.hide_strikethrough = options.hide_strikethrough
+    h.escape_snob = options.escape_snob
+    h.bypass_tables = options.bypass_tables
+    h.ignore_tables = options.ignore_tables
+    h.single_line_break = options.single_line_break
+    h.inline_links = options.inline_links
+    h.unicode_snob = options.unicode_snob
+    h.use_automatic_links = options.use_automatic_links
+    h.skip_internal_links = options.skip_internal_links
+    h.links_each_paragraph = options.links_each_paragraph
+    h.mark_code = options.mark_code
+    h.wrap_links = options.wrap_links
+    h.pad_tables = options.pad_tables
+    h.default_image_alt = options.default_image_alt
+
+    wrapwrite(h.handle(data))
--- a/included_dependencies/html2text/compat.py
+++ b/included_dependencies/html2text/compat.py
@ -6,8 +6,12 @@ if sys.version_info[0] == 2:
    import urlparse
    import HTMLParser
    import urllib
+    from cgi import escape as html_escape
 else:
    import urllib.parse as urlparse
    import html.entities as htmlentitydefs
    import html.parser as HTMLParser
    import urllib.request as urllib
+    from html import escape
+    def html_escape(s):
+        return escape(s, quote=False)
--- a/included_dependencies/html2text/config.py
+++ b/included_dependencies/html2text/config.py
@ -3,6 +3,8 @@ import re
 # Use Unicode characters instead of their ascii psuedo-replacements
 UNICODE_SNOB = 0

+# Marker to use for marking tables for padding post processing
+TABLE_MARKER_FOR_PAD = "special_marker_for_table_padding"
 # Escape all special characters.  Output is less readable, but avoids
 # corner case formatting issues.
 ESCAPE_SNOB = 0
@ -36,6 +38,8 @@ IMAGES_WITH_SIZE = False
 IGNORE_EMPHASIS = False
 MARK_CODE = False
 DECODE_ERRORS = 'strict'
+DEFAULT_IMAGE_ALT = ''
+PAD_TABLES = False

 # Convert links with same href and text to <href> format if they are absolute links
 USE_AUTOMATIC_LINKS = True
@ -116,7 +120,11 @@ UNIFIABLE = {
    'rlm': ''
 }

+# Format tables in HTML rather than Markdown syntax
 BYPASS_TABLES = False
+# Ignore table-related tags (table, th, td, tr) while keeping rows
+IGNORE_TABLES = False
+

 # Use a single line break after a block element rather an two line breaks.
 # NOTE: Requires body width setting to be 0.
--- a/included_dependencies/html2text/utils.py
+++ b/included_dependencies/html2text/utils.py
@ -31,7 +31,7 @@ def dumb_property_dict(style):
    """
    :returns: A hash of css attributes
    """
-    out = dict([(x.strip(), y.strip()) for x, y in
+    out = dict([(x.strip().lower(), y.strip().lower()) for x, y in
                [z.split(':', 1) for z in
                 style.split(';') if ':' in z
                 ]
@ -149,7 +149,7 @@ def google_fixed_width_font(style):
    font_family = ''
    if 'font-family' in style:
        font_family = style['font-family']
-    if 'Courier New' == font_family or 'Consolas' == font_family:
+    if 'courier new' == font_family or 'consolas' == font_family:
        return True

    return False
@ -244,3 +244,55 @@ def escape_md_section(text, snob=False):
    text = config.RE_MD_DASH_MATCHER.sub(r"\1\\\2", text)

    return text
+
+def reformat_table(lines, right_margin):
+    """
+    Given the lines of a table
+    padds the cells and returns the new lines
+    """
+    # find the maximum width of the columns
+    max_width = [len(x.rstrip()) + right_margin for x in lines[0].split('|')]
+    for line in lines:
+        cols = [x.rstrip() for x in line.split('|')]
+        max_width = [max(len(x) + right_margin, old_len)
+                     for x, old_len in zip(cols, max_width)]
+    
+    # reformat
+    new_lines = []
+    for line in lines:
+        cols = [x.rstrip() for x in line.split('|')]
+        if set(line.strip()) == set('-|'):
+            filler = '-'
+            new_cols = [x.rstrip() + (filler * (M - len(x.rstrip())))
+                        for x, M in zip(cols, max_width)]
+        else:
+            filler = ' '
+            new_cols = [x.rstrip() + (filler * (M - len(x.rstrip())))
+                        for x, M in zip(cols, max_width)]
+        new_lines.append('|'.join(new_cols))
+    return new_lines
+
+def pad_tables_in_text(text, right_margin=1):
+    """
+    Provide padding for tables in the text
+    """
+    lines = text.split('\n')
+    table_buffer, altered_lines, table_widths, table_started = [], [], [], False
+    new_lines = []
+    for line in lines:
+        # Toogle table started
+        if (config.TABLE_MARKER_FOR_PAD in line):
+            table_started = not table_started
+            if not table_started:
+                table = reformat_table(table_buffer, right_margin)
+                new_lines.extend(table)
+                table_buffer = []
+                new_lines.append('')
+            continue
+        # Process lines
+        if table_started:
+            table_buffer.append(line)
+        else:
+            new_lines.append(line)
+    new_text = '\n'.join(new_lines)
+    return new_text