Update plugin & web service html2text included package to fix text <>& output.

2025-12-06 08:52:55 +01:00 · 2018-08-01 20:19:46 -05:00 · 2018-08-01 20:19:46 -05:00 · 8fb7f048b5
commit 8fb7f048b5
parent 9186c2fae9
5 changed files with 211 additions and 84 deletions
--- a/included_dependencies/html2text/init.py
+++ b/included_dependencies/html2text/init.py
@ -2,6 +2,7 @@
 # coding: utf-8
 """html2text: Turn HTML into equivalent Markdown-structured text."""
 from __future__ import division
+from __future__ import unicode_literals
 import re
 import sys

@ -10,7 +11,7 @@ try:
 except ImportError:  # pragma: no cover
    pass

-from html2text.compat import urlparse, HTMLParser, html_escape
+from html2text.compat import urlparse, HTMLParser
 from html2text import config

 from html2text.utils import (
@ -30,7 +31,14 @@ from html2text.utils import (
    pad_tables_in_text
 )

-__version__ = (2016, 9, 19)
+try:
+    chr = unichr
+    nochr = unicode('')
+except NameError:
+    # python3 uses chr
+    nochr = str('')
+
+__version__ = (2018, 1, 9)


 # TODO:
@ -81,6 +89,8 @@ class HTML2Text(HTMLParser.HTMLParser):
        self.pad_tables = config.PAD_TABLES  # covered in cli
        self.default_image_alt = config.DEFAULT_IMAGE_ALT  # covered in cli
        self.tag_callback = None
+        self.open_quote = config.OPEN_QUOTE  # covered in cli
+        self.close_quote = config.CLOSE_QUOTE  # covered in cli

        if out is None:  # pragma: no cover
            self.out = self.outtextf
@ -106,6 +116,7 @@ class HTML2Text(HTMLParser.HTMLParser):
        self.pre = 0
        self.startpre = 0
        self.code = False
+        self.quote = False
        self.br_toggle = ''
        self.lastWasNL = 0
        self.lastWasList = False
@ -119,6 +130,10 @@ class HTML2Text(HTMLParser.HTMLParser):
        self.abbr_data = None  # last inner HTML (for abbr being defined)
        self.abbr_list = {}  # stack of abbreviations to write later
        self.baseurl = baseurl
+        self.stressed = False
+        self.preceding_stressed = False
+        self.preceding_data = None
+        self.current_tag = None

        try:
            del unifiable_n[name2cp('nbsp')]
@ -147,22 +162,15 @@ class HTML2Text(HTMLParser.HTMLParser):
    def close(self):
        HTMLParser.HTMLParser.close(self)

-        try:
-            nochr = unicode('')
-            unicode_character = unichr
-        except NameError:
-            nochr = str('')
-            unicode_character = chr
-
        self.pbr()
        self.o('', 0, 'end')

        outtext = nochr.join(self.outtextlist)

        if self.unicode_snob:
-            nbsp = unicode_character(name2cp('nbsp'))
+            nbsp = chr(name2cp('nbsp'))
        else:
-            nbsp = unicode_character(32)
+            nbsp = chr(32)
        try:
            outtext = outtext.replace(unicode('&nbsp_place_holder;'), nbsp)
        except NameError:
@ -175,17 +183,10 @@ class HTML2Text(HTMLParser.HTMLParser):
        return outtext

    def handle_charref(self, c):
-        charref = self.charref(c)
-        if not self.code and not self.pre:
-            charref = html_escape(charref)
-        self.handle_data(charref, True)
+        self.handle_data(self.charref(c), True)

    def handle_entityref(self, c):
-        entityref = self.entityref(c)
-        if (not self.code and not self.pre
-                and entityref != '&nbsp_place_holder;'):
-            entityref = html_escape(entityref)
-        self.handle_data(entityref, True)
+        self.handle_data(self.entityref(c), True)

    def handle_starttag(self, tag, attrs):
        self.handle_tag(tag, attrs, 1)
@ -208,10 +209,11 @@ class HTML2Text(HTMLParser.HTMLParser):
            i += 1
            match = 0

-            if ('href' in a) and a['href'] == attrs['href']:
-                if ('title' in a) or ('title' in attrs):
-                    if (('title' in a) and ('title' in attrs) and
-                                a['title'] == attrs['title']):
+            if 'href' in a and a['href'] == attrs['href']:
+                if 'title' in a or 'title' in attrs:
+                    if 'title' in a and \
+                        'title' in attrs and \
+                            a['title'] == attrs['title']:
                        match = True
                else:
                    match = True
@ -229,8 +231,16 @@ class HTML2Text(HTMLParser.HTMLParser):
        # handle Google's text emphasis
        strikethrough = 'line-through' in \
                        tag_emphasis and self.hide_strikethrough
-        bold = 'bold' in tag_emphasis and not 'bold' in parent_emphasis
-        italic = 'italic' in tag_emphasis and not 'italic' in parent_emphasis
+
+        # google and others may mark a font's weight as `bold` or `700`
+        bold = False
+        for bold_marker in config.BOLD_TEXT_STYLE_VALUES:
+            bold = (bold_marker in tag_emphasis
+                    and bold_marker not in parent_emphasis)
+            if bold:
+                break
+
+        italic = 'italic' in tag_emphasis and 'italic' not in parent_emphasis
        fixed = google_fixed_width_font(tag_style) and not \
            google_fixed_width_font(parent_style) and not self.pre

@ -282,6 +292,7 @@ class HTML2Text(HTMLParser.HTMLParser):
                self.quiet -= 1

    def handle_tag(self, tag, attrs, start):
+        self.current_tag = tag
        # attrs is None for endtags
        if attrs is None:
            attrs = {}
@ -292,10 +303,11 @@ class HTML2Text(HTMLParser.HTMLParser):
            if self.tag_callback(self, tag, attrs, start) is True:
                return

-        # first thing inside the anchor tag is another tag that produces some output
-        if (start and not self.maybe_automatic_link is None
-                and tag not in ['p', 'div', 'style', 'dl', 'dt']
-                and (tag != "img" or self.ignore_images)):
+        # first thing inside the anchor tag is another tag
+        # that produces some output
+        if (start and self.maybe_automatic_link is not None and
+                tag not in ['p', 'div', 'style', 'dl', 'dt'] and
+                (tag != "img" or self.ignore_images)):
            self.o("[")
            self.maybe_automatic_link = None
            self.empty_link = False
@ -312,7 +324,8 @@ class HTML2Text(HTMLParser.HTMLParser):
                tag_style = element_style(attrs, self.style_def, parent_style)
                self.tag_stack.append((tag, attrs, tag_style))
            else:
-                dummy, attrs, tag_style = self.tag_stack.pop() if self.tag_stack else (None, {}, {})
+                dummy, attrs, tag_style = self.tag_stack.pop() \
+                    if self.tag_stack else (None, {}, {})
                if self.tag_stack:
                    parent_style = self.tag_stack[-1][2]

@ -331,6 +344,8 @@ class HTML2Text(HTMLParser.HTMLParser):
                    self.p()
                else:
                    self.soft_br()
+            elif self.astack and tag == 'div':
+                pass
            else:
                self.p()

@ -370,24 +385,49 @@ class HTML2Text(HTMLParser.HTMLParser):
                self.blockquote -= 1
                self.p()

+        def no_preceding_space(self):
+            return (self.preceding_data
+                    and re.match(r'[^\s]', self.preceding_data[-1]))
+
        if tag in ['em', 'i', 'u'] and not self.ignore_emphasis:
-            self.o(self.emphasis_mark)
-        if tag in ['strong', 'b'] and not self.ignore_emphasis:
-            self.o(self.strong_mark)
-        if tag in ['del', 'strike', 's']:
-            if start:
-                self.o('~~')
+            if start and no_preceding_space(self):
+                emphasis = ' ' + self.emphasis_mark
            else:
-                self.o('~~')
+                emphasis = self.emphasis_mark
+
+            self.o(emphasis)
+            if start:
+                self.stressed = True
+
+        if tag in ['strong', 'b'] and not self.ignore_emphasis:
+            if start and no_preceding_space(self):
+                strong = ' ' + self.strong_mark
+            else:
+                strong = self.strong_mark
+
+            self.o(strong)
+            if start:
+                self.stressed = True
+
+        if tag in ['del', 'strike', 's']:
+            if start and no_preceding_space(self):
+                strike = ' ~~'
+            else:
+                strike = '~~'
+
+            self.o(strike)
+            if start:
+                self.stressed = True

        if self.google_doc:
            if not self.inheader:
                # handle some font attributes, but leave headers clean
                self.handle_emphasis(start, tag_style, parent_style)

-        if tag in ["code", "tt"] and not self.pre:
+        if tag in ["kbd", "code", "tt"] and not self.pre:
            self.o('`')  # TODO: `` `this` ``
            self.code = not self.code
+
        if tag == "abbr":
            if start:
                self.abbr_title = None
@ -400,17 +440,30 @@ class HTML2Text(HTMLParser.HTMLParser):
                    self.abbr_title = None
                self.abbr_data = ''

+        if tag == "q":
+            if not self.quote:
+                self.o(self.open_quote)
+            else:
+                self.o(self.close_quote)
+            self.quote = not self.quote
+
+        def link_url(self, link, title=""):
+            url = urlparse.urljoin(self.baseurl, link)
+            title = ' "{0}"'.format(title) if title.strip() else ''
+            self.o(']({url}{title})'.format(url=escape_md(url),
+                                            title=title))
+
        if tag == "a" and not self.ignore_links:
            if start:
-                if ('href' in attrs) and \
-                        (attrs['href'] is not None) and \
-                        not (self.skip_internal_links and
-                                 attrs['href'].startswith('#')):
+                if 'href' in attrs and \
+                    attrs['href'] is not None and not \
+                        (self.skip_internal_links and
+                            attrs['href'].startswith('#')):
                    self.astack.append(attrs)
                    self.maybe_automatic_link = attrs['href']
                    self.empty_link = True
                    if self.protect_links:
-                        attrs['href'] = '<'+attrs['href']+'>'
+                        attrs['href'] = '<' + attrs['href'] + '>'
                else:
                    self.astack.append(None)
            else:
@ -425,12 +478,12 @@ class HTML2Text(HTMLParser.HTMLParser):
                            self.maybe_automatic_link = None
                        if self.inline_links:
                            try:
-                                title = escape_md(a['title'])
+                                title = a['title'] if a['title'] else ''
+                                title = escape_md(title)
                            except KeyError:
-                                self.o("](" + escape_md(urlparse.urljoin(self.baseurl, a['href'])) + ")")
+                                link_url(self, a['href'], '')
                            else:
-                                self.o("](" + escape_md(urlparse.urljoin(self.baseurl, a['href']))
-                                       + ' "' + title + '" )')
+                                link_url(self, a['href'], title)
                        else:
                            i = self.previousIndex(a)
                            if i is not None:
@ -463,7 +516,7 @@ class HTML2Text(HTMLParser.HTMLParser):
                    return

                # If we have a link to create, output the start
-                if not self.maybe_automatic_link is None:
+                if self.maybe_automatic_link is not None:
                    href = self.maybe_automatic_link
                    if self.images_to_alt and escape_md(alt) == href and \
                            self.absolute_url_matcher.match(href):
@ -483,7 +536,16 @@ class HTML2Text(HTMLParser.HTMLParser):
                    self.o("![" + escape_md(alt) + "]")
                    if self.inline_links:
                        href = attrs.get('href') or ''
-                        self.o("(" + escape_md(urlparse.urljoin(self.baseurl, href)) + ")")
+                        self.o(
+                            "(" +
+                            escape_md(
+                                urlparse.urljoin(
+                                    self.baseurl,
+                                    href
+                                )
+                            ) +
+                            ")"
+                        )
                    else:
                        i = self.previousIndex(attrs)
                        if i is not None:
@ -576,11 +638,11 @@ class HTML2Text(HTMLParser.HTMLParser):
                    if start:
                        self.table_start = True
                        if self.pad_tables:
-                            self.o("<"+config.TABLE_MARKER_FOR_PAD+">")
+                            self.o("<" + config.TABLE_MARKER_FOR_PAD + ">")
                            self.o("  \n")
                    else:
                        if self.pad_tables:
-                            self.o("</"+config.TABLE_MARKER_FOR_PAD+">")
+                            self.o("</" + config.TABLE_MARKER_FOR_PAD + ">")
                            self.o("  \n")
                if tag in ["td", "th"] and start:
                    if self.split_next_td:
@ -654,8 +716,9 @@ class HTML2Text(HTMLParser.HTMLParser):
                return

            if self.startpre:
-                #self.out(" :") #TODO: not output when already one there
-                if not data.startswith("\n"):  # <pre>stuff...
+                # self.out(" :") #TODO: not output when already one there
+                if not data.startswith("\n") and not data.startswith("\r\n"):
+                    # <pre>stuff...
                    data = "\n" + data
                if self.mark_code:
                    self.out("\n[code]")
@ -668,7 +731,7 @@ class HTML2Text(HTMLParser.HTMLParser):
            if self.pre:
                if not self.list:
                    bq += "    "
-                #else: list content is already partially indented
+                # else: list content is already partially indented
                for i in range(len(self.list)):
                    bq += "    "
                data = data.replace("\n", "\n" + bq)
@ -700,8 +763,8 @@ class HTML2Text(HTMLParser.HTMLParser):
                    self.out(' ')
                self.space = 0

-            if self.a and ((self.p_p == 2 and self.links_each_paragraph)
-                           or force == "end"):
+            if self.a and ((self.p_p == 2 and self.links_each_paragraph) or
+                           force == "end"):
                if force == "end":
                    self.out("\n")

@ -731,13 +794,25 @@ class HTML2Text(HTMLParser.HTMLParser):
            self.outcount += 1

    def handle_data(self, data, entity_char=False):
+        if self.stressed:
+            data = data.strip()
+            self.stressed = False
+            self.preceding_stressed = True
+        elif (self.preceding_stressed
+              and re.match(r'[^\s.!?]', data[0])
+              and not hn(self.current_tag)
+              and self.current_tag not in ['a', 'code', 'pre']):
+            # should match a letter or common punctuation
+            data = ' ' + data
+            self.preceding_stressed = False
+
        if self.style:
            self.style_def.update(dumb_css_parser(data))

-        if not self.maybe_automatic_link is None:
+        if self.maybe_automatic_link is not None:
            href = self.maybe_automatic_link
-            if (href == data and self.absolute_url_matcher.match(href)
-                    and self.use_automatic_links):
+            if (href == data and self.absolute_url_matcher.match(href) and
+                    self.use_automatic_links):
                self.o("<" + data + ">")
                self.empty_link = False
                return
@ -748,6 +823,7 @@ class HTML2Text(HTMLParser.HTMLParser):

        if not self.code and not self.pre and not entity_char:
            data = escape_md_section(data, snob=self.escape_snob)
+        self.preceding_data = data
        self.o(data, 1)

    def unknown_decl(self, data):  # pragma: no cover
@ -764,10 +840,7 @@ class HTML2Text(HTMLParser.HTMLParser):
            return unifiable_n[c]
        else:
            try:
-                try:
-                    return unichr(c)
-                except NameError:  # Python3
-                    return chr(c)
+                return chr(c)
            except ValueError:  # invalid unicode
                return ''

@ -783,10 +856,7 @@ class HTML2Text(HTMLParser.HTMLParser):
                if c == 'nbsp':
                    return config.UNIFIABLE[c]
                else:
-                    try:
-                        return unichr(name2cp(c))
-                    except NameError:  # Python3
-                        return chr(name2cp(c))
+                    return chr(name2cp(c))

    def replaceEntities(self, s):
        s = s.group(1)
@ -809,7 +879,7 @@ class HTML2Text(HTMLParser.HTMLParser):
        nest_count = 0
        if 'margin-left' in style:
            nest_count = int(style['margin-left'][:-2]) \
-                         // self.google_list_indent
+                // self.google_list_indent

        return nest_count

--- a/included_dependencies/html2text/cli.py
+++ b/included_dependencies/html2text/cli.py
@ -158,7 +158,8 @@ def main():
        action="store_true",
        dest="ignore_tables",
        default=config.IGNORE_TABLES,
-        help="Ignore table-related tags (table, th, td, tr) while keeping rows."
+        help="Ignore table-related tags (table, th, td, tr) "
+             "while keeping rows."
    )
    p.add_option(
        "--single-line-break",
@ -211,7 +212,24 @@ def main():
        action="store",
        type="string",
        default=config.DECODE_ERRORS,
-        help="What to do in case of decode errors.'ignore', 'strict' and 'replace' are acceptable values"
+        help="What to do in case of decode errors.'ignore', 'strict' and "
+             "'replace' are acceptable values"
+    )
+    p.add_option(
+        "--open-quote",
+        dest="open_quote",
+        action="store",
+        type="str",
+        default=config.OPEN_QUOTE,
+        help="The character used to open quotes",
+    )
+    p.add_option(
+        "--close-quote",
+        dest="close_quote",
+        action="store",
+        type="str",
+        default=config.CLOSE_QUOTE,
+        help="The character used to close quotes",
    )
    (options, args) = p.parse_args()

@ -226,8 +244,11 @@ def main():
        file_ = args[0]

        if file_.startswith('http://') or file_.startswith('https://'):
-            warnings.warn("Support for retrieving html over network is set for deprecation by version (2017, 1, x)",
-                    DeprecationWarning)
+            warnings.warn(
+                "Support for retrieving html over network is set for "
+                "deprecation by version (2017, 1, x)",
+                DeprecationWarning
+            )
            baseurl = file_
            j = urllib.urlopen(baseurl)
            data = j.read()
@ -235,7 +256,8 @@ def main():
                try:
                    from feedparser import _getCharacterEncoding as enc
                except ImportError:
-                    enc = lambda x, y: ('utf-8', 1)
+                    def enc(x, y):
+                        return ('utf-8', 1)
                encoding = enc(j.headers, data)[0]
                if encoding == 'us-ascii':
                    encoding = 'utf-8'
@ -245,7 +267,8 @@ def main():
                try:
                    from chardet import detect
                except ImportError:
-                    detect = lambda x: {'encoding': 'utf-8'}
+                    def detect(x):
+                        return {'encoding': 'utf-8'}
                encoding = detect(data)['encoding']
    else:
        data = wrap_read()
@ -295,5 +318,7 @@ def main():
    h.wrap_links = options.wrap_links
    h.pad_tables = options.pad_tables
    h.default_image_alt = options.default_image_alt
+    h.open_quote = options.open_quote
+    h.close_quote = options.close_quote

    wrapwrite(h.handle(data))
--- a/included_dependencies/html2text/compat.py
+++ b/included_dependencies/html2text/compat.py
@ -13,5 +13,9 @@ else:
    import html.parser as HTMLParser
    import urllib.request as urllib
    from html import escape
+
    def html_escape(s):
        return escape(s, quote=False)
+
+
+__all__ = ['HTMLParser', 'html_escape', 'htmlentitydefs', 'urllib', 'urlparse']
--- a/included_dependencies/html2text/config.py
+++ b/included_dependencies/html2text/config.py
@ -1,6 +1,8 @@
+from __future__ import unicode_literals
+
 import re

-# Use Unicode characters instead of their ascii psuedo-replacements
+# Use Unicode characters instead of their ascii pseudo-replacements
 UNICODE_SNOB = 0

 # Marker to use for marking tables for padding post processing
@ -31,6 +33,9 @@ WRAP_LINKS = True
 # Number of pixels Google indents nested lists
 GOOGLE_LIST_INDENT = 36

+# Values Google and others may use to indicate bold text
+BOLD_TEXT_STYLE_VALUES = ('bold', '700', '800', '900')
+
 IGNORE_ANCHORS = False
 IGNORE_IMAGES = False
 IMAGES_TO_ALT = False
@ -41,7 +46,8 @@ DECODE_ERRORS = 'strict'
 DEFAULT_IMAGE_ALT = ''
 PAD_TABLES = False

-# Convert links with same href and text to <href> format if they are absolute links
+# Convert links with same href and text to <href> format
+# if they are absolute links
 USE_AUTOMATIC_LINKS = True

 # For checking space-only lines on line 771
@ -52,7 +58,10 @@ RE_ORDERED_LIST_MATCHER = re.compile(r'\d+\.\s')
 RE_UNORDERED_LIST_MATCHER = re.compile(r'[-\*\+]\s')
 RE_MD_CHARS_MATCHER = re.compile(r"([\\\[\]\(\)])")
 RE_MD_CHARS_MATCHER_ALL = re.compile(r"([`\*_{}\[\]\(\)#!])")
-RE_LINK = re.compile(r"(\[.*?\] ?\(.*?\))|(\[.*?\]:.*?)")  # to find links in the text
+
+# to find links in the text
+RE_LINK = re.compile(r"(\[.*?\] ?\(.*?\))|(\[.*?\]:.*?)")
+
 RE_MD_DOT_MATCHER = re.compile(r"""
    ^             # start of line
    (\s*\d+)      # optional whitespace and a number
@ -126,6 +135,11 @@ BYPASS_TABLES = False
 IGNORE_TABLES = False


-# Use a single line break after a block element rather an two line breaks.
+# Use a single line break after a block element rather than two line breaks.
 # NOTE: Requires body width setting to be 0.
 SINGLE_LINE_BREAK = False
+
+
+# Use double quotation marks when converting the <q> tag.
+OPEN_QUOTE = '"'
+CLOSE_QUOTE = '"'
--- a/included_dependencies/html2text/utils.py
+++ b/included_dependencies/html2text/utils.py
@ -12,7 +12,6 @@ def name2cp(k):


 unifiable_n = {}
-
 for k in config.UNIFIABLE.keys():
    unifiable_n[name2cp(k)] = config.UNIFIABLE[k]

@ -191,7 +190,7 @@ def skipwrap(para, wrap_links):
    # I'm not sure what this is for; I thought it was to detect lists,
    # but there's a <br>-inside-<span> case in one of the tests that
    # also depends upon it.
-    if stripped[0:1] == '-' or stripped[0:1] == '*':
+    if stripped[0:1] in ('-', '*') and not stripped[0:2] == '**':
        return True

    # If the text begins with a single -, *, or +, followed by a space,
@ -245,6 +244,7 @@ def escape_md_section(text, snob=False):

    return text

+
 def reformat_table(lines, right_margin):
    """
    Given the lines of a table
@ -252,11 +252,24 @@ def reformat_table(lines, right_margin):
    """
    # find the maximum width of the columns
    max_width = [len(x.rstrip()) + right_margin for x in lines[0].split('|')]
+    max_cols = len(max_width)
    for line in lines:
        cols = [x.rstrip() for x in line.split('|')]
+        num_cols = len(cols)
+
+        # don't drop any data if colspan attributes result in unequal lengths
+        if num_cols < max_cols:
+            cols += [''] * (max_cols - num_cols)
+        elif max_cols < num_cols:
+            max_width += [
+                len(x) + right_margin for x in
+                cols[-(num_cols - max_cols):]
+            ]
+            max_cols = num_cols
+
        max_width = [max(len(x) + right_margin, old_len)
                     for x, old_len in zip(cols, max_width)]
-    
+
    # reformat
    new_lines = []
    for line in lines:
@ -272,15 +285,16 @@ def reformat_table(lines, right_margin):
        new_lines.append('|'.join(new_cols))
    return new_lines

+
 def pad_tables_in_text(text, right_margin=1):
    """
    Provide padding for tables in the text
    """
    lines = text.split('\n')
-    table_buffer, altered_lines, table_widths, table_started = [], [], [], False
+    table_buffer, table_started = [], False
    new_lines = []
    for line in lines:
-        # Toogle table started
+        # Toggle table started
        if (config.TABLE_MARKER_FOR_PAD in line):
            table_started = not table_started
            if not table_started: