Update plugin & web service html2text included package to fix text <>& output.

This commit is contained in:
Jim Miller 2018-08-01 20:19:46 -05:00
parent 9186c2fae9
commit 8fb7f048b5
5 changed files with 211 additions and 84 deletions

View file

@ -2,6 +2,7 @@
# coding: utf-8
"""html2text: Turn HTML into equivalent Markdown-structured text."""
from __future__ import division
from __future__ import unicode_literals
import re
import sys
@ -10,7 +11,7 @@ try:
except ImportError: # pragma: no cover
pass
from html2text.compat import urlparse, HTMLParser, html_escape
from html2text.compat import urlparse, HTMLParser
from html2text import config
from html2text.utils import (
@ -30,7 +31,14 @@ from html2text.utils import (
pad_tables_in_text
)
__version__ = (2016, 9, 19)
try:
chr = unichr
nochr = unicode('')
except NameError:
# python3 uses chr
nochr = str('')
__version__ = (2018, 1, 9)
# TODO:
@ -81,6 +89,8 @@ class HTML2Text(HTMLParser.HTMLParser):
self.pad_tables = config.PAD_TABLES # covered in cli
self.default_image_alt = config.DEFAULT_IMAGE_ALT # covered in cli
self.tag_callback = None
self.open_quote = config.OPEN_QUOTE # covered in cli
self.close_quote = config.CLOSE_QUOTE # covered in cli
if out is None: # pragma: no cover
self.out = self.outtextf
@ -106,6 +116,7 @@ class HTML2Text(HTMLParser.HTMLParser):
self.pre = 0
self.startpre = 0
self.code = False
self.quote = False
self.br_toggle = ''
self.lastWasNL = 0
self.lastWasList = False
@ -119,6 +130,10 @@ class HTML2Text(HTMLParser.HTMLParser):
self.abbr_data = None # last inner HTML (for abbr being defined)
self.abbr_list = {} # stack of abbreviations to write later
self.baseurl = baseurl
self.stressed = False
self.preceding_stressed = False
self.preceding_data = None
self.current_tag = None
try:
del unifiable_n[name2cp('nbsp')]
@ -147,22 +162,15 @@ class HTML2Text(HTMLParser.HTMLParser):
def close(self):
HTMLParser.HTMLParser.close(self)
try:
nochr = unicode('')
unicode_character = unichr
except NameError:
nochr = str('')
unicode_character = chr
self.pbr()
self.o('', 0, 'end')
outtext = nochr.join(self.outtextlist)
if self.unicode_snob:
nbsp = unicode_character(name2cp('nbsp'))
nbsp = chr(name2cp('nbsp'))
else:
nbsp = unicode_character(32)
nbsp = chr(32)
try:
outtext = outtext.replace(unicode('&nbsp_place_holder;'), nbsp)
except NameError:
@ -175,17 +183,10 @@ class HTML2Text(HTMLParser.HTMLParser):
return outtext
def handle_charref(self, c):
charref = self.charref(c)
if not self.code and not self.pre:
charref = html_escape(charref)
self.handle_data(charref, True)
self.handle_data(self.charref(c), True)
def handle_entityref(self, c):
entityref = self.entityref(c)
if (not self.code and not self.pre
and entityref != '&nbsp_place_holder;'):
entityref = html_escape(entityref)
self.handle_data(entityref, True)
self.handle_data(self.entityref(c), True)
def handle_starttag(self, tag, attrs):
self.handle_tag(tag, attrs, 1)
@ -208,10 +209,11 @@ class HTML2Text(HTMLParser.HTMLParser):
i += 1
match = 0
if ('href' in a) and a['href'] == attrs['href']:
if ('title' in a) or ('title' in attrs):
if (('title' in a) and ('title' in attrs) and
a['title'] == attrs['title']):
if 'href' in a and a['href'] == attrs['href']:
if 'title' in a or 'title' in attrs:
if 'title' in a and \
'title' in attrs and \
a['title'] == attrs['title']:
match = True
else:
match = True
@ -229,8 +231,16 @@ class HTML2Text(HTMLParser.HTMLParser):
# handle Google's text emphasis
strikethrough = 'line-through' in \
tag_emphasis and self.hide_strikethrough
bold = 'bold' in tag_emphasis and not 'bold' in parent_emphasis
italic = 'italic' in tag_emphasis and not 'italic' in parent_emphasis
# google and others may mark a font's weight as `bold` or `700`
bold = False
for bold_marker in config.BOLD_TEXT_STYLE_VALUES:
bold = (bold_marker in tag_emphasis
and bold_marker not in parent_emphasis)
if bold:
break
italic = 'italic' in tag_emphasis and 'italic' not in parent_emphasis
fixed = google_fixed_width_font(tag_style) and not \
google_fixed_width_font(parent_style) and not self.pre
@ -282,6 +292,7 @@ class HTML2Text(HTMLParser.HTMLParser):
self.quiet -= 1
def handle_tag(self, tag, attrs, start):
self.current_tag = tag
# attrs is None for endtags
if attrs is None:
attrs = {}
@ -292,10 +303,11 @@ class HTML2Text(HTMLParser.HTMLParser):
if self.tag_callback(self, tag, attrs, start) is True:
return
# first thing inside the anchor tag is another tag that produces some output
if (start and not self.maybe_automatic_link is None
and tag not in ['p', 'div', 'style', 'dl', 'dt']
and (tag != "img" or self.ignore_images)):
# first thing inside the anchor tag is another tag
# that produces some output
if (start and self.maybe_automatic_link is not None and
tag not in ['p', 'div', 'style', 'dl', 'dt'] and
(tag != "img" or self.ignore_images)):
self.o("[")
self.maybe_automatic_link = None
self.empty_link = False
@ -312,7 +324,8 @@ class HTML2Text(HTMLParser.HTMLParser):
tag_style = element_style(attrs, self.style_def, parent_style)
self.tag_stack.append((tag, attrs, tag_style))
else:
dummy, attrs, tag_style = self.tag_stack.pop() if self.tag_stack else (None, {}, {})
dummy, attrs, tag_style = self.tag_stack.pop() \
if self.tag_stack else (None, {}, {})
if self.tag_stack:
parent_style = self.tag_stack[-1][2]
@ -331,6 +344,8 @@ class HTML2Text(HTMLParser.HTMLParser):
self.p()
else:
self.soft_br()
elif self.astack and tag == 'div':
pass
else:
self.p()
@ -370,24 +385,49 @@ class HTML2Text(HTMLParser.HTMLParser):
self.blockquote -= 1
self.p()
def no_preceding_space(self):
return (self.preceding_data
and re.match(r'[^\s]', self.preceding_data[-1]))
if tag in ['em', 'i', 'u'] and not self.ignore_emphasis:
self.o(self.emphasis_mark)
if tag in ['strong', 'b'] and not self.ignore_emphasis:
self.o(self.strong_mark)
if tag in ['del', 'strike', 's']:
if start:
self.o('~~')
if start and no_preceding_space(self):
emphasis = ' ' + self.emphasis_mark
else:
self.o('~~')
emphasis = self.emphasis_mark
self.o(emphasis)
if start:
self.stressed = True
if tag in ['strong', 'b'] and not self.ignore_emphasis:
if start and no_preceding_space(self):
strong = ' ' + self.strong_mark
else:
strong = self.strong_mark
self.o(strong)
if start:
self.stressed = True
if tag in ['del', 'strike', 's']:
if start and no_preceding_space(self):
strike = ' ~~'
else:
strike = '~~'
self.o(strike)
if start:
self.stressed = True
if self.google_doc:
if not self.inheader:
# handle some font attributes, but leave headers clean
self.handle_emphasis(start, tag_style, parent_style)
if tag in ["code", "tt"] and not self.pre:
if tag in ["kbd", "code", "tt"] and not self.pre:
self.o('`') # TODO: `` `this` ``
self.code = not self.code
if tag == "abbr":
if start:
self.abbr_title = None
@ -400,17 +440,30 @@ class HTML2Text(HTMLParser.HTMLParser):
self.abbr_title = None
self.abbr_data = ''
if tag == "q":
if not self.quote:
self.o(self.open_quote)
else:
self.o(self.close_quote)
self.quote = not self.quote
def link_url(self, link, title=""):
url = urlparse.urljoin(self.baseurl, link)
title = ' "{0}"'.format(title) if title.strip() else ''
self.o(']({url}{title})'.format(url=escape_md(url),
title=title))
if tag == "a" and not self.ignore_links:
if start:
if ('href' in attrs) and \
(attrs['href'] is not None) and \
not (self.skip_internal_links and
attrs['href'].startswith('#')):
if 'href' in attrs and \
attrs['href'] is not None and not \
(self.skip_internal_links and
attrs['href'].startswith('#')):
self.astack.append(attrs)
self.maybe_automatic_link = attrs['href']
self.empty_link = True
if self.protect_links:
attrs['href'] = '<'+attrs['href']+'>'
attrs['href'] = '<' + attrs['href'] + '>'
else:
self.astack.append(None)
else:
@ -425,12 +478,12 @@ class HTML2Text(HTMLParser.HTMLParser):
self.maybe_automatic_link = None
if self.inline_links:
try:
title = escape_md(a['title'])
title = a['title'] if a['title'] else ''
title = escape_md(title)
except KeyError:
self.o("](" + escape_md(urlparse.urljoin(self.baseurl, a['href'])) + ")")
link_url(self, a['href'], '')
else:
self.o("](" + escape_md(urlparse.urljoin(self.baseurl, a['href']))
+ ' "' + title + '" )')
link_url(self, a['href'], title)
else:
i = self.previousIndex(a)
if i is not None:
@ -463,7 +516,7 @@ class HTML2Text(HTMLParser.HTMLParser):
return
# If we have a link to create, output the start
if not self.maybe_automatic_link is None:
if self.maybe_automatic_link is not None:
href = self.maybe_automatic_link
if self.images_to_alt and escape_md(alt) == href and \
self.absolute_url_matcher.match(href):
@ -483,7 +536,16 @@ class HTML2Text(HTMLParser.HTMLParser):
self.o("![" + escape_md(alt) + "]")
if self.inline_links:
href = attrs.get('href') or ''
self.o("(" + escape_md(urlparse.urljoin(self.baseurl, href)) + ")")
self.o(
"(" +
escape_md(
urlparse.urljoin(
self.baseurl,
href
)
) +
")"
)
else:
i = self.previousIndex(attrs)
if i is not None:
@ -576,11 +638,11 @@ class HTML2Text(HTMLParser.HTMLParser):
if start:
self.table_start = True
if self.pad_tables:
self.o("<"+config.TABLE_MARKER_FOR_PAD+">")
self.o("<" + config.TABLE_MARKER_FOR_PAD + ">")
self.o(" \n")
else:
if self.pad_tables:
self.o("</"+config.TABLE_MARKER_FOR_PAD+">")
self.o("</" + config.TABLE_MARKER_FOR_PAD + ">")
self.o(" \n")
if tag in ["td", "th"] and start:
if self.split_next_td:
@ -654,8 +716,9 @@ class HTML2Text(HTMLParser.HTMLParser):
return
if self.startpre:
#self.out(" :") #TODO: not output when already one there
if not data.startswith("\n"): # <pre>stuff...
# self.out(" :") #TODO: not output when already one there
if not data.startswith("\n") and not data.startswith("\r\n"):
# <pre>stuff...
data = "\n" + data
if self.mark_code:
self.out("\n[code]")
@ -668,7 +731,7 @@ class HTML2Text(HTMLParser.HTMLParser):
if self.pre:
if not self.list:
bq += " "
#else: list content is already partially indented
# else: list content is already partially indented
for i in range(len(self.list)):
bq += " "
data = data.replace("\n", "\n" + bq)
@ -700,8 +763,8 @@ class HTML2Text(HTMLParser.HTMLParser):
self.out(' ')
self.space = 0
if self.a and ((self.p_p == 2 and self.links_each_paragraph)
or force == "end"):
if self.a and ((self.p_p == 2 and self.links_each_paragraph) or
force == "end"):
if force == "end":
self.out("\n")
@ -731,13 +794,25 @@ class HTML2Text(HTMLParser.HTMLParser):
self.outcount += 1
def handle_data(self, data, entity_char=False):
if self.stressed:
data = data.strip()
self.stressed = False
self.preceding_stressed = True
elif (self.preceding_stressed
and re.match(r'[^\s.!?]', data[0])
and not hn(self.current_tag)
and self.current_tag not in ['a', 'code', 'pre']):
# should match a letter or common punctuation
data = ' ' + data
self.preceding_stressed = False
if self.style:
self.style_def.update(dumb_css_parser(data))
if not self.maybe_automatic_link is None:
if self.maybe_automatic_link is not None:
href = self.maybe_automatic_link
if (href == data and self.absolute_url_matcher.match(href)
and self.use_automatic_links):
if (href == data and self.absolute_url_matcher.match(href) and
self.use_automatic_links):
self.o("<" + data + ">")
self.empty_link = False
return
@ -748,6 +823,7 @@ class HTML2Text(HTMLParser.HTMLParser):
if not self.code and not self.pre and not entity_char:
data = escape_md_section(data, snob=self.escape_snob)
self.preceding_data = data
self.o(data, 1)
def unknown_decl(self, data): # pragma: no cover
@ -764,10 +840,7 @@ class HTML2Text(HTMLParser.HTMLParser):
return unifiable_n[c]
else:
try:
try:
return unichr(c)
except NameError: # Python3
return chr(c)
return chr(c)
except ValueError: # invalid unicode
return ''
@ -783,10 +856,7 @@ class HTML2Text(HTMLParser.HTMLParser):
if c == 'nbsp':
return config.UNIFIABLE[c]
else:
try:
return unichr(name2cp(c))
except NameError: # Python3
return chr(name2cp(c))
return chr(name2cp(c))
def replaceEntities(self, s):
s = s.group(1)
@ -809,7 +879,7 @@ class HTML2Text(HTMLParser.HTMLParser):
nest_count = 0
if 'margin-left' in style:
nest_count = int(style['margin-left'][:-2]) \
// self.google_list_indent
// self.google_list_indent
return nest_count

View file

@ -158,7 +158,8 @@ def main():
action="store_true",
dest="ignore_tables",
default=config.IGNORE_TABLES,
help="Ignore table-related tags (table, th, td, tr) while keeping rows."
help="Ignore table-related tags (table, th, td, tr) "
"while keeping rows."
)
p.add_option(
"--single-line-break",
@ -211,7 +212,24 @@ def main():
action="store",
type="string",
default=config.DECODE_ERRORS,
help="What to do in case of decode errors.'ignore', 'strict' and 'replace' are acceptable values"
help="What to do in case of decode errors.'ignore', 'strict' and "
"'replace' are acceptable values"
)
p.add_option(
"--open-quote",
dest="open_quote",
action="store",
type="str",
default=config.OPEN_QUOTE,
help="The character used to open quotes",
)
p.add_option(
"--close-quote",
dest="close_quote",
action="store",
type="str",
default=config.CLOSE_QUOTE,
help="The character used to close quotes",
)
(options, args) = p.parse_args()
@ -226,8 +244,11 @@ def main():
file_ = args[0]
if file_.startswith('http://') or file_.startswith('https://'):
warnings.warn("Support for retrieving html over network is set for deprecation by version (2017, 1, x)",
DeprecationWarning)
warnings.warn(
"Support for retrieving html over network is set for "
"deprecation by version (2017, 1, x)",
DeprecationWarning
)
baseurl = file_
j = urllib.urlopen(baseurl)
data = j.read()
@ -235,7 +256,8 @@ def main():
try:
from feedparser import _getCharacterEncoding as enc
except ImportError:
enc = lambda x, y: ('utf-8', 1)
def enc(x, y):
return ('utf-8', 1)
encoding = enc(j.headers, data)[0]
if encoding == 'us-ascii':
encoding = 'utf-8'
@ -245,7 +267,8 @@ def main():
try:
from chardet import detect
except ImportError:
detect = lambda x: {'encoding': 'utf-8'}
def detect(x):
return {'encoding': 'utf-8'}
encoding = detect(data)['encoding']
else:
data = wrap_read()
@ -295,5 +318,7 @@ def main():
h.wrap_links = options.wrap_links
h.pad_tables = options.pad_tables
h.default_image_alt = options.default_image_alt
h.open_quote = options.open_quote
h.close_quote = options.close_quote
wrapwrite(h.handle(data))

View file

@ -13,5 +13,9 @@ else:
import html.parser as HTMLParser
import urllib.request as urllib
from html import escape
def html_escape(s):
return escape(s, quote=False)
__all__ = ['HTMLParser', 'html_escape', 'htmlentitydefs', 'urllib', 'urlparse']

View file

@ -1,6 +1,8 @@
from __future__ import unicode_literals
import re
# Use Unicode characters instead of their ascii psuedo-replacements
# Use Unicode characters instead of their ascii pseudo-replacements
UNICODE_SNOB = 0
# Marker to use for marking tables for padding post processing
@ -31,6 +33,9 @@ WRAP_LINKS = True
# Number of pixels Google indents nested lists
GOOGLE_LIST_INDENT = 36
# Values Google and others may use to indicate bold text
BOLD_TEXT_STYLE_VALUES = ('bold', '700', '800', '900')
IGNORE_ANCHORS = False
IGNORE_IMAGES = False
IMAGES_TO_ALT = False
@ -41,7 +46,8 @@ DECODE_ERRORS = 'strict'
DEFAULT_IMAGE_ALT = ''
PAD_TABLES = False
# Convert links with same href and text to <href> format if they are absolute links
# Convert links with same href and text to <href> format
# if they are absolute links
USE_AUTOMATIC_LINKS = True
# For checking space-only lines on line 771
@ -52,7 +58,10 @@ RE_ORDERED_LIST_MATCHER = re.compile(r'\d+\.\s')
RE_UNORDERED_LIST_MATCHER = re.compile(r'[-\*\+]\s')
RE_MD_CHARS_MATCHER = re.compile(r"([\\\[\]\(\)])")
RE_MD_CHARS_MATCHER_ALL = re.compile(r"([`\*_{}\[\]\(\)#!])")
RE_LINK = re.compile(r"(\[.*?\] ?\(.*?\))|(\[.*?\]:.*?)") # to find links in the text
# to find links in the text
RE_LINK = re.compile(r"(\[.*?\] ?\(.*?\))|(\[.*?\]:.*?)")
RE_MD_DOT_MATCHER = re.compile(r"""
^ # start of line
(\s*\d+) # optional whitespace and a number
@ -126,6 +135,11 @@ BYPASS_TABLES = False
IGNORE_TABLES = False
# Use a single line break after a block element rather an two line breaks.
# Use a single line break after a block element rather than two line breaks.
# NOTE: Requires body width setting to be 0.
SINGLE_LINE_BREAK = False
# Use double quotation marks when converting the <q> tag.
OPEN_QUOTE = '"'
CLOSE_QUOTE = '"'

View file

@ -12,7 +12,6 @@ def name2cp(k):
unifiable_n = {}
for k in config.UNIFIABLE.keys():
unifiable_n[name2cp(k)] = config.UNIFIABLE[k]
@ -191,7 +190,7 @@ def skipwrap(para, wrap_links):
# I'm not sure what this is for; I thought it was to detect lists,
# but there's a <br>-inside-<span> case in one of the tests that
# also depends upon it.
if stripped[0:1] == '-' or stripped[0:1] == '*':
if stripped[0:1] in ('-', '*') and not stripped[0:2] == '**':
return True
# If the text begins with a single -, *, or +, followed by a space,
@ -245,6 +244,7 @@ def escape_md_section(text, snob=False):
return text
def reformat_table(lines, right_margin):
"""
Given the lines of a table
@ -252,8 +252,21 @@ def reformat_table(lines, right_margin):
"""
# find the maximum width of the columns
max_width = [len(x.rstrip()) + right_margin for x in lines[0].split('|')]
max_cols = len(max_width)
for line in lines:
cols = [x.rstrip() for x in line.split('|')]
num_cols = len(cols)
# don't drop any data if colspan attributes result in unequal lengths
if num_cols < max_cols:
cols += [''] * (max_cols - num_cols)
elif max_cols < num_cols:
max_width += [
len(x) + right_margin for x in
cols[-(num_cols - max_cols):]
]
max_cols = num_cols
max_width = [max(len(x) + right_margin, old_len)
for x, old_len in zip(cols, max_width)]
@ -272,15 +285,16 @@ def reformat_table(lines, right_margin):
new_lines.append('|'.join(new_cols))
return new_lines
def pad_tables_in_text(text, right_margin=1):
"""
Provide padding for tables in the text
"""
lines = text.split('\n')
table_buffer, altered_lines, table_widths, table_started = [], [], [], False
table_buffer, table_started = [], False
new_lines = []
for line in lines:
# Toogle table started
# Toggle table started
if (config.TABLE_MARKER_FOR_PAD in line):
table_started = not table_started
if not table_started: