Update html2text to (2016, 9, 19).

This commit is contained in:
Jim Miller 2017-04-29 12:19:07 -05:00
parent db7777b161
commit 7f4bc5c36e
5 changed files with 412 additions and 26 deletions

View file

@ -4,14 +4,13 @@
from __future__ import division from __future__ import division
import re import re
import sys import sys
import cgi
try: try:
from textwrap import wrap from textwrap import wrap
except ImportError: # pragma: no cover except ImportError: # pragma: no cover
pass pass
from html2text.compat import urlparse, HTMLParser from html2text.compat import urlparse, HTMLParser, html_escape
from html2text import config from html2text import config
from html2text.utils import ( from html2text.utils import (
@ -27,10 +26,11 @@ from html2text.utils import (
list_numbering_start, list_numbering_start,
dumb_css_parser, dumb_css_parser,
escape_md_section, escape_md_section,
skipwrap skipwrap,
pad_tables_in_text
) )
__version__ = (2016, 4, 2) __version__ = (2016, 9, 19)
# TODO: # TODO:
@ -68,6 +68,7 @@ class HTML2Text(HTMLParser.HTMLParser):
self.images_with_size = config.IMAGES_WITH_SIZE # covered in cli self.images_with_size = config.IMAGES_WITH_SIZE # covered in cli
self.ignore_emphasis = config.IGNORE_EMPHASIS # covered in cli self.ignore_emphasis = config.IGNORE_EMPHASIS # covered in cli
self.bypass_tables = config.BYPASS_TABLES # covered in cli self.bypass_tables = config.BYPASS_TABLES # covered in cli
self.ignore_tables = config.IGNORE_TABLES # covered in cli
self.google_doc = False # covered in cli self.google_doc = False # covered in cli
self.ul_item_mark = '*' # covered in cli self.ul_item_mark = '*' # covered in cli
self.emphasis_mark = '_' # covered in cli self.emphasis_mark = '_' # covered in cli
@ -77,6 +78,8 @@ class HTML2Text(HTMLParser.HTMLParser):
self.hide_strikethrough = False # covered in cli self.hide_strikethrough = False # covered in cli
self.mark_code = config.MARK_CODE self.mark_code = config.MARK_CODE
self.wrap_links = config.WRAP_LINKS # covered in cli self.wrap_links = config.WRAP_LINKS # covered in cli
self.pad_tables = config.PAD_TABLES # covered in cli
self.default_image_alt = config.DEFAULT_IMAGE_ALT # covered in cli
self.tag_callback = None self.tag_callback = None
if out is None: # pragma: no cover if out is None: # pragma: no cover
@ -130,7 +133,11 @@ class HTML2Text(HTMLParser.HTMLParser):
def handle(self, data): def handle(self, data):
self.feed(data) self.feed(data)
self.feed("") self.feed("")
return self.optwrap(self.close()) markdown = self.optwrap(self.close())
if self.pad_tables:
return pad_tables_in_text(markdown)
else:
return markdown
def outtextf(self, s): def outtextf(self, s):
self.outtextlist.append(s) self.outtextlist.append(s)
@ -142,23 +149,20 @@ class HTML2Text(HTMLParser.HTMLParser):
try: try:
nochr = unicode('') nochr = unicode('')
unicode_character = unichr
except NameError: except NameError:
nochr = str('') nochr = str('')
unicode_character = chr
self.pbr() self.pbr()
self.o('', 0, 'end') self.o('', 0, 'end')
outtext = nochr.join(self.outtextlist) outtext = nochr.join(self.outtextlist)
if self.unicode_snob: if self.unicode_snob:
try: nbsp = unicode_character(name2cp('nbsp'))
nbsp = unichr(name2cp('nbsp'))
except NameError:
nbsp = chr(name2cp('nbsp'))
else: else:
try: nbsp = unicode_character(32)
nbsp = unichr(32)
except NameError:
nbsp = chr(32)
try: try:
outtext = outtext.replace(unicode('&nbsp_place_holder;'), nbsp) outtext = outtext.replace(unicode('&nbsp_place_holder;'), nbsp)
except NameError: except NameError:
@ -173,14 +177,14 @@ class HTML2Text(HTMLParser.HTMLParser):
def handle_charref(self, c): def handle_charref(self, c):
charref = self.charref(c) charref = self.charref(c)
if not self.code and not self.pre: if not self.code and not self.pre:
charref = cgi.escape(charref) charref = html_escape(charref)
self.handle_data(charref, True) self.handle_data(charref, True)
def handle_entityref(self, c): def handle_entityref(self, c):
entityref = self.entityref(c) entityref = self.entityref(c)
if (not self.code and not self.pre if (not self.code and not self.pre
and entityref != '&nbsp_place_holder;'): and entityref != '&nbsp_place_holder;'):
entityref = cgi.escape(entityref) entityref = html_escape(entityref)
self.handle_data(entityref, True) self.handle_data(entityref, True)
def handle_starttag(self, tag, attrs): def handle_starttag(self, tag, attrs):
@ -331,6 +335,9 @@ class HTML2Text(HTMLParser.HTMLParser):
self.p() self.p()
if tag == "br" and start: if tag == "br" and start:
if self.blockquote > 0:
self.o(" \n> ")
else:
self.o(" \n") self.o(" \n")
if tag == "hr" and start: if tag == "hr" and start:
@ -439,7 +446,7 @@ class HTML2Text(HTMLParser.HTMLParser):
if 'src' in attrs: if 'src' in attrs:
if not self.images_to_alt: if not self.images_to_alt:
attrs['href'] = attrs['src'] attrs['href'] = attrs['src']
alt = attrs.get('alt') or '' alt = attrs.get('alt') or self.default_image_alt
# If we have images_with_size, write raw html including width, # If we have images_with_size, write raw html including width,
# height, and alt attributes # height, and alt attributes
@ -541,7 +548,16 @@ class HTML2Text(HTMLParser.HTMLParser):
self.start = 1 self.start = 1
if tag in ["table", "tr", "td", "th"]: if tag in ["table", "tr", "td", "th"]:
if self.bypass_tables: if self.ignore_tables:
if tag == 'tr':
if start:
pass
else:
self.soft_br()
else:
pass
elif self.bypass_tables:
if start: if start:
self.soft_br() self.soft_br()
if tag in ["td", "th"]: if tag in ["td", "th"]:
@ -556,8 +572,16 @@ class HTML2Text(HTMLParser.HTMLParser):
self.o('</{0}>'.format(tag)) self.o('</{0}>'.format(tag))
else: else:
if tag == "table" and start: if tag == "table":
if start:
self.table_start = True self.table_start = True
if self.pad_tables:
self.o("<"+config.TABLE_MARKER_FOR_PAD+">")
self.o(" \n")
else:
if self.pad_tables:
self.o("</"+config.TABLE_MARKER_FOR_PAD+">")
self.o(" \n")
if tag in ["td", "th"] and start: if tag in ["td", "th"] and start:
if self.split_next_td: if self.split_next_td:
self.o("| ") self.o("| ")
@ -707,9 +731,6 @@ class HTML2Text(HTMLParser.HTMLParser):
self.outcount += 1 self.outcount += 1
def handle_data(self, data, entity_char=False): def handle_data(self, data, entity_char=False):
if r'\/script>' in data:
self.quiet -= 1
if self.style: if self.style:
self.style_def.update(dumb_css_parser(data)) self.style_def.update(dumb_css_parser(data))
@ -814,7 +835,9 @@ class HTML2Text(HTMLParser.HTMLParser):
for para in text.split("\n"): for para in text.split("\n"):
if len(para) > 0: if len(para) > 0:
if not skipwrap(para, self.wrap_links): if not skipwrap(para, self.wrap_links):
result += "\n".join(wrap(para, self.body_width)) result += "\n".join(
wrap(para, self.body_width, break_long_words=False)
)
if para.endswith(' '): if para.endswith(' '):
result += " \n" result += " \n"
newlines = 1 newlines = 1

View file

@ -0,0 +1,299 @@
import optparse
import warnings
from html2text.compat import urllib
from html2text import HTML2Text, config, __version__
from html2text.utils import wrapwrite, wrap_read
def main():
baseurl = ''
class bcolors: # pragma: no cover
HEADER = '\033[95m'
OKBLUE = '\033[94m'
OKGREEN = '\033[92m'
WARNING = '\033[93m'
FAIL = '\033[91m'
ENDC = '\033[0m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'
p = optparse.OptionParser(
'%prog [(filename|url) [encoding]]',
version='%prog ' + ".".join(map(str, __version__))
)
p.add_option(
"--default-image-alt",
dest="default_image_alt",
action="store",
type="str",
default=config.DEFAULT_IMAGE_ALT,
help="The default alt string for images with missing ones")
p.add_option(
"--pad-tables",
dest="pad_tables",
action="store_true",
default=config.PAD_TABLES,
help="pad the cells to equal column width in tables"
)
p.add_option(
"--no-wrap-links",
dest="wrap_links",
action="store_false",
default=config.WRAP_LINKS,
help="wrap links during conversion"
)
p.add_option(
"--ignore-emphasis",
dest="ignore_emphasis",
action="store_true",
default=config.IGNORE_EMPHASIS,
help="don't include any formatting for emphasis"
)
p.add_option(
"--reference-links",
dest="inline_links",
action="store_false",
default=config.INLINE_LINKS,
help="use reference style links instead of inline links"
)
p.add_option(
"--ignore-links",
dest="ignore_links",
action="store_true",
default=config.IGNORE_ANCHORS,
help="don't include any formatting for links")
p.add_option(
"--protect-links",
dest="protect_links",
action="store_true",
default=config.PROTECT_LINKS,
help=("protect links from line breaks surrounding them " +
"with angle brackets"))
p.add_option(
"--ignore-images",
dest="ignore_images",
action="store_true",
default=config.IGNORE_IMAGES,
help="don't include any formatting for images"
)
p.add_option(
"--images-to-alt",
dest="images_to_alt",
action="store_true",
default=config.IMAGES_TO_ALT,
help="Discard image data, only keep alt text"
)
p.add_option(
"--images-with-size",
dest="images_with_size",
action="store_true",
default=config.IMAGES_WITH_SIZE,
help="Write image tags with height and width attrs as raw html to "
"retain dimensions"
)
p.add_option(
"-g", "--google-doc",
action="store_true",
dest="google_doc",
default=False,
help="convert an html-exported Google Document"
)
p.add_option(
"-d", "--dash-unordered-list",
action="store_true",
dest="ul_style_dash",
default=False,
help="use a dash rather than a star for unordered list items"
)
p.add_option(
"-e", "--asterisk-emphasis",
action="store_true",
dest="em_style_asterisk",
default=False,
help="use an asterisk rather than an underscore for emphasized text"
)
p.add_option(
"-b", "--body-width",
dest="body_width",
action="store",
type="int",
default=config.BODY_WIDTH,
help="number of characters per output line, 0 for no wrap"
)
p.add_option(
"-i", "--google-list-indent",
dest="list_indent",
action="store",
type="int",
default=config.GOOGLE_LIST_INDENT,
help="number of pixels Google indents nested lists"
)
p.add_option(
"-s", "--hide-strikethrough",
action="store_true",
dest="hide_strikethrough",
default=False,
help="hide strike-through text. only relevant when -g is "
"specified as well"
)
p.add_option(
"--escape-all",
action="store_true",
dest="escape_snob",
default=False,
help="Escape all special characters. Output is less readable, but "
"avoids corner case formatting issues."
)
p.add_option(
"--bypass-tables",
action="store_true",
dest="bypass_tables",
default=config.BYPASS_TABLES,
help="Format tables in HTML rather than Markdown syntax."
)
p.add_option(
"--ignore-tables",
action="store_true",
dest="ignore_tables",
default=config.IGNORE_TABLES,
help="Ignore table-related tags (table, th, td, tr) while keeping rows."
)
p.add_option(
"--single-line-break",
action="store_true",
dest="single_line_break",
default=config.SINGLE_LINE_BREAK,
help=(
"Use a single line break after a block element rather than two "
"line breaks. NOTE: Requires --body-width=0"
)
)
p.add_option(
"--unicode-snob",
action="store_true",
dest="unicode_snob",
default=config.UNICODE_SNOB,
help="Use unicode throughout document"
)
p.add_option(
"--no-automatic-links",
action="store_false",
dest="use_automatic_links",
default=config.USE_AUTOMATIC_LINKS,
help="Do not use automatic links wherever applicable"
)
p.add_option(
"--no-skip-internal-links",
action="store_false",
dest="skip_internal_links",
default=config.SKIP_INTERNAL_LINKS,
help="Do not skip internal links"
)
p.add_option(
"--links-after-para",
action="store_true",
dest="links_each_paragraph",
default=config.LINKS_EACH_PARAGRAPH,
help="Put links after each paragraph instead of document"
)
p.add_option(
"--mark-code",
action="store_true",
dest="mark_code",
default=config.MARK_CODE,
help="Mark program code blocks with [code]...[/code]"
)
p.add_option(
"--decode-errors",
dest="decode_errors",
action="store",
type="string",
default=config.DECODE_ERRORS,
help="What to do in case of decode errors.'ignore', 'strict' and 'replace' are acceptable values"
)
(options, args) = p.parse_args()
# process input
encoding = "utf-8"
if len(args) == 2:
encoding = args[1]
elif len(args) > 2:
p.error('Too many arguments')
if len(args) > 0 and args[0] != '-': # pragma: no cover
file_ = args[0]
if file_.startswith('http://') or file_.startswith('https://'):
warnings.warn("Support for retrieving html over network is set for deprecation by version (2017, 1, x)",
DeprecationWarning)
baseurl = file_
j = urllib.urlopen(baseurl)
data = j.read()
if encoding is None:
try:
from feedparser import _getCharacterEncoding as enc
except ImportError:
enc = lambda x, y: ('utf-8', 1)
encoding = enc(j.headers, data)[0]
if encoding == 'us-ascii':
encoding = 'utf-8'
else:
data = open(file_, 'rb').read()
if encoding is None:
try:
from chardet import detect
except ImportError:
detect = lambda x: {'encoding': 'utf-8'}
encoding = detect(data)['encoding']
else:
data = wrap_read()
if hasattr(data, 'decode'):
try:
try:
data = data.decode(encoding, errors=options.decode_errors)
except TypeError:
# python 2.6.x does not have the errors option
data = data.decode(encoding)
except UnicodeDecodeError as err:
warning = bcolors.WARNING + "Warning:" + bcolors.ENDC
warning += ' Use the ' + bcolors.OKGREEN
warning += '--decode-errors=ignore' + bcolors.ENDC + 'flag.'
print(warning)
raise err
h = HTML2Text(baseurl=baseurl)
# handle options
if options.ul_style_dash:
h.ul_item_mark = '-'
if options.em_style_asterisk:
h.emphasis_mark = '*'
h.strong_mark = '__'
h.body_width = options.body_width
h.google_list_indent = options.list_indent
h.ignore_emphasis = options.ignore_emphasis
h.ignore_links = options.ignore_links
h.protect_links = options.protect_links
h.ignore_images = options.ignore_images
h.images_to_alt = options.images_to_alt
h.images_with_size = options.images_with_size
h.google_doc = options.google_doc
h.hide_strikethrough = options.hide_strikethrough
h.escape_snob = options.escape_snob
h.bypass_tables = options.bypass_tables
h.ignore_tables = options.ignore_tables
h.single_line_break = options.single_line_break
h.inline_links = options.inline_links
h.unicode_snob = options.unicode_snob
h.use_automatic_links = options.use_automatic_links
h.skip_internal_links = options.skip_internal_links
h.links_each_paragraph = options.links_each_paragraph
h.mark_code = options.mark_code
h.wrap_links = options.wrap_links
h.pad_tables = options.pad_tables
h.default_image_alt = options.default_image_alt
wrapwrite(h.handle(data))

View file

@ -6,8 +6,12 @@ if sys.version_info[0] == 2:
import urlparse import urlparse
import HTMLParser import HTMLParser
import urllib import urllib
from cgi import escape as html_escape
else: else:
import urllib.parse as urlparse import urllib.parse as urlparse
import html.entities as htmlentitydefs import html.entities as htmlentitydefs
import html.parser as HTMLParser import html.parser as HTMLParser
import urllib.request as urllib import urllib.request as urllib
from html import escape
def html_escape(s):
return escape(s, quote=False)

View file

@ -3,6 +3,8 @@ import re
# Use Unicode characters instead of their ascii psuedo-replacements # Use Unicode characters instead of their ascii psuedo-replacements
UNICODE_SNOB = 0 UNICODE_SNOB = 0
# Marker to use for marking tables for padding post processing
TABLE_MARKER_FOR_PAD = "special_marker_for_table_padding"
# Escape all special characters. Output is less readable, but avoids # Escape all special characters. Output is less readable, but avoids
# corner case formatting issues. # corner case formatting issues.
ESCAPE_SNOB = 0 ESCAPE_SNOB = 0
@ -36,6 +38,8 @@ IMAGES_WITH_SIZE = False
IGNORE_EMPHASIS = False IGNORE_EMPHASIS = False
MARK_CODE = False MARK_CODE = False
DECODE_ERRORS = 'strict' DECODE_ERRORS = 'strict'
DEFAULT_IMAGE_ALT = ''
PAD_TABLES = False
# Convert links with same href and text to <href> format if they are absolute links # Convert links with same href and text to <href> format if they are absolute links
USE_AUTOMATIC_LINKS = True USE_AUTOMATIC_LINKS = True
@ -116,7 +120,11 @@ UNIFIABLE = {
'rlm': '' 'rlm': ''
} }
# Format tables in HTML rather than Markdown syntax
BYPASS_TABLES = False BYPASS_TABLES = False
# Ignore table-related tags (table, th, td, tr) while keeping rows
IGNORE_TABLES = False
# Use a single line break after a block element rather an two line breaks. # Use a single line break after a block element rather an two line breaks.
# NOTE: Requires body width setting to be 0. # NOTE: Requires body width setting to be 0.

View file

@ -31,7 +31,7 @@ def dumb_property_dict(style):
""" """
:returns: A hash of css attributes :returns: A hash of css attributes
""" """
out = dict([(x.strip(), y.strip()) for x, y in out = dict([(x.strip().lower(), y.strip().lower()) for x, y in
[z.split(':', 1) for z in [z.split(':', 1) for z in
style.split(';') if ':' in z style.split(';') if ':' in z
] ]
@ -149,7 +149,7 @@ def google_fixed_width_font(style):
font_family = '' font_family = ''
if 'font-family' in style: if 'font-family' in style:
font_family = style['font-family'] font_family = style['font-family']
if 'Courier New' == font_family or 'Consolas' == font_family: if 'courier new' == font_family or 'consolas' == font_family:
return True return True
return False return False
@ -244,3 +244,55 @@ def escape_md_section(text, snob=False):
text = config.RE_MD_DASH_MATCHER.sub(r"\1\\\2", text) text = config.RE_MD_DASH_MATCHER.sub(r"\1\\\2", text)
return text return text
def reformat_table(lines, right_margin):
"""
Given the lines of a table
padds the cells and returns the new lines
"""
# find the maximum width of the columns
max_width = [len(x.rstrip()) + right_margin for x in lines[0].split('|')]
for line in lines:
cols = [x.rstrip() for x in line.split('|')]
max_width = [max(len(x) + right_margin, old_len)
for x, old_len in zip(cols, max_width)]
# reformat
new_lines = []
for line in lines:
cols = [x.rstrip() for x in line.split('|')]
if set(line.strip()) == set('-|'):
filler = '-'
new_cols = [x.rstrip() + (filler * (M - len(x.rstrip())))
for x, M in zip(cols, max_width)]
else:
filler = ' '
new_cols = [x.rstrip() + (filler * (M - len(x.rstrip())))
for x, M in zip(cols, max_width)]
new_lines.append('|'.join(new_cols))
return new_lines
def pad_tables_in_text(text, right_margin=1):
"""
Provide padding for tables in the text
"""
lines = text.split('\n')
table_buffer, altered_lines, table_widths, table_started = [], [], [], False
new_lines = []
for line in lines:
# Toogle table started
if (config.TABLE_MARKER_FOR_PAD in line):
table_started = not table_started
if not table_started:
table = reformat_table(table_buffer, right_margin)
new_lines.extend(table)
table_buffer = []
new_lines.append('')
continue
# Process lines
if table_started:
table_buffer.append(line)
else:
new_lines.append(line)
new_text = '\n'.join(new_lines)
return new_text