Update html2text to (2016, 9, 19).

This commit is contained in:
Jim Miller 2017-04-29 12:19:07 -05:00
parent db7777b161
commit 7f4bc5c36e
5 changed files with 412 additions and 26 deletions

View file

@ -4,14 +4,13 @@
from __future__ import division
import re
import sys
import cgi
try:
from textwrap import wrap
except ImportError: # pragma: no cover
pass
from html2text.compat import urlparse, HTMLParser
from html2text.compat import urlparse, HTMLParser, html_escape
from html2text import config
from html2text.utils import (
@ -27,10 +26,11 @@ from html2text.utils import (
list_numbering_start,
dumb_css_parser,
escape_md_section,
skipwrap
skipwrap,
pad_tables_in_text
)
__version__ = (2016, 4, 2)
__version__ = (2016, 9, 19)
# TODO:
@ -68,6 +68,7 @@ class HTML2Text(HTMLParser.HTMLParser):
self.images_with_size = config.IMAGES_WITH_SIZE # covered in cli
self.ignore_emphasis = config.IGNORE_EMPHASIS # covered in cli
self.bypass_tables = config.BYPASS_TABLES # covered in cli
self.ignore_tables = config.IGNORE_TABLES # covered in cli
self.google_doc = False # covered in cli
self.ul_item_mark = '*' # covered in cli
self.emphasis_mark = '_' # covered in cli
@ -77,6 +78,8 @@ class HTML2Text(HTMLParser.HTMLParser):
self.hide_strikethrough = False # covered in cli
self.mark_code = config.MARK_CODE
self.wrap_links = config.WRAP_LINKS # covered in cli
self.pad_tables = config.PAD_TABLES # covered in cli
self.default_image_alt = config.DEFAULT_IMAGE_ALT # covered in cli
self.tag_callback = None
if out is None: # pragma: no cover
@ -130,7 +133,11 @@ class HTML2Text(HTMLParser.HTMLParser):
def handle(self, data):
self.feed(data)
self.feed("")
return self.optwrap(self.close())
markdown = self.optwrap(self.close())
if self.pad_tables:
return pad_tables_in_text(markdown)
else:
return markdown
def outtextf(self, s):
self.outtextlist.append(s)
@ -142,23 +149,20 @@ class HTML2Text(HTMLParser.HTMLParser):
try:
nochr = unicode('')
unicode_character = unichr
except NameError:
nochr = str('')
unicode_character = chr
self.pbr()
self.o('', 0, 'end')
outtext = nochr.join(self.outtextlist)
if self.unicode_snob:
try:
nbsp = unichr(name2cp('nbsp'))
except NameError:
nbsp = chr(name2cp('nbsp'))
nbsp = unicode_character(name2cp('nbsp'))
else:
try:
nbsp = unichr(32)
except NameError:
nbsp = chr(32)
nbsp = unicode_character(32)
try:
outtext = outtext.replace(unicode('&nbsp_place_holder;'), nbsp)
except NameError:
@ -173,14 +177,14 @@ class HTML2Text(HTMLParser.HTMLParser):
def handle_charref(self, c):
charref = self.charref(c)
if not self.code and not self.pre:
charref = cgi.escape(charref)
charref = html_escape(charref)
self.handle_data(charref, True)
def handle_entityref(self, c):
entityref = self.entityref(c)
if (not self.code and not self.pre
and entityref != '&nbsp_place_holder;'):
entityref = cgi.escape(entityref)
entityref = html_escape(entityref)
self.handle_data(entityref, True)
def handle_starttag(self, tag, attrs):
@ -331,6 +335,9 @@ class HTML2Text(HTMLParser.HTMLParser):
self.p()
if tag == "br" and start:
if self.blockquote > 0:
self.o(" \n> ")
else:
self.o(" \n")
if tag == "hr" and start:
@ -439,7 +446,7 @@ class HTML2Text(HTMLParser.HTMLParser):
if 'src' in attrs:
if not self.images_to_alt:
attrs['href'] = attrs['src']
alt = attrs.get('alt') or ''
alt = attrs.get('alt') or self.default_image_alt
# If we have images_with_size, write raw html including width,
# height, and alt attributes
@ -541,7 +548,16 @@ class HTML2Text(HTMLParser.HTMLParser):
self.start = 1
if tag in ["table", "tr", "td", "th"]:
if self.bypass_tables:
if self.ignore_tables:
if tag == 'tr':
if start:
pass
else:
self.soft_br()
else:
pass
elif self.bypass_tables:
if start:
self.soft_br()
if tag in ["td", "th"]:
@ -556,8 +572,16 @@ class HTML2Text(HTMLParser.HTMLParser):
self.o('</{0}>'.format(tag))
else:
if tag == "table" and start:
if tag == "table":
if start:
self.table_start = True
if self.pad_tables:
self.o("<"+config.TABLE_MARKER_FOR_PAD+">")
self.o(" \n")
else:
if self.pad_tables:
self.o("</"+config.TABLE_MARKER_FOR_PAD+">")
self.o(" \n")
if tag in ["td", "th"] and start:
if self.split_next_td:
self.o("| ")
@ -707,9 +731,6 @@ class HTML2Text(HTMLParser.HTMLParser):
self.outcount += 1
def handle_data(self, data, entity_char=False):
if r'\/script>' in data:
self.quiet -= 1
if self.style:
self.style_def.update(dumb_css_parser(data))
@ -814,7 +835,9 @@ class HTML2Text(HTMLParser.HTMLParser):
for para in text.split("\n"):
if len(para) > 0:
if not skipwrap(para, self.wrap_links):
result += "\n".join(wrap(para, self.body_width))
result += "\n".join(
wrap(para, self.body_width, break_long_words=False)
)
if para.endswith(' '):
result += " \n"
newlines = 1

View file

@ -0,0 +1,299 @@
import optparse
import warnings
from html2text.compat import urllib
from html2text import HTML2Text, config, __version__
from html2text.utils import wrapwrite, wrap_read
def main():
baseurl = ''
class bcolors: # pragma: no cover
HEADER = '\033[95m'
OKBLUE = '\033[94m'
OKGREEN = '\033[92m'
WARNING = '\033[93m'
FAIL = '\033[91m'
ENDC = '\033[0m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'
p = optparse.OptionParser(
'%prog [(filename|url) [encoding]]',
version='%prog ' + ".".join(map(str, __version__))
)
p.add_option(
"--default-image-alt",
dest="default_image_alt",
action="store",
type="str",
default=config.DEFAULT_IMAGE_ALT,
help="The default alt string for images with missing ones")
p.add_option(
"--pad-tables",
dest="pad_tables",
action="store_true",
default=config.PAD_TABLES,
help="pad the cells to equal column width in tables"
)
p.add_option(
"--no-wrap-links",
dest="wrap_links",
action="store_false",
default=config.WRAP_LINKS,
help="wrap links during conversion"
)
p.add_option(
"--ignore-emphasis",
dest="ignore_emphasis",
action="store_true",
default=config.IGNORE_EMPHASIS,
help="don't include any formatting for emphasis"
)
p.add_option(
"--reference-links",
dest="inline_links",
action="store_false",
default=config.INLINE_LINKS,
help="use reference style links instead of inline links"
)
p.add_option(
"--ignore-links",
dest="ignore_links",
action="store_true",
default=config.IGNORE_ANCHORS,
help="don't include any formatting for links")
p.add_option(
"--protect-links",
dest="protect_links",
action="store_true",
default=config.PROTECT_LINKS,
help=("protect links from line breaks surrounding them " +
"with angle brackets"))
p.add_option(
"--ignore-images",
dest="ignore_images",
action="store_true",
default=config.IGNORE_IMAGES,
help="don't include any formatting for images"
)
p.add_option(
"--images-to-alt",
dest="images_to_alt",
action="store_true",
default=config.IMAGES_TO_ALT,
help="Discard image data, only keep alt text"
)
p.add_option(
"--images-with-size",
dest="images_with_size",
action="store_true",
default=config.IMAGES_WITH_SIZE,
help="Write image tags with height and width attrs as raw html to "
"retain dimensions"
)
p.add_option(
"-g", "--google-doc",
action="store_true",
dest="google_doc",
default=False,
help="convert an html-exported Google Document"
)
p.add_option(
"-d", "--dash-unordered-list",
action="store_true",
dest="ul_style_dash",
default=False,
help="use a dash rather than a star for unordered list items"
)
p.add_option(
"-e", "--asterisk-emphasis",
action="store_true",
dest="em_style_asterisk",
default=False,
help="use an asterisk rather than an underscore for emphasized text"
)
p.add_option(
"-b", "--body-width",
dest="body_width",
action="store",
type="int",
default=config.BODY_WIDTH,
help="number of characters per output line, 0 for no wrap"
)
p.add_option(
"-i", "--google-list-indent",
dest="list_indent",
action="store",
type="int",
default=config.GOOGLE_LIST_INDENT,
help="number of pixels Google indents nested lists"
)
p.add_option(
"-s", "--hide-strikethrough",
action="store_true",
dest="hide_strikethrough",
default=False,
help="hide strike-through text. only relevant when -g is "
"specified as well"
)
p.add_option(
"--escape-all",
action="store_true",
dest="escape_snob",
default=False,
help="Escape all special characters. Output is less readable, but "
"avoids corner case formatting issues."
)
p.add_option(
"--bypass-tables",
action="store_true",
dest="bypass_tables",
default=config.BYPASS_TABLES,
help="Format tables in HTML rather than Markdown syntax."
)
p.add_option(
"--ignore-tables",
action="store_true",
dest="ignore_tables",
default=config.IGNORE_TABLES,
help="Ignore table-related tags (table, th, td, tr) while keeping rows."
)
p.add_option(
"--single-line-break",
action="store_true",
dest="single_line_break",
default=config.SINGLE_LINE_BREAK,
help=(
"Use a single line break after a block element rather than two "
"line breaks. NOTE: Requires --body-width=0"
)
)
p.add_option(
"--unicode-snob",
action="store_true",
dest="unicode_snob",
default=config.UNICODE_SNOB,
help="Use unicode throughout document"
)
p.add_option(
"--no-automatic-links",
action="store_false",
dest="use_automatic_links",
default=config.USE_AUTOMATIC_LINKS,
help="Do not use automatic links wherever applicable"
)
p.add_option(
"--no-skip-internal-links",
action="store_false",
dest="skip_internal_links",
default=config.SKIP_INTERNAL_LINKS,
help="Do not skip internal links"
)
p.add_option(
"--links-after-para",
action="store_true",
dest="links_each_paragraph",
default=config.LINKS_EACH_PARAGRAPH,
help="Put links after each paragraph instead of document"
)
p.add_option(
"--mark-code",
action="store_true",
dest="mark_code",
default=config.MARK_CODE,
help="Mark program code blocks with [code]...[/code]"
)
p.add_option(
"--decode-errors",
dest="decode_errors",
action="store",
type="string",
default=config.DECODE_ERRORS,
help="What to do in case of decode errors.'ignore', 'strict' and 'replace' are acceptable values"
)
(options, args) = p.parse_args()
# process input
encoding = "utf-8"
if len(args) == 2:
encoding = args[1]
elif len(args) > 2:
p.error('Too many arguments')
if len(args) > 0 and args[0] != '-': # pragma: no cover
file_ = args[0]
if file_.startswith('http://') or file_.startswith('https://'):
warnings.warn("Support for retrieving html over network is set for deprecation by version (2017, 1, x)",
DeprecationWarning)
baseurl = file_
j = urllib.urlopen(baseurl)
data = j.read()
if encoding is None:
try:
from feedparser import _getCharacterEncoding as enc
except ImportError:
enc = lambda x, y: ('utf-8', 1)
encoding = enc(j.headers, data)[0]
if encoding == 'us-ascii':
encoding = 'utf-8'
else:
data = open(file_, 'rb').read()
if encoding is None:
try:
from chardet import detect
except ImportError:
detect = lambda x: {'encoding': 'utf-8'}
encoding = detect(data)['encoding']
else:
data = wrap_read()
if hasattr(data, 'decode'):
try:
try:
data = data.decode(encoding, errors=options.decode_errors)
except TypeError:
# python 2.6.x does not have the errors option
data = data.decode(encoding)
except UnicodeDecodeError as err:
warning = bcolors.WARNING + "Warning:" + bcolors.ENDC
warning += ' Use the ' + bcolors.OKGREEN
warning += '--decode-errors=ignore' + bcolors.ENDC + 'flag.'
print(warning)
raise err
h = HTML2Text(baseurl=baseurl)
# handle options
if options.ul_style_dash:
h.ul_item_mark = '-'
if options.em_style_asterisk:
h.emphasis_mark = '*'
h.strong_mark = '__'
h.body_width = options.body_width
h.google_list_indent = options.list_indent
h.ignore_emphasis = options.ignore_emphasis
h.ignore_links = options.ignore_links
h.protect_links = options.protect_links
h.ignore_images = options.ignore_images
h.images_to_alt = options.images_to_alt
h.images_with_size = options.images_with_size
h.google_doc = options.google_doc
h.hide_strikethrough = options.hide_strikethrough
h.escape_snob = options.escape_snob
h.bypass_tables = options.bypass_tables
h.ignore_tables = options.ignore_tables
h.single_line_break = options.single_line_break
h.inline_links = options.inline_links
h.unicode_snob = options.unicode_snob
h.use_automatic_links = options.use_automatic_links
h.skip_internal_links = options.skip_internal_links
h.links_each_paragraph = options.links_each_paragraph
h.mark_code = options.mark_code
h.wrap_links = options.wrap_links
h.pad_tables = options.pad_tables
h.default_image_alt = options.default_image_alt
wrapwrite(h.handle(data))

View file

@ -6,8 +6,12 @@ if sys.version_info[0] == 2:
import urlparse
import HTMLParser
import urllib
from cgi import escape as html_escape
else:
import urllib.parse as urlparse
import html.entities as htmlentitydefs
import html.parser as HTMLParser
import urllib.request as urllib
from html import escape
def html_escape(s):
return escape(s, quote=False)

View file

@ -3,6 +3,8 @@ import re
# Use Unicode characters instead of their ascii psuedo-replacements
UNICODE_SNOB = 0
# Marker to use for marking tables for padding post processing
TABLE_MARKER_FOR_PAD = "special_marker_for_table_padding"
# Escape all special characters. Output is less readable, but avoids
# corner case formatting issues.
ESCAPE_SNOB = 0
@ -36,6 +38,8 @@ IMAGES_WITH_SIZE = False
IGNORE_EMPHASIS = False
MARK_CODE = False
DECODE_ERRORS = 'strict'
DEFAULT_IMAGE_ALT = ''
PAD_TABLES = False
# Convert links with same href and text to <href> format if they are absolute links
USE_AUTOMATIC_LINKS = True
@ -116,7 +120,11 @@ UNIFIABLE = {
'rlm': ''
}
# Format tables in HTML rather than Markdown syntax
BYPASS_TABLES = False
# Ignore table-related tags (table, th, td, tr) while keeping rows
IGNORE_TABLES = False
# Use a single line break after a block element rather an two line breaks.
# NOTE: Requires body width setting to be 0.

View file

@ -31,7 +31,7 @@ def dumb_property_dict(style):
"""
:returns: A hash of css attributes
"""
out = dict([(x.strip(), y.strip()) for x, y in
out = dict([(x.strip().lower(), y.strip().lower()) for x, y in
[z.split(':', 1) for z in
style.split(';') if ':' in z
]
@ -149,7 +149,7 @@ def google_fixed_width_font(style):
font_family = ''
if 'font-family' in style:
font_family = style['font-family']
if 'Courier New' == font_family or 'Consolas' == font_family:
if 'courier new' == font_family or 'consolas' == font_family:
return True
return False
@ -244,3 +244,55 @@ def escape_md_section(text, snob=False):
text = config.RE_MD_DASH_MATCHER.sub(r"\1\\\2", text)
return text
def reformat_table(lines, right_margin):
"""
Given the lines of a table
padds the cells and returns the new lines
"""
# find the maximum width of the columns
max_width = [len(x.rstrip()) + right_margin for x in lines[0].split('|')]
for line in lines:
cols = [x.rstrip() for x in line.split('|')]
max_width = [max(len(x) + right_margin, old_len)
for x, old_len in zip(cols, max_width)]
# reformat
new_lines = []
for line in lines:
cols = [x.rstrip() for x in line.split('|')]
if set(line.strip()) == set('-|'):
filler = '-'
new_cols = [x.rstrip() + (filler * (M - len(x.rstrip())))
for x, M in zip(cols, max_width)]
else:
filler = ' '
new_cols = [x.rstrip() + (filler * (M - len(x.rstrip())))
for x, M in zip(cols, max_width)]
new_lines.append('|'.join(new_cols))
return new_lines
def pad_tables_in_text(text, right_margin=1):
"""
Provide padding for tables in the text
"""
lines = text.split('\n')
table_buffer, altered_lines, table_widths, table_started = [], [], [], False
new_lines = []
for line in lines:
# Toogle table started
if (config.TABLE_MARKER_FOR_PAD in line):
table_started = not table_started
if not table_started:
table = reformat_table(table_buffer, right_margin)
new_lines.extend(table)
table_buffer = []
new_lines.append('')
continue
# Process lines
if table_started:
table_buffer.append(line)
else:
new_lines.append(line)
new_text = '\n'.join(new_lines)
return new_text