mirror of
https://github.com/JimmXinu/FanFicFare.git
synced 2025-12-07 17:33:05 +01:00
Update html2text to (2016, 9, 19).
This commit is contained in:
parent
db7777b161
commit
7f4bc5c36e
5 changed files with 412 additions and 26 deletions
|
|
@ -4,14 +4,13 @@
|
||||||
from __future__ import division
|
from __future__ import division
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
import cgi
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from textwrap import wrap
|
from textwrap import wrap
|
||||||
except ImportError: # pragma: no cover
|
except ImportError: # pragma: no cover
|
||||||
pass
|
pass
|
||||||
|
|
||||||
from html2text.compat import urlparse, HTMLParser
|
from html2text.compat import urlparse, HTMLParser, html_escape
|
||||||
from html2text import config
|
from html2text import config
|
||||||
|
|
||||||
from html2text.utils import (
|
from html2text.utils import (
|
||||||
|
|
@ -27,10 +26,11 @@ from html2text.utils import (
|
||||||
list_numbering_start,
|
list_numbering_start,
|
||||||
dumb_css_parser,
|
dumb_css_parser,
|
||||||
escape_md_section,
|
escape_md_section,
|
||||||
skipwrap
|
skipwrap,
|
||||||
|
pad_tables_in_text
|
||||||
)
|
)
|
||||||
|
|
||||||
__version__ = (2016, 4, 2)
|
__version__ = (2016, 9, 19)
|
||||||
|
|
||||||
|
|
||||||
# TODO:
|
# TODO:
|
||||||
|
|
@ -68,6 +68,7 @@ class HTML2Text(HTMLParser.HTMLParser):
|
||||||
self.images_with_size = config.IMAGES_WITH_SIZE # covered in cli
|
self.images_with_size = config.IMAGES_WITH_SIZE # covered in cli
|
||||||
self.ignore_emphasis = config.IGNORE_EMPHASIS # covered in cli
|
self.ignore_emphasis = config.IGNORE_EMPHASIS # covered in cli
|
||||||
self.bypass_tables = config.BYPASS_TABLES # covered in cli
|
self.bypass_tables = config.BYPASS_TABLES # covered in cli
|
||||||
|
self.ignore_tables = config.IGNORE_TABLES # covered in cli
|
||||||
self.google_doc = False # covered in cli
|
self.google_doc = False # covered in cli
|
||||||
self.ul_item_mark = '*' # covered in cli
|
self.ul_item_mark = '*' # covered in cli
|
||||||
self.emphasis_mark = '_' # covered in cli
|
self.emphasis_mark = '_' # covered in cli
|
||||||
|
|
@ -77,6 +78,8 @@ class HTML2Text(HTMLParser.HTMLParser):
|
||||||
self.hide_strikethrough = False # covered in cli
|
self.hide_strikethrough = False # covered in cli
|
||||||
self.mark_code = config.MARK_CODE
|
self.mark_code = config.MARK_CODE
|
||||||
self.wrap_links = config.WRAP_LINKS # covered in cli
|
self.wrap_links = config.WRAP_LINKS # covered in cli
|
||||||
|
self.pad_tables = config.PAD_TABLES # covered in cli
|
||||||
|
self.default_image_alt = config.DEFAULT_IMAGE_ALT # covered in cli
|
||||||
self.tag_callback = None
|
self.tag_callback = None
|
||||||
|
|
||||||
if out is None: # pragma: no cover
|
if out is None: # pragma: no cover
|
||||||
|
|
@ -130,7 +133,11 @@ class HTML2Text(HTMLParser.HTMLParser):
|
||||||
def handle(self, data):
|
def handle(self, data):
|
||||||
self.feed(data)
|
self.feed(data)
|
||||||
self.feed("")
|
self.feed("")
|
||||||
return self.optwrap(self.close())
|
markdown = self.optwrap(self.close())
|
||||||
|
if self.pad_tables:
|
||||||
|
return pad_tables_in_text(markdown)
|
||||||
|
else:
|
||||||
|
return markdown
|
||||||
|
|
||||||
def outtextf(self, s):
|
def outtextf(self, s):
|
||||||
self.outtextlist.append(s)
|
self.outtextlist.append(s)
|
||||||
|
|
@ -142,23 +149,20 @@ class HTML2Text(HTMLParser.HTMLParser):
|
||||||
|
|
||||||
try:
|
try:
|
||||||
nochr = unicode('')
|
nochr = unicode('')
|
||||||
|
unicode_character = unichr
|
||||||
except NameError:
|
except NameError:
|
||||||
nochr = str('')
|
nochr = str('')
|
||||||
|
unicode_character = chr
|
||||||
|
|
||||||
self.pbr()
|
self.pbr()
|
||||||
self.o('', 0, 'end')
|
self.o('', 0, 'end')
|
||||||
|
|
||||||
outtext = nochr.join(self.outtextlist)
|
outtext = nochr.join(self.outtextlist)
|
||||||
|
|
||||||
if self.unicode_snob:
|
if self.unicode_snob:
|
||||||
try:
|
nbsp = unicode_character(name2cp('nbsp'))
|
||||||
nbsp = unichr(name2cp('nbsp'))
|
|
||||||
except NameError:
|
|
||||||
nbsp = chr(name2cp('nbsp'))
|
|
||||||
else:
|
else:
|
||||||
try:
|
nbsp = unicode_character(32)
|
||||||
nbsp = unichr(32)
|
|
||||||
except NameError:
|
|
||||||
nbsp = chr(32)
|
|
||||||
try:
|
try:
|
||||||
outtext = outtext.replace(unicode(' _place_holder;'), nbsp)
|
outtext = outtext.replace(unicode(' _place_holder;'), nbsp)
|
||||||
except NameError:
|
except NameError:
|
||||||
|
|
@ -173,14 +177,14 @@ class HTML2Text(HTMLParser.HTMLParser):
|
||||||
def handle_charref(self, c):
|
def handle_charref(self, c):
|
||||||
charref = self.charref(c)
|
charref = self.charref(c)
|
||||||
if not self.code and not self.pre:
|
if not self.code and not self.pre:
|
||||||
charref = cgi.escape(charref)
|
charref = html_escape(charref)
|
||||||
self.handle_data(charref, True)
|
self.handle_data(charref, True)
|
||||||
|
|
||||||
def handle_entityref(self, c):
|
def handle_entityref(self, c):
|
||||||
entityref = self.entityref(c)
|
entityref = self.entityref(c)
|
||||||
if (not self.code and not self.pre
|
if (not self.code and not self.pre
|
||||||
and entityref != ' _place_holder;'):
|
and entityref != ' _place_holder;'):
|
||||||
entityref = cgi.escape(entityref)
|
entityref = html_escape(entityref)
|
||||||
self.handle_data(entityref, True)
|
self.handle_data(entityref, True)
|
||||||
|
|
||||||
def handle_starttag(self, tag, attrs):
|
def handle_starttag(self, tag, attrs):
|
||||||
|
|
@ -331,6 +335,9 @@ class HTML2Text(HTMLParser.HTMLParser):
|
||||||
self.p()
|
self.p()
|
||||||
|
|
||||||
if tag == "br" and start:
|
if tag == "br" and start:
|
||||||
|
if self.blockquote > 0:
|
||||||
|
self.o(" \n> ")
|
||||||
|
else:
|
||||||
self.o(" \n")
|
self.o(" \n")
|
||||||
|
|
||||||
if tag == "hr" and start:
|
if tag == "hr" and start:
|
||||||
|
|
@ -439,7 +446,7 @@ class HTML2Text(HTMLParser.HTMLParser):
|
||||||
if 'src' in attrs:
|
if 'src' in attrs:
|
||||||
if not self.images_to_alt:
|
if not self.images_to_alt:
|
||||||
attrs['href'] = attrs['src']
|
attrs['href'] = attrs['src']
|
||||||
alt = attrs.get('alt') or ''
|
alt = attrs.get('alt') or self.default_image_alt
|
||||||
|
|
||||||
# If we have images_with_size, write raw html including width,
|
# If we have images_with_size, write raw html including width,
|
||||||
# height, and alt attributes
|
# height, and alt attributes
|
||||||
|
|
@ -541,7 +548,16 @@ class HTML2Text(HTMLParser.HTMLParser):
|
||||||
self.start = 1
|
self.start = 1
|
||||||
|
|
||||||
if tag in ["table", "tr", "td", "th"]:
|
if tag in ["table", "tr", "td", "th"]:
|
||||||
if self.bypass_tables:
|
if self.ignore_tables:
|
||||||
|
if tag == 'tr':
|
||||||
|
if start:
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
self.soft_br()
|
||||||
|
else:
|
||||||
|
pass
|
||||||
|
|
||||||
|
elif self.bypass_tables:
|
||||||
if start:
|
if start:
|
||||||
self.soft_br()
|
self.soft_br()
|
||||||
if tag in ["td", "th"]:
|
if tag in ["td", "th"]:
|
||||||
|
|
@ -556,8 +572,16 @@ class HTML2Text(HTMLParser.HTMLParser):
|
||||||
self.o('</{0}>'.format(tag))
|
self.o('</{0}>'.format(tag))
|
||||||
|
|
||||||
else:
|
else:
|
||||||
if tag == "table" and start:
|
if tag == "table":
|
||||||
|
if start:
|
||||||
self.table_start = True
|
self.table_start = True
|
||||||
|
if self.pad_tables:
|
||||||
|
self.o("<"+config.TABLE_MARKER_FOR_PAD+">")
|
||||||
|
self.o(" \n")
|
||||||
|
else:
|
||||||
|
if self.pad_tables:
|
||||||
|
self.o("</"+config.TABLE_MARKER_FOR_PAD+">")
|
||||||
|
self.o(" \n")
|
||||||
if tag in ["td", "th"] and start:
|
if tag in ["td", "th"] and start:
|
||||||
if self.split_next_td:
|
if self.split_next_td:
|
||||||
self.o("| ")
|
self.o("| ")
|
||||||
|
|
@ -707,9 +731,6 @@ class HTML2Text(HTMLParser.HTMLParser):
|
||||||
self.outcount += 1
|
self.outcount += 1
|
||||||
|
|
||||||
def handle_data(self, data, entity_char=False):
|
def handle_data(self, data, entity_char=False):
|
||||||
if r'\/script>' in data:
|
|
||||||
self.quiet -= 1
|
|
||||||
|
|
||||||
if self.style:
|
if self.style:
|
||||||
self.style_def.update(dumb_css_parser(data))
|
self.style_def.update(dumb_css_parser(data))
|
||||||
|
|
||||||
|
|
@ -814,7 +835,9 @@ class HTML2Text(HTMLParser.HTMLParser):
|
||||||
for para in text.split("\n"):
|
for para in text.split("\n"):
|
||||||
if len(para) > 0:
|
if len(para) > 0:
|
||||||
if not skipwrap(para, self.wrap_links):
|
if not skipwrap(para, self.wrap_links):
|
||||||
result += "\n".join(wrap(para, self.body_width))
|
result += "\n".join(
|
||||||
|
wrap(para, self.body_width, break_long_words=False)
|
||||||
|
)
|
||||||
if para.endswith(' '):
|
if para.endswith(' '):
|
||||||
result += " \n"
|
result += " \n"
|
||||||
newlines = 1
|
newlines = 1
|
||||||
|
|
|
||||||
299
included_dependencies/html2text/cli.py
Normal file
299
included_dependencies/html2text/cli.py
Normal file
|
|
@ -0,0 +1,299 @@
|
||||||
|
import optparse
|
||||||
|
import warnings
|
||||||
|
|
||||||
|
from html2text.compat import urllib
|
||||||
|
from html2text import HTML2Text, config, __version__
|
||||||
|
from html2text.utils import wrapwrite, wrap_read
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
baseurl = ''
|
||||||
|
|
||||||
|
class bcolors: # pragma: no cover
|
||||||
|
HEADER = '\033[95m'
|
||||||
|
OKBLUE = '\033[94m'
|
||||||
|
OKGREEN = '\033[92m'
|
||||||
|
WARNING = '\033[93m'
|
||||||
|
FAIL = '\033[91m'
|
||||||
|
ENDC = '\033[0m'
|
||||||
|
BOLD = '\033[1m'
|
||||||
|
UNDERLINE = '\033[4m'
|
||||||
|
|
||||||
|
p = optparse.OptionParser(
|
||||||
|
'%prog [(filename|url) [encoding]]',
|
||||||
|
version='%prog ' + ".".join(map(str, __version__))
|
||||||
|
)
|
||||||
|
p.add_option(
|
||||||
|
"--default-image-alt",
|
||||||
|
dest="default_image_alt",
|
||||||
|
action="store",
|
||||||
|
type="str",
|
||||||
|
default=config.DEFAULT_IMAGE_ALT,
|
||||||
|
help="The default alt string for images with missing ones")
|
||||||
|
p.add_option(
|
||||||
|
"--pad-tables",
|
||||||
|
dest="pad_tables",
|
||||||
|
action="store_true",
|
||||||
|
default=config.PAD_TABLES,
|
||||||
|
help="pad the cells to equal column width in tables"
|
||||||
|
)
|
||||||
|
p.add_option(
|
||||||
|
"--no-wrap-links",
|
||||||
|
dest="wrap_links",
|
||||||
|
action="store_false",
|
||||||
|
default=config.WRAP_LINKS,
|
||||||
|
help="wrap links during conversion"
|
||||||
|
)
|
||||||
|
p.add_option(
|
||||||
|
"--ignore-emphasis",
|
||||||
|
dest="ignore_emphasis",
|
||||||
|
action="store_true",
|
||||||
|
default=config.IGNORE_EMPHASIS,
|
||||||
|
help="don't include any formatting for emphasis"
|
||||||
|
)
|
||||||
|
p.add_option(
|
||||||
|
"--reference-links",
|
||||||
|
dest="inline_links",
|
||||||
|
action="store_false",
|
||||||
|
default=config.INLINE_LINKS,
|
||||||
|
help="use reference style links instead of inline links"
|
||||||
|
)
|
||||||
|
p.add_option(
|
||||||
|
"--ignore-links",
|
||||||
|
dest="ignore_links",
|
||||||
|
action="store_true",
|
||||||
|
default=config.IGNORE_ANCHORS,
|
||||||
|
help="don't include any formatting for links")
|
||||||
|
p.add_option(
|
||||||
|
"--protect-links",
|
||||||
|
dest="protect_links",
|
||||||
|
action="store_true",
|
||||||
|
default=config.PROTECT_LINKS,
|
||||||
|
help=("protect links from line breaks surrounding them " +
|
||||||
|
"with angle brackets"))
|
||||||
|
p.add_option(
|
||||||
|
"--ignore-images",
|
||||||
|
dest="ignore_images",
|
||||||
|
action="store_true",
|
||||||
|
default=config.IGNORE_IMAGES,
|
||||||
|
help="don't include any formatting for images"
|
||||||
|
)
|
||||||
|
p.add_option(
|
||||||
|
"--images-to-alt",
|
||||||
|
dest="images_to_alt",
|
||||||
|
action="store_true",
|
||||||
|
default=config.IMAGES_TO_ALT,
|
||||||
|
help="Discard image data, only keep alt text"
|
||||||
|
)
|
||||||
|
p.add_option(
|
||||||
|
"--images-with-size",
|
||||||
|
dest="images_with_size",
|
||||||
|
action="store_true",
|
||||||
|
default=config.IMAGES_WITH_SIZE,
|
||||||
|
help="Write image tags with height and width attrs as raw html to "
|
||||||
|
"retain dimensions"
|
||||||
|
)
|
||||||
|
p.add_option(
|
||||||
|
"-g", "--google-doc",
|
||||||
|
action="store_true",
|
||||||
|
dest="google_doc",
|
||||||
|
default=False,
|
||||||
|
help="convert an html-exported Google Document"
|
||||||
|
)
|
||||||
|
p.add_option(
|
||||||
|
"-d", "--dash-unordered-list",
|
||||||
|
action="store_true",
|
||||||
|
dest="ul_style_dash",
|
||||||
|
default=False,
|
||||||
|
help="use a dash rather than a star for unordered list items"
|
||||||
|
)
|
||||||
|
p.add_option(
|
||||||
|
"-e", "--asterisk-emphasis",
|
||||||
|
action="store_true",
|
||||||
|
dest="em_style_asterisk",
|
||||||
|
default=False,
|
||||||
|
help="use an asterisk rather than an underscore for emphasized text"
|
||||||
|
)
|
||||||
|
p.add_option(
|
||||||
|
"-b", "--body-width",
|
||||||
|
dest="body_width",
|
||||||
|
action="store",
|
||||||
|
type="int",
|
||||||
|
default=config.BODY_WIDTH,
|
||||||
|
help="number of characters per output line, 0 for no wrap"
|
||||||
|
)
|
||||||
|
p.add_option(
|
||||||
|
"-i", "--google-list-indent",
|
||||||
|
dest="list_indent",
|
||||||
|
action="store",
|
||||||
|
type="int",
|
||||||
|
default=config.GOOGLE_LIST_INDENT,
|
||||||
|
help="number of pixels Google indents nested lists"
|
||||||
|
)
|
||||||
|
p.add_option(
|
||||||
|
"-s", "--hide-strikethrough",
|
||||||
|
action="store_true",
|
||||||
|
dest="hide_strikethrough",
|
||||||
|
default=False,
|
||||||
|
help="hide strike-through text. only relevant when -g is "
|
||||||
|
"specified as well"
|
||||||
|
)
|
||||||
|
p.add_option(
|
||||||
|
"--escape-all",
|
||||||
|
action="store_true",
|
||||||
|
dest="escape_snob",
|
||||||
|
default=False,
|
||||||
|
help="Escape all special characters. Output is less readable, but "
|
||||||
|
"avoids corner case formatting issues."
|
||||||
|
)
|
||||||
|
p.add_option(
|
||||||
|
"--bypass-tables",
|
||||||
|
action="store_true",
|
||||||
|
dest="bypass_tables",
|
||||||
|
default=config.BYPASS_TABLES,
|
||||||
|
help="Format tables in HTML rather than Markdown syntax."
|
||||||
|
)
|
||||||
|
p.add_option(
|
||||||
|
"--ignore-tables",
|
||||||
|
action="store_true",
|
||||||
|
dest="ignore_tables",
|
||||||
|
default=config.IGNORE_TABLES,
|
||||||
|
help="Ignore table-related tags (table, th, td, tr) while keeping rows."
|
||||||
|
)
|
||||||
|
p.add_option(
|
||||||
|
"--single-line-break",
|
||||||
|
action="store_true",
|
||||||
|
dest="single_line_break",
|
||||||
|
default=config.SINGLE_LINE_BREAK,
|
||||||
|
help=(
|
||||||
|
"Use a single line break after a block element rather than two "
|
||||||
|
"line breaks. NOTE: Requires --body-width=0"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
p.add_option(
|
||||||
|
"--unicode-snob",
|
||||||
|
action="store_true",
|
||||||
|
dest="unicode_snob",
|
||||||
|
default=config.UNICODE_SNOB,
|
||||||
|
help="Use unicode throughout document"
|
||||||
|
)
|
||||||
|
p.add_option(
|
||||||
|
"--no-automatic-links",
|
||||||
|
action="store_false",
|
||||||
|
dest="use_automatic_links",
|
||||||
|
default=config.USE_AUTOMATIC_LINKS,
|
||||||
|
help="Do not use automatic links wherever applicable"
|
||||||
|
)
|
||||||
|
p.add_option(
|
||||||
|
"--no-skip-internal-links",
|
||||||
|
action="store_false",
|
||||||
|
dest="skip_internal_links",
|
||||||
|
default=config.SKIP_INTERNAL_LINKS,
|
||||||
|
help="Do not skip internal links"
|
||||||
|
)
|
||||||
|
p.add_option(
|
||||||
|
"--links-after-para",
|
||||||
|
action="store_true",
|
||||||
|
dest="links_each_paragraph",
|
||||||
|
default=config.LINKS_EACH_PARAGRAPH,
|
||||||
|
help="Put links after each paragraph instead of document"
|
||||||
|
)
|
||||||
|
p.add_option(
|
||||||
|
"--mark-code",
|
||||||
|
action="store_true",
|
||||||
|
dest="mark_code",
|
||||||
|
default=config.MARK_CODE,
|
||||||
|
help="Mark program code blocks with [code]...[/code]"
|
||||||
|
)
|
||||||
|
p.add_option(
|
||||||
|
"--decode-errors",
|
||||||
|
dest="decode_errors",
|
||||||
|
action="store",
|
||||||
|
type="string",
|
||||||
|
default=config.DECODE_ERRORS,
|
||||||
|
help="What to do in case of decode errors.'ignore', 'strict' and 'replace' are acceptable values"
|
||||||
|
)
|
||||||
|
(options, args) = p.parse_args()
|
||||||
|
|
||||||
|
# process input
|
||||||
|
encoding = "utf-8"
|
||||||
|
if len(args) == 2:
|
||||||
|
encoding = args[1]
|
||||||
|
elif len(args) > 2:
|
||||||
|
p.error('Too many arguments')
|
||||||
|
|
||||||
|
if len(args) > 0 and args[0] != '-': # pragma: no cover
|
||||||
|
file_ = args[0]
|
||||||
|
|
||||||
|
if file_.startswith('http://') or file_.startswith('https://'):
|
||||||
|
warnings.warn("Support for retrieving html over network is set for deprecation by version (2017, 1, x)",
|
||||||
|
DeprecationWarning)
|
||||||
|
baseurl = file_
|
||||||
|
j = urllib.urlopen(baseurl)
|
||||||
|
data = j.read()
|
||||||
|
if encoding is None:
|
||||||
|
try:
|
||||||
|
from feedparser import _getCharacterEncoding as enc
|
||||||
|
except ImportError:
|
||||||
|
enc = lambda x, y: ('utf-8', 1)
|
||||||
|
encoding = enc(j.headers, data)[0]
|
||||||
|
if encoding == 'us-ascii':
|
||||||
|
encoding = 'utf-8'
|
||||||
|
else:
|
||||||
|
data = open(file_, 'rb').read()
|
||||||
|
if encoding is None:
|
||||||
|
try:
|
||||||
|
from chardet import detect
|
||||||
|
except ImportError:
|
||||||
|
detect = lambda x: {'encoding': 'utf-8'}
|
||||||
|
encoding = detect(data)['encoding']
|
||||||
|
else:
|
||||||
|
data = wrap_read()
|
||||||
|
|
||||||
|
if hasattr(data, 'decode'):
|
||||||
|
try:
|
||||||
|
try:
|
||||||
|
data = data.decode(encoding, errors=options.decode_errors)
|
||||||
|
except TypeError:
|
||||||
|
# python 2.6.x does not have the errors option
|
||||||
|
data = data.decode(encoding)
|
||||||
|
except UnicodeDecodeError as err:
|
||||||
|
warning = bcolors.WARNING + "Warning:" + bcolors.ENDC
|
||||||
|
warning += ' Use the ' + bcolors.OKGREEN
|
||||||
|
warning += '--decode-errors=ignore' + bcolors.ENDC + 'flag.'
|
||||||
|
print(warning)
|
||||||
|
raise err
|
||||||
|
|
||||||
|
h = HTML2Text(baseurl=baseurl)
|
||||||
|
# handle options
|
||||||
|
if options.ul_style_dash:
|
||||||
|
h.ul_item_mark = '-'
|
||||||
|
if options.em_style_asterisk:
|
||||||
|
h.emphasis_mark = '*'
|
||||||
|
h.strong_mark = '__'
|
||||||
|
|
||||||
|
h.body_width = options.body_width
|
||||||
|
h.google_list_indent = options.list_indent
|
||||||
|
h.ignore_emphasis = options.ignore_emphasis
|
||||||
|
h.ignore_links = options.ignore_links
|
||||||
|
h.protect_links = options.protect_links
|
||||||
|
h.ignore_images = options.ignore_images
|
||||||
|
h.images_to_alt = options.images_to_alt
|
||||||
|
h.images_with_size = options.images_with_size
|
||||||
|
h.google_doc = options.google_doc
|
||||||
|
h.hide_strikethrough = options.hide_strikethrough
|
||||||
|
h.escape_snob = options.escape_snob
|
||||||
|
h.bypass_tables = options.bypass_tables
|
||||||
|
h.ignore_tables = options.ignore_tables
|
||||||
|
h.single_line_break = options.single_line_break
|
||||||
|
h.inline_links = options.inline_links
|
||||||
|
h.unicode_snob = options.unicode_snob
|
||||||
|
h.use_automatic_links = options.use_automatic_links
|
||||||
|
h.skip_internal_links = options.skip_internal_links
|
||||||
|
h.links_each_paragraph = options.links_each_paragraph
|
||||||
|
h.mark_code = options.mark_code
|
||||||
|
h.wrap_links = options.wrap_links
|
||||||
|
h.pad_tables = options.pad_tables
|
||||||
|
h.default_image_alt = options.default_image_alt
|
||||||
|
|
||||||
|
wrapwrite(h.handle(data))
|
||||||
|
|
@ -6,8 +6,12 @@ if sys.version_info[0] == 2:
|
||||||
import urlparse
|
import urlparse
|
||||||
import HTMLParser
|
import HTMLParser
|
||||||
import urllib
|
import urllib
|
||||||
|
from cgi import escape as html_escape
|
||||||
else:
|
else:
|
||||||
import urllib.parse as urlparse
|
import urllib.parse as urlparse
|
||||||
import html.entities as htmlentitydefs
|
import html.entities as htmlentitydefs
|
||||||
import html.parser as HTMLParser
|
import html.parser as HTMLParser
|
||||||
import urllib.request as urllib
|
import urllib.request as urllib
|
||||||
|
from html import escape
|
||||||
|
def html_escape(s):
|
||||||
|
return escape(s, quote=False)
|
||||||
|
|
|
||||||
|
|
@ -3,6 +3,8 @@ import re
|
||||||
# Use Unicode characters instead of their ascii psuedo-replacements
|
# Use Unicode characters instead of their ascii psuedo-replacements
|
||||||
UNICODE_SNOB = 0
|
UNICODE_SNOB = 0
|
||||||
|
|
||||||
|
# Marker to use for marking tables for padding post processing
|
||||||
|
TABLE_MARKER_FOR_PAD = "special_marker_for_table_padding"
|
||||||
# Escape all special characters. Output is less readable, but avoids
|
# Escape all special characters. Output is less readable, but avoids
|
||||||
# corner case formatting issues.
|
# corner case formatting issues.
|
||||||
ESCAPE_SNOB = 0
|
ESCAPE_SNOB = 0
|
||||||
|
|
@ -36,6 +38,8 @@ IMAGES_WITH_SIZE = False
|
||||||
IGNORE_EMPHASIS = False
|
IGNORE_EMPHASIS = False
|
||||||
MARK_CODE = False
|
MARK_CODE = False
|
||||||
DECODE_ERRORS = 'strict'
|
DECODE_ERRORS = 'strict'
|
||||||
|
DEFAULT_IMAGE_ALT = ''
|
||||||
|
PAD_TABLES = False
|
||||||
|
|
||||||
# Convert links with same href and text to <href> format if they are absolute links
|
# Convert links with same href and text to <href> format if they are absolute links
|
||||||
USE_AUTOMATIC_LINKS = True
|
USE_AUTOMATIC_LINKS = True
|
||||||
|
|
@ -116,7 +120,11 @@ UNIFIABLE = {
|
||||||
'rlm': ''
|
'rlm': ''
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Format tables in HTML rather than Markdown syntax
|
||||||
BYPASS_TABLES = False
|
BYPASS_TABLES = False
|
||||||
|
# Ignore table-related tags (table, th, td, tr) while keeping rows
|
||||||
|
IGNORE_TABLES = False
|
||||||
|
|
||||||
|
|
||||||
# Use a single line break after a block element rather an two line breaks.
|
# Use a single line break after a block element rather an two line breaks.
|
||||||
# NOTE: Requires body width setting to be 0.
|
# NOTE: Requires body width setting to be 0.
|
||||||
|
|
|
||||||
|
|
@ -31,7 +31,7 @@ def dumb_property_dict(style):
|
||||||
"""
|
"""
|
||||||
:returns: A hash of css attributes
|
:returns: A hash of css attributes
|
||||||
"""
|
"""
|
||||||
out = dict([(x.strip(), y.strip()) for x, y in
|
out = dict([(x.strip().lower(), y.strip().lower()) for x, y in
|
||||||
[z.split(':', 1) for z in
|
[z.split(':', 1) for z in
|
||||||
style.split(';') if ':' in z
|
style.split(';') if ':' in z
|
||||||
]
|
]
|
||||||
|
|
@ -149,7 +149,7 @@ def google_fixed_width_font(style):
|
||||||
font_family = ''
|
font_family = ''
|
||||||
if 'font-family' in style:
|
if 'font-family' in style:
|
||||||
font_family = style['font-family']
|
font_family = style['font-family']
|
||||||
if 'Courier New' == font_family or 'Consolas' == font_family:
|
if 'courier new' == font_family or 'consolas' == font_family:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
return False
|
return False
|
||||||
|
|
@ -244,3 +244,55 @@ def escape_md_section(text, snob=False):
|
||||||
text = config.RE_MD_DASH_MATCHER.sub(r"\1\\\2", text)
|
text = config.RE_MD_DASH_MATCHER.sub(r"\1\\\2", text)
|
||||||
|
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
def reformat_table(lines, right_margin):
|
||||||
|
"""
|
||||||
|
Given the lines of a table
|
||||||
|
padds the cells and returns the new lines
|
||||||
|
"""
|
||||||
|
# find the maximum width of the columns
|
||||||
|
max_width = [len(x.rstrip()) + right_margin for x in lines[0].split('|')]
|
||||||
|
for line in lines:
|
||||||
|
cols = [x.rstrip() for x in line.split('|')]
|
||||||
|
max_width = [max(len(x) + right_margin, old_len)
|
||||||
|
for x, old_len in zip(cols, max_width)]
|
||||||
|
|
||||||
|
# reformat
|
||||||
|
new_lines = []
|
||||||
|
for line in lines:
|
||||||
|
cols = [x.rstrip() for x in line.split('|')]
|
||||||
|
if set(line.strip()) == set('-|'):
|
||||||
|
filler = '-'
|
||||||
|
new_cols = [x.rstrip() + (filler * (M - len(x.rstrip())))
|
||||||
|
for x, M in zip(cols, max_width)]
|
||||||
|
else:
|
||||||
|
filler = ' '
|
||||||
|
new_cols = [x.rstrip() + (filler * (M - len(x.rstrip())))
|
||||||
|
for x, M in zip(cols, max_width)]
|
||||||
|
new_lines.append('|'.join(new_cols))
|
||||||
|
return new_lines
|
||||||
|
|
||||||
|
def pad_tables_in_text(text, right_margin=1):
|
||||||
|
"""
|
||||||
|
Provide padding for tables in the text
|
||||||
|
"""
|
||||||
|
lines = text.split('\n')
|
||||||
|
table_buffer, altered_lines, table_widths, table_started = [], [], [], False
|
||||||
|
new_lines = []
|
||||||
|
for line in lines:
|
||||||
|
# Toogle table started
|
||||||
|
if (config.TABLE_MARKER_FOR_PAD in line):
|
||||||
|
table_started = not table_started
|
||||||
|
if not table_started:
|
||||||
|
table = reformat_table(table_buffer, right_margin)
|
||||||
|
new_lines.extend(table)
|
||||||
|
table_buffer = []
|
||||||
|
new_lines.append('')
|
||||||
|
continue
|
||||||
|
# Process lines
|
||||||
|
if table_started:
|
||||||
|
table_buffer.append(line)
|
||||||
|
else:
|
||||||
|
new_lines.append(line)
|
||||||
|
new_text = '\n'.join(new_lines)
|
||||||
|
return new_text
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue