mirror of
https://github.com/JimmXinu/FanFicFare.git
synced 2025-12-06 08:52:55 +01:00
Update plugin & web service html2text included package to fix text <>& output.
This commit is contained in:
parent
9186c2fae9
commit
8fb7f048b5
5 changed files with 211 additions and 84 deletions
|
|
@ -2,6 +2,7 @@
|
|||
# coding: utf-8
|
||||
"""html2text: Turn HTML into equivalent Markdown-structured text."""
|
||||
from __future__ import division
|
||||
from __future__ import unicode_literals
|
||||
import re
|
||||
import sys
|
||||
|
||||
|
|
@ -10,7 +11,7 @@ try:
|
|||
except ImportError: # pragma: no cover
|
||||
pass
|
||||
|
||||
from html2text.compat import urlparse, HTMLParser, html_escape
|
||||
from html2text.compat import urlparse, HTMLParser
|
||||
from html2text import config
|
||||
|
||||
from html2text.utils import (
|
||||
|
|
@ -30,7 +31,14 @@ from html2text.utils import (
|
|||
pad_tables_in_text
|
||||
)
|
||||
|
||||
__version__ = (2016, 9, 19)
|
||||
try:
|
||||
chr = unichr
|
||||
nochr = unicode('')
|
||||
except NameError:
|
||||
# python3 uses chr
|
||||
nochr = str('')
|
||||
|
||||
__version__ = (2018, 1, 9)
|
||||
|
||||
|
||||
# TODO:
|
||||
|
|
@ -81,6 +89,8 @@ class HTML2Text(HTMLParser.HTMLParser):
|
|||
self.pad_tables = config.PAD_TABLES # covered in cli
|
||||
self.default_image_alt = config.DEFAULT_IMAGE_ALT # covered in cli
|
||||
self.tag_callback = None
|
||||
self.open_quote = config.OPEN_QUOTE # covered in cli
|
||||
self.close_quote = config.CLOSE_QUOTE # covered in cli
|
||||
|
||||
if out is None: # pragma: no cover
|
||||
self.out = self.outtextf
|
||||
|
|
@ -106,6 +116,7 @@ class HTML2Text(HTMLParser.HTMLParser):
|
|||
self.pre = 0
|
||||
self.startpre = 0
|
||||
self.code = False
|
||||
self.quote = False
|
||||
self.br_toggle = ''
|
||||
self.lastWasNL = 0
|
||||
self.lastWasList = False
|
||||
|
|
@ -119,6 +130,10 @@ class HTML2Text(HTMLParser.HTMLParser):
|
|||
self.abbr_data = None # last inner HTML (for abbr being defined)
|
||||
self.abbr_list = {} # stack of abbreviations to write later
|
||||
self.baseurl = baseurl
|
||||
self.stressed = False
|
||||
self.preceding_stressed = False
|
||||
self.preceding_data = None
|
||||
self.current_tag = None
|
||||
|
||||
try:
|
||||
del unifiable_n[name2cp('nbsp')]
|
||||
|
|
@ -147,22 +162,15 @@ class HTML2Text(HTMLParser.HTMLParser):
|
|||
def close(self):
|
||||
HTMLParser.HTMLParser.close(self)
|
||||
|
||||
try:
|
||||
nochr = unicode('')
|
||||
unicode_character = unichr
|
||||
except NameError:
|
||||
nochr = str('')
|
||||
unicode_character = chr
|
||||
|
||||
self.pbr()
|
||||
self.o('', 0, 'end')
|
||||
|
||||
outtext = nochr.join(self.outtextlist)
|
||||
|
||||
if self.unicode_snob:
|
||||
nbsp = unicode_character(name2cp('nbsp'))
|
||||
nbsp = chr(name2cp('nbsp'))
|
||||
else:
|
||||
nbsp = unicode_character(32)
|
||||
nbsp = chr(32)
|
||||
try:
|
||||
outtext = outtext.replace(unicode(' _place_holder;'), nbsp)
|
||||
except NameError:
|
||||
|
|
@ -175,17 +183,10 @@ class HTML2Text(HTMLParser.HTMLParser):
|
|||
return outtext
|
||||
|
||||
def handle_charref(self, c):
|
||||
charref = self.charref(c)
|
||||
if not self.code and not self.pre:
|
||||
charref = html_escape(charref)
|
||||
self.handle_data(charref, True)
|
||||
self.handle_data(self.charref(c), True)
|
||||
|
||||
def handle_entityref(self, c):
|
||||
entityref = self.entityref(c)
|
||||
if (not self.code and not self.pre
|
||||
and entityref != ' _place_holder;'):
|
||||
entityref = html_escape(entityref)
|
||||
self.handle_data(entityref, True)
|
||||
self.handle_data(self.entityref(c), True)
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
self.handle_tag(tag, attrs, 1)
|
||||
|
|
@ -208,10 +209,11 @@ class HTML2Text(HTMLParser.HTMLParser):
|
|||
i += 1
|
||||
match = 0
|
||||
|
||||
if ('href' in a) and a['href'] == attrs['href']:
|
||||
if ('title' in a) or ('title' in attrs):
|
||||
if (('title' in a) and ('title' in attrs) and
|
||||
a['title'] == attrs['title']):
|
||||
if 'href' in a and a['href'] == attrs['href']:
|
||||
if 'title' in a or 'title' in attrs:
|
||||
if 'title' in a and \
|
||||
'title' in attrs and \
|
||||
a['title'] == attrs['title']:
|
||||
match = True
|
||||
else:
|
||||
match = True
|
||||
|
|
@ -229,8 +231,16 @@ class HTML2Text(HTMLParser.HTMLParser):
|
|||
# handle Google's text emphasis
|
||||
strikethrough = 'line-through' in \
|
||||
tag_emphasis and self.hide_strikethrough
|
||||
bold = 'bold' in tag_emphasis and not 'bold' in parent_emphasis
|
||||
italic = 'italic' in tag_emphasis and not 'italic' in parent_emphasis
|
||||
|
||||
# google and others may mark a font's weight as `bold` or `700`
|
||||
bold = False
|
||||
for bold_marker in config.BOLD_TEXT_STYLE_VALUES:
|
||||
bold = (bold_marker in tag_emphasis
|
||||
and bold_marker not in parent_emphasis)
|
||||
if bold:
|
||||
break
|
||||
|
||||
italic = 'italic' in tag_emphasis and 'italic' not in parent_emphasis
|
||||
fixed = google_fixed_width_font(tag_style) and not \
|
||||
google_fixed_width_font(parent_style) and not self.pre
|
||||
|
||||
|
|
@ -282,6 +292,7 @@ class HTML2Text(HTMLParser.HTMLParser):
|
|||
self.quiet -= 1
|
||||
|
||||
def handle_tag(self, tag, attrs, start):
|
||||
self.current_tag = tag
|
||||
# attrs is None for endtags
|
||||
if attrs is None:
|
||||
attrs = {}
|
||||
|
|
@ -292,10 +303,11 @@ class HTML2Text(HTMLParser.HTMLParser):
|
|||
if self.tag_callback(self, tag, attrs, start) is True:
|
||||
return
|
||||
|
||||
# first thing inside the anchor tag is another tag that produces some output
|
||||
if (start and not self.maybe_automatic_link is None
|
||||
and tag not in ['p', 'div', 'style', 'dl', 'dt']
|
||||
and (tag != "img" or self.ignore_images)):
|
||||
# first thing inside the anchor tag is another tag
|
||||
# that produces some output
|
||||
if (start and self.maybe_automatic_link is not None and
|
||||
tag not in ['p', 'div', 'style', 'dl', 'dt'] and
|
||||
(tag != "img" or self.ignore_images)):
|
||||
self.o("[")
|
||||
self.maybe_automatic_link = None
|
||||
self.empty_link = False
|
||||
|
|
@ -312,7 +324,8 @@ class HTML2Text(HTMLParser.HTMLParser):
|
|||
tag_style = element_style(attrs, self.style_def, parent_style)
|
||||
self.tag_stack.append((tag, attrs, tag_style))
|
||||
else:
|
||||
dummy, attrs, tag_style = self.tag_stack.pop() if self.tag_stack else (None, {}, {})
|
||||
dummy, attrs, tag_style = self.tag_stack.pop() \
|
||||
if self.tag_stack else (None, {}, {})
|
||||
if self.tag_stack:
|
||||
parent_style = self.tag_stack[-1][2]
|
||||
|
||||
|
|
@ -331,6 +344,8 @@ class HTML2Text(HTMLParser.HTMLParser):
|
|||
self.p()
|
||||
else:
|
||||
self.soft_br()
|
||||
elif self.astack and tag == 'div':
|
||||
pass
|
||||
else:
|
||||
self.p()
|
||||
|
||||
|
|
@ -370,24 +385,49 @@ class HTML2Text(HTMLParser.HTMLParser):
|
|||
self.blockquote -= 1
|
||||
self.p()
|
||||
|
||||
def no_preceding_space(self):
|
||||
return (self.preceding_data
|
||||
and re.match(r'[^\s]', self.preceding_data[-1]))
|
||||
|
||||
if tag in ['em', 'i', 'u'] and not self.ignore_emphasis:
|
||||
self.o(self.emphasis_mark)
|
||||
if tag in ['strong', 'b'] and not self.ignore_emphasis:
|
||||
self.o(self.strong_mark)
|
||||
if tag in ['del', 'strike', 's']:
|
||||
if start:
|
||||
self.o('~~')
|
||||
if start and no_preceding_space(self):
|
||||
emphasis = ' ' + self.emphasis_mark
|
||||
else:
|
||||
self.o('~~')
|
||||
emphasis = self.emphasis_mark
|
||||
|
||||
self.o(emphasis)
|
||||
if start:
|
||||
self.stressed = True
|
||||
|
||||
if tag in ['strong', 'b'] and not self.ignore_emphasis:
|
||||
if start and no_preceding_space(self):
|
||||
strong = ' ' + self.strong_mark
|
||||
else:
|
||||
strong = self.strong_mark
|
||||
|
||||
self.o(strong)
|
||||
if start:
|
||||
self.stressed = True
|
||||
|
||||
if tag in ['del', 'strike', 's']:
|
||||
if start and no_preceding_space(self):
|
||||
strike = ' ~~'
|
||||
else:
|
||||
strike = '~~'
|
||||
|
||||
self.o(strike)
|
||||
if start:
|
||||
self.stressed = True
|
||||
|
||||
if self.google_doc:
|
||||
if not self.inheader:
|
||||
# handle some font attributes, but leave headers clean
|
||||
self.handle_emphasis(start, tag_style, parent_style)
|
||||
|
||||
if tag in ["code", "tt"] and not self.pre:
|
||||
if tag in ["kbd", "code", "tt"] and not self.pre:
|
||||
self.o('`') # TODO: `` `this` ``
|
||||
self.code = not self.code
|
||||
|
||||
if tag == "abbr":
|
||||
if start:
|
||||
self.abbr_title = None
|
||||
|
|
@ -400,17 +440,30 @@ class HTML2Text(HTMLParser.HTMLParser):
|
|||
self.abbr_title = None
|
||||
self.abbr_data = ''
|
||||
|
||||
if tag == "q":
|
||||
if not self.quote:
|
||||
self.o(self.open_quote)
|
||||
else:
|
||||
self.o(self.close_quote)
|
||||
self.quote = not self.quote
|
||||
|
||||
def link_url(self, link, title=""):
|
||||
url = urlparse.urljoin(self.baseurl, link)
|
||||
title = ' "{0}"'.format(title) if title.strip() else ''
|
||||
self.o(']({url}{title})'.format(url=escape_md(url),
|
||||
title=title))
|
||||
|
||||
if tag == "a" and not self.ignore_links:
|
||||
if start:
|
||||
if ('href' in attrs) and \
|
||||
(attrs['href'] is not None) and \
|
||||
not (self.skip_internal_links and
|
||||
attrs['href'].startswith('#')):
|
||||
if 'href' in attrs and \
|
||||
attrs['href'] is not None and not \
|
||||
(self.skip_internal_links and
|
||||
attrs['href'].startswith('#')):
|
||||
self.astack.append(attrs)
|
||||
self.maybe_automatic_link = attrs['href']
|
||||
self.empty_link = True
|
||||
if self.protect_links:
|
||||
attrs['href'] = '<'+attrs['href']+'>'
|
||||
attrs['href'] = '<' + attrs['href'] + '>'
|
||||
else:
|
||||
self.astack.append(None)
|
||||
else:
|
||||
|
|
@ -425,12 +478,12 @@ class HTML2Text(HTMLParser.HTMLParser):
|
|||
self.maybe_automatic_link = None
|
||||
if self.inline_links:
|
||||
try:
|
||||
title = escape_md(a['title'])
|
||||
title = a['title'] if a['title'] else ''
|
||||
title = escape_md(title)
|
||||
except KeyError:
|
||||
self.o("](" + escape_md(urlparse.urljoin(self.baseurl, a['href'])) + ")")
|
||||
link_url(self, a['href'], '')
|
||||
else:
|
||||
self.o("](" + escape_md(urlparse.urljoin(self.baseurl, a['href']))
|
||||
+ ' "' + title + '" )')
|
||||
link_url(self, a['href'], title)
|
||||
else:
|
||||
i = self.previousIndex(a)
|
||||
if i is not None:
|
||||
|
|
@ -463,7 +516,7 @@ class HTML2Text(HTMLParser.HTMLParser):
|
|||
return
|
||||
|
||||
# If we have a link to create, output the start
|
||||
if not self.maybe_automatic_link is None:
|
||||
if self.maybe_automatic_link is not None:
|
||||
href = self.maybe_automatic_link
|
||||
if self.images_to_alt and escape_md(alt) == href and \
|
||||
self.absolute_url_matcher.match(href):
|
||||
|
|
@ -483,7 +536,16 @@ class HTML2Text(HTMLParser.HTMLParser):
|
|||
self.o("![" + escape_md(alt) + "]")
|
||||
if self.inline_links:
|
||||
href = attrs.get('href') or ''
|
||||
self.o("(" + escape_md(urlparse.urljoin(self.baseurl, href)) + ")")
|
||||
self.o(
|
||||
"(" +
|
||||
escape_md(
|
||||
urlparse.urljoin(
|
||||
self.baseurl,
|
||||
href
|
||||
)
|
||||
) +
|
||||
")"
|
||||
)
|
||||
else:
|
||||
i = self.previousIndex(attrs)
|
||||
if i is not None:
|
||||
|
|
@ -576,11 +638,11 @@ class HTML2Text(HTMLParser.HTMLParser):
|
|||
if start:
|
||||
self.table_start = True
|
||||
if self.pad_tables:
|
||||
self.o("<"+config.TABLE_MARKER_FOR_PAD+">")
|
||||
self.o("<" + config.TABLE_MARKER_FOR_PAD + ">")
|
||||
self.o(" \n")
|
||||
else:
|
||||
if self.pad_tables:
|
||||
self.o("</"+config.TABLE_MARKER_FOR_PAD+">")
|
||||
self.o("</" + config.TABLE_MARKER_FOR_PAD + ">")
|
||||
self.o(" \n")
|
||||
if tag in ["td", "th"] and start:
|
||||
if self.split_next_td:
|
||||
|
|
@ -654,8 +716,9 @@ class HTML2Text(HTMLParser.HTMLParser):
|
|||
return
|
||||
|
||||
if self.startpre:
|
||||
#self.out(" :") #TODO: not output when already one there
|
||||
if not data.startswith("\n"): # <pre>stuff...
|
||||
# self.out(" :") #TODO: not output when already one there
|
||||
if not data.startswith("\n") and not data.startswith("\r\n"):
|
||||
# <pre>stuff...
|
||||
data = "\n" + data
|
||||
if self.mark_code:
|
||||
self.out("\n[code]")
|
||||
|
|
@ -668,7 +731,7 @@ class HTML2Text(HTMLParser.HTMLParser):
|
|||
if self.pre:
|
||||
if not self.list:
|
||||
bq += " "
|
||||
#else: list content is already partially indented
|
||||
# else: list content is already partially indented
|
||||
for i in range(len(self.list)):
|
||||
bq += " "
|
||||
data = data.replace("\n", "\n" + bq)
|
||||
|
|
@ -700,8 +763,8 @@ class HTML2Text(HTMLParser.HTMLParser):
|
|||
self.out(' ')
|
||||
self.space = 0
|
||||
|
||||
if self.a and ((self.p_p == 2 and self.links_each_paragraph)
|
||||
or force == "end"):
|
||||
if self.a and ((self.p_p == 2 and self.links_each_paragraph) or
|
||||
force == "end"):
|
||||
if force == "end":
|
||||
self.out("\n")
|
||||
|
||||
|
|
@ -731,13 +794,25 @@ class HTML2Text(HTMLParser.HTMLParser):
|
|||
self.outcount += 1
|
||||
|
||||
def handle_data(self, data, entity_char=False):
|
||||
if self.stressed:
|
||||
data = data.strip()
|
||||
self.stressed = False
|
||||
self.preceding_stressed = True
|
||||
elif (self.preceding_stressed
|
||||
and re.match(r'[^\s.!?]', data[0])
|
||||
and not hn(self.current_tag)
|
||||
and self.current_tag not in ['a', 'code', 'pre']):
|
||||
# should match a letter or common punctuation
|
||||
data = ' ' + data
|
||||
self.preceding_stressed = False
|
||||
|
||||
if self.style:
|
||||
self.style_def.update(dumb_css_parser(data))
|
||||
|
||||
if not self.maybe_automatic_link is None:
|
||||
if self.maybe_automatic_link is not None:
|
||||
href = self.maybe_automatic_link
|
||||
if (href == data and self.absolute_url_matcher.match(href)
|
||||
and self.use_automatic_links):
|
||||
if (href == data and self.absolute_url_matcher.match(href) and
|
||||
self.use_automatic_links):
|
||||
self.o("<" + data + ">")
|
||||
self.empty_link = False
|
||||
return
|
||||
|
|
@ -748,6 +823,7 @@ class HTML2Text(HTMLParser.HTMLParser):
|
|||
|
||||
if not self.code and not self.pre and not entity_char:
|
||||
data = escape_md_section(data, snob=self.escape_snob)
|
||||
self.preceding_data = data
|
||||
self.o(data, 1)
|
||||
|
||||
def unknown_decl(self, data): # pragma: no cover
|
||||
|
|
@ -764,10 +840,7 @@ class HTML2Text(HTMLParser.HTMLParser):
|
|||
return unifiable_n[c]
|
||||
else:
|
||||
try:
|
||||
try:
|
||||
return unichr(c)
|
||||
except NameError: # Python3
|
||||
return chr(c)
|
||||
return chr(c)
|
||||
except ValueError: # invalid unicode
|
||||
return ''
|
||||
|
||||
|
|
@ -783,10 +856,7 @@ class HTML2Text(HTMLParser.HTMLParser):
|
|||
if c == 'nbsp':
|
||||
return config.UNIFIABLE[c]
|
||||
else:
|
||||
try:
|
||||
return unichr(name2cp(c))
|
||||
except NameError: # Python3
|
||||
return chr(name2cp(c))
|
||||
return chr(name2cp(c))
|
||||
|
||||
def replaceEntities(self, s):
|
||||
s = s.group(1)
|
||||
|
|
@ -809,7 +879,7 @@ class HTML2Text(HTMLParser.HTMLParser):
|
|||
nest_count = 0
|
||||
if 'margin-left' in style:
|
||||
nest_count = int(style['margin-left'][:-2]) \
|
||||
// self.google_list_indent
|
||||
// self.google_list_indent
|
||||
|
||||
return nest_count
|
||||
|
||||
|
|
|
|||
|
|
@ -158,7 +158,8 @@ def main():
|
|||
action="store_true",
|
||||
dest="ignore_tables",
|
||||
default=config.IGNORE_TABLES,
|
||||
help="Ignore table-related tags (table, th, td, tr) while keeping rows."
|
||||
help="Ignore table-related tags (table, th, td, tr) "
|
||||
"while keeping rows."
|
||||
)
|
||||
p.add_option(
|
||||
"--single-line-break",
|
||||
|
|
@ -211,7 +212,24 @@ def main():
|
|||
action="store",
|
||||
type="string",
|
||||
default=config.DECODE_ERRORS,
|
||||
help="What to do in case of decode errors.'ignore', 'strict' and 'replace' are acceptable values"
|
||||
help="What to do in case of decode errors.'ignore', 'strict' and "
|
||||
"'replace' are acceptable values"
|
||||
)
|
||||
p.add_option(
|
||||
"--open-quote",
|
||||
dest="open_quote",
|
||||
action="store",
|
||||
type="str",
|
||||
default=config.OPEN_QUOTE,
|
||||
help="The character used to open quotes",
|
||||
)
|
||||
p.add_option(
|
||||
"--close-quote",
|
||||
dest="close_quote",
|
||||
action="store",
|
||||
type="str",
|
||||
default=config.CLOSE_QUOTE,
|
||||
help="The character used to close quotes",
|
||||
)
|
||||
(options, args) = p.parse_args()
|
||||
|
||||
|
|
@ -226,8 +244,11 @@ def main():
|
|||
file_ = args[0]
|
||||
|
||||
if file_.startswith('http://') or file_.startswith('https://'):
|
||||
warnings.warn("Support for retrieving html over network is set for deprecation by version (2017, 1, x)",
|
||||
DeprecationWarning)
|
||||
warnings.warn(
|
||||
"Support for retrieving html over network is set for "
|
||||
"deprecation by version (2017, 1, x)",
|
||||
DeprecationWarning
|
||||
)
|
||||
baseurl = file_
|
||||
j = urllib.urlopen(baseurl)
|
||||
data = j.read()
|
||||
|
|
@ -235,7 +256,8 @@ def main():
|
|||
try:
|
||||
from feedparser import _getCharacterEncoding as enc
|
||||
except ImportError:
|
||||
enc = lambda x, y: ('utf-8', 1)
|
||||
def enc(x, y):
|
||||
return ('utf-8', 1)
|
||||
encoding = enc(j.headers, data)[0]
|
||||
if encoding == 'us-ascii':
|
||||
encoding = 'utf-8'
|
||||
|
|
@ -245,7 +267,8 @@ def main():
|
|||
try:
|
||||
from chardet import detect
|
||||
except ImportError:
|
||||
detect = lambda x: {'encoding': 'utf-8'}
|
||||
def detect(x):
|
||||
return {'encoding': 'utf-8'}
|
||||
encoding = detect(data)['encoding']
|
||||
else:
|
||||
data = wrap_read()
|
||||
|
|
@ -295,5 +318,7 @@ def main():
|
|||
h.wrap_links = options.wrap_links
|
||||
h.pad_tables = options.pad_tables
|
||||
h.default_image_alt = options.default_image_alt
|
||||
h.open_quote = options.open_quote
|
||||
h.close_quote = options.close_quote
|
||||
|
||||
wrapwrite(h.handle(data))
|
||||
|
|
|
|||
|
|
@ -13,5 +13,9 @@ else:
|
|||
import html.parser as HTMLParser
|
||||
import urllib.request as urllib
|
||||
from html import escape
|
||||
|
||||
def html_escape(s):
|
||||
return escape(s, quote=False)
|
||||
|
||||
|
||||
__all__ = ['HTMLParser', 'html_escape', 'htmlentitydefs', 'urllib', 'urlparse']
|
||||
|
|
|
|||
|
|
@ -1,6 +1,8 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
import re
|
||||
|
||||
# Use Unicode characters instead of their ascii psuedo-replacements
|
||||
# Use Unicode characters instead of their ascii pseudo-replacements
|
||||
UNICODE_SNOB = 0
|
||||
|
||||
# Marker to use for marking tables for padding post processing
|
||||
|
|
@ -31,6 +33,9 @@ WRAP_LINKS = True
|
|||
# Number of pixels Google indents nested lists
|
||||
GOOGLE_LIST_INDENT = 36
|
||||
|
||||
# Values Google and others may use to indicate bold text
|
||||
BOLD_TEXT_STYLE_VALUES = ('bold', '700', '800', '900')
|
||||
|
||||
IGNORE_ANCHORS = False
|
||||
IGNORE_IMAGES = False
|
||||
IMAGES_TO_ALT = False
|
||||
|
|
@ -41,7 +46,8 @@ DECODE_ERRORS = 'strict'
|
|||
DEFAULT_IMAGE_ALT = ''
|
||||
PAD_TABLES = False
|
||||
|
||||
# Convert links with same href and text to <href> format if they are absolute links
|
||||
# Convert links with same href and text to <href> format
|
||||
# if they are absolute links
|
||||
USE_AUTOMATIC_LINKS = True
|
||||
|
||||
# For checking space-only lines on line 771
|
||||
|
|
@ -52,7 +58,10 @@ RE_ORDERED_LIST_MATCHER = re.compile(r'\d+\.\s')
|
|||
RE_UNORDERED_LIST_MATCHER = re.compile(r'[-\*\+]\s')
|
||||
RE_MD_CHARS_MATCHER = re.compile(r"([\\\[\]\(\)])")
|
||||
RE_MD_CHARS_MATCHER_ALL = re.compile(r"([`\*_{}\[\]\(\)#!])")
|
||||
RE_LINK = re.compile(r"(\[.*?\] ?\(.*?\))|(\[.*?\]:.*?)") # to find links in the text
|
||||
|
||||
# to find links in the text
|
||||
RE_LINK = re.compile(r"(\[.*?\] ?\(.*?\))|(\[.*?\]:.*?)")
|
||||
|
||||
RE_MD_DOT_MATCHER = re.compile(r"""
|
||||
^ # start of line
|
||||
(\s*\d+) # optional whitespace and a number
|
||||
|
|
@ -126,6 +135,11 @@ BYPASS_TABLES = False
|
|||
IGNORE_TABLES = False
|
||||
|
||||
|
||||
# Use a single line break after a block element rather an two line breaks.
|
||||
# Use a single line break after a block element rather than two line breaks.
|
||||
# NOTE: Requires body width setting to be 0.
|
||||
SINGLE_LINE_BREAK = False
|
||||
|
||||
|
||||
# Use double quotation marks when converting the <q> tag.
|
||||
OPEN_QUOTE = '"'
|
||||
CLOSE_QUOTE = '"'
|
||||
|
|
|
|||
|
|
@ -12,7 +12,6 @@ def name2cp(k):
|
|||
|
||||
|
||||
unifiable_n = {}
|
||||
|
||||
for k in config.UNIFIABLE.keys():
|
||||
unifiable_n[name2cp(k)] = config.UNIFIABLE[k]
|
||||
|
||||
|
|
@ -191,7 +190,7 @@ def skipwrap(para, wrap_links):
|
|||
# I'm not sure what this is for; I thought it was to detect lists,
|
||||
# but there's a <br>-inside-<span> case in one of the tests that
|
||||
# also depends upon it.
|
||||
if stripped[0:1] == '-' or stripped[0:1] == '*':
|
||||
if stripped[0:1] in ('-', '*') and not stripped[0:2] == '**':
|
||||
return True
|
||||
|
||||
# If the text begins with a single -, *, or +, followed by a space,
|
||||
|
|
@ -245,6 +244,7 @@ def escape_md_section(text, snob=False):
|
|||
|
||||
return text
|
||||
|
||||
|
||||
def reformat_table(lines, right_margin):
|
||||
"""
|
||||
Given the lines of a table
|
||||
|
|
@ -252,11 +252,24 @@ def reformat_table(lines, right_margin):
|
|||
"""
|
||||
# find the maximum width of the columns
|
||||
max_width = [len(x.rstrip()) + right_margin for x in lines[0].split('|')]
|
||||
max_cols = len(max_width)
|
||||
for line in lines:
|
||||
cols = [x.rstrip() for x in line.split('|')]
|
||||
num_cols = len(cols)
|
||||
|
||||
# don't drop any data if colspan attributes result in unequal lengths
|
||||
if num_cols < max_cols:
|
||||
cols += [''] * (max_cols - num_cols)
|
||||
elif max_cols < num_cols:
|
||||
max_width += [
|
||||
len(x) + right_margin for x in
|
||||
cols[-(num_cols - max_cols):]
|
||||
]
|
||||
max_cols = num_cols
|
||||
|
||||
max_width = [max(len(x) + right_margin, old_len)
|
||||
for x, old_len in zip(cols, max_width)]
|
||||
|
||||
|
||||
# reformat
|
||||
new_lines = []
|
||||
for line in lines:
|
||||
|
|
@ -272,15 +285,16 @@ def reformat_table(lines, right_margin):
|
|||
new_lines.append('|'.join(new_cols))
|
||||
return new_lines
|
||||
|
||||
|
||||
def pad_tables_in_text(text, right_margin=1):
|
||||
"""
|
||||
Provide padding for tables in the text
|
||||
"""
|
||||
lines = text.split('\n')
|
||||
table_buffer, altered_lines, table_widths, table_started = [], [], [], False
|
||||
table_buffer, table_started = [], False
|
||||
new_lines = []
|
||||
for line in lines:
|
||||
# Toogle table started
|
||||
# Toggle table started
|
||||
if (config.TABLE_MARKER_FOR_PAD in line):
|
||||
table_started = not table_started
|
||||
if not table_started:
|
||||
|
|
|
|||
Loading…
Reference in a new issue