diff --git a/fanficfare/html2text.py b/fanficfare/html2text.py deleted file mode 100644 index 20296a49..00000000 --- a/fanficfare/html2text.py +++ /dev/null @@ -1,453 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -"""html2text: Turn HTML into equivalent Markdown-structured text.""" -__version__ = "2.37" -__author__ = "Aaron Swartz (me@aaronsw.com)" -__copyright__ = "(C) 2004-2008 Aaron Swartz. GNU GPL 3." -__contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes", "Kevin Jay North"] - -# TODO: -# Support decoded entities with unifiable. - -if not hasattr(__builtins__, 'True'): True, False = 1, 0 -import re, sys, urllib, htmlentitydefs, codecs, StringIO, types -import sgmllib -import urlparse -sgmllib.charref = re.compile('([xX]?[0-9a-fA-F]+)[^0-9a-fA-F]') - -try: from textwrap import wrap -except: pass - -# Use Unicode characters instead of their ascii psuedo-replacements -UNICODE_SNOB = 0 - -# Put the links after each paragraph instead of at the end. -LINKS_EACH_PARAGRAPH = 0 - -# Wrap long lines at position. 0 for no wrapping. (Requires Python 2.3.) -BODY_WIDTH = 78 - -# Don't show internal links (href="#local-anchor") -- corresponding link targets -# won't be visible in the plain text file anyway. -SKIP_INTERNAL_LINKS = False - -### Entity Nonsense ### - -def name2cp(k): - if k == 'apos': return ord("'") - if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3 - return htmlentitydefs.name2codepoint[k] - else: - k = htmlentitydefs.entitydefs[k] - if k.startswith("") and k.endswith(";"): return int(k[2:-1]) # not in latin-1 - return ord(codecs.latin_1_decode(k)[0]) - -unifiable = {'rsquo':"'", 'lsquo':"'", 'rdquo':'"', 'ldquo':'"', -'copy':'(C)', 'mdash':'--', 'nbsp':' ', 'rarr':'->', 'larr':'<-', 'middot':'*', -'ndash':'-', 'oelig':'oe', 'aelig':'ae', -'agrave':'a', 'aacute':'a', 'acirc':'a', 'atilde':'a', 'auml':'a', 'aring':'a', -'egrave':'e', 'eacute':'e', 'ecirc':'e', 'euml':'e', -'igrave':'i', 'iacute':'i', 'icirc':'i', 'iuml':'i', -'ograve':'o', 'oacute':'o', 'ocirc':'o', 'otilde':'o', 'ouml':'o', -'ugrave':'u', 'uacute':'u', 'ucirc':'u', 'uuml':'u'} - -unifiable_n = {} - -for k in unifiable.keys(): - unifiable_n[name2cp(k)] = unifiable[k] - -def charref(name): - if name[0] in ['x','X']: - c = int(name[1:], 16) - else: - c = int(name) - - if not UNICODE_SNOB and c in unifiable_n.keys(): - return unifiable_n[c] - else: - return unichr(c) - -def entityref(c): - if not UNICODE_SNOB and c in unifiable.keys(): - return unifiable[c] - else: - try: name2cp(c) - except KeyError: return "&" + c - else: return unichr(name2cp(c)) - -def replaceEntities(s): - s = s.group(1) - if s[0] == "#": - return charref(s[1:]) - else: return entityref(s) - -r_unescape = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));") -def unescape(s): - return r_unescape.sub(replaceEntities, s) - -def fixattrs(attrs): - # Fix bug in sgmllib.py - if not attrs: return attrs - newattrs = [] - for attr in attrs: - newattrs.append((attr[0], unescape(attr[1]))) - return newattrs - -### End Entity Nonsense ### - -def onlywhite(line): - """Return true if the line does only consist of whitespace characters.""" - for c in line: - if c is not ' ' and c is not ' ': - return c is ' ' - return line - -def optwrap(text,wrap_width=BODY_WIDTH): - """Wrap all paragraphs in the provided text.""" - - if not wrap_width: - return text - - assert wrap, "Requires Python 2.3." - result = '' - newlines = 0 - for para in text.split("\n"): - if len(para) > 0: - if para[0] is not ' ' and para[0] is not '-' and para[0] is not '*': - for line in wrap(para, wrap_width): - result += line + "\n" - result += "\n" - newlines = 2 - else: - if not onlywhite(para): - result += para + "\n" - newlines = 1 - else: - if newlines < 2: - result += "\n" - newlines += 1 - return result - -def hn(tag): - if tag[0] == 'h' and len(tag) == 2: - try: - n = int(tag[1]) - if n in range(1, 10): return n - except ValueError: return 0 - -class _html2text(sgmllib.SGMLParser): - def __init__(self, out=None, baseurl=''): - sgmllib.SGMLParser.__init__(self) - - if out is None: self.out = self.outtextf - else: self.out = out - self.outtext = u'' - self.quiet = 0 - self.p_p = 0 - self.outcount = 0 - self.start = 1 - self.space = 0 - self.a = [] - self.astack = [] - self.acount = 0 - self.list = [] - self.blockquote = 0 - self.pre = 0 - self.startpre = 0 - self.lastWasNL = 0 - self.abbr_title = None # current abbreviation definition - self.abbr_data = None # last inner HTML (for abbr being defined) - self.abbr_list = {} # stack of abbreviations to write later - self.baseurl = baseurl - - def outtextf(self, s): - self.outtext += s - - def close(self): - sgmllib.SGMLParser.close(self) - - self.pbr() - self.o('', 0, 'end') - - return self.outtext - - def handle_charref(self, c): - self.o(charref(c)) - - def handle_entityref(self, c): - self.o(entityref(c)) - - def unknown_starttag(self, tag, attrs): - self.handle_tag(tag, attrs, 1) - - def unknown_endtag(self, tag): - self.handle_tag(tag, None, 0) - - def previousIndex(self, attrs): - """ returns the index of certain set of attributes (of a link) in the - self.a list - - If the set of attributes is not found, returns None - """ - if not attrs.has_attr('href'): return None - - i = -1 - for a in self.a: - i += 1 - match = 0 - - if a.has_attr('href') and a['href'] == attrs['href']: - if a.has_attr('title') or attrs.has_attr('title'): - if (a.has_attr('title') and attrs.has_attr('title') and - a['title'] == attrs['title']): - match = True - else: - match = True - - if match: return i - - def handle_tag(self, tag, attrs, start): - attrs = fixattrs(attrs) - - if hn(tag): - self.p() - if start: self.o(hn(tag)*"#" + ' ') - - if tag in ['p', 'div']: self.p() - - if tag == "br" and start: self.o(" \n") - - if tag == "hr" and start: - self.p() - self.o("* * *") - self.p() - - if tag in ["head", "style", 'script']: - if start: self.quiet += 1 - else: self.quiet -= 1 - - if tag in ["body"]: - self.quiet = 0 # sites like 9rules.com never close
- - if tag == "blockquote": - if start: - self.p(); self.o('> ', 0, 1); self.start = 1 - self.blockquote += 1 - else: - self.blockquote -= 1 - self.p() - - if tag in ['em', 'i', 'u']: self.o("_") - if tag in ['strong', 'b']: self.o("**") - if tag == "code" and not self.pre: self.o('`') #TODO: `` `this` `` - if tag == "abbr": - if start: - attrsD = {} - for (x, y) in attrs: attrsD[x] = y - attrs = attrsD - - self.abbr_title = None - self.abbr_data = '' - if attrs.has_attr('title'): - self.abbr_title = attrs['title'] - else: - if self.abbr_title != None: - self.abbr_list[self.abbr_data] = self.abbr_title - self.abbr_title = None - self.abbr_data = '' - - if tag == "a": - if start: - attrsD = {} - for (x, y) in attrs: attrsD[x] = y - attrs = attrsD - if attrs.has_attr('href') and not (SKIP_INTERNAL_LINKS and attrs['href'].startswith('#')): - self.astack.append(attrs) - self.o("[") - else: - self.astack.append(None) - else: - if self.astack: - a = self.astack.pop() - if a: - i = self.previousIndex(a) - if i is not None: - a = self.a[i] - else: - self.acount += 1 - a['count'] = self.acount - a['outcount'] = self.outcount - self.a.append(a) - self.o("][" + `a['count']` + "]") - - if tag == "img" and start: - attrsD = {} - for (x, y) in attrs: attrsD[x] = y - attrs = attrsD - if attrs.has_attr('src'): - attrs['href'] = attrs['src'] - alt = attrs.get('alt', '') - i = self.previousIndex(attrs) - if i is not None: - attrs = self.a[i] - else: - self.acount += 1 - attrs['count'] = self.acount - attrs['outcount'] = self.outcount - self.a.append(attrs) - self.o("![") - self.o(alt) - self.o("]["+`attrs['count']`+"]") - - if tag == 'dl' and start: self.p() - if tag == 'dt' and not start: self.pbr() - if tag == 'dd' and start: self.o(' ') - if tag == 'dd' and not start: self.pbr() - - if tag in ["ol", "ul"]: - if start: - self.list.append({'name':tag, 'num':0}) - else: - if self.list: self.list.pop() - - self.p() - - if tag == 'li': - if start: - self.pbr() - if self.list: li = self.list[-1] - else: li = {'name':'ul', 'num':0} - self.o(" "*len(self.list)) #TODO: line upstuff...
+ data = "\n" + data
+ if self.mark_code:
+ self.out("\n[code]")
+ self.p_p = 0
+
+ bq = (">" * self.blockquote)
+ if not (force and data and data[0] == ">") and self.blockquote:
+ bq += " "
+
+ if self.pre:
+ if not self.list:
+ bq += " "
+ #else: list content is already partially indented
+ for i in range(len(self.list)):
+ bq += " "
+ data = data.replace("\n", "\n" + bq)
+
+ if self.startpre:
+ self.startpre = 0
+ if self.list:
+ # use existing initial indentation
+ data = data.lstrip("\n")
+
+ if self.start:
+ self.space = 0
+ self.p_p = 0
+ self.start = 0
+
+ if force == 'end':
+ # It's the end.
+ self.p_p = 0
+ self.out("\n")
+ self.space = 0
+
+ if self.p_p:
+ self.out((self.br_toggle + '\n' + bq) * self.p_p)
+ self.space = 0
+ self.br_toggle = ''
+
+ if self.space:
+ if not self.lastWasNL:
+ self.out(' ')
+ self.space = 0
+
+ if self.a and ((self.p_p == 2 and self.links_each_paragraph)
+ or force == "end"):
+ if force == "end":
+ self.out("\n")
+
+ newa = []
+ for link in self.a:
+ if self.outcount > link['outcount']:
+ self.out(" [" + str(link['count']) + "]: " +
+ urlparse.urljoin(self.baseurl, link['href']))
+ if 'title' in link:
+ self.out(" (" + link['title'] + ")")
+ self.out("\n")
+ else:
+ newa.append(link)
+
+ # Don't need an extra line when nothing was done.
+ if self.a != newa:
+ self.out("\n")
+
+ self.a = newa
+
+ if self.abbr_list and force == "end":
+ for abbr, definition in self.abbr_list.items():
+ self.out(" *[" + abbr + "]: " + definition + "\n")
+
+ self.p_p = 0
+ self.out(data)
+ self.outcount += 1
+
+ def handle_data(self, data, entity_char=False):
+ if r'\/script>' in data:
+ self.quiet -= 1
+
+ if self.style:
+ self.style_def.update(dumb_css_parser(data))
+
+ if not self.maybe_automatic_link is None:
+ href = self.maybe_automatic_link
+ if (href == data and self.absolute_url_matcher.match(href)
+ and self.use_automatic_links):
+ self.o("<" + data + ">")
+ self.empty_link = False
+ return
+ else:
+ self.o("[")
+ self.maybe_automatic_link = None
+ self.empty_link = False
+
+ if not self.code and not self.pre and not entity_char:
+ data = escape_md_section(data, snob=self.escape_snob)
+ self.o(data, 1)
+
+ def unknown_decl(self, data): # pragma: no cover
+ # TODO: what is this doing here?
+ pass
+
+ def charref(self, name):
+ if name[0] in ['x', 'X']:
+ c = int(name[1:], 16)
+ else:
+ c = int(name)
+
+ if not self.unicode_snob and c in unifiable_n.keys():
+ return unifiable_n[c]
+ else:
+ try:
+ try:
+ return unichr(c)
+ except NameError: # Python3
+ return chr(c)
+ except ValueError: # invalid unicode
+ return ''
+
+ def entityref(self, c):
+ if not self.unicode_snob and c in config.UNIFIABLE.keys():
+ return config.UNIFIABLE[c]
+ else:
+ try:
+ name2cp(c)
+ except KeyError:
+ return "&" + c + ';'
+ else:
+ if c == 'nbsp':
+ return config.UNIFIABLE[c]
+ else:
+ try:
+ return unichr(name2cp(c))
+ except NameError: # Python3
+ return chr(name2cp(c))
+
+ def replaceEntities(self, s):
+ s = s.group(1)
+ if s[0] == "#":
+ return self.charref(s[1:])
+ else:
+ return self.entityref(s)
+
+ def unescape(self, s):
+ return config.RE_UNESCAPE.sub(self.replaceEntities, s)
+
+ def google_nest_count(self, style):
+ """
+ Calculate the nesting count of google doc lists
+
+ :type style: dict
+
+ :rtype: int
+ """
+ nest_count = 0
+ if 'margin-left' in style:
+ nest_count = int(style['margin-left'][:-2]) \
+ // self.google_list_indent
+
+ return nest_count
+
+ def optwrap(self, text):
+ """
+ Wrap all paragraphs in the provided text.
+
+ :type text: str
+
+ :rtype: str
+ """
+ if not self.body_width:
+ return text
+
+ assert wrap, "Requires Python 2.3."
+ result = ''
+ newlines = 0
+ # I cannot think of a better solution for now.
+ # To avoid the non-wrap behaviour for entire paras
+ # because of the presence of a link in it
+ if not self.wrap_links:
+ self.inline_links = False
+ for para in text.split("\n"):
+ if len(para) > 0:
+ if not skipwrap(para, self.wrap_links):
+ result += "\n".join(wrap(para, self.body_width))
+ if para.endswith(' '):
+ result += " \n"
+ newlines = 1
+ else:
+ result += "\n\n"
+ newlines = 2
+ else:
+ # Warning for the tempted!!!
+ # Be aware that obvious replacement of this with
+ # line.isspace()
+ # DOES NOT work! Explanations are welcome.
+ if not config.RE_SPACE.match(para):
+ result += para + "\n"
+ newlines = 1
+ else:
+ if newlines < 2:
+ result += "\n"
+ newlines += 1
+ return result
+
+
+def html2text(html, baseurl='', bodywidth=None):
+ if bodywidth is None:
+ bodywidth = config.BODY_WIDTH
+ h = HTML2Text(baseurl=baseurl, bodywidth=bodywidth)
+
+ return h.handle(html)
+
+
+def unescape(s, unicode_snob=False):
+ h = HTML2Text()
+ h.unicode_snob = unicode_snob
+
+ return h.unescape(s)
+
+
+if __name__ == "__main__":
+ from html2text.cli import main
+
+ main()
diff --git a/included_dependencies/html2text/compat.py b/included_dependencies/html2text/compat.py
new file mode 100644
index 00000000..2120a41b
--- /dev/null
+++ b/included_dependencies/html2text/compat.py
@@ -0,0 +1,13 @@
+import sys
+
+
+if sys.version_info[0] == 2:
+ import htmlentitydefs
+ import urlparse
+ import HTMLParser
+ import urllib
+else:
+ import urllib.parse as urlparse
+ import html.entities as htmlentitydefs
+ import html.parser as HTMLParser
+ import urllib.request as urllib
diff --git a/included_dependencies/html2text/config.py b/included_dependencies/html2text/config.py
new file mode 100644
index 00000000..85bf47dc
--- /dev/null
+++ b/included_dependencies/html2text/config.py
@@ -0,0 +1,123 @@
+import re
+
+# Use Unicode characters instead of their ascii psuedo-replacements
+UNICODE_SNOB = 0
+
+# Escape all special characters. Output is less readable, but avoids
+# corner case formatting issues.
+ESCAPE_SNOB = 0
+
+# Put the links after each paragraph instead of at the end.
+LINKS_EACH_PARAGRAPH = 0
+
+# Wrap long lines at position. 0 for no wrapping. (Requires Python 2.3.)
+BODY_WIDTH = 78
+
+# Don't show internal links (href="#local-anchor") -- corresponding link
+# targets won't be visible in the plain text file anyway.
+SKIP_INTERNAL_LINKS = True
+
+# Use inline, rather than reference, formatting for images and links
+INLINE_LINKS = True
+
+# Protect links from line breaks surrounding them with angle brackets (in
+# addition to their square brackets)
+PROTECT_LINKS = False
+# WRAP_LINKS = True
+WRAP_LINKS = True
+
+# Number of pixels Google indents nested lists
+GOOGLE_LIST_INDENT = 36
+
+IGNORE_ANCHORS = False
+IGNORE_IMAGES = False
+IMAGES_TO_ALT = False
+IMAGES_WITH_SIZE = False
+IGNORE_EMPHASIS = False
+MARK_CODE = False
+DECODE_ERRORS = 'strict'
+
+# Convert links with same href and text to format if they are absolute links
+USE_AUTOMATIC_LINKS = True
+
+# For checking space-only lines on line 771
+RE_SPACE = re.compile(r'\s\+')
+
+RE_UNESCAPE = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));")
+RE_ORDERED_LIST_MATCHER = re.compile(r'\d+\.\s')
+RE_UNORDERED_LIST_MATCHER = re.compile(r'[-\*\+]\s')
+RE_MD_CHARS_MATCHER = re.compile(r"([\\\[\]\(\)])")
+RE_MD_CHARS_MATCHER_ALL = re.compile(r"([`\*_{}\[\]\(\)#!])")
+RE_LINK = re.compile(r"(\[.*?\] ?\(.*?\))|(\[.*?\]:.*?)") # to find links in the text
+RE_MD_DOT_MATCHER = re.compile(r"""
+ ^ # start of line
+ (\s*\d+) # optional whitespace and a number
+ (\.) # dot
+ (?=\s) # lookahead assert whitespace
+ """, re.MULTILINE | re.VERBOSE)
+RE_MD_PLUS_MATCHER = re.compile(r"""
+ ^
+ (\s*)
+ (\+)
+ (?=\s)
+ """, flags=re.MULTILINE | re.VERBOSE)
+RE_MD_DASH_MATCHER = re.compile(r"""
+ ^
+ (\s*)
+ (-)
+ (?=\s|\-) # followed by whitespace (bullet list, or spaced out hr)
+ # or another dash (header or hr)
+ """, flags=re.MULTILINE | re.VERBOSE)
+RE_SLASH_CHARS = r'\`*_{}[]()#+-.!'
+RE_MD_BACKSLASH_MATCHER = re.compile(r'''
+ (\\) # match one slash
+ (?=[%s]) # followed by a char that requires escaping
+ ''' % re.escape(RE_SLASH_CHARS),
+ flags=re.VERBOSE)
+
+UNIFIABLE = {
+ 'rsquo': "'",
+ 'lsquo': "'",
+ 'rdquo': '"',
+ 'ldquo': '"',
+ 'copy': '(C)',
+ 'mdash': '--',
+ 'nbsp': ' ',
+ 'rarr': '->',
+ 'larr': '<-',
+ 'middot': '*',
+ 'ndash': '-',
+ 'oelig': 'oe',
+ 'aelig': 'ae',
+ 'agrave': 'a',
+ 'aacute': 'a',
+ 'acirc': 'a',
+ 'atilde': 'a',
+ 'auml': 'a',
+ 'aring': 'a',
+ 'egrave': 'e',
+ 'eacute': 'e',
+ 'ecirc': 'e',
+ 'euml': 'e',
+ 'igrave': 'i',
+ 'iacute': 'i',
+ 'icirc': 'i',
+ 'iuml': 'i',
+ 'ograve': 'o',
+ 'oacute': 'o',
+ 'ocirc': 'o',
+ 'otilde': 'o',
+ 'ouml': 'o',
+ 'ugrave': 'u',
+ 'uacute': 'u',
+ 'ucirc': 'u',
+ 'uuml': 'u',
+ 'lrm': '',
+ 'rlm': ''
+}
+
+BYPASS_TABLES = False
+
+# Use a single line break after a block element rather an two line breaks.
+# NOTE: Requires body width setting to be 0.
+SINGLE_LINE_BREAK = False
diff --git a/included_dependencies/html2text/utils.py b/included_dependencies/html2text/utils.py
new file mode 100644
index 00000000..418c89cb
--- /dev/null
+++ b/included_dependencies/html2text/utils.py
@@ -0,0 +1,246 @@
+import sys
+
+from html2text import config
+from html2text.compat import htmlentitydefs
+
+
+def name2cp(k):
+ """Return sname to codepoint"""
+ if k == 'apos':
+ return ord("'")
+ return htmlentitydefs.name2codepoint[k]
+
+
+unifiable_n = {}
+
+for k in config.UNIFIABLE.keys():
+ unifiable_n[name2cp(k)] = config.UNIFIABLE[k]
+
+
+def hn(tag):
+ if tag[0] == 'h' and len(tag) == 2:
+ try:
+ n = int(tag[1])
+ if n in range(1, 10): # pragma: no branch
+ return n
+ except ValueError:
+ return 0
+
+
+def dumb_property_dict(style):
+ """
+ :returns: A hash of css attributes
+ """
+ out = dict([(x.strip(), y.strip()) for x, y in
+ [z.split(':', 1) for z in
+ style.split(';') if ':' in z
+ ]
+ ]
+ )
+
+ return out
+
+
+def dumb_css_parser(data):
+ """
+ :type data: str
+
+ :returns: A hash of css selectors, each of which contains a hash of
+ css attributes.
+ :rtype: dict
+ """
+ # remove @import sentences
+ data += ';'
+ importIndex = data.find('@import')
+ while importIndex != -1:
+ data = data[0:importIndex] + data[data.find(';', importIndex) + 1:]
+ importIndex = data.find('@import')
+
+ # parse the css. reverted from dictionary comprehension in order to
+ # support older pythons
+ elements = [x.split('{') for x in data.split('}') if '{' in x.strip()]
+ try:
+ elements = dict([(a.strip(), dumb_property_dict(b))
+ for a, b in elements])
+ except ValueError: # pragma: no cover
+ elements = {} # not that important
+
+ return elements
+
+
+def element_style(attrs, style_def, parent_style):
+ """
+ :type attrs: dict
+ :type style_def: dict
+ :type style_def: dict
+
+ :returns: A hash of the 'final' style attributes of the element
+ :rtype: dict
+ """
+ style = parent_style.copy()
+ if 'class' in attrs:
+ for css_class in attrs['class'].split():
+ css_style = style_def.get('.' + css_class, {})
+ style.update(css_style)
+ if 'style' in attrs:
+ immediate_style = dumb_property_dict(attrs['style'])
+ style.update(immediate_style)
+
+ return style
+
+
+def google_list_style(style):
+ """
+ Finds out whether this is an ordered or unordered list
+
+ :type style: dict
+
+ :rtype: str
+ """
+ if 'list-style-type' in style:
+ list_style = style['list-style-type']
+ if list_style in ['disc', 'circle', 'square', 'none']:
+ return 'ul'
+
+ return 'ol'
+
+
+def google_has_height(style):
+ """
+ Check if the style of the element has the 'height' attribute
+ explicitly defined
+
+ :type style: dict
+
+ :rtype: bool
+ """
+ if 'height' in style:
+ return True
+
+ return False
+
+
+def google_text_emphasis(style):
+ """
+ :type style: dict
+
+ :returns: A list of all emphasis modifiers of the element
+ :rtype: list
+ """
+ emphasis = []
+ if 'text-decoration' in style:
+ emphasis.append(style['text-decoration'])
+ if 'font-style' in style:
+ emphasis.append(style['font-style'])
+ if 'font-weight' in style:
+ emphasis.append(style['font-weight'])
+
+ return emphasis
+
+
+def google_fixed_width_font(style):
+ """
+ Check if the css of the current element defines a fixed width font
+
+ :type style: dict
+
+ :rtype: bool
+ """
+ font_family = ''
+ if 'font-family' in style:
+ font_family = style['font-family']
+ if 'Courier New' == font_family or 'Consolas' == font_family:
+ return True
+
+ return False
+
+
+def list_numbering_start(attrs):
+ """
+ Extract numbering from list element attributes
+
+ :type attrs: dict
+
+ :rtype: int or None
+ """
+ if 'start' in attrs:
+ try:
+ return int(attrs['start']) - 1
+ except ValueError:
+ pass
+
+ return 0
+
+
+def skipwrap(para, wrap_links):
+ # If it appears to contain a link
+ # don't wrap
+ if (len(config.RE_LINK.findall(para)) > 0) and not wrap_links:
+ return True
+ # If the text begins with four spaces or one tab, it's a code block;
+ # don't wrap
+ if para[0:4] == ' ' or para[0] == '\t':
+ return True
+
+ # If the text begins with only two "--", possibly preceded by
+ # whitespace, that's an emdash; so wrap.
+ stripped = para.lstrip()
+ if stripped[0:2] == "--" and len(stripped) > 2 and stripped[2] != "-":
+ return False
+
+ # I'm not sure what this is for; I thought it was to detect lists,
+ # but there's a
-inside- case in one of the tests that
+ # also depends upon it.
+ if stripped[0:1] == '-' or stripped[0:1] == '*':
+ return True
+
+ # If the text begins with a single -, *, or +, followed by a space,
+ # or an integer, followed by a ., followed by a space (in either
+ # case optionally proceeded by whitespace), it's a list; don't wrap.
+ if config.RE_ORDERED_LIST_MATCHER.match(stripped) or \
+ config.RE_UNORDERED_LIST_MATCHER.match(stripped):
+ return True
+
+ return False
+
+
+def wrapwrite(text):
+ text = text.encode('utf-8')
+ try: # Python3
+ sys.stdout.buffer.write(text)
+ except AttributeError:
+ sys.stdout.write(text)
+
+
+def wrap_read(): # pragma: no cover
+ """
+ :rtype: str
+ """
+ try:
+ return sys.stdin.read()
+ except AttributeError:
+ return sys.stdin.buffer.read()
+
+
+def escape_md(text):
+ """
+ Escapes markdown-sensitive characters within other markdown
+ constructs.
+ """
+ return config.RE_MD_CHARS_MATCHER.sub(r"\\\1", text)
+
+
+def escape_md_section(text, snob=False):
+ """
+ Escapes markdown-sensitive characters across whole document sections.
+ """
+ text = config.RE_MD_BACKSLASH_MATCHER.sub(r"\\\1", text)
+
+ if snob:
+ text = config.RE_MD_CHARS_MATCHER_ALL.sub(r"\\\1", text)
+
+ text = config.RE_MD_DOT_MATCHER.sub(r"\1\\\2", text)
+ text = config.RE_MD_PLUS_MATCHER.sub(r"\1\\\2", text)
+ text = config.RE_MD_DASH_MATCHER.sub(r"\1\\\2", text)
+
+ return text
diff --git a/makeplugin.py b/makeplugin.py
index 9350d6a8..bef6dc5e 100644
--- a/makeplugin.py
+++ b/makeplugin.py
@@ -36,7 +36,7 @@ if __name__=="__main__":
os.chdir('../included_dependencies')
# 'a' for append
- files=['gif.py','six.py','bs4','html5lib','chardet']
+ files=['gif.py','six.py','bs4','html5lib','chardet','html2text']
createZipFile("../"+filename,"a",
files,
exclude=exclude)
diff --git a/setup.py b/setup.py
index 9b37b4d2..0fc49337 100644
--- a/setup.py
+++ b/setup.py
@@ -81,7 +81,7 @@ setup(
# your project is installed. For an analysis of "install_requires" vs pip's
# requirements files see:
# https://packaging.python.org/en/latest/requirements.html
- install_requires=['beautifulsoup4','chardet','html5lib'], # html5lib requires 'six'.
+ install_requires=['beautifulsoup4','chardet','html5lib','html2text'], # html5lib requires 'six'.
# List additional groups of dependencies here (e.g. development
# dependencies). You can install these using the following syntax,