diff --git a/fanficfare/html2text.py b/fanficfare/html2text.py deleted file mode 100644 index 20296a49..00000000 --- a/fanficfare/html2text.py +++ /dev/null @@ -1,453 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -"""html2text: Turn HTML into equivalent Markdown-structured text.""" -__version__ = "2.37" -__author__ = "Aaron Swartz (me@aaronsw.com)" -__copyright__ = "(C) 2004-2008 Aaron Swartz. GNU GPL 3." -__contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes", "Kevin Jay North"] - -# TODO: -# Support decoded entities with unifiable. - -if not hasattr(__builtins__, 'True'): True, False = 1, 0 -import re, sys, urllib, htmlentitydefs, codecs, StringIO, types -import sgmllib -import urlparse -sgmllib.charref = re.compile('&#([xX]?[0-9a-fA-F]+)[^0-9a-fA-F]') - -try: from textwrap import wrap -except: pass - -# Use Unicode characters instead of their ascii psuedo-replacements -UNICODE_SNOB = 0 - -# Put the links after each paragraph instead of at the end. -LINKS_EACH_PARAGRAPH = 0 - -# Wrap long lines at position. 0 for no wrapping. (Requires Python 2.3.) -BODY_WIDTH = 78 - -# Don't show internal links (href="#local-anchor") -- corresponding link targets -# won't be visible in the plain text file anyway. -SKIP_INTERNAL_LINKS = False - -### Entity Nonsense ### - -def name2cp(k): - if k == 'apos': return ord("'") - if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3 - return htmlentitydefs.name2codepoint[k] - else: - k = htmlentitydefs.entitydefs[k] - if k.startswith("&#") and k.endswith(";"): return int(k[2:-1]) # not in latin-1 - return ord(codecs.latin_1_decode(k)[0]) - -unifiable = {'rsquo':"'", 'lsquo':"'", 'rdquo':'"', 'ldquo':'"', -'copy':'(C)', 'mdash':'--', 'nbsp':' ', 'rarr':'->', 'larr':'<-', 'middot':'*', -'ndash':'-', 'oelig':'oe', 'aelig':'ae', -'agrave':'a', 'aacute':'a', 'acirc':'a', 'atilde':'a', 'auml':'a', 'aring':'a', -'egrave':'e', 'eacute':'e', 'ecirc':'e', 'euml':'e', -'igrave':'i', 'iacute':'i', 'icirc':'i', 'iuml':'i', -'ograve':'o', 'oacute':'o', 'ocirc':'o', 'otilde':'o', 'ouml':'o', -'ugrave':'u', 'uacute':'u', 'ucirc':'u', 'uuml':'u'} - -unifiable_n = {} - -for k in unifiable.keys(): - unifiable_n[name2cp(k)] = unifiable[k] - -def charref(name): - if name[0] in ['x','X']: - c = int(name[1:], 16) - else: - c = int(name) - - if not UNICODE_SNOB and c in unifiable_n.keys(): - return unifiable_n[c] - else: - return unichr(c) - -def entityref(c): - if not UNICODE_SNOB and c in unifiable.keys(): - return unifiable[c] - else: - try: name2cp(c) - except KeyError: return "&" + c - else: return unichr(name2cp(c)) - -def replaceEntities(s): - s = s.group(1) - if s[0] == "#": - return charref(s[1:]) - else: return entityref(s) - -r_unescape = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));") -def unescape(s): - return r_unescape.sub(replaceEntities, s) - -def fixattrs(attrs): - # Fix bug in sgmllib.py - if not attrs: return attrs - newattrs = [] - for attr in attrs: - newattrs.append((attr[0], unescape(attr[1]))) - return newattrs - -### End Entity Nonsense ### - -def onlywhite(line): - """Return true if the line does only consist of whitespace characters.""" - for c in line: - if c is not ' ' and c is not ' ': - return c is ' ' - return line - -def optwrap(text,wrap_width=BODY_WIDTH): - """Wrap all paragraphs in the provided text.""" - - if not wrap_width: - return text - - assert wrap, "Requires Python 2.3." - result = '' - newlines = 0 - for para in text.split("\n"): - if len(para) > 0: - if para[0] is not ' ' and para[0] is not '-' and para[0] is not '*': - for line in wrap(para, wrap_width): - result += line + "\n" - result += "\n" - newlines = 2 - else: - if not onlywhite(para): - result += para + "\n" - newlines = 1 - else: - if newlines < 2: - result += "\n" - newlines += 1 - return result - -def hn(tag): - if tag[0] == 'h' and len(tag) == 2: - try: - n = int(tag[1]) - if n in range(1, 10): return n - except ValueError: return 0 - -class _html2text(sgmllib.SGMLParser): - def __init__(self, out=None, baseurl=''): - sgmllib.SGMLParser.__init__(self) - - if out is None: self.out = self.outtextf - else: self.out = out - self.outtext = u'' - self.quiet = 0 - self.p_p = 0 - self.outcount = 0 - self.start = 1 - self.space = 0 - self.a = [] - self.astack = [] - self.acount = 0 - self.list = [] - self.blockquote = 0 - self.pre = 0 - self.startpre = 0 - self.lastWasNL = 0 - self.abbr_title = None # current abbreviation definition - self.abbr_data = None # last inner HTML (for abbr being defined) - self.abbr_list = {} # stack of abbreviations to write later - self.baseurl = baseurl - - def outtextf(self, s): - self.outtext += s - - def close(self): - sgmllib.SGMLParser.close(self) - - self.pbr() - self.o('', 0, 'end') - - return self.outtext - - def handle_charref(self, c): - self.o(charref(c)) - - def handle_entityref(self, c): - self.o(entityref(c)) - - def unknown_starttag(self, tag, attrs): - self.handle_tag(tag, attrs, 1) - - def unknown_endtag(self, tag): - self.handle_tag(tag, None, 0) - - def previousIndex(self, attrs): - """ returns the index of certain set of attributes (of a link) in the - self.a list - - If the set of attributes is not found, returns None - """ - if not attrs.has_attr('href'): return None - - i = -1 - for a in self.a: - i += 1 - match = 0 - - if a.has_attr('href') and a['href'] == attrs['href']: - if a.has_attr('title') or attrs.has_attr('title'): - if (a.has_attr('title') and attrs.has_attr('title') and - a['title'] == attrs['title']): - match = True - else: - match = True - - if match: return i - - def handle_tag(self, tag, attrs, start): - attrs = fixattrs(attrs) - - if hn(tag): - self.p() - if start: self.o(hn(tag)*"#" + ' ') - - if tag in ['p', 'div']: self.p() - - if tag == "br" and start: self.o(" \n") - - if tag == "hr" and start: - self.p() - self.o("* * *") - self.p() - - if tag in ["head", "style", 'script']: - if start: self.quiet += 1 - else: self.quiet -= 1 - - if tag in ["body"]: - self.quiet = 0 # sites like 9rules.com never close - - if tag == "blockquote": - if start: - self.p(); self.o('> ', 0, 1); self.start = 1 - self.blockquote += 1 - else: - self.blockquote -= 1 - self.p() - - if tag in ['em', 'i', 'u']: self.o("_") - if tag in ['strong', 'b']: self.o("**") - if tag == "code" and not self.pre: self.o('`') #TODO: `` `this` `` - if tag == "abbr": - if start: - attrsD = {} - for (x, y) in attrs: attrsD[x] = y - attrs = attrsD - - self.abbr_title = None - self.abbr_data = '' - if attrs.has_attr('title'): - self.abbr_title = attrs['title'] - else: - if self.abbr_title != None: - self.abbr_list[self.abbr_data] = self.abbr_title - self.abbr_title = None - self.abbr_data = '' - - if tag == "a": - if start: - attrsD = {} - for (x, y) in attrs: attrsD[x] = y - attrs = attrsD - if attrs.has_attr('href') and not (SKIP_INTERNAL_LINKS and attrs['href'].startswith('#')): - self.astack.append(attrs) - self.o("[") - else: - self.astack.append(None) - else: - if self.astack: - a = self.astack.pop() - if a: - i = self.previousIndex(a) - if i is not None: - a = self.a[i] - else: - self.acount += 1 - a['count'] = self.acount - a['outcount'] = self.outcount - self.a.append(a) - self.o("][" + `a['count']` + "]") - - if tag == "img" and start: - attrsD = {} - for (x, y) in attrs: attrsD[x] = y - attrs = attrsD - if attrs.has_attr('src'): - attrs['href'] = attrs['src'] - alt = attrs.get('alt', '') - i = self.previousIndex(attrs) - if i is not None: - attrs = self.a[i] - else: - self.acount += 1 - attrs['count'] = self.acount - attrs['outcount'] = self.outcount - self.a.append(attrs) - self.o("![") - self.o(alt) - self.o("]["+`attrs['count']`+"]") - - if tag == 'dl' and start: self.p() - if tag == 'dt' and not start: self.pbr() - if tag == 'dd' and start: self.o(' ') - if tag == 'dd' and not start: self.pbr() - - if tag in ["ol", "ul"]: - if start: - self.list.append({'name':tag, 'num':0}) - else: - if self.list: self.list.pop() - - self.p() - - if tag == 'li': - if start: - self.pbr() - if self.list: li = self.list[-1] - else: li = {'name':'ul', 'num':0} - self.o(" "*len(self.list)) #TODO: line up
  1. s > 9 correctly. - if li['name'] == "ul": self.o("* ") - elif li['name'] == "ol": - li['num'] += 1 - self.o(`li['num']`+". ") - self.start = 1 - else: - self.pbr() - - if tag in ["table", "tr"] and start: self.p() - if tag == 'td': self.pbr() - - if tag == "pre": - if start: - self.startpre = 1 - self.pre = 1 - else: - self.pre = 0 - self.p() - - def pbr(self): - if self.p_p == 0: self.p_p = 1 - - def p(self): self.p_p = 2 - - def o(self, data, puredata=0, force=0): - if self.abbr_data is not None: self.abbr_data += data - - if not self.quiet: - if puredata and not self.pre: - data = re.sub('\s+', ' ', data) - if data and data[0] == ' ': - self.space = 1 - data = data[1:] - if not data and not force: return - - if self.startpre: - #self.out(" :") #TODO: not output when already one there - self.startpre = 0 - - bq = (">" * self.blockquote) - if not (force and data and data[0] == ">") and self.blockquote: bq += " " - - if self.pre: - bq += " " - data = data.replace("\n", "\n"+bq) - - if self.start: - self.space = 0 - self.p_p = 0 - self.start = 0 - - if force == 'end': - # It's the end. - self.p_p = 0 - self.out("\n") - self.space = 0 - - - if self.p_p: - self.out(('\n'+bq)*self.p_p) - self.space = 0 - - if self.space: - if not self.lastWasNL: self.out(' ') - self.space = 0 - - if self.a and ((self.p_p == 2 and LINKS_EACH_PARAGRAPH) or force == "end"): - if force == "end": self.out("\n") - - newa = [] - for link in self.a: - if self.outcount > link['outcount']: - self.out(" ["+`link['count']`+"]: " + urlparse.urljoin(self.baseurl, link['href'])) - if link.has_attr('title'): self.out(" ("+link['title']+")") - self.out("\n") - else: - newa.append(link) - - if self.a != newa: self.out("\n") # Don't need an extra line when nothing was done. - - self.a = newa - - if self.abbr_list and force == "end": - for abbr, definition in self.abbr_list.items(): - self.out(" *[" + abbr + "]: " + definition + "\n") - - self.p_p = 0 - self.out(data) - self.lastWasNL = data and data[-1] == '\n' - self.outcount += 1 - - def handle_data(self, data): - if r'\/script>' in data: self.quiet -= 1 - self.o(data, 1) - - def unknown_decl(self, data): pass - -def wrapwrite(text): sys.stdout.write(text.encode('utf8')) - -def html2text_file(html, out=wrapwrite, baseurl=''): - h = _html2text(out, baseurl) - h.feed(html) - h.feed("") - return h.close() - -def html2text(html, baseurl='', wrap_width=BODY_WIDTH): - return optwrap(html2text_file(html, None, baseurl),wrap_width) - -if __name__ == "__main__": - baseurl = '' - if sys.argv[1:]: - arg = sys.argv[1] - if arg.startswith('http://'): - baseurl = arg - j = urllib.urlopen(baseurl) - try: - from feedparser import _getCharacterEncoding as enc - except ImportError: - enc = lambda x, y: ('utf-8', 1) - text = j.read() - encoding = enc(j.headers, text)[0] - if encoding == 'us-ascii': encoding = 'utf-8' - data = text.decode(encoding) - - else: - encoding = 'utf8' - if len(sys.argv) > 2: - encoding = sys.argv[2] - data = open(arg, 'r').read().decode(encoding) - else: - data = sys.stdin.read().decode('utf8') - wrapwrite(html2text(data, baseurl)) diff --git a/fanficfare/writers/writer_txt.py b/fanficfare/writers/writer_txt.py index 91b3ac52..afffc995 100644 --- a/fanficfare/writers/writer_txt.py +++ b/fanficfare/writers/writer_txt.py @@ -21,7 +21,7 @@ from textwrap import wrap from base_writer import * -from ..html2text import html2text +from html2text import html2text ## In BaseStoryWriter, we define _write to encode objects ## back into for true output. But txt needs to write the @@ -109,7 +109,7 @@ End file. self.wrap_width = self.getConfig('wrap_width') if self.wrap_width == '' or self.wrap_width == '0': - self.wrap_width = None + self.wrap_width = 0 else: self.wrap_width = int(self.wrap_width) @@ -159,7 +159,7 @@ End file. logging.debug('Writing chapter text for: %s' % chap.title) vals={'url':chap.url, 'chapter':chap.title, 'index':"%04d"%(index+1), 'number':index+1} self._write(out,self.lineends(self.wraplines(removeAllEntities(CHAPTER_START.substitute(vals))))) - self._write(out,self.lineends(html2text(chap.html,wrap_width=self.wrap_width))) + self._write(out,self.lineends(html2text(chap.html,bodywidth=self.wrap_width))) self._write(out,self.lineends(self.wraplines(removeAllEntities(CHAPTER_END.substitute(vals))))) self._write(out,self.lineends(self.wraplines(FILE_END.substitute(self.story.getAllMetadata())))) diff --git a/included_dependencies/html2text/__init__.py b/included_dependencies/html2text/__init__.py new file mode 100644 index 00000000..e7b88be3 --- /dev/null +++ b/included_dependencies/html2text/__init__.py @@ -0,0 +1,857 @@ +#!/usr/bin/env python +# coding: utf-8 +"""html2text: Turn HTML into equivalent Markdown-structured text.""" +from __future__ import division +import re +import sys +import cgi + +try: + from textwrap import wrap +except ImportError: # pragma: no cover + pass + +from html2text.compat import urlparse, HTMLParser +from html2text import config + +from html2text.utils import ( + name2cp, + unifiable_n, + google_text_emphasis, + google_fixed_width_font, + element_style, + hn, + google_has_height, + escape_md, + google_list_style, + list_numbering_start, + dumb_css_parser, + escape_md_section, + skipwrap +) + +__version__ = (2016, 4, 2) + + +# TODO: +# Support decoded entities with UNIFIABLE. + + +class HTML2Text(HTMLParser.HTMLParser): + def __init__(self, out=None, baseurl='', bodywidth=config.BODY_WIDTH): + """ + Input parameters: + out: possible custom replacement for self.outtextf (which + appends lines of text). + baseurl: base URL of the document we process + """ + kwargs = {} + if sys.version_info >= (3, 4): + kwargs['convert_charrefs'] = False + HTMLParser.HTMLParser.__init__(self, **kwargs) + + # Config options + self.split_next_td = False + self.td_count = 0 + self.table_start = False + self.unicode_snob = config.UNICODE_SNOB # covered in cli + self.escape_snob = config.ESCAPE_SNOB # covered in cli + self.links_each_paragraph = config.LINKS_EACH_PARAGRAPH + self.body_width = bodywidth # covered in cli + self.skip_internal_links = config.SKIP_INTERNAL_LINKS # covered in cli + self.inline_links = config.INLINE_LINKS # covered in cli + self.protect_links = config.PROTECT_LINKS # covered in cli + self.google_list_indent = config.GOOGLE_LIST_INDENT # covered in cli + self.ignore_links = config.IGNORE_ANCHORS # covered in cli + self.ignore_images = config.IGNORE_IMAGES # covered in cli + self.images_to_alt = config.IMAGES_TO_ALT # covered in cli + self.images_with_size = config.IMAGES_WITH_SIZE # covered in cli + self.ignore_emphasis = config.IGNORE_EMPHASIS # covered in cli + self.bypass_tables = config.BYPASS_TABLES # covered in cli + self.google_doc = False # covered in cli + self.ul_item_mark = '*' # covered in cli + self.emphasis_mark = '_' # covered in cli + self.strong_mark = '**' + self.single_line_break = config.SINGLE_LINE_BREAK # covered in cli + self.use_automatic_links = config.USE_AUTOMATIC_LINKS # covered in cli + self.hide_strikethrough = False # covered in cli + self.mark_code = config.MARK_CODE + self.wrap_links = config.WRAP_LINKS # covered in cli + self.tag_callback = None + + if out is None: # pragma: no cover + self.out = self.outtextf + else: # pragma: no cover + self.out = out + + # empty list to store output characters before they are "joined" + self.outtextlist = [] + + self.quiet = 0 + self.p_p = 0 # number of newline character to print before next output + self.outcount = 0 + self.start = 1 + self.space = 0 + self.a = [] + self.astack = [] + self.maybe_automatic_link = None + self.empty_link = False + self.absolute_url_matcher = re.compile(r'^[a-zA-Z+]+://') + self.acount = 0 + self.list = [] + self.blockquote = 0 + self.pre = 0 + self.startpre = 0 + self.code = False + self.br_toggle = '' + self.lastWasNL = 0 + self.lastWasList = False + self.style = 0 + self.style_def = {} + self.tag_stack = [] + self.emphasis = 0 + self.drop_white_space = 0 + self.inheader = False + self.abbr_title = None # current abbreviation definition + self.abbr_data = None # last inner HTML (for abbr being defined) + self.abbr_list = {} # stack of abbreviations to write later + self.baseurl = baseurl + + try: + del unifiable_n[name2cp('nbsp')] + except KeyError: + pass + config.UNIFIABLE['nbsp'] = ' _place_holder;' + + def feed(self, data): + data = data.replace("", "") + HTMLParser.HTMLParser.feed(self, data) + + def handle(self, data): + self.feed(data) + self.feed("") + return self.optwrap(self.close()) + + def outtextf(self, s): + self.outtextlist.append(s) + if s: + self.lastWasNL = s[-1] == '\n' + + def close(self): + HTMLParser.HTMLParser.close(self) + + try: + nochr = unicode('') + except NameError: + nochr = str('') + + self.pbr() + self.o('', 0, 'end') + + outtext = nochr.join(self.outtextlist) + if self.unicode_snob: + try: + nbsp = unichr(name2cp('nbsp')) + except NameError: + nbsp = chr(name2cp('nbsp')) + else: + try: + nbsp = unichr(32) + except NameError: + nbsp = chr(32) + try: + outtext = outtext.replace(unicode(' _place_holder;'), nbsp) + except NameError: + outtext = outtext.replace(' _place_holder;', nbsp) + + # Clear self.outtextlist to avoid memory leak of its content to + # the next handling. + self.outtextlist = [] + + return outtext + + def handle_charref(self, c): + charref = self.charref(c) + if not self.code and not self.pre: + charref = cgi.escape(charref) + self.handle_data(charref, True) + + def handle_entityref(self, c): + entityref = self.entityref(c) + if (not self.code and not self.pre + and entityref != ' _place_holder;'): + entityref = cgi.escape(entityref) + self.handle_data(entityref, True) + + def handle_starttag(self, tag, attrs): + self.handle_tag(tag, attrs, 1) + + def handle_endtag(self, tag): + self.handle_tag(tag, None, 0) + + def previousIndex(self, attrs): + """ + :type attrs: dict + + :returns: The index of certain set of attributes (of a link) in the + self.a list. If the set of attributes is not found, returns None + :rtype: int + """ + if 'href' not in attrs: # pragma: no cover + return None + i = -1 + for a in self.a: + i += 1 + match = 0 + + if ('href' in a) and a['href'] == attrs['href']: + if ('title' in a) or ('title' in attrs): + if (('title' in a) and ('title' in attrs) and + a['title'] == attrs['title']): + match = True + else: + match = True + + if match: + return i + + def handle_emphasis(self, start, tag_style, parent_style): + """ + Handles various text emphases + """ + tag_emphasis = google_text_emphasis(tag_style) + parent_emphasis = google_text_emphasis(parent_style) + + # handle Google's text emphasis + strikethrough = 'line-through' in \ + tag_emphasis and self.hide_strikethrough + bold = 'bold' in tag_emphasis and not 'bold' in parent_emphasis + italic = 'italic' in tag_emphasis and not 'italic' in parent_emphasis + fixed = google_fixed_width_font(tag_style) and not \ + google_fixed_width_font(parent_style) and not self.pre + + if start: + # crossed-out text must be handled before other attributes + # in order not to output qualifiers unnecessarily + if bold or italic or fixed: + self.emphasis += 1 + if strikethrough: + self.quiet += 1 + if italic: + self.o(self.emphasis_mark) + self.drop_white_space += 1 + if bold: + self.o(self.strong_mark) + self.drop_white_space += 1 + if fixed: + self.o('`') + self.drop_white_space += 1 + self.code = True + else: + if bold or italic or fixed: + # there must not be whitespace before closing emphasis mark + self.emphasis -= 1 + self.space = 0 + if fixed: + if self.drop_white_space: + # empty emphasis, drop it + self.drop_white_space -= 1 + else: + self.o('`') + self.code = False + if bold: + if self.drop_white_space: + # empty emphasis, drop it + self.drop_white_space -= 1 + else: + self.o(self.strong_mark) + if italic: + if self.drop_white_space: + # empty emphasis, drop it + self.drop_white_space -= 1 + else: + self.o(self.emphasis_mark) + # space is only allowed after *all* emphasis marks + if (bold or italic) and not self.emphasis: + self.o(" ") + if strikethrough: + self.quiet -= 1 + + def handle_tag(self, tag, attrs, start): + # attrs is None for endtags + if attrs is None: + attrs = {} + else: + attrs = dict(attrs) + + if self.tag_callback is not None: + if self.tag_callback(self, tag, attrs, start) is True: + return + + # first thing inside the anchor tag is another tag that produces some output + if (start and not self.maybe_automatic_link is None + and tag not in ['p', 'div', 'style', 'dl', 'dt'] + and (tag != "img" or self.ignore_images)): + self.o("[") + self.maybe_automatic_link = None + self.empty_link = False + + if self.google_doc: + # the attrs parameter is empty for a closing tag. in addition, we + # need the attributes of the parent nodes in order to get a + # complete style description for the current element. we assume + # that google docs export well formed html. + parent_style = {} + if start: + if self.tag_stack: + parent_style = self.tag_stack[-1][2] + tag_style = element_style(attrs, self.style_def, parent_style) + self.tag_stack.append((tag, attrs, tag_style)) + else: + dummy, attrs, tag_style = self.tag_stack.pop() if self.tag_stack else (None, {}, {}) + if self.tag_stack: + parent_style = self.tag_stack[-1][2] + + if hn(tag): + self.p() + if start: + self.inheader = True + self.o(hn(tag) * "#" + ' ') + else: + self.inheader = False + return # prevent redundant emphasis marks on headers + + if tag in ['p', 'div']: + if self.google_doc: + if start and google_has_height(tag_style): + self.p() + else: + self.soft_br() + else: + self.p() + + if tag == "br" and start: + self.o(" \n") + + if tag == "hr" and start: + self.p() + self.o("* * *") + self.p() + + if tag in ["head", "style", 'script']: + if start: + self.quiet += 1 + else: + self.quiet -= 1 + + if tag == "style": + if start: + self.style += 1 + else: + self.style -= 1 + + if tag in ["body"]: + self.quiet = 0 # sites like 9rules.com never close + + if tag == "blockquote": + if start: + self.p() + self.o('> ', 0, 1) + self.start = 1 + self.blockquote += 1 + else: + self.blockquote -= 1 + self.p() + + if tag in ['em', 'i', 'u'] and not self.ignore_emphasis: + self.o(self.emphasis_mark) + if tag in ['strong', 'b'] and not self.ignore_emphasis: + self.o(self.strong_mark) + if tag in ['del', 'strike', 's']: + if start: + self.o('~~') + else: + self.o('~~') + + if self.google_doc: + if not self.inheader: + # handle some font attributes, but leave headers clean + self.handle_emphasis(start, tag_style, parent_style) + + if tag in ["code", "tt"] and not self.pre: + self.o('`') # TODO: `` `this` `` + self.code = not self.code + if tag == "abbr": + if start: + self.abbr_title = None + self.abbr_data = '' + if ('title' in attrs): + self.abbr_title = attrs['title'] + else: + if self.abbr_title is not None: + self.abbr_list[self.abbr_data] = self.abbr_title + self.abbr_title = None + self.abbr_data = '' + + if tag == "a" and not self.ignore_links: + if start: + if ('href' in attrs) and \ + (attrs['href'] is not None) and \ + not (self.skip_internal_links and + attrs['href'].startswith('#')): + self.astack.append(attrs) + self.maybe_automatic_link = attrs['href'] + self.empty_link = True + if self.protect_links: + attrs['href'] = '<'+attrs['href']+'>' + else: + self.astack.append(None) + else: + if self.astack: + a = self.astack.pop() + if self.maybe_automatic_link and not self.empty_link: + self.maybe_automatic_link = None + elif a: + if self.empty_link: + self.o("[") + self.empty_link = False + self.maybe_automatic_link = None + if self.inline_links: + try: + title = escape_md(a['title']) + except KeyError: + self.o("](" + escape_md(urlparse.urljoin(self.baseurl, a['href'])) + ")") + else: + self.o("](" + escape_md(urlparse.urljoin(self.baseurl, a['href'])) + + ' "' + title + '" )') + else: + i = self.previousIndex(a) + if i is not None: + a = self.a[i] + else: + self.acount += 1 + a['count'] = self.acount + a['outcount'] = self.outcount + self.a.append(a) + self.o("][" + str(a['count']) + "]") + + if tag == "img" and start and not self.ignore_images: + if 'src' in attrs: + if not self.images_to_alt: + attrs['href'] = attrs['src'] + alt = attrs.get('alt') or '' + + # If we have images_with_size, write raw html including width, + # height, and alt attributes + if self.images_with_size and \ + ("width" in attrs or "height" in attrs): + self.o("") + return + + # If we have a link to create, output the start + if not self.maybe_automatic_link is None: + href = self.maybe_automatic_link + if self.images_to_alt and escape_md(alt) == href and \ + self.absolute_url_matcher.match(href): + self.o("<" + escape_md(alt) + ">") + self.empty_link = False + return + else: + self.o("[") + self.maybe_automatic_link = None + self.empty_link = False + + # If we have images_to_alt, we discard the image itself, + # considering only the alt text. + if self.images_to_alt: + self.o(escape_md(alt)) + else: + self.o("![" + escape_md(alt) + "]") + if self.inline_links: + href = attrs.get('href') or '' + self.o("(" + escape_md(urlparse.urljoin(self.baseurl, href)) + ")") + else: + i = self.previousIndex(attrs) + if i is not None: + attrs = self.a[i] + else: + self.acount += 1 + attrs['count'] = self.acount + attrs['outcount'] = self.outcount + self.a.append(attrs) + self.o("[" + str(attrs['count']) + "]") + + if tag == 'dl' and start: + self.p() + if tag == 'dt' and not start: + self.pbr() + if tag == 'dd' and start: + self.o(' ') + if tag == 'dd' and not start: + self.pbr() + + if tag in ["ol", "ul"]: + # Google Docs create sub lists as top level lists + if (not self.list) and (not self.lastWasList): + self.p() + if start: + if self.google_doc: + list_style = google_list_style(tag_style) + else: + list_style = tag + numbering_start = list_numbering_start(attrs) + self.list.append({ + 'name': list_style, + 'num': numbering_start + }) + else: + if self.list: + self.list.pop() + if (not self.google_doc) and (not self.list): + self.o('\n') + self.lastWasList = True + else: + self.lastWasList = False + + if tag == 'li': + self.pbr() + if start: + if self.list: + li = self.list[-1] + else: + li = {'name': 'ul', 'num': 0} + if self.google_doc: + nest_count = self.google_nest_count(tag_style) + else: + nest_count = len(self.list) + # TODO: line up
    1. s > 9 correctly. + self.o(" " * nest_count) + if li['name'] == "ul": + self.o(self.ul_item_mark + " ") + elif li['name'] == "ol": + li['num'] += 1 + self.o(str(li['num']) + ". ") + self.start = 1 + + if tag in ["table", "tr", "td", "th"]: + if self.bypass_tables: + if start: + self.soft_br() + if tag in ["td", "th"]: + if start: + self.o('<{0}>\n\n'.format(tag)) + else: + self.o('\n'.format(tag)) + else: + if start: + self.o('<{0}>'.format(tag)) + else: + self.o(''.format(tag)) + + else: + if tag == "table" and start: + self.table_start = True + if tag in ["td", "th"] and start: + if self.split_next_td: + self.o("| ") + self.split_next_td = True + + if tag == "tr" and start: + self.td_count = 0 + if tag == "tr" and not start: + self.split_next_td = False + self.soft_br() + if tag == "tr" and not start and self.table_start: + # Underline table header + self.o("|".join(["---"] * self.td_count)) + self.soft_br() + self.table_start = False + if tag in ["td", "th"] and start: + self.td_count += 1 + + if tag == "pre": + if start: + self.startpre = 1 + self.pre = 1 + else: + self.pre = 0 + if self.mark_code: + self.out("\n[/code]") + self.p() + + # TODO: Add docstring for these one letter functions + def pbr(self): + "Pretty print has a line break" + if self.p_p == 0: + self.p_p = 1 + + def p(self): + "Set pretty print to 1 or 2 lines" + self.p_p = 1 if self.single_line_break else 2 + + def soft_br(self): + "Soft breaks" + self.pbr() + self.br_toggle = ' ' + + def o(self, data, puredata=0, force=0): + """ + Deal with indentation and whitespace + """ + if self.abbr_data is not None: + self.abbr_data += data + + if not self.quiet: + if self.google_doc: + # prevent white space immediately after 'begin emphasis' + # marks ('**' and '_') + lstripped_data = data.lstrip() + if self.drop_white_space and not (self.pre or self.code): + data = lstripped_data + if lstripped_data != '': + self.drop_white_space = 0 + + if puredata and not self.pre: + # This is a very dangerous call ... it could mess up + # all handling of   when not handled properly + # (see entityref) + data = re.sub(r'\s+', r' ', data) + if data and data[0] == ' ': + self.space = 1 + data = data[1:] + if not data and not force: + return + + if self.startpre: + #self.out(" :") #TODO: not output when already one there + if not data.startswith("\n"): #
      stuff...
      +                    data = "\n" + data
      +                if self.mark_code:
      +                    self.out("\n[code]")
      +                    self.p_p = 0
      +
      +            bq = (">" * self.blockquote)
      +            if not (force and data and data[0] == ">") and self.blockquote:
      +                bq += " "
      +
      +            if self.pre:
      +                if not self.list:
      +                    bq += "    "
      +                #else: list content is already partially indented
      +                for i in range(len(self.list)):
      +                    bq += "    "
      +                data = data.replace("\n", "\n" + bq)
      +
      +            if self.startpre:
      +                self.startpre = 0
      +                if self.list:
      +                    # use existing initial indentation
      +                    data = data.lstrip("\n")
      +
      +            if self.start:
      +                self.space = 0
      +                self.p_p = 0
      +                self.start = 0
      +
      +            if force == 'end':
      +                # It's the end.
      +                self.p_p = 0
      +                self.out("\n")
      +                self.space = 0
      +
      +            if self.p_p:
      +                self.out((self.br_toggle + '\n' + bq) * self.p_p)
      +                self.space = 0
      +                self.br_toggle = ''
      +
      +            if self.space:
      +                if not self.lastWasNL:
      +                    self.out(' ')
      +                self.space = 0
      +
      +            if self.a and ((self.p_p == 2 and self.links_each_paragraph)
      +                           or force == "end"):
      +                if force == "end":
      +                    self.out("\n")
      +
      +                newa = []
      +                for link in self.a:
      +                    if self.outcount > link['outcount']:
      +                        self.out("   [" + str(link['count']) + "]: " +
      +                                 urlparse.urljoin(self.baseurl, link['href']))
      +                        if 'title' in link:
      +                            self.out(" (" + link['title'] + ")")
      +                        self.out("\n")
      +                    else:
      +                        newa.append(link)
      +
      +                # Don't need an extra line when nothing was done.
      +                if self.a != newa:
      +                    self.out("\n")
      +
      +                self.a = newa
      +
      +            if self.abbr_list and force == "end":
      +                for abbr, definition in self.abbr_list.items():
      +                    self.out("  *[" + abbr + "]: " + definition + "\n")
      +
      +            self.p_p = 0
      +            self.out(data)
      +            self.outcount += 1
      +
      +    def handle_data(self, data, entity_char=False):
      +        if r'\/script>' in data:
      +            self.quiet -= 1
      +
      +        if self.style:
      +            self.style_def.update(dumb_css_parser(data))
      +
      +        if not self.maybe_automatic_link is None:
      +            href = self.maybe_automatic_link
      +            if (href == data and self.absolute_url_matcher.match(href)
      +                    and self.use_automatic_links):
      +                self.o("<" + data + ">")
      +                self.empty_link = False
      +                return
      +            else:
      +                self.o("[")
      +                self.maybe_automatic_link = None
      +                self.empty_link = False
      +
      +        if not self.code and not self.pre and not entity_char:
      +            data = escape_md_section(data, snob=self.escape_snob)
      +        self.o(data, 1)
      +
      +    def unknown_decl(self, data):  # pragma: no cover
      +        # TODO: what is this doing here?
      +        pass
      +
      +    def charref(self, name):
      +        if name[0] in ['x', 'X']:
      +            c = int(name[1:], 16)
      +        else:
      +            c = int(name)
      +
      +        if not self.unicode_snob and c in unifiable_n.keys():
      +            return unifiable_n[c]
      +        else:
      +            try:
      +                try:
      +                    return unichr(c)
      +                except NameError:  # Python3
      +                    return chr(c)
      +            except ValueError:  # invalid unicode
      +                return ''
      +
      +    def entityref(self, c):
      +        if not self.unicode_snob and c in config.UNIFIABLE.keys():
      +            return config.UNIFIABLE[c]
      +        else:
      +            try:
      +                name2cp(c)
      +            except KeyError:
      +                return "&" + c + ';'
      +            else:
      +                if c == 'nbsp':
      +                    return config.UNIFIABLE[c]
      +                else:
      +                    try:
      +                        return unichr(name2cp(c))
      +                    except NameError:  # Python3
      +                        return chr(name2cp(c))
      +
      +    def replaceEntities(self, s):
      +        s = s.group(1)
      +        if s[0] == "#":
      +            return self.charref(s[1:])
      +        else:
      +            return self.entityref(s)
      +
      +    def unescape(self, s):
      +        return config.RE_UNESCAPE.sub(self.replaceEntities, s)
      +
      +    def google_nest_count(self, style):
      +        """
      +        Calculate the nesting count of google doc lists
      +
      +        :type style: dict
      +
      +        :rtype: int
      +        """
      +        nest_count = 0
      +        if 'margin-left' in style:
      +            nest_count = int(style['margin-left'][:-2]) \
      +                         // self.google_list_indent
      +
      +        return nest_count
      +
      +    def optwrap(self, text):
      +        """
      +        Wrap all paragraphs in the provided text.
      +
      +        :type text: str
      +
      +        :rtype: str
      +        """
      +        if not self.body_width:
      +            return text
      +
      +        assert wrap, "Requires Python 2.3."
      +        result = ''
      +        newlines = 0
      +        # I cannot think of a better solution for now.
      +        # To avoid the non-wrap behaviour for entire paras
      +        # because of the presence of a link in it
      +        if not self.wrap_links:
      +            self.inline_links = False
      +        for para in text.split("\n"):
      +            if len(para) > 0:
      +                if not skipwrap(para, self.wrap_links):
      +                    result += "\n".join(wrap(para, self.body_width))
      +                    if para.endswith('  '):
      +                        result += "  \n"
      +                        newlines = 1
      +                    else:
      +                        result += "\n\n"
      +                        newlines = 2
      +                else:
      +                    # Warning for the tempted!!!
      +                    # Be aware that obvious replacement of this with
      +                    # line.isspace()
      +                    # DOES NOT work! Explanations are welcome.
      +                    if not config.RE_SPACE.match(para):
      +                        result += para + "\n"
      +                        newlines = 1
      +            else:
      +                if newlines < 2:
      +                    result += "\n"
      +                    newlines += 1
      +        return result
      +
      +
      +def html2text(html, baseurl='', bodywidth=None):
      +    if bodywidth is None:
      +        bodywidth = config.BODY_WIDTH
      +    h = HTML2Text(baseurl=baseurl, bodywidth=bodywidth)
      +
      +    return h.handle(html)
      +
      +
      +def unescape(s, unicode_snob=False):
      +    h = HTML2Text()
      +    h.unicode_snob = unicode_snob
      +
      +    return h.unescape(s)
      +
      +
      +if __name__ == "__main__":
      +    from html2text.cli import main
      +
      +    main()
      diff --git a/included_dependencies/html2text/compat.py b/included_dependencies/html2text/compat.py
      new file mode 100644
      index 00000000..2120a41b
      --- /dev/null
      +++ b/included_dependencies/html2text/compat.py
      @@ -0,0 +1,13 @@
      +import sys
      +
      +
      +if sys.version_info[0] == 2:
      +    import htmlentitydefs
      +    import urlparse
      +    import HTMLParser
      +    import urllib
      +else:
      +    import urllib.parse as urlparse
      +    import html.entities as htmlentitydefs
      +    import html.parser as HTMLParser
      +    import urllib.request as urllib
      diff --git a/included_dependencies/html2text/config.py b/included_dependencies/html2text/config.py
      new file mode 100644
      index 00000000..85bf47dc
      --- /dev/null
      +++ b/included_dependencies/html2text/config.py
      @@ -0,0 +1,123 @@
      +import re
      +
      +# Use Unicode characters instead of their ascii psuedo-replacements
      +UNICODE_SNOB = 0
      +
      +# Escape all special characters.  Output is less readable, but avoids
      +# corner case formatting issues.
      +ESCAPE_SNOB = 0
      +
      +# Put the links after each paragraph instead of at the end.
      +LINKS_EACH_PARAGRAPH = 0
      +
      +# Wrap long lines at position. 0 for no wrapping. (Requires Python 2.3.)
      +BODY_WIDTH = 78
      +
      +# Don't show internal links (href="#local-anchor") -- corresponding link
      +# targets won't be visible in the plain text file anyway.
      +SKIP_INTERNAL_LINKS = True
      +
      +# Use inline, rather than reference, formatting for images and links
      +INLINE_LINKS = True
      +
      +# Protect links from line breaks surrounding them with angle brackets (in
      +# addition to their square brackets)
      +PROTECT_LINKS = False
      +# WRAP_LINKS = True
      +WRAP_LINKS = True
      +
      +# Number of pixels Google indents nested lists
      +GOOGLE_LIST_INDENT = 36
      +
      +IGNORE_ANCHORS = False
      +IGNORE_IMAGES = False
      +IMAGES_TO_ALT = False
      +IMAGES_WITH_SIZE = False
      +IGNORE_EMPHASIS = False
      +MARK_CODE = False
      +DECODE_ERRORS = 'strict'
      +
      +# Convert links with same href and text to  format if they are absolute links
      +USE_AUTOMATIC_LINKS = True
      +
      +# For checking space-only lines on line 771
      +RE_SPACE = re.compile(r'\s\+')
      +
      +RE_UNESCAPE = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));")
      +RE_ORDERED_LIST_MATCHER = re.compile(r'\d+\.\s')
      +RE_UNORDERED_LIST_MATCHER = re.compile(r'[-\*\+]\s')
      +RE_MD_CHARS_MATCHER = re.compile(r"([\\\[\]\(\)])")
      +RE_MD_CHARS_MATCHER_ALL = re.compile(r"([`\*_{}\[\]\(\)#!])")
      +RE_LINK = re.compile(r"(\[.*?\] ?\(.*?\))|(\[.*?\]:.*?)")  # to find links in the text
      +RE_MD_DOT_MATCHER = re.compile(r"""
      +    ^             # start of line
      +    (\s*\d+)      # optional whitespace and a number
      +    (\.)          # dot
      +    (?=\s)        # lookahead assert whitespace
      +    """, re.MULTILINE | re.VERBOSE)
      +RE_MD_PLUS_MATCHER = re.compile(r"""
      +    ^
      +    (\s*)
      +    (\+)
      +    (?=\s)
      +    """, flags=re.MULTILINE | re.VERBOSE)
      +RE_MD_DASH_MATCHER = re.compile(r"""
      +    ^
      +    (\s*)
      +    (-)
      +    (?=\s|\-)     # followed by whitespace (bullet list, or spaced out hr)
      +                  # or another dash (header or hr)
      +    """, flags=re.MULTILINE | re.VERBOSE)
      +RE_SLASH_CHARS = r'\`*_{}[]()#+-.!'
      +RE_MD_BACKSLASH_MATCHER = re.compile(r'''
      +    (\\)          # match one slash
      +    (?=[%s])      # followed by a char that requires escaping
      +    ''' % re.escape(RE_SLASH_CHARS),
      +    flags=re.VERBOSE)
      +
      +UNIFIABLE = {
      +    'rsquo': "'",
      +    'lsquo': "'",
      +    'rdquo': '"',
      +    'ldquo': '"',
      +    'copy': '(C)',
      +    'mdash': '--',
      +    'nbsp': ' ',
      +    'rarr': '->',
      +    'larr': '<-',
      +    'middot': '*',
      +    'ndash': '-',
      +    'oelig': 'oe',
      +    'aelig': 'ae',
      +    'agrave': 'a',
      +    'aacute': 'a',
      +    'acirc': 'a',
      +    'atilde': 'a',
      +    'auml': 'a',
      +    'aring': 'a',
      +    'egrave': 'e',
      +    'eacute': 'e',
      +    'ecirc': 'e',
      +    'euml': 'e',
      +    'igrave': 'i',
      +    'iacute': 'i',
      +    'icirc': 'i',
      +    'iuml': 'i',
      +    'ograve': 'o',
      +    'oacute': 'o',
      +    'ocirc': 'o',
      +    'otilde': 'o',
      +    'ouml': 'o',
      +    'ugrave': 'u',
      +    'uacute': 'u',
      +    'ucirc': 'u',
      +    'uuml': 'u',
      +    'lrm': '',
      +    'rlm': ''
      +}
      +
      +BYPASS_TABLES = False
      +
      +# Use a single line break after a block element rather an two line breaks.
      +# NOTE: Requires body width setting to be 0.
      +SINGLE_LINE_BREAK = False
      diff --git a/included_dependencies/html2text/utils.py b/included_dependencies/html2text/utils.py
      new file mode 100644
      index 00000000..418c89cb
      --- /dev/null
      +++ b/included_dependencies/html2text/utils.py
      @@ -0,0 +1,246 @@
      +import sys
      +
      +from html2text import config
      +from html2text.compat import htmlentitydefs
      +
      +
      +def name2cp(k):
      +    """Return sname to codepoint"""
      +    if k == 'apos':
      +        return ord("'")
      +    return htmlentitydefs.name2codepoint[k]
      +
      +
      +unifiable_n = {}
      +
      +for k in config.UNIFIABLE.keys():
      +    unifiable_n[name2cp(k)] = config.UNIFIABLE[k]
      +
      +
      +def hn(tag):
      +    if tag[0] == 'h' and len(tag) == 2:
      +        try:
      +            n = int(tag[1])
      +            if n in range(1, 10):  # pragma: no branch
      +                return n
      +        except ValueError:
      +            return 0
      +
      +
      +def dumb_property_dict(style):
      +    """
      +    :returns: A hash of css attributes
      +    """
      +    out = dict([(x.strip(), y.strip()) for x, y in
      +                [z.split(':', 1) for z in
      +                 style.split(';') if ':' in z
      +                 ]
      +                ]
      +               )
      +
      +    return out
      +
      +
      +def dumb_css_parser(data):
      +    """
      +    :type data: str
      +
      +    :returns: A hash of css selectors, each of which contains a hash of
      +    css attributes.
      +    :rtype: dict
      +    """
      +    # remove @import sentences
      +    data += ';'
      +    importIndex = data.find('@import')
      +    while importIndex != -1:
      +        data = data[0:importIndex] + data[data.find(';', importIndex) + 1:]
      +        importIndex = data.find('@import')
      +
      +    # parse the css. reverted from dictionary comprehension in order to
      +    # support older pythons
      +    elements = [x.split('{') for x in data.split('}') if '{' in x.strip()]
      +    try:
      +        elements = dict([(a.strip(), dumb_property_dict(b))
      +                         for a, b in elements])
      +    except ValueError:  # pragma: no cover
      +        elements = {}  # not that important
      +
      +    return elements
      +
      +
      +def element_style(attrs, style_def, parent_style):
      +    """
      +    :type attrs: dict
      +    :type style_def: dict
      +    :type style_def: dict
      +
      +    :returns: A hash of the 'final' style attributes of the element
      +    :rtype: dict
      +    """
      +    style = parent_style.copy()
      +    if 'class' in attrs:
      +        for css_class in attrs['class'].split():
      +            css_style = style_def.get('.' + css_class, {})
      +            style.update(css_style)
      +    if 'style' in attrs:
      +        immediate_style = dumb_property_dict(attrs['style'])
      +        style.update(immediate_style)
      +
      +    return style
      +
      +
      +def google_list_style(style):
      +    """
      +    Finds out whether this is an ordered or unordered list
      +
      +    :type style: dict
      +
      +    :rtype: str
      +    """
      +    if 'list-style-type' in style:
      +        list_style = style['list-style-type']
      +        if list_style in ['disc', 'circle', 'square', 'none']:
      +            return 'ul'
      +
      +    return 'ol'
      +
      +
      +def google_has_height(style):
      +    """
      +    Check if the style of the element has the 'height' attribute
      +    explicitly defined
      +
      +    :type style: dict
      +
      +    :rtype: bool
      +    """
      +    if 'height' in style:
      +        return True
      +
      +    return False
      +
      +
      +def google_text_emphasis(style):
      +    """
      +    :type style: dict
      +
      +    :returns: A list of all emphasis modifiers of the element
      +    :rtype: list
      +    """
      +    emphasis = []
      +    if 'text-decoration' in style:
      +        emphasis.append(style['text-decoration'])
      +    if 'font-style' in style:
      +        emphasis.append(style['font-style'])
      +    if 'font-weight' in style:
      +        emphasis.append(style['font-weight'])
      +
      +    return emphasis
      +
      +
      +def google_fixed_width_font(style):
      +    """
      +    Check if the css of the current element defines a fixed width font
      +
      +    :type style: dict
      +
      +    :rtype: bool
      +    """
      +    font_family = ''
      +    if 'font-family' in style:
      +        font_family = style['font-family']
      +    if 'Courier New' == font_family or 'Consolas' == font_family:
      +        return True
      +
      +    return False
      +
      +
      +def list_numbering_start(attrs):
      +    """
      +    Extract numbering from list element attributes
      +
      +    :type attrs: dict
      +
      +    :rtype: int or None
      +    """
      +    if 'start' in attrs:
      +        try:
      +            return int(attrs['start']) - 1
      +        except ValueError:
      +            pass
      +
      +    return 0
      +
      +
      +def skipwrap(para, wrap_links):
      +    # If it appears to contain a link
      +    # don't wrap
      +    if (len(config.RE_LINK.findall(para)) > 0) and not wrap_links:
      +        return True
      +    # If the text begins with four spaces or one tab, it's a code block;
      +    # don't wrap
      +    if para[0:4] == '    ' or para[0] == '\t':
      +        return True
      +
      +    # If the text begins with only two "--", possibly preceded by
      +    # whitespace, that's an emdash; so wrap.
      +    stripped = para.lstrip()
      +    if stripped[0:2] == "--" and len(stripped) > 2 and stripped[2] != "-":
      +        return False
      +
      +    # I'm not sure what this is for; I thought it was to detect lists,
      +    # but there's a 
      -inside- case in one of the tests that + # also depends upon it. + if stripped[0:1] == '-' or stripped[0:1] == '*': + return True + + # If the text begins with a single -, *, or +, followed by a space, + # or an integer, followed by a ., followed by a space (in either + # case optionally proceeded by whitespace), it's a list; don't wrap. + if config.RE_ORDERED_LIST_MATCHER.match(stripped) or \ + config.RE_UNORDERED_LIST_MATCHER.match(stripped): + return True + + return False + + +def wrapwrite(text): + text = text.encode('utf-8') + try: # Python3 + sys.stdout.buffer.write(text) + except AttributeError: + sys.stdout.write(text) + + +def wrap_read(): # pragma: no cover + """ + :rtype: str + """ + try: + return sys.stdin.read() + except AttributeError: + return sys.stdin.buffer.read() + + +def escape_md(text): + """ + Escapes markdown-sensitive characters within other markdown + constructs. + """ + return config.RE_MD_CHARS_MATCHER.sub(r"\\\1", text) + + +def escape_md_section(text, snob=False): + """ + Escapes markdown-sensitive characters across whole document sections. + """ + text = config.RE_MD_BACKSLASH_MATCHER.sub(r"\\\1", text) + + if snob: + text = config.RE_MD_CHARS_MATCHER_ALL.sub(r"\\\1", text) + + text = config.RE_MD_DOT_MATCHER.sub(r"\1\\\2", text) + text = config.RE_MD_PLUS_MATCHER.sub(r"\1\\\2", text) + text = config.RE_MD_DASH_MATCHER.sub(r"\1\\\2", text) + + return text diff --git a/makeplugin.py b/makeplugin.py index 9350d6a8..bef6dc5e 100644 --- a/makeplugin.py +++ b/makeplugin.py @@ -36,7 +36,7 @@ if __name__=="__main__": os.chdir('../included_dependencies') # 'a' for append - files=['gif.py','six.py','bs4','html5lib','chardet'] + files=['gif.py','six.py','bs4','html5lib','chardet','html2text'] createZipFile("../"+filename,"a", files, exclude=exclude) diff --git a/setup.py b/setup.py index 9b37b4d2..0fc49337 100644 --- a/setup.py +++ b/setup.py @@ -81,7 +81,7 @@ setup( # your project is installed. For an analysis of "install_requires" vs pip's # requirements files see: # https://packaging.python.org/en/latest/requirements.html - install_requires=['beautifulsoup4','chardet','html5lib'], # html5lib requires 'six'. + install_requires=['beautifulsoup4','chardet','html5lib','html2text'], # html5lib requires 'six'. # List additional groups of dependencies here (e.g. development # dependencies). You can install these using the following syntax,