import sys
from html2text import config
from html2text.compat import htmlentitydefs
def name2cp(k):
"""Return sname to codepoint"""
if k == 'apos':
return ord("'")
return htmlentitydefs.name2codepoint[k]
unifiable_n = {}
for k in config.UNIFIABLE.keys():
unifiable_n[name2cp(k)] = config.UNIFIABLE[k]
def hn(tag):
if tag[0] == 'h' and len(tag) == 2:
try:
n = int(tag[1])
if n in range(1, 10): # pragma: no branch
return n
except ValueError:
return 0
def dumb_property_dict(style):
"""
:returns: A hash of css attributes
"""
out = dict([(x.strip().lower(), y.strip().lower()) for x, y in
[z.split(':', 1) for z in
style.split(';') if ':' in z
]
]
)
return out
def dumb_css_parser(data):
"""
:type data: str
:returns: A hash of css selectors, each of which contains a hash of
css attributes.
:rtype: dict
"""
# remove @import sentences
data += ';'
importIndex = data.find('@import')
while importIndex != -1:
data = data[0:importIndex] + data[data.find(';', importIndex) + 1:]
importIndex = data.find('@import')
# parse the css. reverted from dictionary comprehension in order to
# support older pythons
elements = [x.split('{') for x in data.split('}') if '{' in x.strip()]
try:
elements = dict([(a.strip(), dumb_property_dict(b))
for a, b in elements])
except ValueError: # pragma: no cover
elements = {} # not that important
return elements
def element_style(attrs, style_def, parent_style):
"""
:type attrs: dict
:type style_def: dict
:type style_def: dict
:returns: A hash of the 'final' style attributes of the element
:rtype: dict
"""
style = parent_style.copy()
if 'class' in attrs:
for css_class in attrs['class'].split():
css_style = style_def.get('.' + css_class, {})
style.update(css_style)
if 'style' in attrs:
immediate_style = dumb_property_dict(attrs['style'])
style.update(immediate_style)
return style
def google_list_style(style):
"""
Finds out whether this is an ordered or unordered list
:type style: dict
:rtype: str
"""
if 'list-style-type' in style:
list_style = style['list-style-type']
if list_style in ['disc', 'circle', 'square', 'none']:
return 'ul'
return 'ol'
def google_has_height(style):
"""
Check if the style of the element has the 'height' attribute
explicitly defined
:type style: dict
:rtype: bool
"""
if 'height' in style:
return True
return False
def google_text_emphasis(style):
"""
:type style: dict
:returns: A list of all emphasis modifiers of the element
:rtype: list
"""
emphasis = []
if 'text-decoration' in style:
emphasis.append(style['text-decoration'])
if 'font-style' in style:
emphasis.append(style['font-style'])
if 'font-weight' in style:
emphasis.append(style['font-weight'])
return emphasis
def google_fixed_width_font(style):
"""
Check if the css of the current element defines a fixed width font
:type style: dict
:rtype: bool
"""
font_family = ''
if 'font-family' in style:
font_family = style['font-family']
if 'courier new' == font_family or 'consolas' == font_family:
return True
return False
def list_numbering_start(attrs):
"""
Extract numbering from list element attributes
:type attrs: dict
:rtype: int or None
"""
if 'start' in attrs:
try:
return int(attrs['start']) - 1
except ValueError:
pass
return 0
def skipwrap(para, wrap_links):
# If it appears to contain a link
# don't wrap
if (len(config.RE_LINK.findall(para)) > 0) and not wrap_links:
return True
# If the text begins with four spaces or one tab, it's a code block;
# don't wrap
if para[0:4] == ' ' or para[0] == '\t':
return True
# If the text begins with only two "--", possibly preceded by
# whitespace, that's an emdash; so wrap.
stripped = para.lstrip()
if stripped[0:2] == "--" and len(stripped) > 2 and stripped[2] != "-":
return False
# I'm not sure what this is for; I thought it was to detect lists,
# but there's a
-inside- case in one of the tests that
# also depends upon it.
if stripped[0:1] in ('-', '*') and not stripped[0:2] == '**':
return True
# If the text begins with a single -, *, or +, followed by a space,
# or an integer, followed by a ., followed by a space (in either
# case optionally proceeded by whitespace), it's a list; don't wrap.
if config.RE_ORDERED_LIST_MATCHER.match(stripped) or \
config.RE_UNORDERED_LIST_MATCHER.match(stripped):
return True
return False
def wrapwrite(text):
text = text.encode('utf-8')
try: # Python3
sys.stdout.buffer.write(text)
except AttributeError:
sys.stdout.write(text)
def wrap_read(): # pragma: no cover
"""
:rtype: str
"""
try:
return sys.stdin.read()
except AttributeError:
return sys.stdin.buffer.read()
def escape_md(text):
"""
Escapes markdown-sensitive characters within other markdown
constructs.
"""
return config.RE_MD_CHARS_MATCHER.sub(r"\\\1", text)
def escape_md_section(text, snob=False):
"""
Escapes markdown-sensitive characters across whole document sections.
"""
text = config.RE_MD_BACKSLASH_MATCHER.sub(r"\\\1", text)
if snob:
text = config.RE_MD_CHARS_MATCHER_ALL.sub(r"\\\1", text)
text = config.RE_MD_DOT_MATCHER.sub(r"\1\\\2", text)
text = config.RE_MD_PLUS_MATCHER.sub(r"\1\\\2", text)
text = config.RE_MD_DASH_MATCHER.sub(r"\1\\\2", text)
return text
def reformat_table(lines, right_margin):
"""
Given the lines of a table
padds the cells and returns the new lines
"""
# find the maximum width of the columns
max_width = [len(x.rstrip()) + right_margin for x in lines[0].split('|')]
max_cols = len(max_width)
for line in lines:
cols = [x.rstrip() for x in line.split('|')]
num_cols = len(cols)
# don't drop any data if colspan attributes result in unequal lengths
if num_cols < max_cols:
cols += [''] * (max_cols - num_cols)
elif max_cols < num_cols:
max_width += [
len(x) + right_margin for x in
cols[-(num_cols - max_cols):]
]
max_cols = num_cols
max_width = [max(len(x) + right_margin, old_len)
for x, old_len in zip(cols, max_width)]
# reformat
new_lines = []
for line in lines:
cols = [x.rstrip() for x in line.split('|')]
if set(line.strip()) == set('-|'):
filler = '-'
new_cols = [x.rstrip() + (filler * (M - len(x.rstrip())))
for x, M in zip(cols, max_width)]
else:
filler = ' '
new_cols = [x.rstrip() + (filler * (M - len(x.rstrip())))
for x, M in zip(cols, max_width)]
new_lines.append('|'.join(new_cols))
return new_lines
def pad_tables_in_text(text, right_margin=1):
"""
Provide padding for tables in the text
"""
lines = text.split('\n')
table_buffer, table_started = [], False
new_lines = []
for line in lines:
# Toggle table started
if (config.TABLE_MARKER_FOR_PAD in line):
table_started = not table_started
if not table_started:
table = reformat_table(table_buffer, right_margin)
new_lines.extend(table)
table_buffer = []
new_lines.append('')
continue
# Process lines
if table_started:
table_buffer.append(line)
else:
new_lines.append(line)
new_text = '\n'.join(new_lines)
return new_text