mirror of
https://github.com/JimmXinu/FanFicFare.git
synced 2025-12-06 08:52:55 +01:00
298 lines
7.7 KiB
Python
298 lines
7.7 KiB
Python
import sys
|
|
|
|
from html2text import config
|
|
from html2text.compat import htmlentitydefs
|
|
|
|
|
|
def name2cp(k):
|
|
"""Return sname to codepoint"""
|
|
if k == 'apos':
|
|
return ord("'")
|
|
return htmlentitydefs.name2codepoint[k]
|
|
|
|
|
|
unifiable_n = {}
|
|
|
|
for k in config.UNIFIABLE.keys():
|
|
unifiable_n[name2cp(k)] = config.UNIFIABLE[k]
|
|
|
|
|
|
def hn(tag):
|
|
if tag[0] == 'h' and len(tag) == 2:
|
|
try:
|
|
n = int(tag[1])
|
|
if n in range(1, 10): # pragma: no branch
|
|
return n
|
|
except ValueError:
|
|
return 0
|
|
|
|
|
|
def dumb_property_dict(style):
|
|
"""
|
|
:returns: A hash of css attributes
|
|
"""
|
|
out = dict([(x.strip().lower(), y.strip().lower()) for x, y in
|
|
[z.split(':', 1) for z in
|
|
style.split(';') if ':' in z
|
|
]
|
|
]
|
|
)
|
|
|
|
return out
|
|
|
|
|
|
def dumb_css_parser(data):
|
|
"""
|
|
:type data: str
|
|
|
|
:returns: A hash of css selectors, each of which contains a hash of
|
|
css attributes.
|
|
:rtype: dict
|
|
"""
|
|
# remove @import sentences
|
|
data += ';'
|
|
importIndex = data.find('@import')
|
|
while importIndex != -1:
|
|
data = data[0:importIndex] + data[data.find(';', importIndex) + 1:]
|
|
importIndex = data.find('@import')
|
|
|
|
# parse the css. reverted from dictionary comprehension in order to
|
|
# support older pythons
|
|
elements = [x.split('{') for x in data.split('}') if '{' in x.strip()]
|
|
try:
|
|
elements = dict([(a.strip(), dumb_property_dict(b))
|
|
for a, b in elements])
|
|
except ValueError: # pragma: no cover
|
|
elements = {} # not that important
|
|
|
|
return elements
|
|
|
|
|
|
def element_style(attrs, style_def, parent_style):
|
|
"""
|
|
:type attrs: dict
|
|
:type style_def: dict
|
|
:type style_def: dict
|
|
|
|
:returns: A hash of the 'final' style attributes of the element
|
|
:rtype: dict
|
|
"""
|
|
style = parent_style.copy()
|
|
if 'class' in attrs:
|
|
for css_class in attrs['class'].split():
|
|
css_style = style_def.get('.' + css_class, {})
|
|
style.update(css_style)
|
|
if 'style' in attrs:
|
|
immediate_style = dumb_property_dict(attrs['style'])
|
|
style.update(immediate_style)
|
|
|
|
return style
|
|
|
|
|
|
def google_list_style(style):
|
|
"""
|
|
Finds out whether this is an ordered or unordered list
|
|
|
|
:type style: dict
|
|
|
|
:rtype: str
|
|
"""
|
|
if 'list-style-type' in style:
|
|
list_style = style['list-style-type']
|
|
if list_style in ['disc', 'circle', 'square', 'none']:
|
|
return 'ul'
|
|
|
|
return 'ol'
|
|
|
|
|
|
def google_has_height(style):
|
|
"""
|
|
Check if the style of the element has the 'height' attribute
|
|
explicitly defined
|
|
|
|
:type style: dict
|
|
|
|
:rtype: bool
|
|
"""
|
|
if 'height' in style:
|
|
return True
|
|
|
|
return False
|
|
|
|
|
|
def google_text_emphasis(style):
|
|
"""
|
|
:type style: dict
|
|
|
|
:returns: A list of all emphasis modifiers of the element
|
|
:rtype: list
|
|
"""
|
|
emphasis = []
|
|
if 'text-decoration' in style:
|
|
emphasis.append(style['text-decoration'])
|
|
if 'font-style' in style:
|
|
emphasis.append(style['font-style'])
|
|
if 'font-weight' in style:
|
|
emphasis.append(style['font-weight'])
|
|
|
|
return emphasis
|
|
|
|
|
|
def google_fixed_width_font(style):
|
|
"""
|
|
Check if the css of the current element defines a fixed width font
|
|
|
|
:type style: dict
|
|
|
|
:rtype: bool
|
|
"""
|
|
font_family = ''
|
|
if 'font-family' in style:
|
|
font_family = style['font-family']
|
|
if 'courier new' == font_family or 'consolas' == font_family:
|
|
return True
|
|
|
|
return False
|
|
|
|
|
|
def list_numbering_start(attrs):
|
|
"""
|
|
Extract numbering from list element attributes
|
|
|
|
:type attrs: dict
|
|
|
|
:rtype: int or None
|
|
"""
|
|
if 'start' in attrs:
|
|
try:
|
|
return int(attrs['start']) - 1
|
|
except ValueError:
|
|
pass
|
|
|
|
return 0
|
|
|
|
|
|
def skipwrap(para, wrap_links):
|
|
# If it appears to contain a link
|
|
# don't wrap
|
|
if (len(config.RE_LINK.findall(para)) > 0) and not wrap_links:
|
|
return True
|
|
# If the text begins with four spaces or one tab, it's a code block;
|
|
# don't wrap
|
|
if para[0:4] == ' ' or para[0] == '\t':
|
|
return True
|
|
|
|
# If the text begins with only two "--", possibly preceded by
|
|
# whitespace, that's an emdash; so wrap.
|
|
stripped = para.lstrip()
|
|
if stripped[0:2] == "--" and len(stripped) > 2 and stripped[2] != "-":
|
|
return False
|
|
|
|
# I'm not sure what this is for; I thought it was to detect lists,
|
|
# but there's a <br>-inside-<span> case in one of the tests that
|
|
# also depends upon it.
|
|
if stripped[0:1] == '-' or stripped[0:1] == '*':
|
|
return True
|
|
|
|
# If the text begins with a single -, *, or +, followed by a space,
|
|
# or an integer, followed by a ., followed by a space (in either
|
|
# case optionally proceeded by whitespace), it's a list; don't wrap.
|
|
if config.RE_ORDERED_LIST_MATCHER.match(stripped) or \
|
|
config.RE_UNORDERED_LIST_MATCHER.match(stripped):
|
|
return True
|
|
|
|
return False
|
|
|
|
|
|
def wrapwrite(text):
|
|
text = text.encode('utf-8')
|
|
try: # Python3
|
|
sys.stdout.buffer.write(text)
|
|
except AttributeError:
|
|
sys.stdout.write(text)
|
|
|
|
|
|
def wrap_read(): # pragma: no cover
|
|
"""
|
|
:rtype: str
|
|
"""
|
|
try:
|
|
return sys.stdin.read()
|
|
except AttributeError:
|
|
return sys.stdin.buffer.read()
|
|
|
|
|
|
def escape_md(text):
|
|
"""
|
|
Escapes markdown-sensitive characters within other markdown
|
|
constructs.
|
|
"""
|
|
return config.RE_MD_CHARS_MATCHER.sub(r"\\\1", text)
|
|
|
|
|
|
def escape_md_section(text, snob=False):
|
|
"""
|
|
Escapes markdown-sensitive characters across whole document sections.
|
|
"""
|
|
text = config.RE_MD_BACKSLASH_MATCHER.sub(r"\\\1", text)
|
|
|
|
if snob:
|
|
text = config.RE_MD_CHARS_MATCHER_ALL.sub(r"\\\1", text)
|
|
|
|
text = config.RE_MD_DOT_MATCHER.sub(r"\1\\\2", text)
|
|
text = config.RE_MD_PLUS_MATCHER.sub(r"\1\\\2", text)
|
|
text = config.RE_MD_DASH_MATCHER.sub(r"\1\\\2", text)
|
|
|
|
return text
|
|
|
|
def reformat_table(lines, right_margin):
|
|
"""
|
|
Given the lines of a table
|
|
padds the cells and returns the new lines
|
|
"""
|
|
# find the maximum width of the columns
|
|
max_width = [len(x.rstrip()) + right_margin for x in lines[0].split('|')]
|
|
for line in lines:
|
|
cols = [x.rstrip() for x in line.split('|')]
|
|
max_width = [max(len(x) + right_margin, old_len)
|
|
for x, old_len in zip(cols, max_width)]
|
|
|
|
# reformat
|
|
new_lines = []
|
|
for line in lines:
|
|
cols = [x.rstrip() for x in line.split('|')]
|
|
if set(line.strip()) == set('-|'):
|
|
filler = '-'
|
|
new_cols = [x.rstrip() + (filler * (M - len(x.rstrip())))
|
|
for x, M in zip(cols, max_width)]
|
|
else:
|
|
filler = ' '
|
|
new_cols = [x.rstrip() + (filler * (M - len(x.rstrip())))
|
|
for x, M in zip(cols, max_width)]
|
|
new_lines.append('|'.join(new_cols))
|
|
return new_lines
|
|
|
|
def pad_tables_in_text(text, right_margin=1):
|
|
"""
|
|
Provide padding for tables in the text
|
|
"""
|
|
lines = text.split('\n')
|
|
table_buffer, altered_lines, table_widths, table_started = [], [], [], False
|
|
new_lines = []
|
|
for line in lines:
|
|
# Toogle table started
|
|
if (config.TABLE_MARKER_FOR_PAD in line):
|
|
table_started = not table_started
|
|
if not table_started:
|
|
table = reformat_table(table_buffer, right_margin)
|
|
new_lines.extend(table)
|
|
table_buffer = []
|
|
new_lines.append('')
|
|
continue
|
|
# Process lines
|
|
if table_started:
|
|
table_buffer.append(line)
|
|
else:
|
|
new_lines.append(line)
|
|
new_text = '\n'.join(new_lines)
|
|
return new_text
|