Add markers and check to prevent replace_br_with_p running more than once on the same text.

This commit is contained in:
Jim Miller 2017-07-19 11:12:02 -05:00
parent c243b1db3e
commit b3a32ae240

View file

@ -36,7 +36,13 @@ def replace_br_with_p(body):
logger.debug("replace_br_with_p time:%s"%(datetime.now() - start))
return retval
was_run_marker=u'FFF_replace_br_with_p_has_been_run'
def _replace_br_with_p(body):
if was_run_marker in body:
logger.debug("replace_br_with_p previously applied, skipping.")
return body
# Ascii character (and Unicode as well) xA0 is a non-breaking space, ascii code 160.
# However, Python Regex does not recognize it as a whitespace, so we'll be changing it to a regular space.
# .strip() so "\n<div>" at beginning is also recognized.
@ -256,23 +262,19 @@ def _replace_br_with_p(body):
body = re.sub(r'XAMP;(.+?);', r'&\1;', body)
body = body.strip()
## strip off extra <div> nestings that have built up over time.
# b='<div>'
# e='</div>'
# while body.startswith(b) and body.endswith(e):
# body = body[len(b):-len(e)].strip()
# re-wrap in div tag.
body = u'<div>\n' + body + u'</div>\n'
# return body
return tag_sanitizer(body)
body = u'<div id="' +was_run_marker+ u'">\n' + body + u'</div>\n'
# return body after tag_sanitizer with 'replace_br_with_p done' marker.
## marker included twice becaues the comment & id could each be
## removed by different 'clean ups'. I hope it's less likely both
## will be.
return u'<!-- ' +was_run_marker+ u' -->\n' + tag_sanitizer(body)
def is_valid_block(block):
return unicode(block).find('<') == 0 and unicode(block).find('<!') != 0
def soup_up_div(body):
blockTags = ['address', 'blockquote', 'del', 'div', 'dl', 'fieldset', 'form', 'ins', 'noscript', 'ol', 'p', 'pre', 'table', 'ul']
blockTags = ['address', 'aside', 'blockquote', 'del', 'div', 'dl', 'fieldset', 'form', 'ins', 'noscript', 'ol', 'p', 'pre', 'table', 'ul']
recurseTags = ['blockquote', 'div', 'noscript']
tag = body[:body.index('>')+1]