Add markers and check to prevent replace_br_with_p running more than once on the same text.

2025-12-28 11:45:19 +01:00 · 2017-07-19 11:12:02 -05:00 · 2017-07-19 11:12:02 -05:00 · b3a32ae240
commit b3a32ae240
parent c243b1db3e
1 changed files with 13 additions and 11 deletions
--- a/fanficfare/htmlheuristics.py
+++ b/fanficfare/htmlheuristics.py
@ -36,7 +36,13 @@ def replace_br_with_p(body):
    logger.debug("replace_br_with_p time:%s"%(datetime.now() - start))
    return retval

+was_run_marker=u'FFF_replace_br_with_p_has_been_run'
 def _replace_br_with_p(body):
+
+    if was_run_marker in body:
+        logger.debug("replace_br_with_p previously applied, skipping.")
+        return body
+
    # Ascii character (and Unicode as well) xA0 is a non-breaking space, ascii code 160.
    # However, Python Regex does not recognize it as a whitespace, so we'll be changing it to a regular space.
    # .strip() so "\n<div>" at beginning is also recognized.
@ -256,23 +262,19 @@ def _replace_br_with_p(body):
    body = re.sub(r'XAMP;(.+?);', r'&\1;', body)
    body = body.strip()

-    ## strip off extra <div> nestings that have built up over time.
-    # b='<div>'
-    # e='</div>'
-    # while body.startswith(b) and body.endswith(e):
-    #     body = body[len(b):-len(e)].strip()
-
    # re-wrap in div tag.
-    body = u'<div>\n' + body + u'</div>\n'
-
-    # return body
-    return tag_sanitizer(body)
+    body = u'<div id="' +was_run_marker+ u'">\n' + body + u'</div>\n'
+    # return body after tag_sanitizer with 'replace_br_with_p done' marker.
+    ## marker included twice becaues the comment & id could each be
+    ## removed by different 'clean ups'.  I hope it's less likely both
+    ## will be.
+    return u'<!-- ' +was_run_marker+ u' -->\n' + tag_sanitizer(body)

 def is_valid_block(block):
    return unicode(block).find('<') == 0 and unicode(block).find('<!') != 0

 def soup_up_div(body):
-    blockTags = ['address', 'blockquote', 'del', 'div', 'dl', 'fieldset', 'form', 'ins', 'noscript', 'ol', 'p', 'pre', 'table', 'ul']
+    blockTags = ['address', 'aside', 'blockquote', 'del', 'div', 'dl', 'fieldset', 'form', 'ins', 'noscript', 'ol', 'p', 'pre', 'table', 'ul']
    recurseTags = ['blockquote', 'div', 'noscript']

    tag = body[:body.index('>')+1]