Added "shielding" for break tags inside paragraphs, pre and blockquote tags. These should be left alone.

2026-01-05 07:34:36 +01:00 · 2013-11-06 11:37:56 +01:00 · 2013-11-06 11:37:56 +01:00 · e8e4180621
commit e8e4180621
parent 7d29b281b7
1 changed files with 29 additions and 6 deletions
--- a/fanficdownloader/htmlheuristics.py
+++ b/fanficdownloader/htmlheuristics.py
@ -23,6 +23,7 @@ from . import exceptions as exceptions

 def replace_br_with_p(body):

+
    # Ascii character (and Unicode as well) xA0 is a non-breaking space, ascii code 160.
    # However, Python Regex does not recognize it as a whitespace, so we'll be changing it to a reagular space.
    body = body.replace(u'\xa0', u' ')
@ -32,12 +33,33 @@ def replace_br_with_p(body):

    # logger.debug(u'BODY start.: ' + body[:250])
    # logger.debug(u'BODY end...: ' + body[-250:])
-    # logger.debug(u'BODY: ' + body)

    # change surrounding div to a p and remove attrs Top surrounding
    # tag in all cases now should be div, to just strip the first and
    # last tags.
-    body = u'<p>'+body[body.index('>')+1:body.rindex("<")]+u'</p>'
+    body = body[body.index('>')+1:body.rindex("<")]
+
+    # Need to look at BeautifulSoup to see if it'll even return breaks that aren't properly formatted (<br />).
+    body = re.sub(r'\s*<br[^>]*>\s*', r'<br />', body)
+
+    # Find all bexisting blocks with p, pre and blockquote tags, we need to leave those alone.
+    blocksRegex = re.compile(r'(\s*<br\ */*>\s*)*\s*<(pre|p|blockquote)([^>]*)>(.+?)</\2>\s*(\s*<br\ */*>\s*)*', re.DOTALL)
+    body = blocksRegex.sub(r'\n<\2\3>\4</\2>\n', body)
+
+    blocks = blocksRegex.finditer(body)
+    # For our replacements to work, we need to work backwards, so we reverse the iterator.
+    blocksList = []
+    for match in blocks:
+        blocksList.insert(0, match)
+
+    for match in blocksList:
+        group4 =  match.group(4).replace(u'<br />', u'{br /}')
+        body = body[:match.start(4)] + group4 + body[match.end(4):]
+
+    # change surrounding div to a p and remove attrs Top surrounding
+    # tag in all cases now should be div, to just strip the first and
+    # last tags.
+    body = u'<p>' + body + u'</p>'

    # Nuke div tags surrounding a HR tag.
    body = re.sub(r'<div[^>]+>\s*<hr[^>]+>\s*</div>', r'\n<hr />\n', body)
@ -46,9 +68,6 @@ def replace_br_with_p(body):
    # This nukes the hr tag attributes.
    body = re.sub(r'\s*<hr[^>]+>\s*', r'\n<hr />\n', body)

-    # Need to look at BeautifulSoup to see if it'll even return breaks that aren't properly formatted (<br />).
-    body = re.sub(r'\s*<br[^>]*>\s*', r'<br />', body)
-
    # Remove leading and trailing breaks from HR tags
    body = re.sub(r'\s*(<br\ \/>)*\s*<hr\ \/>\s*(<br\ \/>)*\s*', r'\n<hr />\n', body)
    # Nuking breaks leading paragraps that may be in the body. They are eventually treated as <p><br /></p>
@ -117,7 +136,6 @@ def replace_br_with_p(body):
        breaksMaxIndex = 0
        breaksMax = breaksCount[0]

-
    logger.debug(u'---')
    logger.debug(u'breaks 1: ' + str(breaksCount[0]))
    logger.debug(u'breaks 2: ' + str(breaksCount[1]))
@ -162,6 +180,9 @@ def replace_br_with_p(body):
    # change empty p tags to include a br to force spacing.
    body = re.sub(r'<p>\s*</p>', r'<p><br/></p>', body)

+    # Clean up hr tags, and add inverted p tag pairs
+    body = re.sub(r'(<div[^>]+>)*\s*<hr\ \/>\s*(</div>)*', r'\n<hr />\n', body)
+
    # Clean up hr tags, and add inverted p tag pairs
    body = re.sub(r'\s*<hr\ \/>\s*', r'</p>\n<hr />\n<p>', body)

@ -180,6 +201,8 @@ def replace_br_with_p(body):
    # Remove empty tag pairs
    body = re.sub(r'\s*<(\S+)[^>]*>\s*</\1>', r'', body)

+    body = body.replace(u'{br /}', u'<br />')
+    
    # re-wrap in div tag.
    body = u'<div>\n' + body + u'\n</div>'