From e8e418062196abdd056e4fe23bb2bbef76478b31 Mon Sep 17 00:00:00 2001
From: asbjorn grandt <asbjorn.grandt@gmail.com>
Date: Wed, 6 Nov 2013 11:37:56 +0100
Subject: [PATCH] Added "shielding" for break tags inside paragraphs, pre and
 blockquote tags. These should be left alone.

---
 fanficdownloader/htmlheuristics.py | 35 +++++++++++++++++++++++++-----
 1 file changed, 29 insertions(+), 6 deletions(-)
diff --git a/fanficdownloader/htmlheuristics.py b/fanficdownloader/htmlheuristics.py
index 426b3fcb..d33e2b20 100644
--- a/fanficdownloader/htmlheuristics.py
+++ b/fanficdownloader/htmlheuristics.py
@@ -23,6 +23,7 @@ from . import exceptions as exceptions
 
 def replace_br_with_p(body):
 
+
     # Ascii character (and Unicode as well) xA0 is a non-breaking space, ascii code 160.
     # However, Python Regex does not recognize it as a whitespace, so we'll be changing it to a reagular space.
     body = body.replace(u'\xa0', u' ')
@@ -32,12 +33,33 @@ def replace_br_with_p(body):
 
     # logger.debug(u'BODY start.: ' + body[:250])
     # logger.debug(u'BODY end...: ' + body[-250:])
-    # logger.debug(u'BODY: ' + body)
 
     # change surrounding div to a p and remove attrs Top surrounding
     # tag in all cases now should be div, to just strip the first and
     # last tags.
-    body = u'<p>'+body[body.index('>')+1:body.rindex("<")]+u'</p>'
+    body = body[body.index('>')+1:body.rindex("<")]
+
+    # Need to look at BeautifulSoup to see if it'll even return breaks that aren't properly formatted (<br />).
+    body = re.sub(r'\s*<br[^>]*>\s*', r'<br />', body)
+
+    # Find all bexisting blocks with p, pre and blockquote tags, we need to leave those alone.
+    blocksRegex = re.compile(r'(\s*<br\ */*>\s*)*\s*<(pre|p|blockquote)([^>]*)>(.+?)</\2>\s*(\s*<br\ */*>\s*)*', re.DOTALL)
+    body = blocksRegex.sub(r'\n<\2\3>\4</\2>\n', body)
+
+    blocks = blocksRegex.finditer(body)
+    # For our replacements to work, we need to work backwards, so we reverse the iterator.
+    blocksList = []
+    for match in blocks:
+        blocksList.insert(0, match)
+
+    for match in blocksList:
+        group4 =  match.group(4).replace(u'<br />', u'{br /}')
+        body = body[:match.start(4)] + group4 + body[match.end(4):]
+
+    # change surrounding div to a p and remove attrs Top surrounding
+    # tag in all cases now should be div, to just strip the first and
+    # last tags.
+    body = u'<p>' + body + u'</p>'
 
     # Nuke div tags surrounding a HR tag.
     body = re.sub(r'<div[^>]+>\s*<hr[^>]+>\s*</div>', r'\n<hr />\n', body)
@@ -46,9 +68,6 @@ def replace_br_with_p(body):
     # This nukes the hr tag attributes.
     body = re.sub(r'\s*<hr[^>]+>\s*', r'\n<hr />\n', body)
 
-    # Need to look at BeautifulSoup to see if it'll even return breaks that aren't properly formatted (<br />).
-    body = re.sub(r'\s*<br[^>]*>\s*', r'<br />', body)
-
     # Remove leading and trailing breaks from HR tags
     body = re.sub(r'\s*(<br\ \/>)*\s*<hr\ \/>\s*(<br\ \/>)*\s*', r'\n<hr />\n', body)
     # Nuking breaks leading paragraps that may be in the body. They are eventually treated as <p><br /></p>
@@ -117,7 +136,6 @@ def replace_br_with_p(body):
         breaksMaxIndex = 0
         breaksMax = breaksCount[0]
 
-
     logger.debug(u'---')
     logger.debug(u'breaks 1: ' + str(breaksCount[0]))
     logger.debug(u'breaks 2: ' + str(breaksCount[1]))
@@ -162,6 +180,9 @@ def replace_br_with_p(body):
     # change empty p tags to include a br to force spacing.
     body = re.sub(r'<p>\s*</p>', r'<p><br/></p>', body)
 
+    # Clean up hr tags, and add inverted p tag pairs
+    body = re.sub(r'(<div[^>]+>)*\s*<hr\ \/>\s*(</div>)*', r'\n<hr />\n', body)
+
     # Clean up hr tags, and add inverted p tag pairs
     body = re.sub(r'\s*<hr\ \/>\s*', r'</p>\n<hr />\n<p>', body)
 
@@ -180,6 +201,8 @@ def replace_br_with_p(body):
     # Remove empty tag pairs
     body = re.sub(r'\s*<(\S+)[^>]*>\s*</\1>', r'', body)
 
+    body = body.replace(u'{br /}', u'<br />')
+    
     # re-wrap in div tag.
     body = u'<div>\n' + body + u'\n</div>'