From 3cd6019ce3e2f2b02ccc7a2717bb8e36ade0d548 Mon Sep 17 00:00:00 2001
From: asbjorn grandt <asbjorn.grandt@gmail.com>
Date: Fri, 23 Jan 2015 22:57:36 +0100
Subject: [PATCH] Initial investigation into why some stories on AO3 aren't
 reformatted by Br_to_p   The cause is that the function shields breaks inside
 existing paragraps   these are seen as preformatted text blocks. Sadly AO3
 encapsulates story   bodies into a p tag...

---
 fanficdownloader/htmlheuristics.py | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)
diff --git a/fanficdownloader/htmlheuristics.py b/fanficdownloader/htmlheuristics.py
index 52a78df4..7087c393 100644
--- a/fanficdownloader/htmlheuristics.py
+++ b/fanficdownloader/htmlheuristics.py
@@ -33,9 +33,12 @@ def replace_br_with_p(body):
     if body.find('>') == -1 or body.rfind('<') == -1:
         return body
 
+    # logger.debug(u'---')
     # logger.debug(u'BODY start.: ' + body[:250])
+    # logger.debug(u'--')
     # logger.debug(u'BODY end...: ' + body[-250:])
     # logger.debug(u'BODY.......: ' + body)
+    # logger.debug(u'---')
 
     # clean breaks (<br />), removing whitespaces between them.
     body = re.sub(r'\s*<br[^>]*>\s*', r'<br />', body)
@@ -43,14 +46,14 @@ def replace_br_with_p(body):
     # change surrounding div to a p and remove attrs Top surrounding
     # tag in all cases now should be div, to just strip the first and
     # last tags.
-    if is_valid_block(body) and body.find('<div') == 0:
-        body = body[body.index('>')+1:body.rindex('<')]
+    while is_valid_block(body) and body.find('<div') == 0:
+        body = body[body.index('>')+1:body.rindex('<')].strip()
 
     body = soup_up_div(u'<div>' + body + u'</div>')
 
     body = body[body.index('>')+1:body.rindex('<')]
 
-    # Find all bexisting blocks with p, pre and blockquote tags, we need to shields break tags inside those.
+    # Find all existing blocks with p, pre and blockquote tags, we need to shields break tags inside those.
     # This is for "lenient" mode, however it is also used to clear break tags before and after the block elements.
     blocksRegex = re.compile(r'(\s*<br\ />\s*)*\s*<(pre|p|blockquote|table)([^>]*)>(.+?)</\2>\s*(\s*<br\ />\s*)*', re.DOTALL)
     body = blocksRegex.sub(r'\n<\2\3>\4</\2>\n', body)
@@ -90,6 +93,13 @@ def replace_br_with_p(body):
     # Nuking breaks trailing paragraps that may be in the body. They are eventually treated as <p><br /></p>
     body = re.sub(r'</p>\s*(<br\ \/>)+\s*', r'</p>\n<p></p>\n', body)
 
+    # logger.debug(u'--- 2 ---')
+    # logger.debug(u'BODY start.: ' + body[:250])
+    # logger.debug(u'--')
+    # logger.debug(u'BODY end...: ' + body[-250:])
+    # logger.debug(u'BODY.......: ' + body)
+    # logger.debug(u'--- 2 ---')
+
     # Because a leading or trailing non break tag will break the following code, we have to mess around rather badly for a few lines.
     body = body.replace(u'[',u'&squareBracketStart;')
     body = body.replace(u']',u'&squareBracketEnd;')
@@ -149,6 +159,10 @@ def replace_br_with_p(body):
     logger.debug(u'contentLinesSum...: ' + unicode(contentLinesSum))
     logger.debug(u'longestLineLength.: ' + unicode(longestLineLength))
     logger.debug(u'averageLineLength.: ' + unicode(averageLineLength))
+    logger.debug(u'---')
+    logger.debug(u'breaksMaxIndex....: ' + unicode(breaksMaxIndex))
+    logger.debug(u'len(breaksCount)-1: ' + unicode(len(breaksCount)-1))
+    logger.debug(u'breaksMax.........: ' + unicode(breaksMax))
 
     if breaksMaxIndex == len(breaksCount)-1 and breaksMax < 2:
         breaksMaxIndex = 0