diff --git a/fanficdownloader/htmlheuristics.py b/fanficdownloader/htmlheuristics.py
index 426b3fcb..d33e2b20 100644
--- a/fanficdownloader/htmlheuristics.py
+++ b/fanficdownloader/htmlheuristics.py
@@ -23,6 +23,7 @@ from . import exceptions as exceptions
def replace_br_with_p(body):
+
# Ascii character (and Unicode as well) xA0 is a non-breaking space, ascii code 160.
# However, Python Regex does not recognize it as a whitespace, so we'll be changing it to a reagular space.
body = body.replace(u'\xa0', u' ')
@@ -32,12 +33,33 @@ def replace_br_with_p(body):
# logger.debug(u'BODY start.: ' + body[:250])
# logger.debug(u'BODY end...: ' + body[-250:])
- # logger.debug(u'BODY: ' + body)
# change surrounding div to a p and remove attrs Top surrounding
# tag in all cases now should be div, to just strip the first and
# last tags.
- body = u'
'+body[body.index('>')+1:body.rindex("<")]+u'
'
+ body = body[body.index('>')+1:body.rindex("<")]
+
+ # Need to look at BeautifulSoup to see if it'll even return breaks that aren't properly formatted (
).
+ body = re.sub(r'\s*
]*>\s*', r'
', body)
+
+ # Find all bexisting blocks with p, pre and blockquote tags, we need to leave those alone.
+ blocksRegex = re.compile(r'(\s*
\s*)*\s*<(pre|p|blockquote)([^>]*)>(.+?)\2>\s*(\s*
\s*)*', re.DOTALL)
+ body = blocksRegex.sub(r'\n<\2\3>\4\2>\n', body)
+
+ blocks = blocksRegex.finditer(body)
+ # For our replacements to work, we need to work backwards, so we reverse the iterator.
+ blocksList = []
+ for match in blocks:
+ blocksList.insert(0, match)
+
+ for match in blocksList:
+ group4 = match.group(4).replace(u'
', u'{br /}')
+ body = body[:match.start(4)] + group4 + body[match.end(4):]
+
+ # change surrounding div to a p and remove attrs Top surrounding
+ # tag in all cases now should be div, to just strip the first and
+ # last tags.
+ body = u'' + body + u'
'
# Nuke div tags surrounding a HR tag.
body = re.sub(r']+>\s*
]+>\s*', r'\n
\n', body)
@@ -46,9 +68,6 @@ def replace_br_with_p(body):
# This nukes the hr tag attributes.
body = re.sub(r'\s*
]+>\s*', r'\n
\n', body)
- # Need to look at BeautifulSoup to see if it'll even return breaks that aren't properly formatted (
).
- body = re.sub(r'\s*
]*>\s*', r'
', body)
-
# Remove leading and trailing breaks from HR tags
body = re.sub(r'\s*(
)*\s*
\s*(
)*\s*', r'\n
\n', body)
# Nuking breaks leading paragraps that may be in the body. They are eventually treated as
@@ -117,7 +136,6 @@ def replace_br_with_p(body):
breaksMaxIndex = 0
breaksMax = breaksCount[0]
-
logger.debug(u'---')
logger.debug(u'breaks 1: ' + str(breaksCount[0]))
logger.debug(u'breaks 2: ' + str(breaksCount[1]))
@@ -162,6 +180,9 @@ def replace_br_with_p(body):
# change empty p tags to include a br to force spacing.
body = re.sub(r'\s*
', r'
', body)
+ # Clean up hr tags, and add inverted p tag pairs
+ body = re.sub(r'(]+>)*\s*
\s*()*', r'\n
\n', body)
+
# Clean up hr tags, and add inverted p tag pairs
body = re.sub(r'\s*
\s*', r'\n
\n', body)
@@ -180,6 +201,8 @@ def replace_br_with_p(body):
# Remove empty tag pairs
body = re.sub(r'\s*<(\S+)[^>]*>\s*\1>', r'', body)
+ body = body.replace(u'{br /}', u'
')
+
# re-wrap in div tag.
body = u'
\n' + body + u'\n
'