diff --git a/fanficdownloader/htmlheuristics.py b/fanficdownloader/htmlheuristics.py index 426b3fcb..d33e2b20 100644 --- a/fanficdownloader/htmlheuristics.py +++ b/fanficdownloader/htmlheuristics.py @@ -23,6 +23,7 @@ from . import exceptions as exceptions def replace_br_with_p(body): + # Ascii character (and Unicode as well) xA0 is a non-breaking space, ascii code 160. # However, Python Regex does not recognize it as a whitespace, so we'll be changing it to a reagular space. body = body.replace(u'\xa0', u' ') @@ -32,12 +33,33 @@ def replace_br_with_p(body): # logger.debug(u'BODY start.: ' + body[:250]) # logger.debug(u'BODY end...: ' + body[-250:]) - # logger.debug(u'BODY: ' + body) # change surrounding div to a p and remove attrs Top surrounding # tag in all cases now should be div, to just strip the first and # last tags. - body = u'

'+body[body.index('>')+1:body.rindex("<")]+u'

' + body = body[body.index('>')+1:body.rindex("<")] + + # Need to look at BeautifulSoup to see if it'll even return breaks that aren't properly formatted (
). + body = re.sub(r'\s*]*>\s*', r'
', body) + + # Find all bexisting blocks with p, pre and blockquote tags, we need to leave those alone. + blocksRegex = re.compile(r'(\s*\s*)*\s*<(pre|p|blockquote)([^>]*)>(.+?)\s*(\s*\s*)*', re.DOTALL) + body = blocksRegex.sub(r'\n<\2\3>\4\n', body) + + blocks = blocksRegex.finditer(body) + # For our replacements to work, we need to work backwards, so we reverse the iterator. + blocksList = [] + for match in blocks: + blocksList.insert(0, match) + + for match in blocksList: + group4 = match.group(4).replace(u'
', u'{br /}') + body = body[:match.start(4)] + group4 + body[match.end(4):] + + # change surrounding div to a p and remove attrs Top surrounding + # tag in all cases now should be div, to just strip the first and + # last tags. + body = u'

' + body + u'

' # Nuke div tags surrounding a HR tag. body = re.sub(r']+>\s*]+>\s*', r'\n
\n', body) @@ -46,9 +68,6 @@ def replace_br_with_p(body): # This nukes the hr tag attributes. body = re.sub(r'\s*]+>\s*', r'\n
\n', body) - # Need to look at BeautifulSoup to see if it'll even return breaks that aren't properly formatted (
). - body = re.sub(r'\s*]*>\s*', r'
', body) - # Remove leading and trailing breaks from HR tags body = re.sub(r'\s*()*\s*\s*()*\s*', r'\n
\n', body) # Nuking breaks leading paragraps that may be in the body. They are eventually treated as


@@ -117,7 +136,6 @@ def replace_br_with_p(body): breaksMaxIndex = 0 breaksMax = breaksCount[0] - logger.debug(u'---') logger.debug(u'breaks 1: ' + str(breaksCount[0])) logger.debug(u'breaks 2: ' + str(breaksCount[1])) @@ -162,6 +180,9 @@ def replace_br_with_p(body): # change empty p tags to include a br to force spacing. body = re.sub(r'

\s*

', r'


', body) + # Clean up hr tags, and add inverted p tag pairs + body = re.sub(r'(]+>)*\s*\s*()*', r'\n
\n', body) + # Clean up hr tags, and add inverted p tag pairs body = re.sub(r'\s*\s*', r'

\n
\n

', body) @@ -180,6 +201,8 @@ def replace_br_with_p(body): # Remove empty tag pairs body = re.sub(r'\s*<(\S+)[^>]*>\s*', r'', body) + body = body.replace(u'{br /}', u'
') + # re-wrap in div tag. body = u'

\n' + body + u'\n
'