From 3cd6019ce3e2f2b02ccc7a2717bb8e36ade0d548 Mon Sep 17 00:00:00 2001 From: asbjorn grandt Date: Fri, 23 Jan 2015 22:57:36 +0100 Subject: [PATCH] Initial investigation into why some stories on AO3 aren't reformatted by Br_to_p The cause is that the function shields breaks inside existing paragraps these are seen as preformatted text blocks. Sadly AO3 encapsulates story bodies into a p tag... --- fanficdownloader/htmlheuristics.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/fanficdownloader/htmlheuristics.py b/fanficdownloader/htmlheuristics.py index 52a78df4..7087c393 100644 --- a/fanficdownloader/htmlheuristics.py +++ b/fanficdownloader/htmlheuristics.py @@ -33,9 +33,12 @@ def replace_br_with_p(body): if body.find('>') == -1 or body.rfind('<') == -1: return body + # logger.debug(u'---') # logger.debug(u'BODY start.: ' + body[:250]) + # logger.debug(u'--') # logger.debug(u'BODY end...: ' + body[-250:]) # logger.debug(u'BODY.......: ' + body) + # logger.debug(u'---') # clean breaks (
), removing whitespaces between them. body = re.sub(r'\s*]*>\s*', r'
', body) @@ -43,14 +46,14 @@ def replace_br_with_p(body): # change surrounding div to a p and remove attrs Top surrounding # tag in all cases now should be div, to just strip the first and # last tags. - if is_valid_block(body) and body.find('')+1:body.rindex('<')] + while is_valid_block(body) and body.find('')+1:body.rindex('<')].strip() body = soup_up_div(u'
' + body + u'
') body = body[body.index('>')+1:body.rindex('<')] - # Find all bexisting blocks with p, pre and blockquote tags, we need to shields break tags inside those. + # Find all existing blocks with p, pre and blockquote tags, we need to shields break tags inside those. # This is for "lenient" mode, however it is also used to clear break tags before and after the block elements. blocksRegex = re.compile(r'(\s*\s*)*\s*<(pre|p|blockquote|table)([^>]*)>(.+?)\s*(\s*\s*)*', re.DOTALL) body = blocksRegex.sub(r'\n<\2\3>\4\n', body) @@ -90,6 +93,13 @@ def replace_br_with_p(body): # Nuking breaks trailing paragraps that may be in the body. They are eventually treated as


body = re.sub(r'

\s*()+\s*', r'

\n

\n', body) + # logger.debug(u'--- 2 ---') + # logger.debug(u'BODY start.: ' + body[:250]) + # logger.debug(u'--') + # logger.debug(u'BODY end...: ' + body[-250:]) + # logger.debug(u'BODY.......: ' + body) + # logger.debug(u'--- 2 ---') + # Because a leading or trailing non break tag will break the following code, we have to mess around rather badly for a few lines. body = body.replace(u'[',u'&squareBracketStart;') body = body.replace(u']',u'&squareBracketEnd;') @@ -149,6 +159,10 @@ def replace_br_with_p(body): logger.debug(u'contentLinesSum...: ' + unicode(contentLinesSum)) logger.debug(u'longestLineLength.: ' + unicode(longestLineLength)) logger.debug(u'averageLineLength.: ' + unicode(averageLineLength)) + logger.debug(u'---') + logger.debug(u'breaksMaxIndex....: ' + unicode(breaksMaxIndex)) + logger.debug(u'len(breaksCount)-1: ' + unicode(len(breaksCount)-1)) + logger.debug(u'breaksMax.........: ' + unicode(breaksMax)) if breaksMaxIndex == len(breaksCount)-1 and breaksMax < 2: breaksMaxIndex = 0