mirror of
https://github.com/JimmXinu/FanFicFare.git
synced 2026-05-09 05:21:13 +02:00
Initial investigation into why some stories on AO3 aren't reformatted by Br_to_p
The cause is that the function shields breaks inside existing paragraps these are seen as preformatted text blocks. Sadly AO3 encapsulates story bodies into a p tag...
This commit is contained in:
parent
15b246c21e
commit
3cd6019ce3
1 changed files with 17 additions and 3 deletions
|
|
@ -33,9 +33,12 @@ def replace_br_with_p(body):
|
|||
if body.find('>') == -1 or body.rfind('<') == -1:
|
||||
return body
|
||||
|
||||
# logger.debug(u'---')
|
||||
# logger.debug(u'BODY start.: ' + body[:250])
|
||||
# logger.debug(u'--')
|
||||
# logger.debug(u'BODY end...: ' + body[-250:])
|
||||
# logger.debug(u'BODY.......: ' + body)
|
||||
# logger.debug(u'---')
|
||||
|
||||
# clean breaks (<br />), removing whitespaces between them.
|
||||
body = re.sub(r'\s*<br[^>]*>\s*', r'<br />', body)
|
||||
|
|
@ -43,14 +46,14 @@ def replace_br_with_p(body):
|
|||
# change surrounding div to a p and remove attrs Top surrounding
|
||||
# tag in all cases now should be div, to just strip the first and
|
||||
# last tags.
|
||||
if is_valid_block(body) and body.find('<div') == 0:
|
||||
body = body[body.index('>')+1:body.rindex('<')]
|
||||
while is_valid_block(body) and body.find('<div') == 0:
|
||||
body = body[body.index('>')+1:body.rindex('<')].strip()
|
||||
|
||||
body = soup_up_div(u'<div>' + body + u'</div>')
|
||||
|
||||
body = body[body.index('>')+1:body.rindex('<')]
|
||||
|
||||
# Find all bexisting blocks with p, pre and blockquote tags, we need to shields break tags inside those.
|
||||
# Find all existing blocks with p, pre and blockquote tags, we need to shields break tags inside those.
|
||||
# This is for "lenient" mode, however it is also used to clear break tags before and after the block elements.
|
||||
blocksRegex = re.compile(r'(\s*<br\ />\s*)*\s*<(pre|p|blockquote|table)([^>]*)>(.+?)</\2>\s*(\s*<br\ />\s*)*', re.DOTALL)
|
||||
body = blocksRegex.sub(r'\n<\2\3>\4</\2>\n', body)
|
||||
|
|
@ -90,6 +93,13 @@ def replace_br_with_p(body):
|
|||
# Nuking breaks trailing paragraps that may be in the body. They are eventually treated as <p><br /></p>
|
||||
body = re.sub(r'</p>\s*(<br\ \/>)+\s*', r'</p>\n<p></p>\n', body)
|
||||
|
||||
# logger.debug(u'--- 2 ---')
|
||||
# logger.debug(u'BODY start.: ' + body[:250])
|
||||
# logger.debug(u'--')
|
||||
# logger.debug(u'BODY end...: ' + body[-250:])
|
||||
# logger.debug(u'BODY.......: ' + body)
|
||||
# logger.debug(u'--- 2 ---')
|
||||
|
||||
# Because a leading or trailing non break tag will break the following code, we have to mess around rather badly for a few lines.
|
||||
body = body.replace(u'[',u'&squareBracketStart;')
|
||||
body = body.replace(u']',u'&squareBracketEnd;')
|
||||
|
|
@ -149,6 +159,10 @@ def replace_br_with_p(body):
|
|||
logger.debug(u'contentLinesSum...: ' + unicode(contentLinesSum))
|
||||
logger.debug(u'longestLineLength.: ' + unicode(longestLineLength))
|
||||
logger.debug(u'averageLineLength.: ' + unicode(averageLineLength))
|
||||
logger.debug(u'---')
|
||||
logger.debug(u'breaksMaxIndex....: ' + unicode(breaksMaxIndex))
|
||||
logger.debug(u'len(breaksCount)-1: ' + unicode(len(breaksCount)-1))
|
||||
logger.debug(u'breaksMax.........: ' + unicode(breaksMax))
|
||||
|
||||
if breaksMaxIndex == len(breaksCount)-1 and breaksMax < 2:
|
||||
breaksMaxIndex = 0
|
||||
|
|
|
|||
Loading…
Reference in a new issue