mirror of
https://github.com/JimmXinu/FanFicFare.git
synced 2026-01-05 07:34:36 +01:00
Added "shielding" for break tags inside paragraphs, pre and blockquote tags. These should be left alone.
This commit is contained in:
parent
7d29b281b7
commit
e8e4180621
1 changed files with 29 additions and 6 deletions
|
|
@ -23,6 +23,7 @@ from . import exceptions as exceptions
|
|||
|
||||
def replace_br_with_p(body):
|
||||
|
||||
|
||||
# Ascii character (and Unicode as well) xA0 is a non-breaking space, ascii code 160.
|
||||
# However, Python Regex does not recognize it as a whitespace, so we'll be changing it to a reagular space.
|
||||
body = body.replace(u'\xa0', u' ')
|
||||
|
|
@ -32,12 +33,33 @@ def replace_br_with_p(body):
|
|||
|
||||
# logger.debug(u'BODY start.: ' + body[:250])
|
||||
# logger.debug(u'BODY end...: ' + body[-250:])
|
||||
# logger.debug(u'BODY: ' + body)
|
||||
|
||||
# change surrounding div to a p and remove attrs Top surrounding
|
||||
# tag in all cases now should be div, to just strip the first and
|
||||
# last tags.
|
||||
body = u'<p>'+body[body.index('>')+1:body.rindex("<")]+u'</p>'
|
||||
body = body[body.index('>')+1:body.rindex("<")]
|
||||
|
||||
# Need to look at BeautifulSoup to see if it'll even return breaks that aren't properly formatted (<br />).
|
||||
body = re.sub(r'\s*<br[^>]*>\s*', r'<br />', body)
|
||||
|
||||
# Find all bexisting blocks with p, pre and blockquote tags, we need to leave those alone.
|
||||
blocksRegex = re.compile(r'(\s*<br\ */*>\s*)*\s*<(pre|p|blockquote)([^>]*)>(.+?)</\2>\s*(\s*<br\ */*>\s*)*', re.DOTALL)
|
||||
body = blocksRegex.sub(r'\n<\2\3>\4</\2>\n', body)
|
||||
|
||||
blocks = blocksRegex.finditer(body)
|
||||
# For our replacements to work, we need to work backwards, so we reverse the iterator.
|
||||
blocksList = []
|
||||
for match in blocks:
|
||||
blocksList.insert(0, match)
|
||||
|
||||
for match in blocksList:
|
||||
group4 = match.group(4).replace(u'<br />', u'{br /}')
|
||||
body = body[:match.start(4)] + group4 + body[match.end(4):]
|
||||
|
||||
# change surrounding div to a p and remove attrs Top surrounding
|
||||
# tag in all cases now should be div, to just strip the first and
|
||||
# last tags.
|
||||
body = u'<p>' + body + u'</p>'
|
||||
|
||||
# Nuke div tags surrounding a HR tag.
|
||||
body = re.sub(r'<div[^>]+>\s*<hr[^>]+>\s*</div>', r'\n<hr />\n', body)
|
||||
|
|
@ -46,9 +68,6 @@ def replace_br_with_p(body):
|
|||
# This nukes the hr tag attributes.
|
||||
body = re.sub(r'\s*<hr[^>]+>\s*', r'\n<hr />\n', body)
|
||||
|
||||
# Need to look at BeautifulSoup to see if it'll even return breaks that aren't properly formatted (<br />).
|
||||
body = re.sub(r'\s*<br[^>]*>\s*', r'<br />', body)
|
||||
|
||||
# Remove leading and trailing breaks from HR tags
|
||||
body = re.sub(r'\s*(<br\ \/>)*\s*<hr\ \/>\s*(<br\ \/>)*\s*', r'\n<hr />\n', body)
|
||||
# Nuking breaks leading paragraps that may be in the body. They are eventually treated as <p><br /></p>
|
||||
|
|
@ -117,7 +136,6 @@ def replace_br_with_p(body):
|
|||
breaksMaxIndex = 0
|
||||
breaksMax = breaksCount[0]
|
||||
|
||||
|
||||
logger.debug(u'---')
|
||||
logger.debug(u'breaks 1: ' + str(breaksCount[0]))
|
||||
logger.debug(u'breaks 2: ' + str(breaksCount[1]))
|
||||
|
|
@ -162,6 +180,9 @@ def replace_br_with_p(body):
|
|||
# change empty p tags to include a br to force spacing.
|
||||
body = re.sub(r'<p>\s*</p>', r'<p><br/></p>', body)
|
||||
|
||||
# Clean up hr tags, and add inverted p tag pairs
|
||||
body = re.sub(r'(<div[^>]+>)*\s*<hr\ \/>\s*(</div>)*', r'\n<hr />\n', body)
|
||||
|
||||
# Clean up hr tags, and add inverted p tag pairs
|
||||
body = re.sub(r'\s*<hr\ \/>\s*', r'</p>\n<hr />\n<p>', body)
|
||||
|
||||
|
|
@ -180,6 +201,8 @@ def replace_br_with_p(body):
|
|||
# Remove empty tag pairs
|
||||
body = re.sub(r'\s*<(\S+)[^>]*>\s*</\1>', r'', body)
|
||||
|
||||
body = body.replace(u'{br /}', u'<br />')
|
||||
|
||||
# re-wrap in div tag.
|
||||
body = u'<div>\n' + body + u'\n</div>'
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue