mirror of
https://github.com/JimmXinu/FanFicFare.git
synced 2026-05-08 21:11:59 +02:00
htmlheuristics needed to chek if the parsed body actually had any tags in it,
and is now removing ASCII character x!0 (whitespace) as it interfered with regexp. base_adapter called replace_br_with_p with two argument, where teh function only support one.
This commit is contained in:
parent
df5a91daed
commit
9d8508ee6f
2 changed files with 12 additions and 1 deletions
|
|
@ -372,7 +372,7 @@ class BaseSiteAdapter(Configurable):
|
|||
if self.getConfig("replace_br_with_p"):
|
||||
# Apply heuristic processing to replace <br> paragraph
|
||||
# breaks with <p> tags.
|
||||
retval = replace_br_with_p(self,retval)
|
||||
retval = replace_br_with_p(retval)
|
||||
|
||||
if self.getConfig('replace_hr'):
|
||||
# replacing a self-closing tag with a container tag in the
|
||||
|
|
|
|||
|
|
@ -23,6 +23,17 @@ from . import exceptions as exceptions
|
|||
|
||||
def replace_br_with_p(body):
|
||||
|
||||
logger.debug('Body Length.: %d' % len(body))
|
||||
logger.debug('Body First >: %d' % body.find('>'))
|
||||
logger.debug('Body Last <.: %d' % body.rfind("<"))
|
||||
|
||||
# Ascii character (and Unicode as well) xA0 is a non-breaking space, ascii code 160.
|
||||
# However, Python Regex does not recognize it as a whitespace, so we'll be changing it to a reagular space.
|
||||
body = body.replace(u'\xa0', u' ')
|
||||
|
||||
if body.find('>') == -1 or body.rfind("<") == -1:
|
||||
return body
|
||||
|
||||
# change surrounding div to a p and remove attrs Top surrounding
|
||||
# tag in all cases now should be div, to just strip the first and
|
||||
# last tags.
|
||||
|
|
|
|||
Loading…
Reference in a new issue