htmlheuristics needed to chek if the parsed body actually had any tags in it,

and is now removing ASCII character x!0 (whitespace) as it interfered with
  regexp.
base_adapter called replace_br_with_p with two argument, where teh function
  only support one.
This commit is contained in:
asbjorn grandt 2013-10-27 23:01:09 +01:00
parent df5a91daed
commit 9d8508ee6f
2 changed files with 12 additions and 1 deletions

View file

@ -372,7 +372,7 @@ class BaseSiteAdapter(Configurable):
if self.getConfig("replace_br_with_p"):
# Apply heuristic processing to replace <br> paragraph
# breaks with <p> tags.
retval = replace_br_with_p(self,retval)
retval = replace_br_with_p(retval)
if self.getConfig('replace_hr'):
# replacing a self-closing tag with a container tag in the

View file

@ -23,6 +23,17 @@ from . import exceptions as exceptions
def replace_br_with_p(body):
logger.debug('Body Length.: %d' % len(body))
logger.debug('Body First >: %d' % body.find('>'))
logger.debug('Body Last <.: %d' % body.rfind("<"))
# Ascii character (and Unicode as well) xA0 is a non-breaking space, ascii code 160.
# However, Python Regex does not recognize it as a whitespace, so we'll be changing it to a reagular space.
body = body.replace(u'\xa0', u' ')
if body.find('>') == -1 or body.rfind("<") == -1:
return body
# change surrounding div to a p and remove attrs Top surrounding
# tag in all cases now should be div, to just strip the first and
# last tags.