From 9d8508ee6f9a3517c497544a089df2f840431be9 Mon Sep 17 00:00:00 2001 From: asbjorn grandt Date: Sun, 27 Oct 2013 23:01:09 +0100 Subject: [PATCH] htmlheuristics needed to chek if the parsed body actually had any tags in it, and is now removing ASCII character x!0 (whitespace) as it interfered with regexp. base_adapter called replace_br_with_p with two argument, where teh function only support one. --- fanficdownloader/adapters/base_adapter.py | 2 +- fanficdownloader/htmlheuristics.py | 11 +++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/fanficdownloader/adapters/base_adapter.py b/fanficdownloader/adapters/base_adapter.py index 612af196..e2e87b8e 100644 --- a/fanficdownloader/adapters/base_adapter.py +++ b/fanficdownloader/adapters/base_adapter.py @@ -372,7 +372,7 @@ class BaseSiteAdapter(Configurable): if self.getConfig("replace_br_with_p"): # Apply heuristic processing to replace
paragraph # breaks with

tags. - retval = replace_br_with_p(self,retval) + retval = replace_br_with_p(retval) if self.getConfig('replace_hr'): # replacing a self-closing tag with a container tag in the diff --git a/fanficdownloader/htmlheuristics.py b/fanficdownloader/htmlheuristics.py index fd9fe737..0d89c4e0 100644 --- a/fanficdownloader/htmlheuristics.py +++ b/fanficdownloader/htmlheuristics.py @@ -23,6 +23,17 @@ from . import exceptions as exceptions def replace_br_with_p(body): + logger.debug('Body Length.: %d' % len(body)) + logger.debug('Body First >: %d' % body.find('>')) + logger.debug('Body Last <.: %d' % body.rfind("<")) + + # Ascii character (and Unicode as well) xA0 is a non-breaking space, ascii code 160. + # However, Python Regex does not recognize it as a whitespace, so we'll be changing it to a reagular space. + body = body.replace(u'\xa0', u' ') + + if body.find('>') == -1 or body.rfind("<") == -1: + return body + # change surrounding div to a p and remove attrs Top surrounding # tag in all cases now should be div, to just strip the first and # last tags.