htmlheuristics needed to chek if the parsed body actually had any tags in it,

and is now removing ASCII character x!0 (whitespace) as it interfered with regexp. base_adapter called replace_br_with_p with two argument, where teh function only support one.
2026-05-08 21:11:59 +02:00 · 2013-10-27 23:01:09 +01:00 · 2013-10-27 23:01:09 +01:00 · 9d8508ee6f
commit 9d8508ee6f
parent df5a91daed
2 changed files with 12 additions and 1 deletions
--- a/fanficdownloader/adapters/base_adapter.py
+++ b/fanficdownloader/adapters/base_adapter.py
@ -372,7 +372,7 @@ class BaseSiteAdapter(Configurable):
        if self.getConfig("replace_br_with_p"):
            # Apply heuristic processing to replace <br> paragraph
            # breaks with <p> tags.
-            retval = replace_br_with_p(self,retval)
+            retval = replace_br_with_p(retval)
            
        if self.getConfig('replace_hr'):
            # replacing a self-closing tag with a container tag in the
--- a/fanficdownloader/htmlheuristics.py
+++ b/fanficdownloader/htmlheuristics.py
@ -23,6 +23,17 @@ from . import exceptions as exceptions

 def replace_br_with_p(body):

+    logger.debug('Body Length.: %d' % len(body))
+    logger.debug('Body First >: %d' % body.find('>'))
+    logger.debug('Body Last <.: %d' % body.rfind("<"))
+
+    # Ascii character (and Unicode as well) xA0 is a non-breaking space, ascii code 160.
+    # However, Python Regex does not recognize it as a whitespace, so we'll be changing it to a reagular space.
+    body = body.replace(u'\xa0', u' ')
+
+    if body.find('>') == -1 or body.rfind("<") == -1:
+        return body
+
    # change surrounding div to a p and remove attrs Top surrounding
    # tag in all cases now should be div, to just strip the first and
    # last tags.