From 9d8508ee6f9a3517c497544a089df2f840431be9 Mon Sep 17 00:00:00 2001
From: asbjorn grandt <asbjorn.grandt@gmail.com>
Date: Sun, 27 Oct 2013 23:01:09 +0100
Subject: [PATCH] htmlheuristics needed to chek if the parsed body actually had
 any tags in it,   and is now removing ASCII character x!0 (whitespace) as it
 interfered with   regexp. base_adapter called replace_br_with_p with two
 argument, where teh function   only support one.

---
 fanficdownloader/adapters/base_adapter.py |  2 +-
 fanficdownloader/htmlheuristics.py        | 11 +++++++++++
 2 files changed, 12 insertions(+), 1 deletion(-)
diff --git a/fanficdownloader/adapters/base_adapter.py b/fanficdownloader/adapters/base_adapter.py
index 612af196..e2e87b8e 100644
--- a/fanficdownloader/adapters/base_adapter.py
+++ b/fanficdownloader/adapters/base_adapter.py
@@ -372,7 +372,7 @@ class BaseSiteAdapter(Configurable):
         if self.getConfig("replace_br_with_p"):
             # Apply heuristic processing to replace <br> paragraph
             # breaks with <p> tags.
-            retval = replace_br_with_p(self,retval)
+            retval = replace_br_with_p(retval)
             
         if self.getConfig('replace_hr'):
             # replacing a self-closing tag with a container tag in the
diff --git a/fanficdownloader/htmlheuristics.py b/fanficdownloader/htmlheuristics.py
index fd9fe737..0d89c4e0 100644
--- a/fanficdownloader/htmlheuristics.py
+++ b/fanficdownloader/htmlheuristics.py
@@ -23,6 +23,17 @@ from . import exceptions as exceptions
 
 def replace_br_with_p(body):
 
+    logger.debug('Body Length.: %d' % len(body))
+    logger.debug('Body First >: %d' % body.find('>'))
+    logger.debug('Body Last <.: %d' % body.rfind("<"))
+
+    # Ascii character (and Unicode as well) xA0 is a non-breaking space, ascii code 160.
+    # However, Python Regex does not recognize it as a whitespace, so we'll be changing it to a reagular space.
+    body = body.replace(u'\xa0', u' ')
+
+    if body.find('>') == -1 or body.rfind("<") == -1:
+        return body
+
     # change surrounding div to a p and remove attrs Top surrounding
     # tag in all cases now should be div, to just strip the first and
     # last tags.