From 3b04b6ad6148a3e20720eae51ac54bce04cbc4ca Mon Sep 17 00:00:00 2001
From: asbjorn grandt
Date: Thu, 31 Oct 2013 22:07:07 +0100
Subject: [PATCH] TtH.org sometimes place a div tag around hr tags, this caused
the heuristics to generate faulty html Also TtH adds an empty div at the end,
when placed inside the last paragraph, the ePub will fail.
---
fanficdownloader/htmlheuristics.py | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/fanficdownloader/htmlheuristics.py b/fanficdownloader/htmlheuristics.py
index 44c68fa2..14dea96a 100644
--- a/fanficdownloader/htmlheuristics.py
+++ b/fanficdownloader/htmlheuristics.py
@@ -35,6 +35,9 @@ def replace_br_with_p(body):
# last tags.
body = u''+body[body.index('>')+1:body.rindex("<")]+u'
'
+ # Nuke div tags surrounding a HR tag.
+ body = re.sub(r']+>\s*
]+>\s*
', r'\n
\n', body)
+
# So many people add formatting to their HR tags, and ePub does not allow those, we are supposed to use css.
# This nukes the hr tag attributes.
body = re.sub(r'\s*
]+>\s*', r'\n
\n', body)
@@ -120,6 +123,9 @@ def replace_br_with_p(body):
# superflous cleaning, remove whitespaces leading closing p tags. These does not affect formatting.
body = re.sub(r'\s*', r'', body)
+ # Remove empty tag pairs
+ body = re.sub(r'\s*<(\S+)[^>]*>\s*\1>', r'', body)
+
# re-wrap in div tag.
body = u'\n' + body + u'\n
'