Incorporating heuristic br->p tag processing as an optional feature.

2026-05-02 20:02:51 +02:00 · 2013-10-26 15:52:40 -05:00 · 2013-10-26 15:52:40 -05:00 · df5a91daed
commit df5a91daed
parent 2195ea5792
9 changed files with 149 additions and 16 deletions
--- a/fanficdownloader/adapters/adapter_archiveofourownorg.py
+++ b/fanficdownloader/adapters/adapter_archiveofourownorg.py
@ -303,7 +303,7 @@ class ArchiveOfOurOwnOrgAdapter(BaseSiteAdapter):
    def getChapterText(self, url):
        logger.debug('Getting chapter text from: %s' % url)
 		
-        chapter=bs.BeautifulSoup('<div class="story"></div>')
+        chapter=bs.BeautifulSoup('<div class="story"></div>').find('div')
        data = self._fetchUrl(url)
        soup = bs.BeautifulSoup(data,selfClosingTags=('br','hr'))
 		
--- a/fanficdownloader/adapters/adapter_fictionpadcom.py
+++ b/fanficdownloader/adapters/adapter_fictionpadcom.py
@ -180,7 +180,7 @@ class FictionPadSiteAdapter(BaseSiteAdapter):
            
    def getChapterText(self, url):
        logger.debug('Getting chapter text from: %s' % url)
-        soup = bs.BeautifulSoup(self._fetchUrl(url))
+        soup = bs.BeautifulSoup("<div id='story'>"+self._fetchUrl(url)+"</div>")
        return self.utf8FromSoup(url,soup)
    
 def getClass():
--- a/fanficdownloader/adapters/adapter_nickandgregnet.py
+++ b/fanficdownloader/adapters/adapter_nickandgregnet.py
@ -166,10 +166,11 @@ class NickAndGregNetAdapter(BaseSiteAdapter):

        soup = bs.BeautifulStoneSoup(self._fetchUrl(url),
                                     selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
-        
-        div = soup.find('table', {'class' : 'tblborder6'})

-        if None == div:
-            raise exceptions.FailedToDownload("Error downloading Chapter: %s!  Missing required element!" % url)
-    
+        # wrap a div around it.
+        divsoup = bs.BeautifulStoneSoup('<div class="story"></div>',
+                                        selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
+        div = divsoup.find('div')
+        div.append(soup.find('table', {'class' : 'tblborder6'}))
+
        return self.utf8FromSoup(url,div)
--- a/fanficdownloader/adapters/adapter_simplyundeniablecom.py
+++ b/fanficdownloader/adapters/adapter_simplyundeniablecom.py
@ -214,5 +214,6 @@ class SimplyUndeniableComAdapter(BaseSiteAdapter):

        if None == div:
            raise exceptions.FailedToDownload("Error downloading Chapter: %s!  Missing required element!" % url)
+        div.name='div'
    
        return self.utf8FromSoup(url,div)
--- a/fanficdownloader/adapters/adapter_siyecouk.py
+++ b/fanficdownloader/adapters/adapter_siyecouk.py
@ -241,5 +241,6 @@ class SiyeCoUkAdapter(BaseSiteAdapter): # XXX
        
        if None == story:
            raise exceptions.FailedToDownload("Error downloading Chapter: %s!  Missing required element!" % url)
+        story.name='div'
    
        return self.utf8FromSoup(url,story)
--- a/fanficdownloader/adapters/adapter_test1.py
+++ b/fanficdownloader/adapters/adapter_test1.py
@ -323,7 +323,7 @@ Some more longer description.  "I suck at summaries!"  "Better than it sounds!"
 </div>
 '''
        elif self.story.getMetadata('storyId') == '0':
-            text=u'''
+            text=u'''<div>
 <h3>45. Pronglet Returns to Hogwarts: Chapter 7</h3>
 <br />
    eyes… but I’m not convinced we should automatically<br />
@ -332,6 +332,7 @@ Some more longer description.  "I suck at summaries!"  "Better than it sounds!"
 <br /><br />
    “Sure, invite her along. Does she have children?”<br />
 <br />
+</div>
 '''
        else:
            if self.story.getMetadata('storyId') == '667':
--- a/fanficdownloader/adapters/adapter_whoficcom.py
+++ b/fanficdownloader/adapters/adapter_whoficcom.py
@ -228,7 +228,8 @@ class WhoficComSiteAdapter(BaseSiteAdapter):

        if None == span:
            raise exceptions.FailedToDownload("Error downloading Chapter: %s!  Missing required element!" % url)
-    
+
+        span.name='div'
        return self.utf8FromSoup(url,span)

 def getClass():
--- a/fanficdownloader/adapters/base_adapter.py
+++ b/fanficdownloader/adapters/base_adapter.py
@ -26,6 +26,7 @@ from functools import partial

 from .. import BeautifulSoup as bs
 from ..htmlcleanup import stripHTML
+from ..htmlheuristics import replace_br_with_p

 logger = logging.getLogger(__name__)

@ -354,13 +355,8 @@ class BaseSiteAdapter(Configurable):
 	    # removes paired, but empty tags.
            if t.string != None and len(t.string.strip()) == 0 :
                t.extract()
-                
-        retval = soup.__str__('utf8').decode('utf-8')

-        if self.getConfig('replace_hr'):
-            # replacing a self-closing tag with a container tag in the
-            # soup is more difficult than it first appears.  So cheat.
-            retval = retval.replace("<hr />","<div class='center'>* * *</div>")
+        retval = soup.__str__('utf8').decode('utf-8')

        if self.getConfig('nook_img_fix'):
            # if the <img> tag doesn't have a div or a p around it,
@ -371,7 +367,19 @@ class BaseSiteAdapter(Configurable):
            
        # Don't want body tags in chapter html--writers add them.
        # This is primarily for epub updates.
-        return re.sub(r"</?body>\r?\n?","",retval)
+        retval = re.sub(r"</?body>\r?\n?","",retval)
+        
+        if self.getConfig("replace_br_with_p"):
+            # Apply heuristic processing to replace <br> paragraph
+            # breaks with <p> tags.
+            retval = replace_br_with_p(self,retval)
+            
+        if self.getConfig('replace_hr'):
+            # replacing a self-closing tag with a container tag in the
+            # soup is more difficult than it first appears.  So cheat.
+            retval = retval.replace("<hr />","<div class='center'>* * *</div>")
+
+        return retval

 def cachedfetch(realfetch,cache,url):
    if url in cache:
--- a/fanficdownloader/htmlheuristics.py
+++ b/fanficdownloader/htmlheuristics.py
@ -0,0 +1,120 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2013 Fanficdownloader team
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import logging
+logger = logging.getLogger(__name__)
+import re
+
+from . import exceptions as exceptions
+
+def replace_br_with_p(body):
+
+    # change surrounding div to a p and remove attrs Top surrounding
+    # tag in all cases now should be div, to just strip the first and
+    # last tags.
+    body = u'<p>'+body[body.index('>')+1:body.rindex("<")]+u'</p>'
+
+    # So many people add formatting to their HR tags, and ePub does not allow those, we are supposed to use css.
+    # This nukes the hr tag attributes.
+    body = re.sub(r'\s*<hr[^>]+>\s*', r'\n<hr />\n', body)
+
+    # Need to look at BeautifulSoup to see if it'll even return breaks that aren't properly formatted (<br />).
+    body = re.sub(r'\s*<br[^>]*>\s*', r'<br />', body)
+
+    # Remove leading and trailing breaks from HR tags
+    body = re.sub(r'\s*(<br\ \/>)*\s*<hr\ \/>\s*(<br\ \/>)*\s*', r'\n<hr />\n', body)
+    # Nuking breaks leading paragraps that may be in the body. They are eventually treated as <p><br /></p>
+    body = re.sub(r'\s*(<br\ \/>)+\s*<p', r'\n<p></p>\n<p', body)
+    # Nuking breaks trailing paragraps that may be in the body. They are eventually treated as <p><br /></p>
+    body = re.sub(r'</p>\s*(<br\ \/>)+\s*', r'</p>\n<p></p>\n', body)
+
+    # Because a leading or trailing non break tag will break the following code, we have to mess around rather badly for a few lines.
+    body = body.replace(u'[',u'&squareBracketStart;')
+    body = body.replace(u']',u'&squareBracketEnd;')
+    body = body.replace(u'<br />',u'[br /]')
+
+    breaksRegexp = [
+        re.compile(r'([^\]])(\[br\ \/\])([^\[])'),
+        re.compile(r'([^\]])(\[br\ \/\]){2}([^\[])'),
+        re.compile(r'([^\]])(\[br\ \/\]){3}([^\[])'),
+        re.compile(r'([^\]])(\[br\ \/\]){4}([^\[])'),
+        re.compile(r'([^\]])(\[br\ \/\]){5}([^\[])'),
+        re.compile(r'([^\]])(\[br\ \/\]){6}([^\[])'),
+        re.compile(r'([^\]])(\[br\ \/\]){7}([^\[])'),
+        re.compile(r'([^\]])(\[br\ \/\]){8}([^\[])'),
+        re.compile(r'(\[br\ \/\]){9,}')]
+
+    breaksCount = [
+        len(breaksRegexp[0].findall(body)),
+        len(breaksRegexp[1].findall(body)),
+        len(breaksRegexp[2].findall(body)),
+        len(breaksRegexp[3].findall(body)),
+        len(breaksRegexp[4].findall(body)),
+        len(breaksRegexp[5].findall(body)),
+        len(breaksRegexp[6].findall(body)),
+        len(breaksRegexp[7].findall(body))]
+
+    breaksMax = 0
+    breaksMaxIndex = 0;
+
+    for i in range(len(breaksCount)):
+        if breaksCount[i] > breaksMax:
+            breaksMax = breaksCount[i]
+            breaksMaxIndex = i
+
+    # Find all instances of consecutive breaks less than otr equal to the max count use most often
+    #  replase those tags to inverted p tag pairs, those with more connsecutive breaks are replaced them with a horisontal line
+    for i in range(len(breaksCount)):
+        if i <= breaksMaxIndex:
+            body = breaksRegexp[i].sub(r'\1</p>\n<p>\3', body)
+        else:
+            body = breaksRegexp[i].sub(r'\1</p>\n<hr />\n<p>\3', body)
+
+    body = breaksRegexp[8].sub(r'</p>\n<hr />\n<p>', body)
+
+    # Reverting the square brackets
+    body = body.replace(u'[', u'<')
+    body = body.replace(u']', u'>')
+    body = body.replace(u'&squareBracketStart;', u'[')
+    body = body.replace(u'&squareBracketEnd;', u']')
+
+    # If for some reason, a third break makes its way inside the paragraph, preplace that with the empty paragraph for the additional linespaing.
+    body = re.sub(r'<p>\s*(<br\ \/>)+', r'<p><br /></p>\n<p>', body)
+
+    # change empty p tags to include a br to force spacing.
+    body = re.sub(r'<p>\s*</p>', r'<p><br/></p>', body)
+
+    # Clean up hr tags, and add inverted p tag pairs
+    body = re.sub(r'\s*<hr\ \/>\s*', r'</p>\n<hr />\n<p>', body)
+
+    # Because the previous regexp may cause trouble if the hr tag already had a p tag pair around it, w nee dot repair that.
+    # Repeated opening p tags are condenced to one. As we added the extra leading opening p tags, we can safely assume that
+    #  the last in such a chain must be the original. Lets keep its attributes if they are there.
+    body = re.sub(r'\s*(<p[^>]*>\s*)+<p([^>]*)>\s*', r'\n<p\2>', body)
+    # Repeated closing p tags are condenced to one
+    body = re.sub(r'\s*(<\/\s*p>\s*){2,}', r'</p>\n', body)
+
+    # superflous cleaning, remove whitespaces traling opening p tags. These does affect formatting.
+    body = re.sub(r'<p([^>]*)>\s*', r'<p\1>', body)
+    # superflous cleaning, remove whitespaces leading closing p tags. These does not affect formatting.
+    body = re.sub(r'\s*</p>', r'</p>', body)
+
+    # re-wrap in div tag.
+    body = u'<div>\n' + body + u'\n</div>'
+
+    return body 
+