Add make_soup() to base_adapter for bs4 soups.

2026-05-09 05:21:13 +02:00 · 2014-12-03 19:03:31 -06:00 · 2014-12-03 19:03:31 -06:00 · 16213b6309
commit 16213b6309
parent 1f01481e47
6 changed files with 30 additions and 37 deletions
--- a/fanficdownloader/adapters/adapter_archiveofourownorg.py
+++ b/fanficdownloader/adapters/adapter_archiveofourownorg.py
@ -21,9 +21,6 @@ logger = logging.getLogger(__name__)
 import re
 import urllib2

-#from .. import BeautifulSoup as bs
-import bs4 as bs
-
 from ..htmlcleanup import stripHTML
 from .. import exceptions as exceptions

@ -168,10 +165,10 @@ class ArchiveOfOurOwnOrgAdapter(BaseSiteAdapter):
            meta = self._fetchUrl(metaurl,usecache=False)
            
        # use BeautifulSoup HTML parser to make everything easier to find.
-        soup = bs.BeautifulSoup(data)
+        soup = self.make_soup(data)
        for tag in soup.findAll('div',id='admin-banner'):
            tag.extract()
-        metasoup = bs.BeautifulSoup(meta)
+        metasoup = self.make_soup(meta)
        for tag in metasoup.findAll('div',id='admin-banner'):
            tag.extract()

@ -334,9 +331,9 @@ class ArchiveOfOurOwnOrgAdapter(BaseSiteAdapter):
    def getChapterText(self, url):
        logger.debug('Getting chapter text from: %s' % url)
 		
-        chapter=bs.BeautifulSoup('<div class="story"></div>').find('div')
+        chapter=self.make_soup('<div class="story"></div>').find('div')
        data = self._fetchUrl(url)
-        soup = bs.BeautifulSoup(data)
+        soup = self.make_soup(data)

        exclude_notes=self.getConfigList('exclude_notes')

--- a/fanficdownloader/adapters/adapter_fanfictionnet.py
+++ b/fanficdownloader/adapters/adapter_fanfictionnet.py
@ -24,8 +24,6 @@ import urllib2
 from urllib import unquote_plus
 import time

-#from .. import BeautifulSoup as bs
-import bs4 as bs
 from .. import exceptions as exceptions
 from ..htmlcleanup import stripHTML

@ -105,7 +103,7 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter):
        try:
            data = self._fetchUrl(url)
            #logger.debug("\n===================\n%s\n===================\n"%data)
-            soup = bs.BeautifulSoup(data, "html5lib")
+            soup = self.make_soup(data)
        except urllib2.HTTPError, e:
            if e.code == 404:
                raise exceptions.StoryDoesNotExist(url)
@ -140,7 +138,7 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter):
                if "not found. Please check to see you are not using an outdated url." \
                        not in newdata:
                    logger.debug('=======Found newer chapter: %s' % tryurl)
-                    soup = bs.BeautifulSoup(newdata, "html5lib")
+                    soup = self.make_soup(newdata)
            except:
                pass
            
@ -167,7 +165,7 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter):
            self.story.addToList('category',stripHTML(categories[1]))
        elif 'Crossover' in categories[0]['href']:
            caturl = "https://%s%s"%(self.getSiteDomain(),categories[0]['href'])
-            catsoup = bs.BeautifulSoup(self._fetchUrl(caturl), "html5lib")
+            catsoup = self.make_soup(self._fetchUrl(caturl))
            for a in catsoup.findAll('a',href=re.compile(r"^/crossovers/.+?/\d+/")):
                self.story.addToList('category',stripHTML(a))
            else:
@ -322,7 +320,7 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter):
        #     data = data[data.index(divstr):] 
        # data = data.replace("<body","<notbody").replace("<BODY","<NOTBODY")
        
-        soup = bs.BeautifulSoup(data, "html5lib")
+        soup = self.make_soup(data)

        ## Remove the 'share' button.
        ## No longer appears in the story text.
--- a/fanficdownloader/adapters/adapter_ficwadcom.py
+++ b/fanficdownloader/adapters/adapter_ficwadcom.py
@ -23,9 +23,6 @@ import urllib2
 import time
 import httplib, urllib

-#from .. import BeautifulSoup as bs
-import bs4 as bs
-
 from .. import exceptions as exceptions
 from ..htmlcleanup import stripHTML

@ -98,7 +95,7 @@ class FicwadComSiteAdapter(BaseSiteAdapter):
            # non-existent/removed story urls get thrown to the front page.
            if "<h2>Welcome to FicWad</h2>" in data:
                raise exceptions.StoryDoesNotExist(self.url)
-            soup = bs.BeautifulSoup(data, "html5lib")
+            soup = self.make_soup(data)
        except urllib2.HTTPError, e:
            if e.code == 404:
                raise exceptions.StoryDoesNotExist(self.url)
@ -120,7 +117,7 @@ class FicwadComSiteAdapter(BaseSiteAdapter):
            logger.debug("Normalizing to URL: "+url)
            self._setURL(url)
            try:
-                soup = bs.BeautifulSoup(self._fetchUrl(url), "html5lib")
+                soup = self.make_soup(self._fetchUrl(url))
            except urllib2.HTTPError, e:
                if e.code == 404:
                    raise exceptions.StoryDoesNotExist(self.url)
@ -162,8 +159,8 @@ class FicwadComSiteAdapter(BaseSiteAdapter):
        ## perhaps not the most efficient way to parse this, using
        ## regexps for each rather than something more complex, but
        ## IMO, it's more readable and amenable to change.
-        metastr = stripHTML(str(metap)).replace('\n',' ').replace('\t',' ')
-        #print "metap: (%s)"%metastr
+        metastr = stripHTML(str(metap)).replace('\n',' ').replace('\t',' ').replace(u'\u00a0',' ')
+        print "metastr: (%s)"%metastr

        m = re.match(r".*?Rating: (.+?) -.*?",metastr)
        if m:
@ -223,7 +220,7 @@ class FicwadComSiteAdapter(BaseSiteAdapter):

    def getChapterText(self, url):
        logger.debug('Getting chapter text from: %s' % url)
-        soup = bs.BeautifulSoup(self._fetchUrl(url), "html5lib")
+        soup = self.make_soup(self._fetchUrl(url))

        span = soup.find('div', {'id' : 'storytext'})

--- a/fanficdownloader/adapters/adapter_test1.py
+++ b/fanficdownloader/adapters/adapter_test1.py
@ -20,7 +20,6 @@ import time
 import logging
 logger = logging.getLogger(__name__)

-import bs4 as bs
 from .. import exceptions

 from base_adapter import BaseSiteAdapter,  makeDate
@ -360,7 +359,7 @@ horizontal rules
 <p>Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.</p>
 </div>
 '''
-        soup = bs.BeautifulSoup(text,'html5lib')
+        soup = self.make_soup(text)
        return self.utf8FromSoup(url,soup)

 def getClass():
--- a/fanficdownloader/adapters/adapter_tthfanficorg.py
+++ b/fanficdownloader/adapters/adapter_tthfanficorg.py
@ -22,10 +22,6 @@ import re
 import urllib2
 import time

-from .. import BeautifulSoup as bs
-import bs4 as bs
-
-
 from ..htmlcleanup import stripHTML
 from .. import exceptions as exceptions

@ -103,7 +99,7 @@ class TwistingTheHellmouthSiteAdapter(BaseSiteAdapter):
 # <input type='text' id='urealname' name='urealname' value=''/>
 # <input type='password' id='password' name='6bb3fcd148d148629223690bf19733b8'/>
 # <input type='submit' value='Login' name='loginsubmit'/>
-        soup = bs.BeautifulSoup(self._fetchUrl(loginUrl), 'html5lib')
+        soup = self.make_soup(self._fetchUrl(loginUrl))
        params['ctkn']=soup.find('input', {'name':'ctkn'})['value']
        params[soup.find('input', {'id':'password'})['name']] = params['password']
        
@ -133,7 +129,7 @@ class TwistingTheHellmouthSiteAdapter(BaseSiteAdapter):
        try:
            data = self._fetchUrl(url)
            #print("data:%s"%data)
-            soup = bs.BeautifulSoup(data, 'html5lib')
+            soup = self.make_soup(data)
        except urllib2.HTTPError, e:
            if e.code == 404:
                raise exceptions.StoryDoesNotExist(url)
@ -156,7 +152,7 @@ class TwistingTheHellmouthSiteAdapter(BaseSiteAdapter):
                # refetch story page.
                ## XXX - needs cache invalidate?  Or at least check that it this needs doing...
                data = self._fetchUrl(url,usecache=False)
-                soup = bs.BeautifulSoup(data, 'html5lib')
+                soup = self.make_soup(data)

        if "NOTE: This story is rated FR21 which is above your chosen filter level." in data:
            raise exceptions.AdultCheckRequired(self.url)
@ -174,7 +170,7 @@ class TwistingTheHellmouthSiteAdapter(BaseSiteAdapter):
            logger.debug("**AUTHOR** URL: "+authorurl)
            authordata = self._fetchUrl(authorurl)
            descurl=authorurl
-            authorsoup = bs.BeautifulSoup(authordata, 'html5lib')
+            authorsoup = self.make_soup(authordata)
            # author can have several pages, scan until we find it.
            while( not authorsoup.find('a', href=re.compile(r"^/Story-"+self.story.getMetadata('storyId'))) ):
                nextarrow = authorsoup.find('a', {'class':'arrowf'})
@ -188,7 +184,7 @@ class TwistingTheHellmouthSiteAdapter(BaseSiteAdapter):
                logger.debug("**AUTHOR** nextpage URL: "+nextpage)
                authordata = self._fetchUrl(nextpage)
                descurl=nextpage
-                authorsoup = bs.BeautifulSoup(authordata, 'html5lib')
+                authorsoup = self.make_soup(authordata)
        except urllib2.HTTPError, e:
            if e.code == 404:
                raise exceptions.StoryDoesNotExist(url)
@ -207,7 +203,7 @@ class TwistingTheHellmouthSiteAdapter(BaseSiteAdapter):
                infourl = 'http://'+self.host+ainfo['href']
                logger.debug("**StoryInfo** URL: "+infourl)
                infodata = self._fetchUrl(infourl)
-                infosoup = bs.BeautifulSoup(infodata, 'html5lib')
+                infosoup = self.make_soup(infodata)

                # for a in infosoup.findAll('a',href=re.compile(r"^/Author-\d+")):
                #     self.story.addToList('authorId',a['href'].split('/')[1].split('-')[1])
@ -295,7 +291,7 @@ class TwistingTheHellmouthSiteAdapter(BaseSiteAdapter):

    def getChapterText(self, url):
        logger.debug('Getting chapter text from: %s' % url)
-        soup = bs.BeautifulSoup(self._fetchUrl(url), 'html5lib')
+        soup = self.make_soup(self._fetchUrl(url))

        div = soup.find('div', {'id' : 'storyinnerbody'})
        
--- a/fanficdownloader/adapters/base_adapter.py
+++ b/fanficdownloader/adapters/base_adapter.py
@ -26,8 +26,7 @@ import cookielib as cl
 from functools import partial
 import pickle

-#from .. import BeautifulSoup as bs
-import bs4 as bs
+import bs4

 from ..htmlcleanup import stripHTML
 from ..htmlheuristics import replace_br_with_p
@ -470,7 +469,7 @@ class BaseSiteAdapter(Configurable):
            if isinstance(svalue,basestring):
                # bs4/html5lib add html, header and body tags, which
                # we don't want.
-                svalue = bs.BeautifulSoup(svalue,"html5lib").body
+                svalue = bs4.BeautifulSoup(svalue,"html5lib").body
                svalue.name='span'
            self.story.setMetadata('description',self.utf8FromSoup(url,svalue))
        else:
@ -566,6 +565,13 @@ class BaseSiteAdapter(Configurable):

        return retval

+    def make_soup(self,data):
+        '''
+        Convenience method for getting a bs4 soup.  Older and
+        non-updated adapters call the included bs3 library themselves.
+        '''
+        return bs4.BeautifulSoup(data,'html5lib')
+    
 def cachedfetch(realfetch,cache,url):
    if url in cache:
        return cache[url]