From 16213b6309b0f79ef6cfb9b7b6be9630c7ec22c0 Mon Sep 17 00:00:00 2001 From: Jim Miller Date: Wed, 3 Dec 2014 19:03:31 -0600 Subject: [PATCH] Add make_soup() to base_adapter for bs4 soups. --- .../adapters/adapter_archiveofourownorg.py | 11 ++++------- .../adapters/adapter_fanfictionnet.py | 10 ++++------ fanficdownloader/adapters/adapter_ficwadcom.py | 13 +++++-------- fanficdownloader/adapters/adapter_test1.py | 3 +-- .../adapters/adapter_tthfanficorg.py | 18 +++++++----------- fanficdownloader/adapters/base_adapter.py | 12 +++++++++--- 6 files changed, 30 insertions(+), 37 deletions(-) diff --git a/fanficdownloader/adapters/adapter_archiveofourownorg.py b/fanficdownloader/adapters/adapter_archiveofourownorg.py index 3dfd12d9..4c93a43f 100644 --- a/fanficdownloader/adapters/adapter_archiveofourownorg.py +++ b/fanficdownloader/adapters/adapter_archiveofourownorg.py @@ -21,9 +21,6 @@ logger = logging.getLogger(__name__) import re import urllib2 -#from .. import BeautifulSoup as bs -import bs4 as bs - from ..htmlcleanup import stripHTML from .. import exceptions as exceptions @@ -168,10 +165,10 @@ class ArchiveOfOurOwnOrgAdapter(BaseSiteAdapter): meta = self._fetchUrl(metaurl,usecache=False) # use BeautifulSoup HTML parser to make everything easier to find. - soup = bs.BeautifulSoup(data) + soup = self.make_soup(data) for tag in soup.findAll('div',id='admin-banner'): tag.extract() - metasoup = bs.BeautifulSoup(meta) + metasoup = self.make_soup(meta) for tag in metasoup.findAll('div',id='admin-banner'): tag.extract() @@ -334,9 +331,9 @@ class ArchiveOfOurOwnOrgAdapter(BaseSiteAdapter): def getChapterText(self, url): logger.debug('Getting chapter text from: %s' % url) - chapter=bs.BeautifulSoup('
').find('div') + chapter=self.make_soup('
').find('div') data = self._fetchUrl(url) - soup = bs.BeautifulSoup(data) + soup = self.make_soup(data) exclude_notes=self.getConfigList('exclude_notes') diff --git a/fanficdownloader/adapters/adapter_fanfictionnet.py b/fanficdownloader/adapters/adapter_fanfictionnet.py index 93208cd7..baac7c5f 100644 --- a/fanficdownloader/adapters/adapter_fanfictionnet.py +++ b/fanficdownloader/adapters/adapter_fanfictionnet.py @@ -24,8 +24,6 @@ import urllib2 from urllib import unquote_plus import time -#from .. import BeautifulSoup as bs -import bs4 as bs from .. import exceptions as exceptions from ..htmlcleanup import stripHTML @@ -105,7 +103,7 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter): try: data = self._fetchUrl(url) #logger.debug("\n===================\n%s\n===================\n"%data) - soup = bs.BeautifulSoup(data, "html5lib") + soup = self.make_soup(data) except urllib2.HTTPError, e: if e.code == 404: raise exceptions.StoryDoesNotExist(url) @@ -140,7 +138,7 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter): if "not found. Please check to see you are not using an outdated url." \ not in newdata: logger.debug('=======Found newer chapter: %s' % tryurl) - soup = bs.BeautifulSoup(newdata, "html5lib") + soup = self.make_soup(newdata) except: pass @@ -167,7 +165,7 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter): self.story.addToList('category',stripHTML(categories[1])) elif 'Crossover' in categories[0]['href']: caturl = "https://%s%s"%(self.getSiteDomain(),categories[0]['href']) - catsoup = bs.BeautifulSoup(self._fetchUrl(caturl), "html5lib") + catsoup = self.make_soup(self._fetchUrl(caturl)) for a in catsoup.findAll('a',href=re.compile(r"^/crossovers/.+?/\d+/")): self.story.addToList('category',stripHTML(a)) else: @@ -322,7 +320,7 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter): # data = data[data.index(divstr):] # data = data.replace("Welcome to FicWad" in data: raise exceptions.StoryDoesNotExist(self.url) - soup = bs.BeautifulSoup(data, "html5lib") + soup = self.make_soup(data) except urllib2.HTTPError, e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) @@ -120,7 +117,7 @@ class FicwadComSiteAdapter(BaseSiteAdapter): logger.debug("Normalizing to URL: "+url) self._setURL(url) try: - soup = bs.BeautifulSoup(self._fetchUrl(url), "html5lib") + soup = self.make_soup(self._fetchUrl(url)) except urllib2.HTTPError, e: if e.code == 404: raise exceptions.StoryDoesNotExist(self.url) @@ -162,8 +159,8 @@ class FicwadComSiteAdapter(BaseSiteAdapter): ## perhaps not the most efficient way to parse this, using ## regexps for each rather than something more complex, but ## IMO, it's more readable and amenable to change. - metastr = stripHTML(str(metap)).replace('\n',' ').replace('\t',' ') - #print "metap: (%s)"%metastr + metastr = stripHTML(str(metap)).replace('\n',' ').replace('\t',' ').replace(u'\u00a0',' ') + print "metastr: (%s)"%metastr m = re.match(r".*?Rating: (.+?) -.*?",metastr) if m: @@ -223,7 +220,7 @@ class FicwadComSiteAdapter(BaseSiteAdapter): def getChapterText(self, url): logger.debug('Getting chapter text from: %s' % url) - soup = bs.BeautifulSoup(self._fetchUrl(url), "html5lib") + soup = self.make_soup(self._fetchUrl(url)) span = soup.find('div', {'id' : 'storytext'}) diff --git a/fanficdownloader/adapters/adapter_test1.py b/fanficdownloader/adapters/adapter_test1.py index c662d5c5..4bf8525a 100644 --- a/fanficdownloader/adapters/adapter_test1.py +++ b/fanficdownloader/adapters/adapter_test1.py @@ -20,7 +20,6 @@ import time import logging logger = logging.getLogger(__name__) -import bs4 as bs from .. import exceptions from base_adapter import BaseSiteAdapter, makeDate @@ -360,7 +359,7 @@ horizontal rules

Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

''' - soup = bs.BeautifulSoup(text,'html5lib') + soup = self.make_soup(text) return self.utf8FromSoup(url,soup) def getClass(): diff --git a/fanficdownloader/adapters/adapter_tthfanficorg.py b/fanficdownloader/adapters/adapter_tthfanficorg.py index bfd6bddf..0e527178 100644 --- a/fanficdownloader/adapters/adapter_tthfanficorg.py +++ b/fanficdownloader/adapters/adapter_tthfanficorg.py @@ -22,10 +22,6 @@ import re import urllib2 import time -from .. import BeautifulSoup as bs -import bs4 as bs - - from ..htmlcleanup import stripHTML from .. import exceptions as exceptions @@ -103,7 +99,7 @@ class TwistingTheHellmouthSiteAdapter(BaseSiteAdapter): # # # - soup = bs.BeautifulSoup(self._fetchUrl(loginUrl), 'html5lib') + soup = self.make_soup(self._fetchUrl(loginUrl)) params['ctkn']=soup.find('input', {'name':'ctkn'})['value'] params[soup.find('input', {'id':'password'})['name']] = params['password'] @@ -133,7 +129,7 @@ class TwistingTheHellmouthSiteAdapter(BaseSiteAdapter): try: data = self._fetchUrl(url) #print("data:%s"%data) - soup = bs.BeautifulSoup(data, 'html5lib') + soup = self.make_soup(data) except urllib2.HTTPError, e: if e.code == 404: raise exceptions.StoryDoesNotExist(url) @@ -156,7 +152,7 @@ class TwistingTheHellmouthSiteAdapter(BaseSiteAdapter): # refetch story page. ## XXX - needs cache invalidate? Or at least check that it this needs doing... data = self._fetchUrl(url,usecache=False) - soup = bs.BeautifulSoup(data, 'html5lib') + soup = self.make_soup(data) if "NOTE: This story is rated FR21 which is above your chosen filter level." in data: raise exceptions.AdultCheckRequired(self.url) @@ -174,7 +170,7 @@ class TwistingTheHellmouthSiteAdapter(BaseSiteAdapter): logger.debug("**AUTHOR** URL: "+authorurl) authordata = self._fetchUrl(authorurl) descurl=authorurl - authorsoup = bs.BeautifulSoup(authordata, 'html5lib') + authorsoup = self.make_soup(authordata) # author can have several pages, scan until we find it. while( not authorsoup.find('a', href=re.compile(r"^/Story-"+self.story.getMetadata('storyId'))) ): nextarrow = authorsoup.find('a', {'class':'arrowf'}) @@ -188,7 +184,7 @@ class TwistingTheHellmouthSiteAdapter(BaseSiteAdapter): logger.debug("**AUTHOR** nextpage URL: "+nextpage) authordata = self._fetchUrl(nextpage) descurl=nextpage - authorsoup = bs.BeautifulSoup(authordata, 'html5lib') + authorsoup = self.make_soup(authordata) except urllib2.HTTPError, e: if e.code == 404: raise exceptions.StoryDoesNotExist(url) @@ -207,7 +203,7 @@ class TwistingTheHellmouthSiteAdapter(BaseSiteAdapter): infourl = 'http://'+self.host+ainfo['href'] logger.debug("**StoryInfo** URL: "+infourl) infodata = self._fetchUrl(infourl) - infosoup = bs.BeautifulSoup(infodata, 'html5lib') + infosoup = self.make_soup(infodata) # for a in infosoup.findAll('a',href=re.compile(r"^/Author-\d+")): # self.story.addToList('authorId',a['href'].split('/')[1].split('-')[1]) @@ -295,7 +291,7 @@ class TwistingTheHellmouthSiteAdapter(BaseSiteAdapter): def getChapterText(self, url): logger.debug('Getting chapter text from: %s' % url) - soup = bs.BeautifulSoup(self._fetchUrl(url), 'html5lib') + soup = self.make_soup(self._fetchUrl(url)) div = soup.find('div', {'id' : 'storyinnerbody'}) diff --git a/fanficdownloader/adapters/base_adapter.py b/fanficdownloader/adapters/base_adapter.py index 4f21ebc9..91cd1a68 100644 --- a/fanficdownloader/adapters/base_adapter.py +++ b/fanficdownloader/adapters/base_adapter.py @@ -26,8 +26,7 @@ import cookielib as cl from functools import partial import pickle -#from .. import BeautifulSoup as bs -import bs4 as bs +import bs4 from ..htmlcleanup import stripHTML from ..htmlheuristics import replace_br_with_p @@ -470,7 +469,7 @@ class BaseSiteAdapter(Configurable): if isinstance(svalue,basestring): # bs4/html5lib add html, header and body tags, which # we don't want. - svalue = bs.BeautifulSoup(svalue,"html5lib").body + svalue = bs4.BeautifulSoup(svalue,"html5lib").body svalue.name='span' self.story.setMetadata('description',self.utf8FromSoup(url,svalue)) else: @@ -566,6 +565,13 @@ class BaseSiteAdapter(Configurable): return retval + def make_soup(self,data): + ''' + Convenience method for getting a bs4 soup. Older and + non-updated adapters call the included bs3 library themselves. + ''' + return bs4.BeautifulSoup(data,'html5lib') + def cachedfetch(realfetch,cache,url): if url in cache: return cache[url]