Add make_soup() to base_adapter for bs4 soups.

This commit is contained in:
Jim Miller 2014-12-03 19:03:31 -06:00
parent 1f01481e47
commit 16213b6309
6 changed files with 30 additions and 37 deletions

View file

@ -21,9 +21,6 @@ logger = logging.getLogger(__name__)
import re
import urllib2
#from .. import BeautifulSoup as bs
import bs4 as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
@ -168,10 +165,10 @@ class ArchiveOfOurOwnOrgAdapter(BaseSiteAdapter):
meta = self._fetchUrl(metaurl,usecache=False)
# use BeautifulSoup HTML parser to make everything easier to find.
soup = bs.BeautifulSoup(data)
soup = self.make_soup(data)
for tag in soup.findAll('div',id='admin-banner'):
tag.extract()
metasoup = bs.BeautifulSoup(meta)
metasoup = self.make_soup(meta)
for tag in metasoup.findAll('div',id='admin-banner'):
tag.extract()
@ -334,9 +331,9 @@ class ArchiveOfOurOwnOrgAdapter(BaseSiteAdapter):
def getChapterText(self, url):
logger.debug('Getting chapter text from: %s' % url)
chapter=bs.BeautifulSoup('<div class="story"></div>').find('div')
chapter=self.make_soup('<div class="story"></div>').find('div')
data = self._fetchUrl(url)
soup = bs.BeautifulSoup(data)
soup = self.make_soup(data)
exclude_notes=self.getConfigList('exclude_notes')

View file

@ -24,8 +24,6 @@ import urllib2
from urllib import unquote_plus
import time
#from .. import BeautifulSoup as bs
import bs4 as bs
from .. import exceptions as exceptions
from ..htmlcleanup import stripHTML
@ -105,7 +103,7 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter):
try:
data = self._fetchUrl(url)
#logger.debug("\n===================\n%s\n===================\n"%data)
soup = bs.BeautifulSoup(data, "html5lib")
soup = self.make_soup(data)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(url)
@ -140,7 +138,7 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter):
if "not found. Please check to see you are not using an outdated url." \
not in newdata:
logger.debug('=======Found newer chapter: %s' % tryurl)
soup = bs.BeautifulSoup(newdata, "html5lib")
soup = self.make_soup(newdata)
except:
pass
@ -167,7 +165,7 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter):
self.story.addToList('category',stripHTML(categories[1]))
elif 'Crossover' in categories[0]['href']:
caturl = "https://%s%s"%(self.getSiteDomain(),categories[0]['href'])
catsoup = bs.BeautifulSoup(self._fetchUrl(caturl), "html5lib")
catsoup = self.make_soup(self._fetchUrl(caturl))
for a in catsoup.findAll('a',href=re.compile(r"^/crossovers/.+?/\d+/")):
self.story.addToList('category',stripHTML(a))
else:
@ -322,7 +320,7 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter):
# data = data[data.index(divstr):]
# data = data.replace("<body","<notbody").replace("<BODY","<NOTBODY")
soup = bs.BeautifulSoup(data, "html5lib")
soup = self.make_soup(data)
## Remove the 'share' button.
## No longer appears in the story text.

View file

@ -23,9 +23,6 @@ import urllib2
import time
import httplib, urllib
#from .. import BeautifulSoup as bs
import bs4 as bs
from .. import exceptions as exceptions
from ..htmlcleanup import stripHTML
@ -98,7 +95,7 @@ class FicwadComSiteAdapter(BaseSiteAdapter):
# non-existent/removed story urls get thrown to the front page.
if "<h2>Welcome to FicWad</h2>" in data:
raise exceptions.StoryDoesNotExist(self.url)
soup = bs.BeautifulSoup(data, "html5lib")
soup = self.make_soup(data)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
@ -120,7 +117,7 @@ class FicwadComSiteAdapter(BaseSiteAdapter):
logger.debug("Normalizing to URL: "+url)
self._setURL(url)
try:
soup = bs.BeautifulSoup(self._fetchUrl(url), "html5lib")
soup = self.make_soup(self._fetchUrl(url))
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
@ -162,8 +159,8 @@ class FicwadComSiteAdapter(BaseSiteAdapter):
## perhaps not the most efficient way to parse this, using
## regexps for each rather than something more complex, but
## IMO, it's more readable and amenable to change.
metastr = stripHTML(str(metap)).replace('\n',' ').replace('\t',' ')
#print "metap: (%s)"%metastr
metastr = stripHTML(str(metap)).replace('\n',' ').replace('\t',' ').replace(u'\u00a0',' ')
print "metastr: (%s)"%metastr
m = re.match(r".*?Rating: (.+?) -.*?",metastr)
if m:
@ -223,7 +220,7 @@ class FicwadComSiteAdapter(BaseSiteAdapter):
def getChapterText(self, url):
logger.debug('Getting chapter text from: %s' % url)
soup = bs.BeautifulSoup(self._fetchUrl(url), "html5lib")
soup = self.make_soup(self._fetchUrl(url))
span = soup.find('div', {'id' : 'storytext'})

View file

@ -20,7 +20,6 @@ import time
import logging
logger = logging.getLogger(__name__)
import bs4 as bs
from .. import exceptions
from base_adapter import BaseSiteAdapter, makeDate
@ -360,7 +359,7 @@ horizontal rules
<p>Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.</p>
</div>
'''
soup = bs.BeautifulSoup(text,'html5lib')
soup = self.make_soup(text)
return self.utf8FromSoup(url,soup)
def getClass():

View file

@ -22,10 +22,6 @@ import re
import urllib2
import time
from .. import BeautifulSoup as bs
import bs4 as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
@ -103,7 +99,7 @@ class TwistingTheHellmouthSiteAdapter(BaseSiteAdapter):
# <input type='text' id='urealname' name='urealname' value=''/>
# <input type='password' id='password' name='6bb3fcd148d148629223690bf19733b8'/>
# <input type='submit' value='Login' name='loginsubmit'/>
soup = bs.BeautifulSoup(self._fetchUrl(loginUrl), 'html5lib')
soup = self.make_soup(self._fetchUrl(loginUrl))
params['ctkn']=soup.find('input', {'name':'ctkn'})['value']
params[soup.find('input', {'id':'password'})['name']] = params['password']
@ -133,7 +129,7 @@ class TwistingTheHellmouthSiteAdapter(BaseSiteAdapter):
try:
data = self._fetchUrl(url)
#print("data:%s"%data)
soup = bs.BeautifulSoup(data, 'html5lib')
soup = self.make_soup(data)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(url)
@ -156,7 +152,7 @@ class TwistingTheHellmouthSiteAdapter(BaseSiteAdapter):
# refetch story page.
## XXX - needs cache invalidate? Or at least check that it this needs doing...
data = self._fetchUrl(url,usecache=False)
soup = bs.BeautifulSoup(data, 'html5lib')
soup = self.make_soup(data)
if "NOTE: This story is rated FR21 which is above your chosen filter level." in data:
raise exceptions.AdultCheckRequired(self.url)
@ -174,7 +170,7 @@ class TwistingTheHellmouthSiteAdapter(BaseSiteAdapter):
logger.debug("**AUTHOR** URL: "+authorurl)
authordata = self._fetchUrl(authorurl)
descurl=authorurl
authorsoup = bs.BeautifulSoup(authordata, 'html5lib')
authorsoup = self.make_soup(authordata)
# author can have several pages, scan until we find it.
while( not authorsoup.find('a', href=re.compile(r"^/Story-"+self.story.getMetadata('storyId'))) ):
nextarrow = authorsoup.find('a', {'class':'arrowf'})
@ -188,7 +184,7 @@ class TwistingTheHellmouthSiteAdapter(BaseSiteAdapter):
logger.debug("**AUTHOR** nextpage URL: "+nextpage)
authordata = self._fetchUrl(nextpage)
descurl=nextpage
authorsoup = bs.BeautifulSoup(authordata, 'html5lib')
authorsoup = self.make_soup(authordata)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(url)
@ -207,7 +203,7 @@ class TwistingTheHellmouthSiteAdapter(BaseSiteAdapter):
infourl = 'http://'+self.host+ainfo['href']
logger.debug("**StoryInfo** URL: "+infourl)
infodata = self._fetchUrl(infourl)
infosoup = bs.BeautifulSoup(infodata, 'html5lib')
infosoup = self.make_soup(infodata)
# for a in infosoup.findAll('a',href=re.compile(r"^/Author-\d+")):
# self.story.addToList('authorId',a['href'].split('/')[1].split('-')[1])
@ -295,7 +291,7 @@ class TwistingTheHellmouthSiteAdapter(BaseSiteAdapter):
def getChapterText(self, url):
logger.debug('Getting chapter text from: %s' % url)
soup = bs.BeautifulSoup(self._fetchUrl(url), 'html5lib')
soup = self.make_soup(self._fetchUrl(url))
div = soup.find('div', {'id' : 'storyinnerbody'})

View file

@ -26,8 +26,7 @@ import cookielib as cl
from functools import partial
import pickle
#from .. import BeautifulSoup as bs
import bs4 as bs
import bs4
from ..htmlcleanup import stripHTML
from ..htmlheuristics import replace_br_with_p
@ -470,7 +469,7 @@ class BaseSiteAdapter(Configurable):
if isinstance(svalue,basestring):
# bs4/html5lib add html, header and body tags, which
# we don't want.
svalue = bs.BeautifulSoup(svalue,"html5lib").body
svalue = bs4.BeautifulSoup(svalue,"html5lib").body
svalue.name='span'
self.story.setMetadata('description',self.utf8FromSoup(url,svalue))
else:
@ -566,6 +565,13 @@ class BaseSiteAdapter(Configurable):
return retval
def make_soup(self,data):
'''
Convenience method for getting a bs4 soup. Older and
non-updated adapters call the included bs3 library themselves.
'''
return bs4.BeautifulSoup(data,'html5lib')
def cachedfetch(realfetch,cache,url):
if url in cache:
return cache[url]