mirror of
https://github.com/JimmXinu/FanFicFare.git
synced 2026-05-09 05:21:13 +02:00
Add make_soup() to base_adapter for bs4 soups.
This commit is contained in:
parent
1f01481e47
commit
16213b6309
6 changed files with 30 additions and 37 deletions
|
|
@ -21,9 +21,6 @@ logger = logging.getLogger(__name__)
|
|||
import re
|
||||
import urllib2
|
||||
|
||||
#from .. import BeautifulSoup as bs
|
||||
import bs4 as bs
|
||||
|
||||
from ..htmlcleanup import stripHTML
|
||||
from .. import exceptions as exceptions
|
||||
|
||||
|
|
@ -168,10 +165,10 @@ class ArchiveOfOurOwnOrgAdapter(BaseSiteAdapter):
|
|||
meta = self._fetchUrl(metaurl,usecache=False)
|
||||
|
||||
# use BeautifulSoup HTML parser to make everything easier to find.
|
||||
soup = bs.BeautifulSoup(data)
|
||||
soup = self.make_soup(data)
|
||||
for tag in soup.findAll('div',id='admin-banner'):
|
||||
tag.extract()
|
||||
metasoup = bs.BeautifulSoup(meta)
|
||||
metasoup = self.make_soup(meta)
|
||||
for tag in metasoup.findAll('div',id='admin-banner'):
|
||||
tag.extract()
|
||||
|
||||
|
|
@ -334,9 +331,9 @@ class ArchiveOfOurOwnOrgAdapter(BaseSiteAdapter):
|
|||
def getChapterText(self, url):
|
||||
logger.debug('Getting chapter text from: %s' % url)
|
||||
|
||||
chapter=bs.BeautifulSoup('<div class="story"></div>').find('div')
|
||||
chapter=self.make_soup('<div class="story"></div>').find('div')
|
||||
data = self._fetchUrl(url)
|
||||
soup = bs.BeautifulSoup(data)
|
||||
soup = self.make_soup(data)
|
||||
|
||||
exclude_notes=self.getConfigList('exclude_notes')
|
||||
|
||||
|
|
|
|||
|
|
@ -24,8 +24,6 @@ import urllib2
|
|||
from urllib import unquote_plus
|
||||
import time
|
||||
|
||||
#from .. import BeautifulSoup as bs
|
||||
import bs4 as bs
|
||||
from .. import exceptions as exceptions
|
||||
from ..htmlcleanup import stripHTML
|
||||
|
||||
|
|
@ -105,7 +103,7 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter):
|
|||
try:
|
||||
data = self._fetchUrl(url)
|
||||
#logger.debug("\n===================\n%s\n===================\n"%data)
|
||||
soup = bs.BeautifulSoup(data, "html5lib")
|
||||
soup = self.make_soup(data)
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
raise exceptions.StoryDoesNotExist(url)
|
||||
|
|
@ -140,7 +138,7 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter):
|
|||
if "not found. Please check to see you are not using an outdated url." \
|
||||
not in newdata:
|
||||
logger.debug('=======Found newer chapter: %s' % tryurl)
|
||||
soup = bs.BeautifulSoup(newdata, "html5lib")
|
||||
soup = self.make_soup(newdata)
|
||||
except:
|
||||
pass
|
||||
|
||||
|
|
@ -167,7 +165,7 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter):
|
|||
self.story.addToList('category',stripHTML(categories[1]))
|
||||
elif 'Crossover' in categories[0]['href']:
|
||||
caturl = "https://%s%s"%(self.getSiteDomain(),categories[0]['href'])
|
||||
catsoup = bs.BeautifulSoup(self._fetchUrl(caturl), "html5lib")
|
||||
catsoup = self.make_soup(self._fetchUrl(caturl))
|
||||
for a in catsoup.findAll('a',href=re.compile(r"^/crossovers/.+?/\d+/")):
|
||||
self.story.addToList('category',stripHTML(a))
|
||||
else:
|
||||
|
|
@ -322,7 +320,7 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter):
|
|||
# data = data[data.index(divstr):]
|
||||
# data = data.replace("<body","<notbody").replace("<BODY","<NOTBODY")
|
||||
|
||||
soup = bs.BeautifulSoup(data, "html5lib")
|
||||
soup = self.make_soup(data)
|
||||
|
||||
## Remove the 'share' button.
|
||||
## No longer appears in the story text.
|
||||
|
|
|
|||
|
|
@ -23,9 +23,6 @@ import urllib2
|
|||
import time
|
||||
import httplib, urllib
|
||||
|
||||
#from .. import BeautifulSoup as bs
|
||||
import bs4 as bs
|
||||
|
||||
from .. import exceptions as exceptions
|
||||
from ..htmlcleanup import stripHTML
|
||||
|
||||
|
|
@ -98,7 +95,7 @@ class FicwadComSiteAdapter(BaseSiteAdapter):
|
|||
# non-existent/removed story urls get thrown to the front page.
|
||||
if "<h2>Welcome to FicWad</h2>" in data:
|
||||
raise exceptions.StoryDoesNotExist(self.url)
|
||||
soup = bs.BeautifulSoup(data, "html5lib")
|
||||
soup = self.make_soup(data)
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
raise exceptions.StoryDoesNotExist(self.url)
|
||||
|
|
@ -120,7 +117,7 @@ class FicwadComSiteAdapter(BaseSiteAdapter):
|
|||
logger.debug("Normalizing to URL: "+url)
|
||||
self._setURL(url)
|
||||
try:
|
||||
soup = bs.BeautifulSoup(self._fetchUrl(url), "html5lib")
|
||||
soup = self.make_soup(self._fetchUrl(url))
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
raise exceptions.StoryDoesNotExist(self.url)
|
||||
|
|
@ -162,8 +159,8 @@ class FicwadComSiteAdapter(BaseSiteAdapter):
|
|||
## perhaps not the most efficient way to parse this, using
|
||||
## regexps for each rather than something more complex, but
|
||||
## IMO, it's more readable and amenable to change.
|
||||
metastr = stripHTML(str(metap)).replace('\n',' ').replace('\t',' ')
|
||||
#print "metap: (%s)"%metastr
|
||||
metastr = stripHTML(str(metap)).replace('\n',' ').replace('\t',' ').replace(u'\u00a0',' ')
|
||||
print "metastr: (%s)"%metastr
|
||||
|
||||
m = re.match(r".*?Rating: (.+?) -.*?",metastr)
|
||||
if m:
|
||||
|
|
@ -223,7 +220,7 @@ class FicwadComSiteAdapter(BaseSiteAdapter):
|
|||
|
||||
def getChapterText(self, url):
|
||||
logger.debug('Getting chapter text from: %s' % url)
|
||||
soup = bs.BeautifulSoup(self._fetchUrl(url), "html5lib")
|
||||
soup = self.make_soup(self._fetchUrl(url))
|
||||
|
||||
span = soup.find('div', {'id' : 'storytext'})
|
||||
|
||||
|
|
|
|||
|
|
@ -20,7 +20,6 @@ import time
|
|||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
import bs4 as bs
|
||||
from .. import exceptions
|
||||
|
||||
from base_adapter import BaseSiteAdapter, makeDate
|
||||
|
|
@ -360,7 +359,7 @@ horizontal rules
|
|||
<p>Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.</p>
|
||||
</div>
|
||||
'''
|
||||
soup = bs.BeautifulSoup(text,'html5lib')
|
||||
soup = self.make_soup(text)
|
||||
return self.utf8FromSoup(url,soup)
|
||||
|
||||
def getClass():
|
||||
|
|
|
|||
|
|
@ -22,10 +22,6 @@ import re
|
|||
import urllib2
|
||||
import time
|
||||
|
||||
from .. import BeautifulSoup as bs
|
||||
import bs4 as bs
|
||||
|
||||
|
||||
from ..htmlcleanup import stripHTML
|
||||
from .. import exceptions as exceptions
|
||||
|
||||
|
|
@ -103,7 +99,7 @@ class TwistingTheHellmouthSiteAdapter(BaseSiteAdapter):
|
|||
# <input type='text' id='urealname' name='urealname' value=''/>
|
||||
# <input type='password' id='password' name='6bb3fcd148d148629223690bf19733b8'/>
|
||||
# <input type='submit' value='Login' name='loginsubmit'/>
|
||||
soup = bs.BeautifulSoup(self._fetchUrl(loginUrl), 'html5lib')
|
||||
soup = self.make_soup(self._fetchUrl(loginUrl))
|
||||
params['ctkn']=soup.find('input', {'name':'ctkn'})['value']
|
||||
params[soup.find('input', {'id':'password'})['name']] = params['password']
|
||||
|
||||
|
|
@ -133,7 +129,7 @@ class TwistingTheHellmouthSiteAdapter(BaseSiteAdapter):
|
|||
try:
|
||||
data = self._fetchUrl(url)
|
||||
#print("data:%s"%data)
|
||||
soup = bs.BeautifulSoup(data, 'html5lib')
|
||||
soup = self.make_soup(data)
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
raise exceptions.StoryDoesNotExist(url)
|
||||
|
|
@ -156,7 +152,7 @@ class TwistingTheHellmouthSiteAdapter(BaseSiteAdapter):
|
|||
# refetch story page.
|
||||
## XXX - needs cache invalidate? Or at least check that it this needs doing...
|
||||
data = self._fetchUrl(url,usecache=False)
|
||||
soup = bs.BeautifulSoup(data, 'html5lib')
|
||||
soup = self.make_soup(data)
|
||||
|
||||
if "NOTE: This story is rated FR21 which is above your chosen filter level." in data:
|
||||
raise exceptions.AdultCheckRequired(self.url)
|
||||
|
|
@ -174,7 +170,7 @@ class TwistingTheHellmouthSiteAdapter(BaseSiteAdapter):
|
|||
logger.debug("**AUTHOR** URL: "+authorurl)
|
||||
authordata = self._fetchUrl(authorurl)
|
||||
descurl=authorurl
|
||||
authorsoup = bs.BeautifulSoup(authordata, 'html5lib')
|
||||
authorsoup = self.make_soup(authordata)
|
||||
# author can have several pages, scan until we find it.
|
||||
while( not authorsoup.find('a', href=re.compile(r"^/Story-"+self.story.getMetadata('storyId'))) ):
|
||||
nextarrow = authorsoup.find('a', {'class':'arrowf'})
|
||||
|
|
@ -188,7 +184,7 @@ class TwistingTheHellmouthSiteAdapter(BaseSiteAdapter):
|
|||
logger.debug("**AUTHOR** nextpage URL: "+nextpage)
|
||||
authordata = self._fetchUrl(nextpage)
|
||||
descurl=nextpage
|
||||
authorsoup = bs.BeautifulSoup(authordata, 'html5lib')
|
||||
authorsoup = self.make_soup(authordata)
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
raise exceptions.StoryDoesNotExist(url)
|
||||
|
|
@ -207,7 +203,7 @@ class TwistingTheHellmouthSiteAdapter(BaseSiteAdapter):
|
|||
infourl = 'http://'+self.host+ainfo['href']
|
||||
logger.debug("**StoryInfo** URL: "+infourl)
|
||||
infodata = self._fetchUrl(infourl)
|
||||
infosoup = bs.BeautifulSoup(infodata, 'html5lib')
|
||||
infosoup = self.make_soup(infodata)
|
||||
|
||||
# for a in infosoup.findAll('a',href=re.compile(r"^/Author-\d+")):
|
||||
# self.story.addToList('authorId',a['href'].split('/')[1].split('-')[1])
|
||||
|
|
@ -295,7 +291,7 @@ class TwistingTheHellmouthSiteAdapter(BaseSiteAdapter):
|
|||
|
||||
def getChapterText(self, url):
|
||||
logger.debug('Getting chapter text from: %s' % url)
|
||||
soup = bs.BeautifulSoup(self._fetchUrl(url), 'html5lib')
|
||||
soup = self.make_soup(self._fetchUrl(url))
|
||||
|
||||
div = soup.find('div', {'id' : 'storyinnerbody'})
|
||||
|
||||
|
|
|
|||
|
|
@ -26,8 +26,7 @@ import cookielib as cl
|
|||
from functools import partial
|
||||
import pickle
|
||||
|
||||
#from .. import BeautifulSoup as bs
|
||||
import bs4 as bs
|
||||
import bs4
|
||||
|
||||
from ..htmlcleanup import stripHTML
|
||||
from ..htmlheuristics import replace_br_with_p
|
||||
|
|
@ -470,7 +469,7 @@ class BaseSiteAdapter(Configurable):
|
|||
if isinstance(svalue,basestring):
|
||||
# bs4/html5lib add html, header and body tags, which
|
||||
# we don't want.
|
||||
svalue = bs.BeautifulSoup(svalue,"html5lib").body
|
||||
svalue = bs4.BeautifulSoup(svalue,"html5lib").body
|
||||
svalue.name='span'
|
||||
self.story.setMetadata('description',self.utf8FromSoup(url,svalue))
|
||||
else:
|
||||
|
|
@ -566,6 +565,13 @@ class BaseSiteAdapter(Configurable):
|
|||
|
||||
return retval
|
||||
|
||||
def make_soup(self,data):
|
||||
'''
|
||||
Convenience method for getting a bs4 soup. Older and
|
||||
non-updated adapters call the included bs3 library themselves.
|
||||
'''
|
||||
return bs4.BeautifulSoup(data,'html5lib')
|
||||
|
||||
def cachedfetch(realfetch,cache,url):
|
||||
if url in cache:
|
||||
return cache[url]
|
||||
|
|
|
|||
Loading…
Reference in a new issue