mirror of
https://github.com/JimmXinu/FanFicFare.git
synced 2025-12-06 08:52:55 +01:00
Annotate adapter_fanficcastletvnet.py for ease of copying for new sites.
This commit is contained in:
parent
e4ad7edb48
commit
75fb648ace
1 changed files with 88 additions and 23 deletions
|
|
@ -18,7 +18,6 @@
|
|||
import time
|
||||
import logging
|
||||
import re
|
||||
import urllib
|
||||
import urllib2
|
||||
|
||||
import fanficdownloader.BeautifulSoup as bs
|
||||
|
|
@ -27,17 +26,45 @@ import fanficdownloader.exceptions as exceptions
|
|||
|
||||
from base_adapter import BaseSiteAdapter, utf8FromSoup, makeDate
|
||||
|
||||
class FanficCastleTVNetAdapter(BaseSiteAdapter):
|
||||
# By virtue of being recent and requiring both is_adult and user/pass,
|
||||
# adapter_fanficcastletvnet.py is the best choice for learning to
|
||||
# write adapters--especially for sites that use the eFiction system.
|
||||
# Most sites that have ".../viewstory.php?sid=123" in the story URL
|
||||
# are eFiction.
|
||||
|
||||
# For non-eFiction sites, it can be considerably more complex, but
|
||||
# this is still a good starting point.
|
||||
|
||||
# In general an 'adapter' needs to do these five things:
|
||||
|
||||
# - 'Register' correctly with the downloader
|
||||
# - Site Login (if needed)
|
||||
# - 'Are you adult?' check (if needed--some do one, some the other, some both)
|
||||
# - Grab the chapter list
|
||||
# - Grab the story meta-data (some (non-eFiction) adapters have to get it from the author page)
|
||||
# - Grab the chapter texts
|
||||
|
||||
# Search for XXX comments--that's where things are most likely to need changing.
|
||||
|
||||
# This function is called by the downloader in all adapter_*.py files
|
||||
# in this dir to register the adapter class. So it needs to be
|
||||
# updated to reflect the class below it. That, plus getSiteDomain()
|
||||
# take care of 'Registering'.
|
||||
def getClass():
|
||||
return FanficCastleTVNetAdapter # XXX
|
||||
|
||||
# Class name has to be unique. Our convention is camel case the
|
||||
# sitename with Adapter at the end. www is skipped.
|
||||
class FanficCastleTVNetAdapter(BaseSiteAdapter): # XXX
|
||||
|
||||
def __init__(self, config, url):
|
||||
BaseSiteAdapter.__init__(self, config, url)
|
||||
self.story.setMetadata('siteabbrev','csltv')
|
||||
|
||||
self.decode = ["Windows-1252",
|
||||
"utf8"] # 1252 is a superset of iso-8859-1.
|
||||
# Most sites that claim to be
|
||||
# iso-8859-1 (and some that claim to be
|
||||
# utf8) are really windows-1252.
|
||||
self.story.addToList("category","Castle")
|
||||
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
|
||||
self.password = ""
|
||||
self.is_adult=False
|
||||
|
|
@ -48,12 +75,23 @@ class FanficCastleTVNetAdapter(BaseSiteAdapter):
|
|||
|
||||
# normalized story URL.
|
||||
self._setURL('http://' + self.getSiteDomain() + '/viewstory.php?sid='+self.story.getMetadata('storyId'))
|
||||
self.dateformat = "%b %d, %Y"
|
||||
|
||||
# Each adapter needs to have a unique site abbreviation.
|
||||
self.story.setMetadata('siteabbrev','csltv') # XXX
|
||||
|
||||
# If all stories from the site fall into the same category,
|
||||
# the site itself isn't likely to label them as such, so we
|
||||
# do.
|
||||
self.story.addToList("category","Castle") # XXX
|
||||
|
||||
# The date format will vary from site to site.
|
||||
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
|
||||
self.dateformat = "%b %d, %Y" # XXX
|
||||
|
||||
@staticmethod
|
||||
@staticmethod # must be @staticmethod, don't remove it.
|
||||
def getSiteDomain():
|
||||
return 'fanfic.castletv.net'
|
||||
# The site domain. Does have www here, if it uses it.
|
||||
return 'fanfic.castletv.net' # XXX
|
||||
|
||||
def getSiteExampleURLs(self):
|
||||
return "http://"+self.getSiteDomain()+"/viewstory.php?sid=1234"
|
||||
|
|
@ -61,14 +99,15 @@ class FanficCastleTVNetAdapter(BaseSiteAdapter):
|
|||
def getSiteURLPattern(self):
|
||||
return re.escape("http://"+self.getSiteDomain()+"/viewstory.php?sid=")+r"\d+$"
|
||||
|
||||
## Login seems to be reasonably standard across eFiction sites.
|
||||
def needToLoginCheck(self, data):
|
||||
if 'Registered Users Only' in data \
|
||||
or 'There is no such account on our website' in data \
|
||||
or "That password doesn't match the one in our database" in data:
|
||||
return True
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
return False
|
||||
|
||||
def performLogin(self, url):
|
||||
params = {}
|
||||
|
||||
|
|
@ -95,13 +134,20 @@ class FanficCastleTVNetAdapter(BaseSiteAdapter):
|
|||
else:
|
||||
return True
|
||||
|
||||
## Getting the chapter list and the meta data, plus 'is adult' checking.
|
||||
def extractChapterUrlsAndMetadata(self):
|
||||
|
||||
if self.is_adult or self.getConfig("is_adult"):
|
||||
addurl = "&ageconsent=ok&warning=4"
|
||||
# Weirdly, different sites use different warning numbers.
|
||||
# If the title search below fails, there's a good chance
|
||||
# you need a different number. print data at that point
|
||||
# and see what the 'click here to continue' url says.
|
||||
addurl = "&ageconsent=ok&warning=4" # XXX
|
||||
else:
|
||||
addurl=""
|
||||
|
||||
|
||||
# index=1 makes sure we see the story chapter index. Some
|
||||
# sites skip that for one-chapter stories.
|
||||
url = self.url+'&index=1'+addurl
|
||||
logging.debug("URL: "+url)
|
||||
|
||||
|
|
@ -118,7 +164,10 @@ class FanficCastleTVNetAdapter(BaseSiteAdapter):
|
|||
self.performLogin(url)
|
||||
data = self._fetchUrl(url)
|
||||
|
||||
if "Age Consent Required" in data:
|
||||
# The actual text that is used to announce you need to be an
|
||||
# adult varies from site to site. Again, print data before
|
||||
# the title search to troubleshoot.
|
||||
if "Age Consent Required" in data: # XXX
|
||||
raise exceptions.AdultCheckRequired(self.url)
|
||||
|
||||
if "Access denied. This story has not been validated by the adminstrators of this site." in data:
|
||||
|
|
@ -126,7 +175,10 @@ class FanficCastleTVNetAdapter(BaseSiteAdapter):
|
|||
|
||||
# use BeautifulSoup HTML parser to make everything easier to find.
|
||||
soup = bs.BeautifulSoup(data)
|
||||
# print data
|
||||
|
||||
# Now go hunting for all the meta data and the chapter list.
|
||||
|
||||
## Title
|
||||
a = soup.find('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"$"))
|
||||
self.story.setMetadata('title',a.string)
|
||||
|
|
@ -144,12 +196,16 @@ class FanficCastleTVNetAdapter(BaseSiteAdapter):
|
|||
|
||||
self.story.setMetadata('numChapters',len(self.chapterUrls))
|
||||
|
||||
# eFiction sites don't help us out a lot with their meta data
|
||||
# formating, so it's a little ugly.
|
||||
|
||||
# utility method
|
||||
def defaultGetattr(d,k):
|
||||
try:
|
||||
return d[k]
|
||||
except:
|
||||
return ""
|
||||
|
||||
|
||||
# <span class="label">Rated:</span> NC-17<br /> etc
|
||||
labels = soup.findAll('span',{'class':'label'})
|
||||
for labelspan in labels:
|
||||
|
|
@ -176,13 +232,26 @@ class FanficCastleTVNetAdapter(BaseSiteAdapter):
|
|||
for cat in catstext:
|
||||
self.story.addToList('category',cat.string)
|
||||
|
||||
## Not all sites use Genre, but there's no harm to
|
||||
## leaving it in. Check to make sure the type_id number
|
||||
## is correct, though--it's site specific.
|
||||
if 'Genre' in label:
|
||||
genres = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class'))
|
||||
genres = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=2')) # XXX
|
||||
genrestext = [genre.string for genre in genres]
|
||||
self.genre = ', '.join(genrestext)
|
||||
for genre in genrestext:
|
||||
self.story.addToList('genre',genre.string)
|
||||
|
||||
## Not all sites use Warnings, but there's no harm to
|
||||
## leaving it in. Check to make sure the type_id number
|
||||
## is correct, though--it's site specific.
|
||||
if 'Warnings' in label:
|
||||
warnings = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=2')) # XXX
|
||||
warningstext = [warning.string for warning in warnings]
|
||||
self.warning = ', '.join(warningstext)
|
||||
for warning in warningstext:
|
||||
self.story.addToList('warnings',warning.string)
|
||||
|
||||
if 'Completed' in label:
|
||||
if 'Yes' in value:
|
||||
self.story.setMetadata('status', 'Completed')
|
||||
|
|
@ -197,7 +266,7 @@ class FanficCastleTVNetAdapter(BaseSiteAdapter):
|
|||
#value = value[0:-1]
|
||||
self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat))
|
||||
|
||||
|
||||
# grab the text for an individual chapter.
|
||||
def getChapterText(self, url):
|
||||
|
||||
logging.debug('Getting chapter text from: %s' % url)
|
||||
|
|
@ -205,13 +274,9 @@ class FanficCastleTVNetAdapter(BaseSiteAdapter):
|
|||
soup = bs.BeautifulStoneSoup(self._fetchUrl(url),
|
||||
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
|
||||
|
||||
span = soup.find('div', {'id' : 'story'})
|
||||
div = soup.find('div', {'id' : 'story'})
|
||||
|
||||
if None == span:
|
||||
if None == div:
|
||||
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
return utf8FromSoup(span)
|
||||
|
||||
def getClass():
|
||||
return FanficCastleTVNetAdapter
|
||||
|
||||
return utf8FromSoup(div)
|
||||
|
|
|
|||
Loading…
Reference in a new issue