Annotate adapter_fanficcastletvnet.py for ease of copying for new sites.

This commit is contained in:
Jim Miller 2011-08-12 17:22:29 -05:00
parent e4ad7edb48
commit 75fb648ace

View file

@ -18,7 +18,6 @@
import time
import logging
import re
import urllib
import urllib2
import fanficdownloader.BeautifulSoup as bs
@ -27,17 +26,45 @@ import fanficdownloader.exceptions as exceptions
from base_adapter import BaseSiteAdapter, utf8FromSoup, makeDate
class FanficCastleTVNetAdapter(BaseSiteAdapter):
# By virtue of being recent and requiring both is_adult and user/pass,
# adapter_fanficcastletvnet.py is the best choice for learning to
# write adapters--especially for sites that use the eFiction system.
# Most sites that have ".../viewstory.php?sid=123" in the story URL
# are eFiction.
# For non-eFiction sites, it can be considerably more complex, but
# this is still a good starting point.
# In general an 'adapter' needs to do these five things:
# - 'Register' correctly with the downloader
# - Site Login (if needed)
# - 'Are you adult?' check (if needed--some do one, some the other, some both)
# - Grab the chapter list
# - Grab the story meta-data (some (non-eFiction) adapters have to get it from the author page)
# - Grab the chapter texts
# Search for XXX comments--that's where things are most likely to need changing.
# This function is called by the downloader in all adapter_*.py files
# in this dir to register the adapter class. So it needs to be
# updated to reflect the class below it. That, plus getSiteDomain()
# take care of 'Registering'.
def getClass():
return FanficCastleTVNetAdapter # XXX
# Class name has to be unique. Our convention is camel case the
# sitename with Adapter at the end. www is skipped.
class FanficCastleTVNetAdapter(BaseSiteAdapter): # XXX
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.story.setMetadata('siteabbrev','csltv')
self.decode = ["Windows-1252",
"utf8"] # 1252 is a superset of iso-8859-1.
# Most sites that claim to be
# iso-8859-1 (and some that claim to be
# utf8) are really windows-1252.
self.story.addToList("category","Castle")
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
self.password = ""
self.is_adult=False
@ -48,12 +75,23 @@ class FanficCastleTVNetAdapter(BaseSiteAdapter):
# normalized story URL.
self._setURL('http://' + self.getSiteDomain() + '/viewstory.php?sid='+self.story.getMetadata('storyId'))
self.dateformat = "%b %d, %Y"
# Each adapter needs to have a unique site abbreviation.
self.story.setMetadata('siteabbrev','csltv') # XXX
# If all stories from the site fall into the same category,
# the site itself isn't likely to label them as such, so we
# do.
self.story.addToList("category","Castle") # XXX
# The date format will vary from site to site.
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
self.dateformat = "%b %d, %Y" # XXX
@staticmethod
@staticmethod # must be @staticmethod, don't remove it.
def getSiteDomain():
return 'fanfic.castletv.net'
# The site domain. Does have www here, if it uses it.
return 'fanfic.castletv.net' # XXX
def getSiteExampleURLs(self):
return "http://"+self.getSiteDomain()+"/viewstory.php?sid=1234"
@ -61,14 +99,15 @@ class FanficCastleTVNetAdapter(BaseSiteAdapter):
def getSiteURLPattern(self):
return re.escape("http://"+self.getSiteDomain()+"/viewstory.php?sid=")+r"\d+$"
## Login seems to be reasonably standard across eFiction sites.
def needToLoginCheck(self, data):
if 'Registered Users Only' in data \
or 'There is no such account on our website' in data \
or "That password doesn't match the one in our database" in data:
return True
return True
else:
return False
return False
def performLogin(self, url):
params = {}
@ -95,13 +134,20 @@ class FanficCastleTVNetAdapter(BaseSiteAdapter):
else:
return True
## Getting the chapter list and the meta data, plus 'is adult' checking.
def extractChapterUrlsAndMetadata(self):
if self.is_adult or self.getConfig("is_adult"):
addurl = "&ageconsent=ok&warning=4"
# Weirdly, different sites use different warning numbers.
# If the title search below fails, there's a good chance
# you need a different number. print data at that point
# and see what the 'click here to continue' url says.
addurl = "&ageconsent=ok&warning=4" # XXX
else:
addurl=""
# index=1 makes sure we see the story chapter index. Some
# sites skip that for one-chapter stories.
url = self.url+'&index=1'+addurl
logging.debug("URL: "+url)
@ -118,7 +164,10 @@ class FanficCastleTVNetAdapter(BaseSiteAdapter):
self.performLogin(url)
data = self._fetchUrl(url)
if "Age Consent Required" in data:
# The actual text that is used to announce you need to be an
# adult varies from site to site. Again, print data before
# the title search to troubleshoot.
if "Age Consent Required" in data: # XXX
raise exceptions.AdultCheckRequired(self.url)
if "Access denied. This story has not been validated by the adminstrators of this site." in data:
@ -126,7 +175,10 @@ class FanficCastleTVNetAdapter(BaseSiteAdapter):
# use BeautifulSoup HTML parser to make everything easier to find.
soup = bs.BeautifulSoup(data)
# print data
# Now go hunting for all the meta data and the chapter list.
## Title
a = soup.find('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"$"))
self.story.setMetadata('title',a.string)
@ -144,12 +196,16 @@ class FanficCastleTVNetAdapter(BaseSiteAdapter):
self.story.setMetadata('numChapters',len(self.chapterUrls))
# eFiction sites don't help us out a lot with their meta data
# formating, so it's a little ugly.
# utility method
def defaultGetattr(d,k):
try:
return d[k]
except:
return ""
# <span class="label">Rated:</span> NC-17<br /> etc
labels = soup.findAll('span',{'class':'label'})
for labelspan in labels:
@ -176,13 +232,26 @@ class FanficCastleTVNetAdapter(BaseSiteAdapter):
for cat in catstext:
self.story.addToList('category',cat.string)
## Not all sites use Genre, but there's no harm to
## leaving it in. Check to make sure the type_id number
## is correct, though--it's site specific.
if 'Genre' in label:
genres = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class'))
genres = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=2')) # XXX
genrestext = [genre.string for genre in genres]
self.genre = ', '.join(genrestext)
for genre in genrestext:
self.story.addToList('genre',genre.string)
## Not all sites use Warnings, but there's no harm to
## leaving it in. Check to make sure the type_id number
## is correct, though--it's site specific.
if 'Warnings' in label:
warnings = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=2')) # XXX
warningstext = [warning.string for warning in warnings]
self.warning = ', '.join(warningstext)
for warning in warningstext:
self.story.addToList('warnings',warning.string)
if 'Completed' in label:
if 'Yes' in value:
self.story.setMetadata('status', 'Completed')
@ -197,7 +266,7 @@ class FanficCastleTVNetAdapter(BaseSiteAdapter):
#value = value[0:-1]
self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat))
# grab the text for an individual chapter.
def getChapterText(self, url):
logging.debug('Getting chapter text from: %s' % url)
@ -205,13 +274,9 @@ class FanficCastleTVNetAdapter(BaseSiteAdapter):
soup = bs.BeautifulStoneSoup(self._fetchUrl(url),
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
span = soup.find('div', {'id' : 'story'})
div = soup.find('div', {'id' : 'story'})
if None == span:
if None == div:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return utf8FromSoup(span)
def getClass():
return FanficCastleTVNetAdapter
return utf8FromSoup(div)