From 75fb648ace60bb6d5623040f8ba9fbc679fa9a3a Mon Sep 17 00:00:00 2001 From: Jim Miller Date: Fri, 12 Aug 2011 17:22:29 -0500 Subject: [PATCH] Annotate adapter_fanficcastletvnet.py for ease of copying for new sites. --- .../adapters/adapter_fanficcastletvnet.py | 111 ++++++++++++++---- 1 file changed, 88 insertions(+), 23 deletions(-) diff --git a/fanficdownloader/adapters/adapter_fanficcastletvnet.py b/fanficdownloader/adapters/adapter_fanficcastletvnet.py index 06e2fd30..96d5d1b2 100644 --- a/fanficdownloader/adapters/adapter_fanficcastletvnet.py +++ b/fanficdownloader/adapters/adapter_fanficcastletvnet.py @@ -18,7 +18,6 @@ import time import logging import re -import urllib import urllib2 import fanficdownloader.BeautifulSoup as bs @@ -27,17 +26,45 @@ import fanficdownloader.exceptions as exceptions from base_adapter import BaseSiteAdapter, utf8FromSoup, makeDate -class FanficCastleTVNetAdapter(BaseSiteAdapter): +# By virtue of being recent and requiring both is_adult and user/pass, +# adapter_fanficcastletvnet.py is the best choice for learning to +# write adapters--especially for sites that use the eFiction system. +# Most sites that have ".../viewstory.php?sid=123" in the story URL +# are eFiction. + +# For non-eFiction sites, it can be considerably more complex, but +# this is still a good starting point. + +# In general an 'adapter' needs to do these five things: + +# - 'Register' correctly with the downloader +# - Site Login (if needed) +# - 'Are you adult?' check (if needed--some do one, some the other, some both) +# - Grab the chapter list +# - Grab the story meta-data (some (non-eFiction) adapters have to get it from the author page) +# - Grab the chapter texts + +# Search for XXX comments--that's where things are most likely to need changing. + +# This function is called by the downloader in all adapter_*.py files +# in this dir to register the adapter class. So it needs to be +# updated to reflect the class below it. That, plus getSiteDomain() +# take care of 'Registering'. +def getClass(): + return FanficCastleTVNetAdapter # XXX + +# Class name has to be unique. Our convention is camel case the +# sitename with Adapter at the end. www is skipped. +class FanficCastleTVNetAdapter(BaseSiteAdapter): # XXX def __init__(self, config, url): BaseSiteAdapter.__init__(self, config, url) - self.story.setMetadata('siteabbrev','csltv') + self.decode = ["Windows-1252", "utf8"] # 1252 is a superset of iso-8859-1. # Most sites that claim to be # iso-8859-1 (and some that claim to be # utf8) are really windows-1252. - self.story.addToList("category","Castle") self.username = "NoneGiven" # if left empty, site doesn't return any message at all. self.password = "" self.is_adult=False @@ -48,12 +75,23 @@ class FanficCastleTVNetAdapter(BaseSiteAdapter): # normalized story URL. self._setURL('http://' + self.getSiteDomain() + '/viewstory.php?sid='+self.story.getMetadata('storyId')) - self.dateformat = "%b %d, %Y" + + # Each adapter needs to have a unique site abbreviation. + self.story.setMetadata('siteabbrev','csltv') # XXX + # If all stories from the site fall into the same category, + # the site itself isn't likely to label them as such, so we + # do. + self.story.addToList("category","Castle") # XXX + + # The date format will vary from site to site. + # http://docs.python.org/library/datetime.html#strftime-strptime-behavior + self.dateformat = "%b %d, %Y" # XXX - @staticmethod + @staticmethod # must be @staticmethod, don't remove it. def getSiteDomain(): - return 'fanfic.castletv.net' + # The site domain. Does have www here, if it uses it. + return 'fanfic.castletv.net' # XXX def getSiteExampleURLs(self): return "http://"+self.getSiteDomain()+"/viewstory.php?sid=1234" @@ -61,14 +99,15 @@ class FanficCastleTVNetAdapter(BaseSiteAdapter): def getSiteURLPattern(self): return re.escape("http://"+self.getSiteDomain()+"/viewstory.php?sid=")+r"\d+$" + ## Login seems to be reasonably standard across eFiction sites. def needToLoginCheck(self, data): if 'Registered Users Only' in data \ or 'There is no such account on our website' in data \ or "That password doesn't match the one in our database" in data: - return True + return True else: - return False - + return False + def performLogin(self, url): params = {} @@ -95,13 +134,20 @@ class FanficCastleTVNetAdapter(BaseSiteAdapter): else: return True + ## Getting the chapter list and the meta data, plus 'is adult' checking. def extractChapterUrlsAndMetadata(self): if self.is_adult or self.getConfig("is_adult"): - addurl = "&ageconsent=ok&warning=4" + # Weirdly, different sites use different warning numbers. + # If the title search below fails, there's a good chance + # you need a different number. print data at that point + # and see what the 'click here to continue' url says. + addurl = "&ageconsent=ok&warning=4" # XXX else: addurl="" - + + # index=1 makes sure we see the story chapter index. Some + # sites skip that for one-chapter stories. url = self.url+'&index=1'+addurl logging.debug("URL: "+url) @@ -118,7 +164,10 @@ class FanficCastleTVNetAdapter(BaseSiteAdapter): self.performLogin(url) data = self._fetchUrl(url) - if "Age Consent Required" in data: + # The actual text that is used to announce you need to be an + # adult varies from site to site. Again, print data before + # the title search to troubleshoot. + if "Age Consent Required" in data: # XXX raise exceptions.AdultCheckRequired(self.url) if "Access denied. This story has not been validated by the adminstrators of this site." in data: @@ -126,7 +175,10 @@ class FanficCastleTVNetAdapter(BaseSiteAdapter): # use BeautifulSoup HTML parser to make everything easier to find. soup = bs.BeautifulSoup(data) + # print data + # Now go hunting for all the meta data and the chapter list. + ## Title a = soup.find('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"$")) self.story.setMetadata('title',a.string) @@ -144,12 +196,16 @@ class FanficCastleTVNetAdapter(BaseSiteAdapter): self.story.setMetadata('numChapters',len(self.chapterUrls)) + # eFiction sites don't help us out a lot with their meta data + # formating, so it's a little ugly. + + # utility method def defaultGetattr(d,k): try: return d[k] except: return "" - + # Rated: NC-17
etc labels = soup.findAll('span',{'class':'label'}) for labelspan in labels: @@ -176,13 +232,26 @@ class FanficCastleTVNetAdapter(BaseSiteAdapter): for cat in catstext: self.story.addToList('category',cat.string) + ## Not all sites use Genre, but there's no harm to + ## leaving it in. Check to make sure the type_id number + ## is correct, though--it's site specific. if 'Genre' in label: - genres = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class')) + genres = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=2')) # XXX genrestext = [genre.string for genre in genres] self.genre = ', '.join(genrestext) for genre in genrestext: self.story.addToList('genre',genre.string) + ## Not all sites use Warnings, but there's no harm to + ## leaving it in. Check to make sure the type_id number + ## is correct, though--it's site specific. + if 'Warnings' in label: + warnings = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=2')) # XXX + warningstext = [warning.string for warning in warnings] + self.warning = ', '.join(warningstext) + for warning in warningstext: + self.story.addToList('warnings',warning.string) + if 'Completed' in label: if 'Yes' in value: self.story.setMetadata('status', 'Completed') @@ -197,7 +266,7 @@ class FanficCastleTVNetAdapter(BaseSiteAdapter): #value = value[0:-1] self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat)) - + # grab the text for an individual chapter. def getChapterText(self, url): logging.debug('Getting chapter text from: %s' % url) @@ -205,13 +274,9 @@ class FanficCastleTVNetAdapter(BaseSiteAdapter): soup = bs.BeautifulStoneSoup(self._fetchUrl(url), selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags. - span = soup.find('div', {'id' : 'story'}) + div = soup.find('div', {'id' : 'story'}) - if None == span: + if None == div: raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url) - return utf8FromSoup(span) - -def getClass(): - return FanficCastleTVNetAdapter - + return utf8FromSoup(div)