From aa629c2c39018848e8105282fda1974503a53920 Mon Sep 17 00:00:00 2001 From: Jim Miller Date: Tue, 7 Jun 2011 13:00:39 -0500 Subject: [PATCH] Merge ffnet/fpcom, make both use URL given to pull meta/chap list to avoid missing latest chapter due to out of date servers. --- .../adapters/adapter_fanfictionnet.py | 22 ++- .../adapters/adapter_fictionpresscom.py | 163 +----------------- 2 files changed, 22 insertions(+), 163 deletions(-) diff --git a/fanficdownloader/adapters/adapter_fanfictionnet.py b/fanficdownloader/adapters/adapter_fanfictionnet.py index 983a2483..3a8704b4 100644 --- a/fanficdownloader/adapters/adapter_fanfictionnet.py +++ b/fanficdownloader/adapters/adapter_fanfictionnet.py @@ -39,6 +39,13 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter): self._setURL("http://"+self.getSiteDomain()\ +"/s/"+self.story.getMetadata('storyId')+"/1/") + # ffnet update emails have the latest chapter URL. + # Frequently, when they arrive, not all the servers have the + # latest chapter yet and going back to chapter 1 to pull the + # chapter list doesn't get the latest. So save and use the + # original URL given to pull chapter list & metadata. + self.origurl = url + @staticmethod def getSiteDomain(): return 'www.fanfiction.net' @@ -58,7 +65,7 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter): # fetch the chapter. From that we will get almost all the # metadata and chapter list - url = self.url + url = self.origurl logging.debug("URL: "+url) # use BeautifulSoup HTML parser to make everything easier to find. @@ -67,12 +74,12 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter): soup = bs.BeautifulSoup(data) except urllib2.HTTPError, e: if e.code == 404: - raise exceptions.StoryDoesNotExist(self.url) + raise exceptions.StoryDoesNotExist(url) else: raise e if "Unable to locate story with id of " in data: - raise exceptions.StoryDoesNotExist(self.url) + raise exceptions.StoryDoesNotExist(url) # Find authorid and URL from... author url. a = soup.find('a', href=re.compile(r"^/u/\d+")) @@ -99,6 +106,10 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter): # var author = 'U n F a b u l o u s M e'; for script in soup.findAll('script', src=None): + if not script: + continue + if not script.string: + continue if 'var storyid' in script.string: for line in script.string.split('\n'): m = re.match(r"^ +var ([^ ]+) = '?(.*?)'?;$",line) @@ -184,8 +195,9 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter): def getChapterText(self, url): logging.debug('Getting chapter text from: %s' % url) - time.sleep(0.5) ## ffnet tends to fail more if hit too fast. - ## This is in additional to what ever the + time.sleep(0.5) ## ffnet(and, I assume, fpcom) tends to fail + ## more if hit too fast. This is in + ## additional to what ever the ## slow_down_sleep_time setting is. soup = bs.BeautifulStoneSoup(self._fetchUrl(url), selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags. diff --git a/fanficdownloader/adapters/adapter_fictionpresscom.py b/fanficdownloader/adapters/adapter_fictionpresscom.py index 45b1d0c4..51be29e0 100644 --- a/fanficdownloader/adapters/adapter_fictionpresscom.py +++ b/fanficdownloader/adapters/adapter_fictionpresscom.py @@ -26,18 +26,14 @@ import fanficdownloader.exceptions as exceptions from base_adapter import BaseSiteAdapter, utf8FromSoup, makeDate -class FictionPressComSiteAdapter(BaseSiteAdapter): +## They're from the same people and pretty much identical. +from adapter_fanfictionnet import FanFictionNetSiteAdapter + +class FictionPressComSiteAdapter(FanFictionNetSiteAdapter): def __init__(self, config, url): - BaseSiteAdapter.__init__(self, config, url) + FanFictionNetSiteAdapter.__init__(self, config, url) self.story.setMetadata('siteabbrev','fpcom') - - # get storyId from url--url validation guarantees second part is storyId - self.story.setMetadata('storyId',self.parsedUrl.path.split('/',)[2]) - - # normalized story URL. - self._setURL("http://"+self.getSiteDomain()\ - +"/s/"+self.story.getMetadata('storyId')+"/1/") @staticmethod def getSiteDomain(): @@ -53,155 +49,6 @@ class FictionPressComSiteAdapter(BaseSiteAdapter): def getSiteURLPattern(self): return r"http://(www|m)?\.fictionpress\.com/s/\d+(/\d+)?(/|/[a-zA-Z0-9_-]+)?/?$" - def extractChapterUrlsAndMetadata(self): - - # fetch the chapter. From that we will get almost all the - # metadata and chapter list - - url = self.url - logging.debug("URL: "+url) - - # use BeautifulSoup HTML parser to make everything easier to find. - try: - data = self._fetchUrl(url) - soup = bs.BeautifulSoup(data) - except urllib2.HTTPError, e: - if e.code == 404: - raise exceptions.StoryDoesNotExist(self.url) - else: - raise e - - if "Unable to locate story with id of " in data: - raise exceptions.StoryDoesNotExist(self.url) - - # Find authorid and URL from... author url. - a = soup.find('a', href=re.compile(r"^/u/\d+")) - self.story.setMetadata('authorId',a['href'].split('/')[2]) - self.story.setMetadata('authorUrl','http://'+self.host+a['href']) - self.story.setMetadata('author',a.string) - - - # start by finding a script towards the bottom that has a - # bunch of useful stuff in it. - - # var storyid = 6577076; - # var chapter = 1; - # var chapters = 17; - # var words = 42787; - # var userid = 2645830; - # var title = 'The+Invitation'; - # var title_t = 'The Invitation'; - # var summary = 'Dudley Dursley would be the first to say he lived a very normal life. But what happens when he gets invited to his cousin Harry Potter\'s wedding? Will Dudley get the courage to apologize for the torture he caused all those years ago? Harry/Ginny story.'; - # var categoryid = 224; - # var cat_title = 'Harry Potter'; - # var datep = '12-21-10'; - # var dateu = '04-06-11'; - # var author = 'U n F a b u l o u s M e'; - - for script in soup.findAll('script', src=None): - if not script: - continue - if not script.string: - continue - if 'var storyid' in script.string: - for line in script.string.split('\n'): - m = re.match(r"^ +var ([^ ]+) = '?(.*?)'?;$",line) - if m == None : continue - var,value = m.groups() - # remove javascript escaping from values. - value = re.sub(r'\\(.)',r'\1',value) - #print var,value - if 'words' in var: - self.story.setMetadata('numWords', value) - if 'title_t' in var: - self.story.setMetadata('title', value) - if 'summary' in var: - self.story.setMetadata('description', value) - if 'datep' in var: - self.story.setMetadata('datePublished',makeDate(value, '%m-%d-%y')) - if 'dateu' in var: - self.story.setMetadata('dateUpdated',makeDate(value, '%m-%d-%y')) - if 'cat_title' in var: - if "Crossover" in value: - value = re.sub(r' Crossover$','',value) - for c in value.split(' and '): - self.story.addToList('category',c) - # Screws up when the category itself - # contains ' and '. But that's rare - # and the only alternative is to find - # the 'Crossover' category URL and - # parse that page to search for - # with href /crossovers/(name)/(num)/ - # Harry Potter - # Naruto - else: - self.story.addToList('category',value) - break # for script in soup.findAll('script', src=None): - - # Find the chapter selector - select = soup.find('select', { 'name' : 'chapter' } ) - - if select is None: - # no selector found, so it's a one-chapter story. - self.chapterUrls.append((self.story.getMetadata('title'),url)) - else: - allOptions = select.findAll('option') - for o in allOptions: - url = u'http://%s/s/%s/%s/' % ( self.getSiteDomain(), - self.story.getMetadata('storyId'), - o['value']) - # just in case there's tags, like in chapter titles. - title = u"%s" % o - title = re.sub(r'<[^>]+>','',title) - self.chapterUrls.append((title,url)) - - self.story.setMetadata('numChapters',len(self.chapterUrls)) - - ## Pull some additional data from html. Find Rating and look around it. - - a = soup.find('a', href='http://www.fictionratings.com/') - self.story.setMetadata('rating',a.string) - - # after Rating, the same bit of text containing id:123456 contains - # Complete--if completed. - if 'Complete' in a.findNext(text=re.compile(r'id:'+self.story.getMetadata('storyId'))): - self.story.setMetadata('status', 'Completed') - else: - self.story.setMetadata('status', 'In-Progress') - - # Parse genre(s) from - # - # - # genre is after first -, but before first 'fanfiction'. - m = re.match(r"^(?:Chapter \d+ of a|A) (?:.*?) (?:- (?P.*?)) (?:crossover )?fanfiction", - soup.find('meta',{'name':'description'})['content']) - if m != None: - genres=m.group('genres') - # Hurt/Comfort is one genre. - genres=re.sub('Hurt/Comfort','Hurt-Comfort',genres) - for g in genres.split('/'): - self.story.addToList('genre',g) - - return - - - def getChapterText(self, url): - logging.debug('Getting chapter text from: %s' % url) - time.sleep(0.5) ## ffnet(and, I assume, fpcom) tends to fail - ## more if hit too fast. This is in - ## additional to what ever the - ## slow_down_sleep_time setting is. - soup = bs.BeautifulStoneSoup(self._fetchUrl(url), - selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags. - - div = soup.find('div', {'id' : 'storytext'}) - - if None == div: - raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url) - - return utf8FromSoup(div) - def getClass(): return FictionPressComSiteAdapter