Merge ffnet/fpcom, make both use URL given to pull meta/chap list to avoid

missing latest chapter due to out of date servers.
2026-05-09 05:21:13 +02:00 · 2011-06-07 13:00:39 -05:00 · 2011-06-07 13:00:39 -05:00 · aa629c2c39
commit aa629c2c39
parent 520a295f88
2 changed files with 22 additions and 163 deletions
--- a/fanficdownloader/adapters/adapter_fanfictionnet.py
+++ b/fanficdownloader/adapters/adapter_fanfictionnet.py
@ -39,6 +39,13 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter):
        self._setURL("http://"+self.getSiteDomain()\
                         +"/s/"+self.story.getMetadata('storyId')+"/1/")

+        # ffnet update emails have the latest chapter URL.
+        # Frequently, when they arrive, not all the servers have the
+        # latest chapter yet and going back to chapter 1 to pull the
+        # chapter list doesn't get the latest.  So save and use the
+        # original URL given to pull chapter list & metadata.
+        self.origurl = url
+
    @staticmethod
    def getSiteDomain():
        return 'www.fanfiction.net'
@ -58,7 +65,7 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter):
        # fetch the chapter.  From that we will get almost all the
        # metadata and chapter list

-        url = self.url
+        url = self.origurl
        logging.debug("URL: "+url)
        
        # use BeautifulSoup HTML parser to make everything easier to find.
@ -67,12 +74,12 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter):
            soup = bs.BeautifulSoup(data)
        except urllib2.HTTPError, e:
            if e.code == 404:
-                raise exceptions.StoryDoesNotExist(self.url)
+                raise exceptions.StoryDoesNotExist(url)
            else:
                raise e
            
        if "Unable to locate story with id of " in data:
-            raise exceptions.StoryDoesNotExist(self.url)
+            raise exceptions.StoryDoesNotExist(url)
            
        # Find authorid and URL from... author url.
        a = soup.find('a', href=re.compile(r"^/u/\d+"))
@ -99,6 +106,10 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter):
        # var author = 'U n F a b u l o u s M e';

        for script in soup.findAll('script', src=None):
+            if not script:
+                continue
+            if not script.string:
+                continue
            if 'var storyid' in script.string:
                for line in script.string.split('\n'):
                    m = re.match(r"^ +var ([^ ]+) = '?(.*?)'?;$",line)
@ -184,8 +195,9 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter):

    def getChapterText(self, url):
        logging.debug('Getting chapter text from: %s' % url)
-        time.sleep(0.5) ## ffnet tends to fail more if hit too fast.
-                        ## This is in additional to what ever the
+        time.sleep(0.5) ## ffnet(and, I assume, fpcom) tends to fail
+                        ## more if hit too fast.  This is in
+                        ## additional to what ever the
                        ## slow_down_sleep_time setting is.
        soup = bs.BeautifulStoneSoup(self._fetchUrl(url),
                                     selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
--- a/fanficdownloader/adapters/adapter_fictionpresscom.py
+++ b/fanficdownloader/adapters/adapter_fictionpresscom.py
@ -26,18 +26,14 @@ import fanficdownloader.exceptions as exceptions

 from base_adapter import BaseSiteAdapter, utf8FromSoup, makeDate

-class FictionPressComSiteAdapter(BaseSiteAdapter):
+## They're from the same people and pretty much identical.
+from adapter_fanfictionnet import FanFictionNetSiteAdapter
+
+class FictionPressComSiteAdapter(FanFictionNetSiteAdapter):

    def __init__(self, config, url):
-        BaseSiteAdapter.__init__(self, config, url)
+        FanFictionNetSiteAdapter.__init__(self, config, url)
        self.story.setMetadata('siteabbrev','fpcom')
-        
-        # get storyId from url--url validation guarantees second part is storyId
-        self.story.setMetadata('storyId',self.parsedUrl.path.split('/',)[2])
-
-        # normalized story URL.
-        self._setURL("http://"+self.getSiteDomain()\
-                         +"/s/"+self.story.getMetadata('storyId')+"/1/")

    @staticmethod
    def getSiteDomain():
@ -53,155 +49,6 @@ class FictionPressComSiteAdapter(BaseSiteAdapter):
    def getSiteURLPattern(self):
        return r"http://(www|m)?\.fictionpress\.com/s/\d+(/\d+)?(/|/[a-zA-Z0-9_-]+)?/?$"

-    def extractChapterUrlsAndMetadata(self):
-
-        # fetch the chapter.  From that we will get almost all the
-        # metadata and chapter list
-
-        url = self.url
-        logging.debug("URL: "+url)
-        
-        # use BeautifulSoup HTML parser to make everything easier to find.
-        try:
-            data = self._fetchUrl(url)
-            soup = bs.BeautifulSoup(data)
-        except urllib2.HTTPError, e:
-            if e.code == 404:
-                raise exceptions.StoryDoesNotExist(self.url)
-            else:
-                raise e
-            
-        if "Unable to locate story with id of " in data:
-            raise exceptions.StoryDoesNotExist(self.url)
-            
-        # Find authorid and URL from... author url.
-        a = soup.find('a', href=re.compile(r"^/u/\d+"))
-        self.story.setMetadata('authorId',a['href'].split('/')[2])
-        self.story.setMetadata('authorUrl','http://'+self.host+a['href'])
-        self.story.setMetadata('author',a.string)
-
-            
-        # start by finding a script towards the bottom that has a
-        # bunch of useful stuff in it.
-            
-        # var storyid = 6577076;
-        # var chapter = 1;
-        # var chapters = 17;
-        # var words = 42787;
-        # var userid = 2645830;
-        # var title = 'The+Invitation';
-        # var title_t = 'The Invitation';
-        # var summary = 'Dudley Dursley would be the first to say he lived a very normal life. But what happens when he gets invited to his cousin Harry Potter\'s wedding? Will Dudley get the courage to apologize for the torture he caused all those years ago? Harry/Ginny story.';
-        # var categoryid = 224;
-        # var cat_title = 'Harry Potter';
-        # var datep = '12-21-10';
-        # var dateu = '04-06-11';
-        # var author = 'U n F a b u l o u s M e';
-
-        for script in soup.findAll('script', src=None):
-            if not script:
-                continue
-            if not script.string:
-                continue
-            if 'var storyid' in script.string:
-                for line in script.string.split('\n'):
-                    m = re.match(r"^ +var ([^ ]+) = '?(.*?)'?;$",line)
-                    if m == None : continue
-                    var,value = m.groups()
-                    # remove javascript escaping from values.
-                    value = re.sub(r'\\(.)',r'\1',value)
-                    #print var,value
-                    if 'words' in var:
-                        self.story.setMetadata('numWords', value)
-                    if 'title_t' in var:
-                        self.story.setMetadata('title', value)
-                    if 'summary' in var:
-                        self.story.setMetadata('description', value)
-                    if 'datep' in var:
-                        self.story.setMetadata('datePublished',makeDate(value, '%m-%d-%y'))
-                    if 'dateu' in var:
-                        self.story.setMetadata('dateUpdated',makeDate(value, '%m-%d-%y'))
-                    if 'cat_title' in var:
-                        if "Crossover" in value:
-                            value = re.sub(r' Crossover$','',value)
-                            for c in value.split(' and '):
-                                self.story.addToList('category',c)
-                                # Screws up when the category itself
-                                # contains ' and '.  But that's rare
-                                # and the only alternative is to find
-                                # the 'Crossover' category URL and
-                                # parse that page to search for <a>
-                                # with href /crossovers/(name)/(num)/
-				# <a href="/crossovers/Harry_Potter/224/">Harry Potter</a>
-				# <a href="/crossovers/Naruto/1402/">Naruto</a>
-                        else:
-                            self.story.addToList('category',value)
-                break # for script in soup.findAll('script', src=None):
-            
-        # Find the chapter selector 
-        select = soup.find('select', { 'name' : 'chapter' } )
-    	 
-        if select is None:
-    	   # no selector found, so it's a one-chapter story.
-    	   self.chapterUrls.append((self.story.getMetadata('title'),url))
-        else:
-            allOptions = select.findAll('option')
-            for o in allOptions:
-                url = u'http://%s/s/%s/%s/' % ( self.getSiteDomain(),
-                                            self.story.getMetadata('storyId'),
-                                            o['value'])
-                # just in case there's tags, like <i> in chapter titles.
-                title = u"%s" % o
-                title = re.sub(r'<[^>]+>','',title)
-                self.chapterUrls.append((title,url))
-
-        self.story.setMetadata('numChapters',len(self.chapterUrls))
-
-        ## Pull some additional data from html.  Find Rating and look around it.
-
-        a = soup.find('a', href='http://www.fictionratings.com/')
-        self.story.setMetadata('rating',a.string)
-
-        # after Rating, the same bit of text containing id:123456 contains
-        # Complete--if completed.
-        if 'Complete' in a.findNext(text=re.compile(r'id:'+self.story.getMetadata('storyId'))):
-            self.story.setMetadata('status', 'Completed')
-        else:
-            self.story.setMetadata('status', 'In-Progress')
-
-        # Parse genre(s) from <meta name="description" content="..."
-        # <meta name="description" content="Chapter 1 of a Harry Potter  - Family/Friendship fanfiction. Dudley Dursley would be the first to say he lived a very normal life. But what happens when he gets invited to his cousin Harry Potter's wedding? Will Dudley get the courage to apologize for the torture he caused all those years ago? Harry/Ginny story..">
-        # <meta name="description" content="A Gundam Wing/AC and Gundam Seed  - Romance/Sci-Fi crossover fanfiction  with characters:  & Kira Y.. Story summary: One-Shoot dividido en dos partes. Kira va en camino a rescatar a Lacus, pero él no es el unico. Dos personajes de diferentes universos Gundams. SEED vs ZERO.">
-        # <meta name="description" content="Chapter 1 of a Alvin and the chipmunks and Alpha and Omega  crossover fanfiction  with characters: Alvin S. & Humphrey. You'll just have to read to find out... No Flames Plesae... and tell me what you want to see by PM'ing me....">
-        # genre is after first -, but before first 'fanfiction'.
-        m = re.match(r"^(?:Chapter \d+ of a|A) (?:.*?)  (?:- (?P<genres>.*?)) (?:crossover )?fanfiction",
-                     soup.find('meta',{'name':'description'})['content'])
-        if m != None:
-            genres=m.group('genres')
-            # Hurt/Comfort is one genre.
-            genres=re.sub('Hurt/Comfort','Hurt-Comfort',genres)
-            for g in genres.split('/'):
-                self.story.addToList('genre',g)
-        
-        return
-
-
-    def getChapterText(self, url):
-        logging.debug('Getting chapter text from: %s' % url)
-        time.sleep(0.5) ## ffnet(and, I assume, fpcom) tends to fail
-                        ## more if hit too fast.  This is in
-                        ## additional to what ever the
-                        ## slow_down_sleep_time setting is.
-        soup = bs.BeautifulStoneSoup(self._fetchUrl(url),
-                                     selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
-
-        div = soup.find('div', {'id' : 'storytext'})
-
-        if None == div:
-            raise exceptions.FailedToDownload("Error downloading Chapter: %s!  Missing required element!" % url)
-
-        return utf8FromSoup(div)
-
 def getClass():
    return FictionPressComSiteAdapter