Add site storiesonline.net.

2026-05-09 05:21:13 +02:00 · 2013-10-29 23:20:05 -05:00 · 2013-10-29 23:20:05 -05:00 · cbc02b749b
commit cbc02b749b
parent 8761b766ca
4 changed files with 342 additions and 0 deletions
--- a/defaults.ini
+++ b/defaults.ini
@ -793,6 +793,24 @@ dislikes_label:Dislikes
 #username:YourName
 #password:yourpassword

+[storiesonline.net]
+## Some sites require login (or login for some rated stories) The
+## program can prompt you, or you can save it in config.  In
+## commandline version, this should go in your personal.ini, not
+## defaults.ini.
+#username:YourName
+#password:yourpassword
+
+## Clear FanFiction from defaults, site is original fiction.
+extratags:
+
+extra_valid_entries:size,universe,codes
+#extra_titlepage_entries:size,universe,codes
+
+size_label:Size
+universe_label:Universe
+codes_label:Codes
+
 [grangerenchanted.com]
 ## Site dedicated to these categories/characters/ships
 extracategories:Harry Potter
--- a/fanficdownloader/adapters/init.py
+++ b/fanficdownloader/adapters/init.py
@ -120,6 +120,7 @@ import adapter_potterheadsanonymouscom
 import adapter_simplyundeniablecom
 import adapter_scarheadnet
 import adapter_fictionpadcom
+import adapter_storiesonlinenet

 ## This bit of complexity allows adapters to be added by just adding
 ## importing.  It eliminates the long if/else clauses we used to need
--- a/fanficdownloader/adapters/adapter_storiesonlinenet.py
+++ b/fanficdownloader/adapters/adapter_storiesonlinenet.py
@ -0,0 +1,305 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2013 Fanficdownloader team
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import time
+import logging
+logger = logging.getLogger(__name__)
+import re
+import urllib2
+
+from .. import BeautifulSoup as bs
+from ..htmlcleanup import stripHTML
+from .. import exceptions as exceptions
+
+from base_adapter import BaseSiteAdapter,  makeDate
+
+def getClass():
+    return StoriesOnlineNetAdapter
+
+# Class name has to be unique.  Our convention is camel case the
+# sitename with Adapter at the end.  www is skipped.
+class StoriesOnlineNetAdapter(BaseSiteAdapter):
+
+    def __init__(self, config, url):
+        BaseSiteAdapter.__init__(self, config, url)
+
+        self.decode = ["Windows-1252",
+                       "utf8"] # 1252 is a superset of iso-8859-1.
+                               # Most sites that claim to be
+                               # iso-8859-1 (and some that claim to be
+                               # utf8) are really windows-1252.
+        self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
+        self.password = ""
+        self.is_adult=False
+        
+        # get storyId from url
+        self.story.setMetadata('storyId',self.parsedUrl.path.split('/',)[2].split(':')[0])
+        if 'storyInfo' in self.story.getMetadata('storyId'):
+            self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
+        
+        # normalized story URL.
+        self._setURL('http://' + self.getSiteDomain() + '/s/'+self.story.getMetadata('storyId'))
+        
+        # Each adapter needs to have a unique site abbreviation.
+        self.story.setMetadata('siteabbrev','strol')
+
+        # The date format will vary from site to site.
+        # http://docs.python.org/library/datetime.html#strftime-strptime-behavior
+        self.dateformat = "%Y-%m-%d"
+            
+    @staticmethod # must be @staticmethod, don't remove it.
+    def getSiteDomain():
+        # The site domain.  Does have www here, if it uses it.
+        return 'storiesonline.net'
+
+    @classmethod
+    def getSiteExampleURLs(self):
+        return "http://"+self.getSiteDomain()+"/s/1234 http://"+self.getSiteDomain()+"/s/1234:4010"
+
+    def getSiteURLPattern(self):
+        return re.escape("http://"+self.getSiteDomain())+r"/s/\d+(:\d+)?(;\d+)?$"
+
+    ## Login seems to be reasonably standard across eFiction sites.
+    def needToLoginCheck(self, data):
+        if 'Free Registration' in data \
+                or "Invalid Password!" in data \
+                or "Invalid User Name!" in data \
+                or "Access to unlinked chapters requires" in data:
+            return True
+        else:
+            return False
+        
+    def performLogin(self, url):
+        params = {}
+
+        if self.password:
+            params['theusername'] = self.username
+            params['thepassword'] = self.password
+        else:
+            params['theusername'] = self.getConfig("username")
+            params['thepassword'] = self.getConfig("password")
+        params['rememberMe'] = '1'
+        params['page'] = 'http://'+self.getSiteDomain()+'/'
+        params['submit'] = 'Login'
+    
+        loginUrl = 'http://' + self.getSiteDomain() + '/login.php'
+        logger.debug("Will now login to URL (%s) as (%s)" % (loginUrl,
+                                                              params['theusername']))
+    
+        d = self._fetchUrl(loginUrl, params)
+    
+        if "My Account" not in d : #Member Account
+            logger.info("Failed to login to URL %s as %s" % (loginUrl,
+                                                              params['theusername']))
+            raise exceptions.FailedToLogin(url,params['theusername'])
+            return False
+        else:
+            return True
+
+    ## Getting the chapter list and the meta data, plus 'is adult' checking.
+    def extractChapterUrlsAndMetadata(self):
+
+        # index=1 makes sure we see the story chapter index.  Some
+        # sites skip that for one-chapter stories.
+        url = self.url
+        logger.debug("URL: "+url)
+
+        try:
+            data = self._fetchUrl(url)
+        except urllib2.HTTPError, e:
+            if e.code == 404:
+                raise exceptions.StoryDoesNotExist(self.url)
+            else:
+                raise e
+
+        if self.needToLoginCheck(data):
+            # need to log in for this one.
+            self.performLogin(url)
+            data = self._fetchUrl(url)
+            
+        if "Access denied. This story has not been validated by the adminstrators of this site." in data:
+            raise exceptions.FailedToDownload(self.getSiteDomain() +" says: Access denied. This story has not been validated by the adminstrators of this site.")
+            
+        # use BeautifulSoup HTML parser to make everything easier to find.
+        soup = bs.BeautifulSoup(data)
+        #print data
+
+        # Now go hunting for all the meta data and the chapter list.
+        
+        ## Title
+        a = soup.find('h1')
+        self.story.setMetadata('title',stripHTML(a))
+        
+        # Find authorid and URL from... author url.
+        a = soup.find('a', href=re.compile(r"/a/\w+"))
+        self.story.setMetadata('authorId',a['href'].split('/')[2])
+        self.story.setMetadata('authorUrl','http://'+self.host+a['href'])
+        self.story.setMetadata('author',stripHTML(a).replace("'s Page",""))
+
+        # Find the chapters:
+        chapters = soup.findAll('a', href=re.compile(r'/s/'+self.story.getMetadata('storyId')+":\d+$"))
+        if len(chapters) != 0:
+            for chapter in chapters:
+                # just in case there's tags, like <i> in chapter titles.
+                self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+chapter['href']))
+        else:
+            self.chapterUrls.append((self.story.getMetadata('title'),'http://'+self.host+'/s/'+self.story.getMetadata('storyId')))
+
+        self.story.setMetadata('numChapters',len(self.chapterUrls))
+
+        # surprisingly, the detailed page does not give enough details, so go to author's page
+        skip=0
+        i=0
+        while i == 0:
+            asoup = bs.BeautifulSoup(self._fetchUrl(self.story.getMetadata('authorUrl')+"&skip="+str(skip)))
+        
+            a = asoup.findAll('td', {'class' : 'lc2'})
+            for lc2 in a:
+                if lc2.find('a')['href'] == '/s/'+self.story.getMetadata('storyId'):
+                    i=1
+                    break
+                if a[len(a)-1] == lc2:
+                    skip=skip+10
+        
+        for cat in lc2.findAll('div', {'class' : 'typediv'}):
+            self.story.addToList('genre',cat.text)
+
+        # in lieu of word count.
+        self.story.setMetadata('size', lc2.findNext('td', {'class' : 'num'}).text)
+        
+        lc4 = lc2.findNext('td', {'class' : 'lc4'})        
+        
+        try:
+            a = lc4.find('a', href=re.compile(r"/library/show_series.php\?id=\d+"))
+            i = a.parent.text.split('(')[1].split(')')[0]
+            self.setSeries(stripHTML(a), i)
+            self.story.setMetadata('seriesUrl','http://'+self.host+a['href'])
+        except:
+            pass
+        try:
+            a = lc4.find('a', href=re.compile(r"/library/universe.php\?id=\d+"))
+            if a:
+                self.story.setMetadata("universe",stripHTML(a))
+        except:
+            pass
+            
+        for a in lc4.findAll('span', {'class' : 'help'}):
+            a.extract()
+        for a in lc4.findAll('br'):
+            a.extract()
+
+        desc = "%s"%lc4
+        desc = desc[desc.index(">")+1:]
+        desc = desc[:desc.index("<")]
+        self.setDescription('http://'+self.host+'/s/'+self.story.getMetadata('storyId'),desc)
+            
+        for b in lc4.findAll('b'):
+            #logger.debug('Getting metadata: "%s"' % b)
+            label = b.text
+            if label in ['Posted:', 'Concluded:', 'Updated:']:
+                value = b.findNext('noscript').text
+                #logger.debug('Have a date field label: "%s", value: "%s"' % (label, value))
+            else:
+                value = b.nextSibling
+            #logger.debug('label: "%s", value: "%s"' % (label, value))
+            
+            if 'Sex' in label:
+                self.story.setMetadata('rating', value)
+                
+            if 'Codes' in label:
+                for code in value.split(' '):
+                    self.story.addToList('codes',code)
+                    
+            if 'Posted' in label:
+                self.story.setMetadata('datePublished', makeDate(stripHTML(value), self.dateformat))
+                self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat))
+                
+            if 'Concluded' in label:
+                self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat))
+                
+            if 'Updated' in label:
+                self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat))
+#                
+        status = lc4.find('span', {'class' : 'ab'})
+        if  status != None:
+            self.story.setMetadata('status', 'In-Progress')
+            if "Last Activity" in status.text:
+                self.story.setMetadata('dateUpdated', makeDate(status.text.split('Activity: ')[1].split(')')[0], self.dateformat))
+        else:
+            self.story.setMetadata('status', 'Completed')
+
+            
+    # grab the text for an individual chapter.
+    def getChapterText(self, url):
+
+        logger.debug('Getting chapter text from: %s' % url)
+
+        soup = bs.BeautifulSoup(self._fetchUrl(url),
+                                     selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
+        
+        div = soup.find('div', {'id' : 'story'})
+        
+        # some big chapters are split over several pages
+        pager = div.find('span', {'class' : 'pager'})
+        if pager != None:
+            urls=pager.findAll('a')
+            urls=urls[:len(urls)-1]
+            
+            
+            for ur in urls:
+                soup = bs.BeautifulSoup(self._fetchUrl("http://"+self.getSiteDomain()+ur['href']),
+                                     selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
+        
+                div1 = soup.find('div', {'id' : 'story'})
+                
+                # appending next section
+                last=div.findAll('p')
+                next=div1.find('span', {'class' : 'conTag'}).nextSibling
+            
+                last[len(last)-1]=last[len(last)-1].append(next)
+                div.append(div1)
+            
+        # removing all the left-over stuff    
+        for a in div.findAll('span'):
+            a.extract() 
+            
+        for a in div.findAll('h1'):
+            a.extract()
+        for a in div.findAll('h2'):
+            a.extract()
+        for a in div.findAll('h3'):
+            a.extract()
+        for a in div.findAll('h4'):
+            a.extract()
+        for a in div.findAll('br'):
+            a.extract()
+        for a in div.findAll('div', {'class' : 'date'}):
+            a.extract()
+            
+        a = div.find('form')
+        if a != None:
+            b = a.nextSibling
+            while b != None:
+                a.extract()
+                a=b
+                b=b.nextSibling
+        
+
+        if None == div:
+            raise exceptions.FailedToDownload("Error downloading Chapter: %s!  Missing required element!" % url)
+    
+        return self.utf8FromSoup(url,div)
--- a/plugin-defaults.ini
+++ b/plugin-defaults.ini
@ -778,6 +778,24 @@ dislikes_label:Dislikes
 #username:YourName
 #password:yourpassword

+[storiesonline.net]
+## Some sites require login (or login for some rated stories) The
+## program can prompt you, or you can save it in config.  In
+## commandline version, this should go in your personal.ini, not
+## defaults.ini.
+#username:YourName
+#password:yourpassword
+
+## Clear FanFiction from defaults, site is original fiction.
+extratags:
+
+extra_valid_entries:size,universe,codes
+#extra_titlepage_entries:size,universe,codes
+
+size_label:Size
+universe_label:Universe
+codes_label:Codes
+
 [grangerenchanted.com]
 ## Site dedicated to these categories/characters/ships
 extracategories:Harry Potter