Fixes for finestories.com, now shares code with storiesonline.net, thanks davidfor for this.

2026-05-09 05:21:13 +02:00 · 2016-02-21 11:22:58 -06:00 · 2016-02-21 11:22:58 -06:00 · 44e12d1ef3
commit 44e12d1ef3
parent 83c8987b8b
4 changed files with 78 additions and 266 deletions
--- a/calibre-plugin/plugin-defaults.ini
+++ b/calibre-plugin/plugin-defaults.ini
@ -1193,14 +1193,41 @@ dislikes_label:Dislikes
 ## program can prompt you, or you can save it in config.  In
 ## commandline version, this should go in your personal.ini, not
 ## defaults.ini.
-#username:YourName
+## finestories.com has started requiring login by email rather than
+## pen name.
+#username:youremail@yourdomain.dom
 #password:yourpassword

-# shows size as "10 KB", not word count
-extra_valid_entries:size
+## Clear FanFiction from defaults, site is original fiction.
+extratags:

-# don't show twitter icon.
-cover_exclusion_regexp:/res/css/bir.png
+extra_valid_entries:size,universe,universeUrl,universeHTML,sitetags,notice,codes,score
+#extra_titlepage_entries:size,universeHTML,sitetags,notice,score
+include_in_codes:sitetags
+
+## adds to include_subject_tags instead of replacing it.
+#extra_subject_tags:sitetags
+
+size_label:Size
+universe_label:Universe
+universeUrl_label:Universe URL
+universeHTML_label:Universe
+sitetags_label:Site Tags
+notice_label:Notice
+score_label:Score
+
+## Assume entryUrl, apply to "<a class='%slink' href='%s'>%s</a>" to
+## make entryHTML.
+make_linkhtml_entries:universe
+
+## storiesonline.net stories can be in a series or a universe, but not
+## both.  By default, universe will be populated in 'series' with
+## index=0
+universe_as_series: true
+
+## some sites include images that we don't ever want becoming the
+## cover image.  This lets you exclude them.
+cover_exclusion_regexp:/css/bir.png

 [forums.spacebattles.com]
 ## see [base_xenforoforum]
--- a/fanficfare/adapters/adapter_finestoriescom.py
+++ b/fanficfare/adapters/adapter_finestoriescom.py
@ -15,279 +15,32 @@
 # limitations under the License.
 #

-import time
 import logging
 logger = logging.getLogger(__name__)
-import re
-import urllib2
-
-
-from ..htmlcleanup import stripHTML
-from .. import exceptions as exceptions
-
-from base_adapter import BaseSiteAdapter,  makeDate
+from adapter_storiesonlinenet import StoriesOnlineNetAdapter

 def getClass():
    return FineStoriesComAdapter

 # Class name has to be unique.  Our convention is camel case the
 # sitename with Adapter at the end.  www is skipped.
-class FineStoriesComAdapter(BaseSiteAdapter):
+class FineStoriesComAdapter(StoriesOnlineNetAdapter):

-    def __init__(self, config, url):
-        BaseSiteAdapter.__init__(self, config, url)
-
-        self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
-        self.password = ""
-        self.is_adult=False
-
-        # get storyId from url
-        self.story.setMetadata('storyId',self.parsedUrl.path.split('/',)[2].split(':')[0])
-        if 'storyInfo' in self.story.getMetadata('storyId'):
-            self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
-
-        # normalized story URL.
-        self._setURL('http://' + self.getSiteDomain() + '/s/storyInfo.php?id='+self.story.getMetadata('storyId'))
-
-        # Each adapter needs to have a unique site abbreviation.
-        self.story.setMetadata('siteabbrev','fnst')
-
-        # The date format will vary from site to site.
-        # http://docs.python.org/library/datetime.html#strftime-strptime-behavior
-        self.dateformat = "%Y-%m-%d"
+    @classmethod
+    def getSiteAbbrev(self):
+        return 'fnst'

    @staticmethod # must be @staticmethod, don't remove it.
    def getSiteDomain():
        # The site domain.  Does have www here, if it uses it.
        return 'finestories.com'

-    @classmethod
-    def getSiteExampleURLs(cls):
-        return "http://"+cls.getSiteDomain()+"/s/1234 http://"+cls.getSiteDomain()+"/s/1234:4010 http://"+cls.getSiteDomain()+"/library/storyInfo.php?id=1234"
-
-    def getSiteURLPattern(self):
-        return re.escape("http://"+self.getSiteDomain())+r"/(s|library)?/(storyInfo.php\?id=)?\d+(:\d+)?(;\d+)?$"
-
    ## Login seems to be reasonably standard across eFiction sites.
    def needToLoginCheck(self, data):
        if 'Free Registration' in data \
+                or "Log In" in data \
                or "Invalid Password!" in data \
                or "Invalid User Name!" in data:
            return True
        else:
            return False
-
-    def performLogin(self, url):
-        params = {}
-
-        if self.password:
-            params['theusername'] = self.username
-            params['thepassword'] = self.password
-        else:
-            params['theusername'] = self.getConfig("username")
-            params['thepassword'] = self.getConfig("password")
-        params['rememberMe'] = '1'
-        params['page'] = 'http://'+self.getSiteDomain()+'/'
-        params['submit'] = 'Login'
-
-        loginUrl = 'http://' + self.getSiteDomain() + '/login.php'
-        logger.debug("Will now login to URL (%s) as (%s)" % (loginUrl,
-                                                              params['theusername']))
-
-        d = self._fetchUrl(loginUrl, params)
-
-        if "My Account" not in d : #Member Account
-            logger.info("Failed to login to URL %s as %s" % (loginUrl,
-                                                              params['theusername']))
-            raise exceptions.FailedToLogin(url,params['theusername'])
-            return False
-        else:
-            return True
-
-    ## Getting the chapter list and the meta data, plus 'is adult' checking.
-    def extractChapterUrlsAndMetadata(self):
-
-        # index=1 makes sure we see the story chapter index.  Some
-        # sites skip that for one-chapter stories.
-        url = self.url
-        logger.debug("URL: "+url)
-
-        try:
-            data = self._fetchUrl(url)
-        except urllib2.HTTPError, e:
-            if e.code == 404:
-                raise exceptions.StoryDoesNotExist(self.url)
-            else:
-                raise e
-
-        if self.needToLoginCheck(data):
-            # need to log in for this one.
-            self.performLogin(url)
-            data = self._fetchUrl(url)
-
-        if "Access denied. This story has not been validated by the adminstrators of this site." in data:
-            raise exceptions.AccessDenied(self.getSiteDomain() +" says: Access denied. This story has not been validated by the adminstrators of this site.")
-
-        # use BeautifulSoup HTML parser to make everything easier to find.
-        soup = self.make_soup(data)
-        # print data
-
-        # Now go hunting for all the meta data and the chapter list.
-
-        ## Title
-        a = soup.find('a', href=re.compile(r'/s/'+self.story.getMetadata('storyId')))
-        self.story.setMetadata('title',stripHTML(a))
-
-        # Find authorid and URL from... author url.
-        a = soup.find('a', href=re.compile(r"/a/\w+"))
-        self.story.setMetadata('authorId',a['href'].split('/')[2])
-        self.story.setMetadata('authorUrl','http://'+self.host+a['href'])
-        self.story.setMetadata('author',a.text)
-
-        # Find the chapters:
-        chapters = soup.findAll('a', href=re.compile(r'/s/'+self.story.getMetadata('storyId')+":\d+$"))
-        if len(chapters) != 0:
-            for chapter in chapters:
-                # just in case there's tags, like <i> in chapter titles.
-                self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+chapter['href']))
-        else:
-            self.chapterUrls.append((self.story.getMetadata('title'),'http://'+self.host+'/s/'+self.story.getMetadata('storyId')))
-
-        self.story.setMetadata('numChapters',len(self.chapterUrls))
-
-        # surprisingly, the detailed page does not give enough details, so go to author's page
-
-        skip=0
-        i=0
-        while i == 0:
-            asoup = self.make_soup(self._fetchUrl(self.story.getMetadata('authorUrl')+"&skip="+unicode(skip)))
-
-            tds = asoup.findAll('td', {'class' : 'lc2'})
-            for lc2 in tds:
-                if lc2.find('a', href=re.compile(r'/s/'+self.story.getMetadata('storyId'))):
-                    i=1
-                    break
-                if tds[len(tds)-1] == lc2:
-                    skip=skip+10
-
-        for cat in lc2.findAll('div', {'class' : 'typediv'}):
-            self.story.addToList('category',cat.text)
-
-        self.story.setMetadata('size', lc2.findNext('td', {'class' : 'num'}).text)
-
-        lc4 = lc2.findNext('td', {'class' : 'lc4'})
-
-        try:
-            a = lc4.find('a', href=re.compile(r"/library/show_series.php\?id=\d+"))
-            i = a.parent.text.split('(')[1].split(')')[0]
-            self.setSeries(a.text, i)
-            self.story.setMetadata('seriesUrl','http://'+self.host+a['href'])
-        except:
-            pass
-        try:
-            a = lc4.find('a', href=re.compile(r"/library/universe.php\?id=\d+"))
-            self.story.addToList("category",a.text)
-        except:
-            pass
-
-        for a in lc4.findAll('span', {'class' : 'help'}) + lc4.findAll('script'):
-            a.extract()
-
-        self.setDescription('http://'+self.host+'/s/'+self.story.getMetadata('storyId'),lc4.text.split('[More Info')[0])
-
-        for b in lc4.findAll('b'):
-            label = b.text
-            value = b.nextSibling
-
-            if 'For Age' in label:
-                self.story.setMetadata('rating', value)
-
-            if 'Tags' in label:
-                for genre in value.split(', '):
-                    self.story.addToList('genre',genre)
-
-            ## Site uses a <script> to inject timestamp in locale plus <noscript> general version.
-            if 'Posted' in label:
-                value = b.find_next_sibling('noscript')
-                if '(' in value:
-                    date = makeDate(stripHTML(value.split(' (')[0]), self.dateformat)
-                else:
-                    date = makeDate(stripHTML(value), self.dateformat)
-                self.story.setMetadata('datePublished', date)
-                self.story.setMetadata('dateUpdated', date)
-
-            if 'Concluded' in label or 'Updated' in label:
-                value = b.find_next_sibling('noscript')
-                if '(' in value:
-                    date = makeDate(stripHTML(value.split(' (')[0]), self.dateformat)
-                else:
-                    date = makeDate(stripHTML(value), self.dateformat)
-                self.story.setMetadata('dateUpdated', date)
-
-        status = lc4.find('span', {'class' : 'ab'})
-        if  status != None:
-            self.story.setMetadata('status', 'In-Progress')
-            if "Last Activity" in status.text:
-                self.story.setMetadata('dateUpdated', makeDate(status.text.split('Activity: ')[1].split(')')[0], self.dateformat))
-        else:
-            self.story.setMetadata('status', 'Completed')
-
-
-    # grab the text for an individual chapter.
-    def getChapterText(self, url):
-
-        logger.debug('Getting chapter text from: %s' % url)
-
-        soup = self.make_soup(self._fetchUrl(url))
-
-        div = soup.find('article')
-
-        if None == div:
-            raise exceptions.FailedToDownload("Error downloading Chapter: %s!  Missing required element!" % url)
-
-        # some big chapters are split over several pages
-        pager = div.find('span', {'class' : 'pager'})
-        if pager != None:
-            urls=pager.findAll('a')
-            urls=urls[:len(urls)-1]
-
-            for ur in urls:
-                soup = self.make_soup(self._fetchUrl("http://"+self.getSiteDomain()+ur['href']))
-
-                div1 = soup.find('article')
-
-                #print("div.contents:%s"%(div.contents,))
-                # appending next section
-                last=div.findAll('p')
-                next=div1.find('span', {'class' : 'conTag'}).nextSibling
-                last[len(last)-1]=last[len(last)-1].append(next)
-
-                self.clean_chapter(div1)
-                #print("div.contents:%s"%(div.contents,))
-                for t in div1.contents:
-                    div.append(t)
-
-        self.clean_chapter(div)
-
-        return self.utf8FromSoup(url,div)
-    def clean_chapter(self,art):
-        # discard included chapter heading.
-        # discard included date
-        # discard share block
-        # continued/continues
-        # discard next link
-        for tag in art.find_all('h2') + \
-                art.find_all('div', class_="date") + \
-                art.find_all('div', class_="vform") + \
-                art.find_all('span', class_="conTag") + \
-                art.find_all('h3', class_="end"):
-            tag.extract()
-
-        # remove pager blocks.
-        for pager in art.find_all('span', class_="pager"):
-            # remove br tags before and after pager.
-            #print("br list prev: %s"%len(pager.find_previous_siblings('br')))
-            #print("br list next: %s"%len(pager.find_next_siblings('br')))
-            for tag in pager.find_next_siblings('br')[:2] + pager.find_previous_siblings('br')[:2]:
-                tag.extract()
-            pager.extract()
--- a/fanficfare/adapters/adapter_storiesonlinenet.py
+++ b/fanficfare/adapters/adapter_storiesonlinenet.py
@ -36,6 +36,7 @@ class StoriesOnlineNetAdapter(BaseSiteAdapter):

    def __init__(self, config, url):
        BaseSiteAdapter.__init__(self, config, url)
+        logger.debug("StoriesOnlineNetAdapter.__init__ - url='%s'" % url)

        self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
        self.password = ""
@ -50,12 +51,16 @@ class StoriesOnlineNetAdapter(BaseSiteAdapter):
        self._setURL('http://' + self.getSiteDomain() + '/s/'+self.story.getMetadata('storyId'))

        # Each adapter needs to have a unique site abbreviation.
-        self.story.setMetadata('siteabbrev','strol')
+        self.story.setMetadata('siteabbrev',self.getSiteAbbrev())

        # The date format will vary from site to site.
        # http://docs.python.org/library/datetime.html#strftime-strptime-behavior
        self.dateformat = "%Y-%m-%d"

+    @classmethod
+    def getSiteAbbrev(self):
+        return 'strol'
+
    @staticmethod # must be @staticmethod, don't remove it.
    def getSiteDomain():
        # The site domain.  Does have www here, if it uses it.
@ -66,7 +71,7 @@ class StoriesOnlineNetAdapter(BaseSiteAdapter):
        return "http://"+cls.getSiteDomain()+"/s/1234 http://"+cls.getSiteDomain()+"/s/1234:4010"

    def getSiteURLPattern(self):
-        return re.escape("http://"+self.getSiteDomain())+r"/s/\d+((:\d+)?(;\d+)?$|(:i)?$)?"
+        return re.escape("http://"+self.getSiteDomain())+r"/(s|library)/(storyInfo.php\?id=)?(?P<id>\d+)((:\d+)?(;\d+)?$|(:i)?$)?"

    ## Login seems to be reasonably standard across eFiction sites.
    def needToLoginCheck(self, data):
@ -290,7 +295,7 @@ class StoriesOnlineNetAdapter(BaseSiteAdapter):

            if 'Tags' in label or 'Codes' in label:
                for code in re.split(r'\s*,\s*', value.strip()):
-                     self.story.addToList('sitetags',code)
+                    self.story.addToList('sitetags',code)

            if 'Posted' in label:
                self.story.setMetadata('datePublished', makeDate(stripHTML(value), self.dateformat))
--- a/fanficfare/defaults.ini
+++ b/fanficfare/defaults.ini
@ -1181,14 +1181,41 @@ dislikes_label:Dislikes
 ## program can prompt you, or you can save it in config.  In
 ## commandline version, this should go in your personal.ini, not
 ## defaults.ini.
-#username:YourName
+## finestories.com has started requiring login by email rather than
+## pen name.
+#username:youremail@yourdomain.dom
 #password:yourpassword

-# shows size as "10 KB", not word count
-extra_valid_entries:size
+## Clear FanFiction from defaults, site is original fiction.
+extratags:

-# don't show twitter icon.
-cover_exclusion_regexp:/res/css/bir.png
+extra_valid_entries:size,universe,universeUrl,universeHTML,sitetags,notice,codes,score
+#extra_titlepage_entries:size,universeHTML,sitetags,notice,score
+include_in_codes:sitetags
+
+## adds to include_subject_tags instead of replacing it.
+#extra_subject_tags:sitetags
+
+size_label:Size
+universe_label:Universe
+universeUrl_label:Universe URL
+universeHTML_label:Universe
+sitetags_label:Site Tags
+notice_label:Notice
+score_label:Score
+
+## Assume entryUrl, apply to "<a class='%slink' href='%s'>%s</a>" to
+## make entryHTML.
+make_linkhtml_entries:universe
+
+## storiesonline.net stories can be in a series or a universe, but not
+## both.  By default, universe will be populated in 'series' with
+## index=0
+universe_as_series: true
+
+## some sites include images that we don't ever want becoming the
+## cover image.  This lets you exclude them.
+cover_exclusion_regexp:/css/bir.png

 [forums.spacebattles.com]
 ## see [base_xenforoforum]