Add to base_xenforoforum: fix for author using reply URL, continue_on_chapter_error feature, remove 'Story' in front of 'Thread' in title.

2026-05-06 03:20:24 +02:00 · 2015-09-28 13:32:24 -05:00 · 2015-09-28 13:32:24 -05:00 · 5040c44572
commit 5040c44572
parent e2d1a693dd
4 changed files with 96 additions and 46 deletions
--- a/calibre-plugin/plugin-defaults.ini
+++ b/calibre-plugin/plugin-defaults.ini
@ -406,7 +406,7 @@ add_to_replace_metadata:
 title=>[-: ]*[\(\[]([^\]\)]+)[\)\]][-: ]*=>
 # remove 'Thread' and the next word, usually "Thread 2", "Thread
 # four", "Thread iv", etc
- title,tagsfromtitle=>[-: ]*[Tt]hread [^ ]+[-: ]*=>
+ title,tagsfromtitle=>[-: ]*(Story *)?[Tt]hread [^ ]+[-: ]*=>

 add_to_extra_titlepage_entries:,tagsfromtitle,forumtags

@ -425,6 +425,13 @@ dateUpdated_format:%%Y-%%m-%%d %%H:%%M:%%S
 ## the description.
 description_limit:500

+## Because base_xenforoforum adapters can pull chapter URLs from human
+## posts, the odds of errors in the chapter URLs are vastly higher.
+## You can set continue_on_chapter_error:true to continue on after
+## failing to download a chapter and instead record an error message
+## in the ebook for that chapter.
+continue_on_chapter_error:false
+
 ## Each output format has a section that overrides [defaults]
 [html]

@ -1347,6 +1354,9 @@ extracategories:My Little Pony: Friendship is Magic
 ## Site dedicated to these categories/characters/ships
 extracategories:The Pretender

+[questionablequesting.com]
+## see [base_xenforoforum]
+
 [samandjack.net]
 ## Some sites require login (or login for some rated stories) The
 ## program can prompt you, or you can save it in config.  In
--- a/fanficfare/adapters/base_xenforoforum_adapter.py
+++ b/fanficfare/adapters/base_xenforoforum_adapter.py
@ -17,6 +17,7 @@

 import time
 import logging
+import traceback
 logger = logging.getLogger(__name__)
 import re
 import urllib2
@ -38,11 +39,11 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
                               # Most sites that claim to be
                               # iso-8859-1 (and some that claim to be
                               # utf8) are really windows-1252.
-							   
-							   
+
+
        # get storyId from url--url validation guarantees query is only sid=1234
-        self.story.setMetadata('storyId',self.parsedUrl.path.split('/',)[2])        
-        
+        self.story.setMetadata('storyId',self.parsedUrl.path.split('/',)[2])
+
        # get storyId from url--url validation guarantees query correct
        m = re.match(self.getSiteURLPattern(),url)
        if m:
@ -54,23 +55,23 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
            raise exceptions.InvalidStoryURL(url,
                                             self.getSiteDomain(),
                                             self.getSiteExampleURLs())
-        
+
        # Each adapter needs to have a unique site abbreviation.
        self.story.setMetadata('siteabbrev','fsb')

        # The date format will vary from site to site.
        # http://docs.python.org/library/datetime.html#strftime-strptime-behavior
        self.dateformat = "%b %d, %Y at %I:%M %p"
-            
+
    @classmethod
    def getConfigSections(cls):
        "Only needs to be overriden if has additional ini sections."
        return ['base_xenforoforum',cls.getConfigSection()]
-    
+
    @classmethod
    def getURLPrefix(cls):
        # The site domain.  Does have www here, if it uses it.
-        return 'https://' + cls.getSiteDomain() 
+        return 'https://' + cls.getSiteDomain()

    @classmethod
    def getSiteExampleURLs(cls):
@ -78,7 +79,7 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):

    def getSiteURLPattern(self):
        return r"https?://"+re.escape(self.getSiteDomain())+r"/(?P<tp>threads|posts)/(.+\.)?(?P<id>\d+)/?"
-        
+
    def use_pagecache(self):
        '''
        adapters that will work with the page cache need to implement
@ -112,7 +113,7 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):

        h1 = soup.find('div',{'class':'titleBar'}).h1
        self.story.setMetadata('title',stripHTML(h1))
-        
+
        if '#' in useurl:
            anchorid = useurl.split('#')[1]
            soup = soup.find('li',id=anchorid)
@ -129,7 +130,7 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
                            self.story.setMetadata('datePublished', date)
                        if not self.story.getMetadataRaw('dateUpdated') or date > self.story.getMetadataRaw('dateUpdated'):
                            self.story.setMetadata('dateUpdated', date)
-                            
+
                        self.chapterUrls.append((name,self.getURLPrefix()+'/'+url))

                    ## only use tags if threadmarks for chapters.
@ -138,10 +139,10 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
                        self.story.addToList('forumtags',stripHTML(tag))

            soup = soup.find('li',{'class':'message'}) # limit first post for date stuff below. ('#' posts above)
-                
+
        # Now go hunting for the 'chapter list'.
        bq = soup.find('blockquote') # assume first posting contains TOC urls.
-        
+
        bq.name='div'

        for iframe in bq.find_all('iframe'):
@ -149,7 +150,7 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):

        for qdiv in bq.find_all('div',{'class':'quoteExpand'}):
            qdiv.extract() # Remove <div class="quoteExpand">click to expand</div>
-            
+
        self.setDescription(useurl,bq)

        # otherwise, use first post links--include first post since
@ -160,31 +161,34 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
                logger.debug("found chapurl:%s"%url)
                if not url.startswith('http'):
                    url = self.getURLPrefix()+'/'+url
-    
+
                if ( url.startswith(self.getURLPrefix()) or
                     url.startswith('http://'+self.getSiteDomain()) or
                     url.startswith('https://'+self.getSiteDomain()) ) and ('/posts/' in url or '/threads/' in url):
+
                    # brute force way to deal with SB's http->https change when hardcoded http urls.
                    url = url.replace('http://'+self.getSiteDomain(),self.getURLPrefix())
+
                    url = re.sub(r'(^[\'"]+|[\'"]+$)','',url) # strip leading or trailing '" from incorrect quoting.
-                    logger.debug("used chapurl:%s"%(url))
+
+                    logger.debug("(ch:%s)used chapurl:%s"%(len(self.chapterUrls)+1,url))
                    self.chapterUrls.append((name,url))
                    if url == useurl and 'First Post' == self.chapterUrls[0][0]:
                        # remove "First Post" if included in list.
                        logger.debug("delete dup 'First Post' chapter: %s %s"%self.chapterUrls[0])
                        del self.chapterUrls[0]
-                        
+
            # Didn't use threadmarks, so take created/updated dates
            # from the 'first' posting created and updated.
            date = self.make_date(soup.find('a',{'class':'datePermalink'}))
            if date:
                self.story.setMetadata('datePublished', date)
                self.story.setMetadata('dateUpdated', date) # updated overwritten below if found.
-        
+
            date = self.make_date(soup.find('div',{'class':'editDate'}))
            if date:
-                self.story.setMetadata('dateUpdated', date) 
-            
+                self.story.setMetadata('dateUpdated', date)
+
        self.story.setMetadata('numChapters',len(self.chapterUrls))

    def make_date(self,parenttag): # forums use a BS thing where dates
@ -205,7 +209,7 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
        except:
            logger.debug('No date found in %s'%parenttag)
            return None
-        
+
    # grab the text for an individual chapter.
    def getChapterText(self, url):
        logger.debug('Getting chapter text from: %s' % url)
@ -218,28 +222,48 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
        # https://forums.sufficientvelocity.com/posts/39915/
        if '#post-' in url:
            url = self.getURLPrefix()+'/posts/'+url.split('#post-')[1]+'/'
-        
-        origurl = url
-        (data,opened) = self._fetchUrlOpened(url)
-        url = opened.geturl()
-        if '#' in origurl and '#' not in url:
-            url = url + origurl[origurl.index('#'):]
-        logger.debug("chapter URL redirected to: %s"%url)

-        soup = self.make_soup(data)
-
-        if '#' in url:
-            anchorid = url.split('#')[1]
-            soup = soup.find('li',id=anchorid)
-                
-        bq = soup.find('blockquote')
-
-        bq.name='div'
-
-        for iframe in bq.find_all('iframe'):
-            iframe.extract() # calibre book reader & editor don't like iframes to youtube.
-
-        for qdiv in bq.find_all('div',{'class':'quoteExpand'}):
-            qdiv.extract() # Remove <div class="quoteExpand">click to expand</div>
+        ## Same as above except for for case where author mistakenly
+        ## used the reply link instead of normal link to post.
+        # "http://forums.spacebattles.com/threads/manager-worm-story-thread-iv.301602/reply?quote=15962513"
+        # https://forums.spacebattles.com/posts/
+        if 'reply?quote=' in url:
+            url = self.getURLPrefix()+'/posts/'+url.split('reply?quote=')[1]+'/'

+        try:
+            origurl = url
+            (data,opened) = self._fetchUrlOpened(url)
+            url = opened.geturl()
+            if '#' in origurl and '#' not in url:
+                url = url + origurl[origurl.index('#'):]
+            logger.debug("chapter URL redirected to: %s"%url)
+            
+            soup = self.make_soup(data)
+    
+            if '#' in url:
+                anchorid = url.split('#')[1]
+                soup = soup.find('li',id=anchorid)
+    
+            bq = soup.find('blockquote')
+    
+            bq.name='div'
+    
+            for iframe in bq.find_all('iframe'):
+                iframe.extract() # calibre book reader & editor don't like iframes to youtube.
+    
+            for qdiv in bq.find_all('div',{'class':'quoteExpand'}):
+                qdiv.extract() # Remove <div class="quoteExpand">click to expand</div>
+    
+        except Exception as e:
+            if self.getConfig('continue_on_chapter_error'):
+                bq = self.make_soup("""<div>
+<p><b>Error</b></p>
+<p>FanFicFare failed to download this chapter.  Because you have
+<b>continue_on_chapter_error</b> set to <b>true</b> in your personal.ini, the download continued.</p>
+<p>Chapter URL:<br>%s</p>
+<p>Error:<br><pre>%s</pre></p>
+</div>"""%(url,traceback.format_exc()))
+            else:
+                raise
+            
        return self.utf8FromSoup(url,bq)
--- a/fanficfare/configurable.py
+++ b/fanficfare/configurable.py
@ -82,7 +82,7 @@ def get_valid_sections():
    sitesections = list(othersections)
    for section in sites:
        sitesections.append(section)
-        # also allows [www.base_efiction] and [www.base_forum]. Not
+        # also allows [www.base_efiction] and [www.base_xenforoforum]. Not
        # likely to matter.
        if section.startswith('www.'):
            # add w/o www if has www
@ -166,6 +166,12 @@ def get_valid_set_options():
               'include_images':(None,['epub','html'],boollist),
               'grayscale_images':(None,['epub','html'],boollist),
               'no_image_processing':(None,['epub','html'],boollist),
+
+               'continue_on_chapter_error':(['base_xenforoforum',
+                                             'forums.spacebattles.com',
+                                             'forums.sufficientvelocity.com',
+                                             'questionablequesting.com',
+                                             ],None,boollist),
               }

    return dict(valdict)
--- a/fanficfare/defaults.ini
+++ b/fanficfare/defaults.ini
@ -405,7 +405,7 @@ add_to_replace_metadata:
 title=>[-: ]*[\(\[]([^\]\)]+)[\)\]][-: ]*=>
 # remove 'Thread' and the next word, usually "Thread 2", "Thread
 # four", "Thread iv", etc
- title,tagsfromtitle=>[-: ]*[Tt]hread [^ ]+[-: ]*=>
+ title,tagsfromtitle=>[-: ]*(Story *)?[Tt]hread [^ ]+[-: ]*=>

 add_to_extra_titlepage_entries:,tagsfromtitle,forumtags

@ -424,6 +424,13 @@ dateUpdated_format:%%Y-%%m-%%d %%H:%%M:%%S
 ## the description.
 description_limit:500

+## Because base_xenforoforum adapters can pull chapter URLs from human
+## posts, the odds of errors in the chapter URLs are vastly higher.
+## You can set continue_on_chapter_error:true to continue on after
+## failing to download a chapter and instead record an error message
+## in the ebook for that chapter.
+continue_on_chapter_error:false
+
 ## Each output format has a section that overrides [defaults]
 [html]

@ -1335,6 +1342,9 @@ extracategories:My Little Pony: Friendship is Magic
 ## Site dedicated to these categories/characters/ships
 extracategories:The Pretender

+[questionablequesting.com]
+## see [base_xenforoforum]
+
 [samandjack.net]
 ## Some sites require login (or login for some rated stories) The
 ## program can prompt you, or you can save it in config.  In