From 5040c4457225d3266d592c9dccf9bfa716f85857 Mon Sep 17 00:00:00 2001
From: Jim Miller <retiefjimm@gmail.com>
Date: Mon, 28 Sep 2015 13:32:24 -0500
Subject: [PATCH] Add to base_xenforoforum: fix for author using reply URL,
 continue_on_chapter_error feature, remove 'Story' in front of 'Thread' in
 title.

---
 calibre-plugin/plugin-defaults.ini            |  12 +-
 .../adapters/base_xenforoforum_adapter.py     | 110 +++++++++++-------
 fanficfare/configurable.py                    |   8 +-
 fanficfare/defaults.ini                       |  12 +-
 4 files changed, 96 insertions(+), 46 deletions(-)

diff --git a/calibre-plugin/plugin-defaults.ini b/calibre-plugin/plugin-defaults.ini
index 7d374b05..cf6792e2 100644
--- a/calibre-plugin/plugin-defaults.ini
+++ b/calibre-plugin/plugin-defaults.ini
@@ -406,7 +406,7 @@ add_to_replace_metadata:
  title=>[-: ]*[\(\[]([^\]\)]+)[\)\]][-: ]*=>
 # remove 'Thread' and the next word, usually "Thread 2", "Thread
 # four", "Thread iv", etc
- title,tagsfromtitle=>[-: ]*[Tt]hread [^ ]+[-: ]*=>
+ title,tagsfromtitle=>[-: ]*(Story *)?[Tt]hread [^ ]+[-: ]*=>
 
 add_to_extra_titlepage_entries:,tagsfromtitle,forumtags
 
@@ -425,6 +425,13 @@ dateUpdated_format:%%Y-%%m-%%d %%H:%%M:%%S
 ## the description.
 description_limit:500
 
+## Because base_xenforoforum adapters can pull chapter URLs from human
+## posts, the odds of errors in the chapter URLs are vastly higher.
+## You can set continue_on_chapter_error:true to continue on after
+## failing to download a chapter and instead record an error message
+## in the ebook for that chapter.
+continue_on_chapter_error:false
+
 ## Each output format has a section that overrides [defaults]
 [html]
 
@@ -1347,6 +1354,9 @@ extracategories:My Little Pony: Friendship is Magic
 ## Site dedicated to these categories/characters/ships
 extracategories:The Pretender
 
+[questionablequesting.com]
+## see [base_xenforoforum]
+
 [samandjack.net]
 ## Some sites require login (or login for some rated stories) The
 ## program can prompt you, or you can save it in config.  In
diff --git a/fanficfare/adapters/base_xenforoforum_adapter.py b/fanficfare/adapters/base_xenforoforum_adapter.py
index f6192743..4084872b 100644
--- a/fanficfare/adapters/base_xenforoforum_adapter.py
+++ b/fanficfare/adapters/base_xenforoforum_adapter.py
@@ -17,6 +17,7 @@
 
 import time
 import logging
+import traceback
 logger = logging.getLogger(__name__)
 import re
 import urllib2
@@ -38,11 +39,11 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
                                # Most sites that claim to be
                                # iso-8859-1 (and some that claim to be
                                # utf8) are really windows-1252.
-							   
-							   
+
+
         # get storyId from url--url validation guarantees query is only sid=1234
-        self.story.setMetadata('storyId',self.parsedUrl.path.split('/',)[2])        
-        
+        self.story.setMetadata('storyId',self.parsedUrl.path.split('/',)[2])
+
         # get storyId from url--url validation guarantees query correct
         m = re.match(self.getSiteURLPattern(),url)
         if m:
@@ -54,23 +55,23 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
             raise exceptions.InvalidStoryURL(url,
                                              self.getSiteDomain(),
                                              self.getSiteExampleURLs())
-        
+
         # Each adapter needs to have a unique site abbreviation.
         self.story.setMetadata('siteabbrev','fsb')
 
         # The date format will vary from site to site.
         # http://docs.python.org/library/datetime.html#strftime-strptime-behavior
         self.dateformat = "%b %d, %Y at %I:%M %p"
-            
+
     @classmethod
     def getConfigSections(cls):
         "Only needs to be overriden if has additional ini sections."
         return ['base_xenforoforum',cls.getConfigSection()]
-    
+
     @classmethod
     def getURLPrefix(cls):
         # The site domain.  Does have www here, if it uses it.
-        return 'https://' + cls.getSiteDomain() 
+        return 'https://' + cls.getSiteDomain()
 
     @classmethod
     def getSiteExampleURLs(cls):
@@ -78,7 +79,7 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
 
     def getSiteURLPattern(self):
         return r"https?://"+re.escape(self.getSiteDomain())+r"/(?P<tp>threads|posts)/(.+\.)?(?P<id>\d+)/?"
-        
+
     def use_pagecache(self):
         '''
         adapters that will work with the page cache need to implement
@@ -112,7 +113,7 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
 
         h1 = soup.find('div',{'class':'titleBar'}).h1
         self.story.setMetadata('title',stripHTML(h1))
-        
+
         if '#' in useurl:
             anchorid = useurl.split('#')[1]
             soup = soup.find('li',id=anchorid)
@@ -129,7 +130,7 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
                             self.story.setMetadata('datePublished', date)
                         if not self.story.getMetadataRaw('dateUpdated') or date > self.story.getMetadataRaw('dateUpdated'):
                             self.story.setMetadata('dateUpdated', date)
-                            
+
                         self.chapterUrls.append((name,self.getURLPrefix()+'/'+url))
 
                     ## only use tags if threadmarks for chapters.
@@ -138,10 +139,10 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
                         self.story.addToList('forumtags',stripHTML(tag))
 
             soup = soup.find('li',{'class':'message'}) # limit first post for date stuff below. ('#' posts above)
-                
+
         # Now go hunting for the 'chapter list'.
         bq = soup.find('blockquote') # assume first posting contains TOC urls.
-        
+
         bq.name='div'
 
         for iframe in bq.find_all('iframe'):
@@ -149,7 +150,7 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
 
         for qdiv in bq.find_all('div',{'class':'quoteExpand'}):
             qdiv.extract() # Remove <div class="quoteExpand">click to expand</div>
-            
+
         self.setDescription(useurl,bq)
 
         # otherwise, use first post links--include first post since
@@ -160,31 +161,34 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
                 logger.debug("found chapurl:%s"%url)
                 if not url.startswith('http'):
                     url = self.getURLPrefix()+'/'+url
-    
+
                 if ( url.startswith(self.getURLPrefix()) or
                      url.startswith('http://'+self.getSiteDomain()) or
                      url.startswith('https://'+self.getSiteDomain()) ) and ('/posts/' in url or '/threads/' in url):
+
                     # brute force way to deal with SB's http->https change when hardcoded http urls.
                     url = url.replace('http://'+self.getSiteDomain(),self.getURLPrefix())
+
                     url = re.sub(r'(^[\'"]+|[\'"]+$)','',url) # strip leading or trailing '" from incorrect quoting.
-                    logger.debug("used chapurl:%s"%(url))
+
+                    logger.debug("(ch:%s)used chapurl:%s"%(len(self.chapterUrls)+1,url))
                     self.chapterUrls.append((name,url))
                     if url == useurl and 'First Post' == self.chapterUrls[0][0]:
                         # remove "First Post" if included in list.
                         logger.debug("delete dup 'First Post' chapter: %s %s"%self.chapterUrls[0])
                         del self.chapterUrls[0]
-                        
+
             # Didn't use threadmarks, so take created/updated dates
             # from the 'first' posting created and updated.
             date = self.make_date(soup.find('a',{'class':'datePermalink'}))
             if date:
                 self.story.setMetadata('datePublished', date)
                 self.story.setMetadata('dateUpdated', date) # updated overwritten below if found.
-        
+
             date = self.make_date(soup.find('div',{'class':'editDate'}))
             if date:
-                self.story.setMetadata('dateUpdated', date) 
-            
+                self.story.setMetadata('dateUpdated', date)
+
         self.story.setMetadata('numChapters',len(self.chapterUrls))
 
     def make_date(self,parenttag): # forums use a BS thing where dates
@@ -205,7 +209,7 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
         except:
             logger.debug('No date found in %s'%parenttag)
             return None
-        
+
     # grab the text for an individual chapter.
     def getChapterText(self, url):
         logger.debug('Getting chapter text from: %s' % url)
@@ -218,28 +222,48 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
         # https://forums.sufficientvelocity.com/posts/39915/
         if '#post-' in url:
             url = self.getURLPrefix()+'/posts/'+url.split('#post-')[1]+'/'
-        
-        origurl = url
-        (data,opened) = self._fetchUrlOpened(url)
-        url = opened.geturl()
-        if '#' in origurl and '#' not in url:
-            url = url + origurl[origurl.index('#'):]
-        logger.debug("chapter URL redirected to: %s"%url)
 
-        soup = self.make_soup(data)
-
-        if '#' in url:
-            anchorid = url.split('#')[1]
-            soup = soup.find('li',id=anchorid)
-                
-        bq = soup.find('blockquote')
-
-        bq.name='div'
-
-        for iframe in bq.find_all('iframe'):
-            iframe.extract() # calibre book reader & editor don't like iframes to youtube.
-
-        for qdiv in bq.find_all('div',{'class':'quoteExpand'}):
-            qdiv.extract() # Remove <div class="quoteExpand">click to expand</div>
+        ## Same as above except for for case where author mistakenly
+        ## used the reply link instead of normal link to post.
+        # "http://forums.spacebattles.com/threads/manager-worm-story-thread-iv.301602/reply?quote=15962513"
+        # https://forums.spacebattles.com/posts/
+        if 'reply?quote=' in url:
+            url = self.getURLPrefix()+'/posts/'+url.split('reply?quote=')[1]+'/'
 
+        try:
+            origurl = url
+            (data,opened) = self._fetchUrlOpened(url)
+            url = opened.geturl()
+            if '#' in origurl and '#' not in url:
+                url = url + origurl[origurl.index('#'):]
+            logger.debug("chapter URL redirected to: %s"%url)
+            
+            soup = self.make_soup(data)
+    
+            if '#' in url:
+                anchorid = url.split('#')[1]
+                soup = soup.find('li',id=anchorid)
+    
+            bq = soup.find('blockquote')
+    
+            bq.name='div'
+    
+            for iframe in bq.find_all('iframe'):
+                iframe.extract() # calibre book reader & editor don't like iframes to youtube.
+    
+            for qdiv in bq.find_all('div',{'class':'quoteExpand'}):
+                qdiv.extract() # Remove <div class="quoteExpand">click to expand</div>
+    
+        except Exception as e:
+            if self.getConfig('continue_on_chapter_error'):
+                bq = self.make_soup("""<div>
+<p><b>Error</b></p>
+<p>FanFicFare failed to download this chapter.  Because you have
+<b>continue_on_chapter_error</b> set to <b>true</b> in your personal.ini, the download continued.</p>
+<p>Chapter URL:<br>%s</p>
+<p>Error:<br><pre>%s</pre></p>
+</div>"""%(url,traceback.format_exc()))
+            else:
+                raise
+            
         return self.utf8FromSoup(url,bq)
diff --git a/fanficfare/configurable.py b/fanficfare/configurable.py
index 1eca838f..c516a9ed 100644
--- a/fanficfare/configurable.py
+++ b/fanficfare/configurable.py
@@ -82,7 +82,7 @@ def get_valid_sections():
     sitesections = list(othersections)
     for section in sites:
         sitesections.append(section)
-        # also allows [www.base_efiction] and [www.base_forum]. Not
+        # also allows [www.base_efiction] and [www.base_xenforoforum]. Not
         # likely to matter.
         if section.startswith('www.'):
             # add w/o www if has www
@@ -166,6 +166,12 @@ def get_valid_set_options():
                'include_images':(None,['epub','html'],boollist),
                'grayscale_images':(None,['epub','html'],boollist),
                'no_image_processing':(None,['epub','html'],boollist),
+
+               'continue_on_chapter_error':(['base_xenforoforum',
+                                             'forums.spacebattles.com',
+                                             'forums.sufficientvelocity.com',
+                                             'questionablequesting.com',
+                                             ],None,boollist),
                }
 
     return dict(valdict)
diff --git a/fanficfare/defaults.ini b/fanficfare/defaults.ini
index eeeba758..76eef052 100644
--- a/fanficfare/defaults.ini
+++ b/fanficfare/defaults.ini
@@ -405,7 +405,7 @@ add_to_replace_metadata:
  title=>[-: ]*[\(\[]([^\]\)]+)[\)\]][-: ]*=>
 # remove 'Thread' and the next word, usually "Thread 2", "Thread
 # four", "Thread iv", etc
- title,tagsfromtitle=>[-: ]*[Tt]hread [^ ]+[-: ]*=>
+ title,tagsfromtitle=>[-: ]*(Story *)?[Tt]hread [^ ]+[-: ]*=>
 
 add_to_extra_titlepage_entries:,tagsfromtitle,forumtags
 
@@ -424,6 +424,13 @@ dateUpdated_format:%%Y-%%m-%%d %%H:%%M:%%S
 ## the description.
 description_limit:500
 
+## Because base_xenforoforum adapters can pull chapter URLs from human
+## posts, the odds of errors in the chapter URLs are vastly higher.
+## You can set continue_on_chapter_error:true to continue on after
+## failing to download a chapter and instead record an error message
+## in the ebook for that chapter.
+continue_on_chapter_error:false
+
 ## Each output format has a section that overrides [defaults]
 [html]
 
@@ -1335,6 +1342,9 @@ extracategories:My Little Pony: Friendship is Magic
 ## Site dedicated to these categories/characters/ships
 extracategories:The Pretender
 
+[questionablequesting.com]
+## see [base_xenforoforum]
+
 [samandjack.net]
 ## Some sites require login (or login for some rated stories) The
 ## program can prompt you, or you can save it in config.  In