From 5040c4457225d3266d592c9dccf9bfa716f85857 Mon Sep 17 00:00:00 2001 From: Jim Miller Date: Mon, 28 Sep 2015 13:32:24 -0500 Subject: [PATCH] Add to base_xenforoforum: fix for author using reply URL, continue_on_chapter_error feature, remove 'Story' in front of 'Thread' in title. --- calibre-plugin/plugin-defaults.ini | 12 +- .../adapters/base_xenforoforum_adapter.py | 110 +++++++++++------- fanficfare/configurable.py | 8 +- fanficfare/defaults.ini | 12 +- 4 files changed, 96 insertions(+), 46 deletions(-) diff --git a/calibre-plugin/plugin-defaults.ini b/calibre-plugin/plugin-defaults.ini index 7d374b05..cf6792e2 100644 --- a/calibre-plugin/plugin-defaults.ini +++ b/calibre-plugin/plugin-defaults.ini @@ -406,7 +406,7 @@ add_to_replace_metadata: title=>[-: ]*[\(\[]([^\]\)]+)[\)\]][-: ]*=> # remove 'Thread' and the next word, usually "Thread 2", "Thread # four", "Thread iv", etc - title,tagsfromtitle=>[-: ]*[Tt]hread [^ ]+[-: ]*=> + title,tagsfromtitle=>[-: ]*(Story *)?[Tt]hread [^ ]+[-: ]*=> add_to_extra_titlepage_entries:,tagsfromtitle,forumtags @@ -425,6 +425,13 @@ dateUpdated_format:%%Y-%%m-%%d %%H:%%M:%%S ## the description. description_limit:500 +## Because base_xenforoforum adapters can pull chapter URLs from human +## posts, the odds of errors in the chapter URLs are vastly higher. +## You can set continue_on_chapter_error:true to continue on after +## failing to download a chapter and instead record an error message +## in the ebook for that chapter. +continue_on_chapter_error:false + ## Each output format has a section that overrides [defaults] [html] @@ -1347,6 +1354,9 @@ extracategories:My Little Pony: Friendship is Magic ## Site dedicated to these categories/characters/ships extracategories:The Pretender +[questionablequesting.com] +## see [base_xenforoforum] + [samandjack.net] ## Some sites require login (or login for some rated stories) The ## program can prompt you, or you can save it in config. In diff --git a/fanficfare/adapters/base_xenforoforum_adapter.py b/fanficfare/adapters/base_xenforoforum_adapter.py index f6192743..4084872b 100644 --- a/fanficfare/adapters/base_xenforoforum_adapter.py +++ b/fanficfare/adapters/base_xenforoforum_adapter.py @@ -17,6 +17,7 @@ import time import logging +import traceback logger = logging.getLogger(__name__) import re import urllib2 @@ -38,11 +39,11 @@ class BaseXenForoForumAdapter(BaseSiteAdapter): # Most sites that claim to be # iso-8859-1 (and some that claim to be # utf8) are really windows-1252. - - + + # get storyId from url--url validation guarantees query is only sid=1234 - self.story.setMetadata('storyId',self.parsedUrl.path.split('/',)[2]) - + self.story.setMetadata('storyId',self.parsedUrl.path.split('/',)[2]) + # get storyId from url--url validation guarantees query correct m = re.match(self.getSiteURLPattern(),url) if m: @@ -54,23 +55,23 @@ class BaseXenForoForumAdapter(BaseSiteAdapter): raise exceptions.InvalidStoryURL(url, self.getSiteDomain(), self.getSiteExampleURLs()) - + # Each adapter needs to have a unique site abbreviation. self.story.setMetadata('siteabbrev','fsb') # The date format will vary from site to site. # http://docs.python.org/library/datetime.html#strftime-strptime-behavior self.dateformat = "%b %d, %Y at %I:%M %p" - + @classmethod def getConfigSections(cls): "Only needs to be overriden if has additional ini sections." return ['base_xenforoforum',cls.getConfigSection()] - + @classmethod def getURLPrefix(cls): # The site domain. Does have www here, if it uses it. - return 'https://' + cls.getSiteDomain() + return 'https://' + cls.getSiteDomain() @classmethod def getSiteExampleURLs(cls): @@ -78,7 +79,7 @@ class BaseXenForoForumAdapter(BaseSiteAdapter): def getSiteURLPattern(self): return r"https?://"+re.escape(self.getSiteDomain())+r"/(?Pthreads|posts)/(.+\.)?(?P\d+)/?" - + def use_pagecache(self): ''' adapters that will work with the page cache need to implement @@ -112,7 +113,7 @@ class BaseXenForoForumAdapter(BaseSiteAdapter): h1 = soup.find('div',{'class':'titleBar'}).h1 self.story.setMetadata('title',stripHTML(h1)) - + if '#' in useurl: anchorid = useurl.split('#')[1] soup = soup.find('li',id=anchorid) @@ -129,7 +130,7 @@ class BaseXenForoForumAdapter(BaseSiteAdapter): self.story.setMetadata('datePublished', date) if not self.story.getMetadataRaw('dateUpdated') or date > self.story.getMetadataRaw('dateUpdated'): self.story.setMetadata('dateUpdated', date) - + self.chapterUrls.append((name,self.getURLPrefix()+'/'+url)) ## only use tags if threadmarks for chapters. @@ -138,10 +139,10 @@ class BaseXenForoForumAdapter(BaseSiteAdapter): self.story.addToList('forumtags',stripHTML(tag)) soup = soup.find('li',{'class':'message'}) # limit first post for date stuff below. ('#' posts above) - + # Now go hunting for the 'chapter list'. bq = soup.find('blockquote') # assume first posting contains TOC urls. - + bq.name='div' for iframe in bq.find_all('iframe'): @@ -149,7 +150,7 @@ class BaseXenForoForumAdapter(BaseSiteAdapter): for qdiv in bq.find_all('div',{'class':'quoteExpand'}): qdiv.extract() # Remove
click to expand
- + self.setDescription(useurl,bq) # otherwise, use first post links--include first post since @@ -160,31 +161,34 @@ class BaseXenForoForumAdapter(BaseSiteAdapter): logger.debug("found chapurl:%s"%url) if not url.startswith('http'): url = self.getURLPrefix()+'/'+url - + if ( url.startswith(self.getURLPrefix()) or url.startswith('http://'+self.getSiteDomain()) or url.startswith('https://'+self.getSiteDomain()) ) and ('/posts/' in url or '/threads/' in url): + # brute force way to deal with SB's http->https change when hardcoded http urls. url = url.replace('http://'+self.getSiteDomain(),self.getURLPrefix()) + url = re.sub(r'(^[\'"]+|[\'"]+$)','',url) # strip leading or trailing '" from incorrect quoting. - logger.debug("used chapurl:%s"%(url)) + + logger.debug("(ch:%s)used chapurl:%s"%(len(self.chapterUrls)+1,url)) self.chapterUrls.append((name,url)) if url == useurl and 'First Post' == self.chapterUrls[0][0]: # remove "First Post" if included in list. logger.debug("delete dup 'First Post' chapter: %s %s"%self.chapterUrls[0]) del self.chapterUrls[0] - + # Didn't use threadmarks, so take created/updated dates # from the 'first' posting created and updated. date = self.make_date(soup.find('a',{'class':'datePermalink'})) if date: self.story.setMetadata('datePublished', date) self.story.setMetadata('dateUpdated', date) # updated overwritten below if found. - + date = self.make_date(soup.find('div',{'class':'editDate'})) if date: - self.story.setMetadata('dateUpdated', date) - + self.story.setMetadata('dateUpdated', date) + self.story.setMetadata('numChapters',len(self.chapterUrls)) def make_date(self,parenttag): # forums use a BS thing where dates @@ -205,7 +209,7 @@ class BaseXenForoForumAdapter(BaseSiteAdapter): except: logger.debug('No date found in %s'%parenttag) return None - + # grab the text for an individual chapter. def getChapterText(self, url): logger.debug('Getting chapter text from: %s' % url) @@ -218,28 +222,48 @@ class BaseXenForoForumAdapter(BaseSiteAdapter): # https://forums.sufficientvelocity.com/posts/39915/ if '#post-' in url: url = self.getURLPrefix()+'/posts/'+url.split('#post-')[1]+'/' - - origurl = url - (data,opened) = self._fetchUrlOpened(url) - url = opened.geturl() - if '#' in origurl and '#' not in url: - url = url + origurl[origurl.index('#'):] - logger.debug("chapter URL redirected to: %s"%url) - soup = self.make_soup(data) - - if '#' in url: - anchorid = url.split('#')[1] - soup = soup.find('li',id=anchorid) - - bq = soup.find('blockquote') - - bq.name='div' - - for iframe in bq.find_all('iframe'): - iframe.extract() # calibre book reader & editor don't like iframes to youtube. - - for qdiv in bq.find_all('div',{'class':'quoteExpand'}): - qdiv.extract() # Remove
click to expand
+ ## Same as above except for for case where author mistakenly + ## used the reply link instead of normal link to post. + # "http://forums.spacebattles.com/threads/manager-worm-story-thread-iv.301602/reply?quote=15962513" + # https://forums.spacebattles.com/posts/ + if 'reply?quote=' in url: + url = self.getURLPrefix()+'/posts/'+url.split('reply?quote=')[1]+'/' + try: + origurl = url + (data,opened) = self._fetchUrlOpened(url) + url = opened.geturl() + if '#' in origurl and '#' not in url: + url = url + origurl[origurl.index('#'):] + logger.debug("chapter URL redirected to: %s"%url) + + soup = self.make_soup(data) + + if '#' in url: + anchorid = url.split('#')[1] + soup = soup.find('li',id=anchorid) + + bq = soup.find('blockquote') + + bq.name='div' + + for iframe in bq.find_all('iframe'): + iframe.extract() # calibre book reader & editor don't like iframes to youtube. + + for qdiv in bq.find_all('div',{'class':'quoteExpand'}): + qdiv.extract() # Remove
click to expand
+ + except Exception as e: + if self.getConfig('continue_on_chapter_error'): + bq = self.make_soup("""
+

Error

+

FanFicFare failed to download this chapter. Because you have +continue_on_chapter_error set to true in your personal.ini, the download continued.

+

Chapter URL:
%s

+

Error:

%s

+
"""%(url,traceback.format_exc())) + else: + raise + return self.utf8FromSoup(url,bq) diff --git a/fanficfare/configurable.py b/fanficfare/configurable.py index 1eca838f..c516a9ed 100644 --- a/fanficfare/configurable.py +++ b/fanficfare/configurable.py @@ -82,7 +82,7 @@ def get_valid_sections(): sitesections = list(othersections) for section in sites: sitesections.append(section) - # also allows [www.base_efiction] and [www.base_forum]. Not + # also allows [www.base_efiction] and [www.base_xenforoforum]. Not # likely to matter. if section.startswith('www.'): # add w/o www if has www @@ -166,6 +166,12 @@ def get_valid_set_options(): 'include_images':(None,['epub','html'],boollist), 'grayscale_images':(None,['epub','html'],boollist), 'no_image_processing':(None,['epub','html'],boollist), + + 'continue_on_chapter_error':(['base_xenforoforum', + 'forums.spacebattles.com', + 'forums.sufficientvelocity.com', + 'questionablequesting.com', + ],None,boollist), } return dict(valdict) diff --git a/fanficfare/defaults.ini b/fanficfare/defaults.ini index eeeba758..76eef052 100644 --- a/fanficfare/defaults.ini +++ b/fanficfare/defaults.ini @@ -405,7 +405,7 @@ add_to_replace_metadata: title=>[-: ]*[\(\[]([^\]\)]+)[\)\]][-: ]*=> # remove 'Thread' and the next word, usually "Thread 2", "Thread # four", "Thread iv", etc - title,tagsfromtitle=>[-: ]*[Tt]hread [^ ]+[-: ]*=> + title,tagsfromtitle=>[-: ]*(Story *)?[Tt]hread [^ ]+[-: ]*=> add_to_extra_titlepage_entries:,tagsfromtitle,forumtags @@ -424,6 +424,13 @@ dateUpdated_format:%%Y-%%m-%%d %%H:%%M:%%S ## the description. description_limit:500 +## Because base_xenforoforum adapters can pull chapter URLs from human +## posts, the odds of errors in the chapter URLs are vastly higher. +## You can set continue_on_chapter_error:true to continue on after +## failing to download a chapter and instead record an error message +## in the ebook for that chapter. +continue_on_chapter_error:false + ## Each output format has a section that overrides [defaults] [html] @@ -1335,6 +1342,9 @@ extracategories:My Little Pony: Friendship is Magic ## Site dedicated to these categories/characters/ships extracategories:The Pretender +[questionablequesting.com] +## see [base_xenforoforum] + [samandjack.net] ## Some sites require login (or login for some rated stories) The ## program can prompt you, or you can save it in config. In