Add to base_xenforoforum: fix for author using reply URL, continue_on_chapter_error feature, remove 'Story' in front of 'Thread' in title.

This commit is contained in:
Jim Miller 2015-09-28 13:32:24 -05:00
parent e2d1a693dd
commit 5040c44572
4 changed files with 96 additions and 46 deletions

View file

@ -406,7 +406,7 @@ add_to_replace_metadata:
title=>[-: ]*[\(\[]([^\]\)]+)[\)\]][-: ]*=>
# remove 'Thread' and the next word, usually "Thread 2", "Thread
# four", "Thread iv", etc
title,tagsfromtitle=>[-: ]*[Tt]hread [^ ]+[-: ]*=>
title,tagsfromtitle=>[-: ]*(Story *)?[Tt]hread [^ ]+[-: ]*=>
add_to_extra_titlepage_entries:,tagsfromtitle,forumtags
@ -425,6 +425,13 @@ dateUpdated_format:%%Y-%%m-%%d %%H:%%M:%%S
## the description.
description_limit:500
## Because base_xenforoforum adapters can pull chapter URLs from human
## posts, the odds of errors in the chapter URLs are vastly higher.
## You can set continue_on_chapter_error:true to continue on after
## failing to download a chapter and instead record an error message
## in the ebook for that chapter.
continue_on_chapter_error:false
## Each output format has a section that overrides [defaults]
[html]
@ -1347,6 +1354,9 @@ extracategories:My Little Pony: Friendship is Magic
## Site dedicated to these categories/characters/ships
extracategories:The Pretender
[questionablequesting.com]
## see [base_xenforoforum]
[samandjack.net]
## Some sites require login (or login for some rated stories) The
## program can prompt you, or you can save it in config. In

View file

@ -17,6 +17,7 @@
import time
import logging
import traceback
logger = logging.getLogger(__name__)
import re
import urllib2
@ -38,11 +39,11 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
# Most sites that claim to be
# iso-8859-1 (and some that claim to be
# utf8) are really windows-1252.
# get storyId from url--url validation guarantees query is only sid=1234
self.story.setMetadata('storyId',self.parsedUrl.path.split('/',)[2])
self.story.setMetadata('storyId',self.parsedUrl.path.split('/',)[2])
# get storyId from url--url validation guarantees query correct
m = re.match(self.getSiteURLPattern(),url)
if m:
@ -54,23 +55,23 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
raise exceptions.InvalidStoryURL(url,
self.getSiteDomain(),
self.getSiteExampleURLs())
# Each adapter needs to have a unique site abbreviation.
self.story.setMetadata('siteabbrev','fsb')
# The date format will vary from site to site.
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
self.dateformat = "%b %d, %Y at %I:%M %p"
@classmethod
def getConfigSections(cls):
"Only needs to be overriden if has additional ini sections."
return ['base_xenforoforum',cls.getConfigSection()]
@classmethod
def getURLPrefix(cls):
# The site domain. Does have www here, if it uses it.
return 'https://' + cls.getSiteDomain()
return 'https://' + cls.getSiteDomain()
@classmethod
def getSiteExampleURLs(cls):
@ -78,7 +79,7 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
def getSiteURLPattern(self):
return r"https?://"+re.escape(self.getSiteDomain())+r"/(?P<tp>threads|posts)/(.+\.)?(?P<id>\d+)/?"
def use_pagecache(self):
'''
adapters that will work with the page cache need to implement
@ -112,7 +113,7 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
h1 = soup.find('div',{'class':'titleBar'}).h1
self.story.setMetadata('title',stripHTML(h1))
if '#' in useurl:
anchorid = useurl.split('#')[1]
soup = soup.find('li',id=anchorid)
@ -129,7 +130,7 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
self.story.setMetadata('datePublished', date)
if not self.story.getMetadataRaw('dateUpdated') or date > self.story.getMetadataRaw('dateUpdated'):
self.story.setMetadata('dateUpdated', date)
self.chapterUrls.append((name,self.getURLPrefix()+'/'+url))
## only use tags if threadmarks for chapters.
@ -138,10 +139,10 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
self.story.addToList('forumtags',stripHTML(tag))
soup = soup.find('li',{'class':'message'}) # limit first post for date stuff below. ('#' posts above)
# Now go hunting for the 'chapter list'.
bq = soup.find('blockquote') # assume first posting contains TOC urls.
bq.name='div'
for iframe in bq.find_all('iframe'):
@ -149,7 +150,7 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
for qdiv in bq.find_all('div',{'class':'quoteExpand'}):
qdiv.extract() # Remove <div class="quoteExpand">click to expand</div>
self.setDescription(useurl,bq)
# otherwise, use first post links--include first post since
@ -160,31 +161,34 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
logger.debug("found chapurl:%s"%url)
if not url.startswith('http'):
url = self.getURLPrefix()+'/'+url
if ( url.startswith(self.getURLPrefix()) or
url.startswith('http://'+self.getSiteDomain()) or
url.startswith('https://'+self.getSiteDomain()) ) and ('/posts/' in url or '/threads/' in url):
# brute force way to deal with SB's http->https change when hardcoded http urls.
url = url.replace('http://'+self.getSiteDomain(),self.getURLPrefix())
url = re.sub(r'(^[\'"]+|[\'"]+$)','',url) # strip leading or trailing '" from incorrect quoting.
logger.debug("used chapurl:%s"%(url))
logger.debug("(ch:%s)used chapurl:%s"%(len(self.chapterUrls)+1,url))
self.chapterUrls.append((name,url))
if url == useurl and 'First Post' == self.chapterUrls[0][0]:
# remove "First Post" if included in list.
logger.debug("delete dup 'First Post' chapter: %s %s"%self.chapterUrls[0])
del self.chapterUrls[0]
# Didn't use threadmarks, so take created/updated dates
# from the 'first' posting created and updated.
date = self.make_date(soup.find('a',{'class':'datePermalink'}))
if date:
self.story.setMetadata('datePublished', date)
self.story.setMetadata('dateUpdated', date) # updated overwritten below if found.
date = self.make_date(soup.find('div',{'class':'editDate'}))
if date:
self.story.setMetadata('dateUpdated', date)
self.story.setMetadata('dateUpdated', date)
self.story.setMetadata('numChapters',len(self.chapterUrls))
def make_date(self,parenttag): # forums use a BS thing where dates
@ -205,7 +209,7 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
except:
logger.debug('No date found in %s'%parenttag)
return None
# grab the text for an individual chapter.
def getChapterText(self, url):
logger.debug('Getting chapter text from: %s' % url)
@ -218,28 +222,48 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
# https://forums.sufficientvelocity.com/posts/39915/
if '#post-' in url:
url = self.getURLPrefix()+'/posts/'+url.split('#post-')[1]+'/'
origurl = url
(data,opened) = self._fetchUrlOpened(url)
url = opened.geturl()
if '#' in origurl and '#' not in url:
url = url + origurl[origurl.index('#'):]
logger.debug("chapter URL redirected to: %s"%url)
soup = self.make_soup(data)
if '#' in url:
anchorid = url.split('#')[1]
soup = soup.find('li',id=anchorid)
bq = soup.find('blockquote')
bq.name='div'
for iframe in bq.find_all('iframe'):
iframe.extract() # calibre book reader & editor don't like iframes to youtube.
for qdiv in bq.find_all('div',{'class':'quoteExpand'}):
qdiv.extract() # Remove <div class="quoteExpand">click to expand</div>
## Same as above except for for case where author mistakenly
## used the reply link instead of normal link to post.
# "http://forums.spacebattles.com/threads/manager-worm-story-thread-iv.301602/reply?quote=15962513"
# https://forums.spacebattles.com/posts/
if 'reply?quote=' in url:
url = self.getURLPrefix()+'/posts/'+url.split('reply?quote=')[1]+'/'
try:
origurl = url
(data,opened) = self._fetchUrlOpened(url)
url = opened.geturl()
if '#' in origurl and '#' not in url:
url = url + origurl[origurl.index('#'):]
logger.debug("chapter URL redirected to: %s"%url)
soup = self.make_soup(data)
if '#' in url:
anchorid = url.split('#')[1]
soup = soup.find('li',id=anchorid)
bq = soup.find('blockquote')
bq.name='div'
for iframe in bq.find_all('iframe'):
iframe.extract() # calibre book reader & editor don't like iframes to youtube.
for qdiv in bq.find_all('div',{'class':'quoteExpand'}):
qdiv.extract() # Remove <div class="quoteExpand">click to expand</div>
except Exception as e:
if self.getConfig('continue_on_chapter_error'):
bq = self.make_soup("""<div>
<p><b>Error</b></p>
<p>FanFicFare failed to download this chapter. Because you have
<b>continue_on_chapter_error</b> set to <b>true</b> in your personal.ini, the download continued.</p>
<p>Chapter URL:<br>%s</p>
<p>Error:<br><pre>%s</pre></p>
</div>"""%(url,traceback.format_exc()))
else:
raise
return self.utf8FromSoup(url,bq)

View file

@ -82,7 +82,7 @@ def get_valid_sections():
sitesections = list(othersections)
for section in sites:
sitesections.append(section)
# also allows [www.base_efiction] and [www.base_forum]. Not
# also allows [www.base_efiction] and [www.base_xenforoforum]. Not
# likely to matter.
if section.startswith('www.'):
# add w/o www if has www
@ -166,6 +166,12 @@ def get_valid_set_options():
'include_images':(None,['epub','html'],boollist),
'grayscale_images':(None,['epub','html'],boollist),
'no_image_processing':(None,['epub','html'],boollist),
'continue_on_chapter_error':(['base_xenforoforum',
'forums.spacebattles.com',
'forums.sufficientvelocity.com',
'questionablequesting.com',
],None,boollist),
}
return dict(valdict)

View file

@ -405,7 +405,7 @@ add_to_replace_metadata:
title=>[-: ]*[\(\[]([^\]\)]+)[\)\]][-: ]*=>
# remove 'Thread' and the next word, usually "Thread 2", "Thread
# four", "Thread iv", etc
title,tagsfromtitle=>[-: ]*[Tt]hread [^ ]+[-: ]*=>
title,tagsfromtitle=>[-: ]*(Story *)?[Tt]hread [^ ]+[-: ]*=>
add_to_extra_titlepage_entries:,tagsfromtitle,forumtags
@ -424,6 +424,13 @@ dateUpdated_format:%%Y-%%m-%%d %%H:%%M:%%S
## the description.
description_limit:500
## Because base_xenforoforum adapters can pull chapter URLs from human
## posts, the odds of errors in the chapter URLs are vastly higher.
## You can set continue_on_chapter_error:true to continue on after
## failing to download a chapter and instead record an error message
## in the ebook for that chapter.
continue_on_chapter_error:false
## Each output format has a section that overrides [defaults]
[html]
@ -1335,6 +1342,9 @@ extracategories:My Little Pony: Friendship is Magic
## Site dedicated to these categories/characters/ships
extracategories:The Pretender
[questionablequesting.com]
## see [base_xenforoforum]
[samandjack.net]
## Some sites require login (or login for some rated stories) The
## program can prompt you, or you can save it in config. In