mirror of
https://github.com/JimmXinu/FanFicFare.git
synced 2026-05-06 03:20:24 +02:00
Add to base_xenforoforum: fix for author using reply URL, continue_on_chapter_error feature, remove 'Story' in front of 'Thread' in title.
This commit is contained in:
parent
e2d1a693dd
commit
5040c44572
4 changed files with 96 additions and 46 deletions
|
|
@ -406,7 +406,7 @@ add_to_replace_metadata:
|
|||
title=>[-: ]*[\(\[]([^\]\)]+)[\)\]][-: ]*=>
|
||||
# remove 'Thread' and the next word, usually "Thread 2", "Thread
|
||||
# four", "Thread iv", etc
|
||||
title,tagsfromtitle=>[-: ]*[Tt]hread [^ ]+[-: ]*=>
|
||||
title,tagsfromtitle=>[-: ]*(Story *)?[Tt]hread [^ ]+[-: ]*=>
|
||||
|
||||
add_to_extra_titlepage_entries:,tagsfromtitle,forumtags
|
||||
|
||||
|
|
@ -425,6 +425,13 @@ dateUpdated_format:%%Y-%%m-%%d %%H:%%M:%%S
|
|||
## the description.
|
||||
description_limit:500
|
||||
|
||||
## Because base_xenforoforum adapters can pull chapter URLs from human
|
||||
## posts, the odds of errors in the chapter URLs are vastly higher.
|
||||
## You can set continue_on_chapter_error:true to continue on after
|
||||
## failing to download a chapter and instead record an error message
|
||||
## in the ebook for that chapter.
|
||||
continue_on_chapter_error:false
|
||||
|
||||
## Each output format has a section that overrides [defaults]
|
||||
[html]
|
||||
|
||||
|
|
@ -1347,6 +1354,9 @@ extracategories:My Little Pony: Friendship is Magic
|
|||
## Site dedicated to these categories/characters/ships
|
||||
extracategories:The Pretender
|
||||
|
||||
[questionablequesting.com]
|
||||
## see [base_xenforoforum]
|
||||
|
||||
[samandjack.net]
|
||||
## Some sites require login (or login for some rated stories) The
|
||||
## program can prompt you, or you can save it in config. In
|
||||
|
|
|
|||
|
|
@ -17,6 +17,7 @@
|
|||
|
||||
import time
|
||||
import logging
|
||||
import traceback
|
||||
logger = logging.getLogger(__name__)
|
||||
import re
|
||||
import urllib2
|
||||
|
|
@ -38,11 +39,11 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
|
|||
# Most sites that claim to be
|
||||
# iso-8859-1 (and some that claim to be
|
||||
# utf8) are really windows-1252.
|
||||
|
||||
|
||||
|
||||
|
||||
# get storyId from url--url validation guarantees query is only sid=1234
|
||||
self.story.setMetadata('storyId',self.parsedUrl.path.split('/',)[2])
|
||||
|
||||
self.story.setMetadata('storyId',self.parsedUrl.path.split('/',)[2])
|
||||
|
||||
# get storyId from url--url validation guarantees query correct
|
||||
m = re.match(self.getSiteURLPattern(),url)
|
||||
if m:
|
||||
|
|
@ -54,23 +55,23 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
|
|||
raise exceptions.InvalidStoryURL(url,
|
||||
self.getSiteDomain(),
|
||||
self.getSiteExampleURLs())
|
||||
|
||||
|
||||
# Each adapter needs to have a unique site abbreviation.
|
||||
self.story.setMetadata('siteabbrev','fsb')
|
||||
|
||||
# The date format will vary from site to site.
|
||||
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
|
||||
self.dateformat = "%b %d, %Y at %I:%M %p"
|
||||
|
||||
|
||||
@classmethod
|
||||
def getConfigSections(cls):
|
||||
"Only needs to be overriden if has additional ini sections."
|
||||
return ['base_xenforoforum',cls.getConfigSection()]
|
||||
|
||||
|
||||
@classmethod
|
||||
def getURLPrefix(cls):
|
||||
# The site domain. Does have www here, if it uses it.
|
||||
return 'https://' + cls.getSiteDomain()
|
||||
return 'https://' + cls.getSiteDomain()
|
||||
|
||||
@classmethod
|
||||
def getSiteExampleURLs(cls):
|
||||
|
|
@ -78,7 +79,7 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
|
|||
|
||||
def getSiteURLPattern(self):
|
||||
return r"https?://"+re.escape(self.getSiteDomain())+r"/(?P<tp>threads|posts)/(.+\.)?(?P<id>\d+)/?"
|
||||
|
||||
|
||||
def use_pagecache(self):
|
||||
'''
|
||||
adapters that will work with the page cache need to implement
|
||||
|
|
@ -112,7 +113,7 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
|
|||
|
||||
h1 = soup.find('div',{'class':'titleBar'}).h1
|
||||
self.story.setMetadata('title',stripHTML(h1))
|
||||
|
||||
|
||||
if '#' in useurl:
|
||||
anchorid = useurl.split('#')[1]
|
||||
soup = soup.find('li',id=anchorid)
|
||||
|
|
@ -129,7 +130,7 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
|
|||
self.story.setMetadata('datePublished', date)
|
||||
if not self.story.getMetadataRaw('dateUpdated') or date > self.story.getMetadataRaw('dateUpdated'):
|
||||
self.story.setMetadata('dateUpdated', date)
|
||||
|
||||
|
||||
self.chapterUrls.append((name,self.getURLPrefix()+'/'+url))
|
||||
|
||||
## only use tags if threadmarks for chapters.
|
||||
|
|
@ -138,10 +139,10 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
|
|||
self.story.addToList('forumtags',stripHTML(tag))
|
||||
|
||||
soup = soup.find('li',{'class':'message'}) # limit first post for date stuff below. ('#' posts above)
|
||||
|
||||
|
||||
# Now go hunting for the 'chapter list'.
|
||||
bq = soup.find('blockquote') # assume first posting contains TOC urls.
|
||||
|
||||
|
||||
bq.name='div'
|
||||
|
||||
for iframe in bq.find_all('iframe'):
|
||||
|
|
@ -149,7 +150,7 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
|
|||
|
||||
for qdiv in bq.find_all('div',{'class':'quoteExpand'}):
|
||||
qdiv.extract() # Remove <div class="quoteExpand">click to expand</div>
|
||||
|
||||
|
||||
self.setDescription(useurl,bq)
|
||||
|
||||
# otherwise, use first post links--include first post since
|
||||
|
|
@ -160,31 +161,34 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
|
|||
logger.debug("found chapurl:%s"%url)
|
||||
if not url.startswith('http'):
|
||||
url = self.getURLPrefix()+'/'+url
|
||||
|
||||
|
||||
if ( url.startswith(self.getURLPrefix()) or
|
||||
url.startswith('http://'+self.getSiteDomain()) or
|
||||
url.startswith('https://'+self.getSiteDomain()) ) and ('/posts/' in url or '/threads/' in url):
|
||||
|
||||
# brute force way to deal with SB's http->https change when hardcoded http urls.
|
||||
url = url.replace('http://'+self.getSiteDomain(),self.getURLPrefix())
|
||||
|
||||
url = re.sub(r'(^[\'"]+|[\'"]+$)','',url) # strip leading or trailing '" from incorrect quoting.
|
||||
logger.debug("used chapurl:%s"%(url))
|
||||
|
||||
logger.debug("(ch:%s)used chapurl:%s"%(len(self.chapterUrls)+1,url))
|
||||
self.chapterUrls.append((name,url))
|
||||
if url == useurl and 'First Post' == self.chapterUrls[0][0]:
|
||||
# remove "First Post" if included in list.
|
||||
logger.debug("delete dup 'First Post' chapter: %s %s"%self.chapterUrls[0])
|
||||
del self.chapterUrls[0]
|
||||
|
||||
|
||||
# Didn't use threadmarks, so take created/updated dates
|
||||
# from the 'first' posting created and updated.
|
||||
date = self.make_date(soup.find('a',{'class':'datePermalink'}))
|
||||
if date:
|
||||
self.story.setMetadata('datePublished', date)
|
||||
self.story.setMetadata('dateUpdated', date) # updated overwritten below if found.
|
||||
|
||||
|
||||
date = self.make_date(soup.find('div',{'class':'editDate'}))
|
||||
if date:
|
||||
self.story.setMetadata('dateUpdated', date)
|
||||
|
||||
self.story.setMetadata('dateUpdated', date)
|
||||
|
||||
self.story.setMetadata('numChapters',len(self.chapterUrls))
|
||||
|
||||
def make_date(self,parenttag): # forums use a BS thing where dates
|
||||
|
|
@ -205,7 +209,7 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
|
|||
except:
|
||||
logger.debug('No date found in %s'%parenttag)
|
||||
return None
|
||||
|
||||
|
||||
# grab the text for an individual chapter.
|
||||
def getChapterText(self, url):
|
||||
logger.debug('Getting chapter text from: %s' % url)
|
||||
|
|
@ -218,28 +222,48 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
|
|||
# https://forums.sufficientvelocity.com/posts/39915/
|
||||
if '#post-' in url:
|
||||
url = self.getURLPrefix()+'/posts/'+url.split('#post-')[1]+'/'
|
||||
|
||||
origurl = url
|
||||
(data,opened) = self._fetchUrlOpened(url)
|
||||
url = opened.geturl()
|
||||
if '#' in origurl and '#' not in url:
|
||||
url = url + origurl[origurl.index('#'):]
|
||||
logger.debug("chapter URL redirected to: %s"%url)
|
||||
|
||||
soup = self.make_soup(data)
|
||||
|
||||
if '#' in url:
|
||||
anchorid = url.split('#')[1]
|
||||
soup = soup.find('li',id=anchorid)
|
||||
|
||||
bq = soup.find('blockquote')
|
||||
|
||||
bq.name='div'
|
||||
|
||||
for iframe in bq.find_all('iframe'):
|
||||
iframe.extract() # calibre book reader & editor don't like iframes to youtube.
|
||||
|
||||
for qdiv in bq.find_all('div',{'class':'quoteExpand'}):
|
||||
qdiv.extract() # Remove <div class="quoteExpand">click to expand</div>
|
||||
## Same as above except for for case where author mistakenly
|
||||
## used the reply link instead of normal link to post.
|
||||
# "http://forums.spacebattles.com/threads/manager-worm-story-thread-iv.301602/reply?quote=15962513"
|
||||
# https://forums.spacebattles.com/posts/
|
||||
if 'reply?quote=' in url:
|
||||
url = self.getURLPrefix()+'/posts/'+url.split('reply?quote=')[1]+'/'
|
||||
|
||||
try:
|
||||
origurl = url
|
||||
(data,opened) = self._fetchUrlOpened(url)
|
||||
url = opened.geturl()
|
||||
if '#' in origurl and '#' not in url:
|
||||
url = url + origurl[origurl.index('#'):]
|
||||
logger.debug("chapter URL redirected to: %s"%url)
|
||||
|
||||
soup = self.make_soup(data)
|
||||
|
||||
if '#' in url:
|
||||
anchorid = url.split('#')[1]
|
||||
soup = soup.find('li',id=anchorid)
|
||||
|
||||
bq = soup.find('blockquote')
|
||||
|
||||
bq.name='div'
|
||||
|
||||
for iframe in bq.find_all('iframe'):
|
||||
iframe.extract() # calibre book reader & editor don't like iframes to youtube.
|
||||
|
||||
for qdiv in bq.find_all('div',{'class':'quoteExpand'}):
|
||||
qdiv.extract() # Remove <div class="quoteExpand">click to expand</div>
|
||||
|
||||
except Exception as e:
|
||||
if self.getConfig('continue_on_chapter_error'):
|
||||
bq = self.make_soup("""<div>
|
||||
<p><b>Error</b></p>
|
||||
<p>FanFicFare failed to download this chapter. Because you have
|
||||
<b>continue_on_chapter_error</b> set to <b>true</b> in your personal.ini, the download continued.</p>
|
||||
<p>Chapter URL:<br>%s</p>
|
||||
<p>Error:<br><pre>%s</pre></p>
|
||||
</div>"""%(url,traceback.format_exc()))
|
||||
else:
|
||||
raise
|
||||
|
||||
return self.utf8FromSoup(url,bq)
|
||||
|
|
|
|||
|
|
@ -82,7 +82,7 @@ def get_valid_sections():
|
|||
sitesections = list(othersections)
|
||||
for section in sites:
|
||||
sitesections.append(section)
|
||||
# also allows [www.base_efiction] and [www.base_forum]. Not
|
||||
# also allows [www.base_efiction] and [www.base_xenforoforum]. Not
|
||||
# likely to matter.
|
||||
if section.startswith('www.'):
|
||||
# add w/o www if has www
|
||||
|
|
@ -166,6 +166,12 @@ def get_valid_set_options():
|
|||
'include_images':(None,['epub','html'],boollist),
|
||||
'grayscale_images':(None,['epub','html'],boollist),
|
||||
'no_image_processing':(None,['epub','html'],boollist),
|
||||
|
||||
'continue_on_chapter_error':(['base_xenforoforum',
|
||||
'forums.spacebattles.com',
|
||||
'forums.sufficientvelocity.com',
|
||||
'questionablequesting.com',
|
||||
],None,boollist),
|
||||
}
|
||||
|
||||
return dict(valdict)
|
||||
|
|
|
|||
|
|
@ -405,7 +405,7 @@ add_to_replace_metadata:
|
|||
title=>[-: ]*[\(\[]([^\]\)]+)[\)\]][-: ]*=>
|
||||
# remove 'Thread' and the next word, usually "Thread 2", "Thread
|
||||
# four", "Thread iv", etc
|
||||
title,tagsfromtitle=>[-: ]*[Tt]hread [^ ]+[-: ]*=>
|
||||
title,tagsfromtitle=>[-: ]*(Story *)?[Tt]hread [^ ]+[-: ]*=>
|
||||
|
||||
add_to_extra_titlepage_entries:,tagsfromtitle,forumtags
|
||||
|
||||
|
|
@ -424,6 +424,13 @@ dateUpdated_format:%%Y-%%m-%%d %%H:%%M:%%S
|
|||
## the description.
|
||||
description_limit:500
|
||||
|
||||
## Because base_xenforoforum adapters can pull chapter URLs from human
|
||||
## posts, the odds of errors in the chapter URLs are vastly higher.
|
||||
## You can set continue_on_chapter_error:true to continue on after
|
||||
## failing to download a chapter and instead record an error message
|
||||
## in the ebook for that chapter.
|
||||
continue_on_chapter_error:false
|
||||
|
||||
## Each output format has a section that overrides [defaults]
|
||||
[html]
|
||||
|
||||
|
|
@ -1335,6 +1342,9 @@ extracategories:My Little Pony: Friendship is Magic
|
|||
## Site dedicated to these categories/characters/ships
|
||||
extracategories:The Pretender
|
||||
|
||||
[questionablequesting.com]
|
||||
## see [base_xenforoforum]
|
||||
|
||||
[samandjack.net]
|
||||
## Some sites require login (or login for some rated stories) The
|
||||
## program can prompt you, or you can save it in config. In
|
||||
|
|
|
|||
Loading…
Reference in a new issue