mirror of
https://github.com/JimmXinu/FanFicFare.git
synced 2026-04-27 01:11:21 +02:00
Adding normalize_chapterurl() for xenforoforum and normalize_text_links option.
This commit is contained in:
parent
c9205dd6bc
commit
a40383bada
5 changed files with 166 additions and 109 deletions
|
|
@ -720,6 +720,11 @@ remove_transparency: true
|
|||
## true--replace_br_with_p also fixes the problem.
|
||||
nook_img_fix:true
|
||||
|
||||
## Apply adapter's normalize_chapterurl() to all links in chapter
|
||||
## texts, if they match chapter URLs. Currently only implemented by
|
||||
## base_xenforoforum adapters.
|
||||
#normalize_text_links:false
|
||||
|
||||
[mobi]
|
||||
## mobi TOC cannot be turned off right now.
|
||||
#include_tocpage: true
|
||||
|
|
|
|||
|
|
@ -84,7 +84,7 @@ class BaseSiteAdapter(Configurable):
|
|||
|
||||
def __init__(self, configuration, url):
|
||||
Configurable.__init__(self, configuration)
|
||||
|
||||
|
||||
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
|
||||
self.password = ""
|
||||
self.is_adult=False
|
||||
|
|
@ -113,7 +113,7 @@ class BaseSiteAdapter(Configurable):
|
|||
self.logfile = None
|
||||
|
||||
self.pagecache = self.get_empty_pagecache()
|
||||
|
||||
|
||||
## order of preference for decoding.
|
||||
self.decode = ["utf8",
|
||||
"Windows-1252"] # 1252 is a superset of
|
||||
|
|
@ -135,17 +135,17 @@ class BaseSiteAdapter(Configurable):
|
|||
saveheaders = self.opener.addheaders
|
||||
self.opener = u2.build_opener(u2.HTTPCookieProcessor(self.cookiejar),GZipProcessor())
|
||||
self.opener.addheaders = saveheaders
|
||||
|
||||
|
||||
def load_cookiejar(self,filename):
|
||||
'''
|
||||
Needs to be called after adapter create, but before any fetchs
|
||||
are done. Takes file *name*.
|
||||
'''
|
||||
self.get_cookiejar().load(filename, ignore_discard=True, ignore_expires=True)
|
||||
|
||||
|
||||
def get_pagecache(self):
|
||||
return self.pagecache
|
||||
|
||||
|
||||
def set_pagecache(self,d):
|
||||
self.pagecache=d
|
||||
|
||||
|
|
@ -159,7 +159,7 @@ class BaseSiteAdapter(Configurable):
|
|||
|
||||
def _has_cachekey(self,cachekey):
|
||||
return self.use_pagecache() and cachekey in self.get_pagecache()
|
||||
|
||||
|
||||
def _get_from_pagecache(self,cachekey):
|
||||
if self.use_pagecache():
|
||||
return self.get_pagecache().get(cachekey)
|
||||
|
|
@ -176,18 +176,18 @@ class BaseSiteAdapter(Configurable):
|
|||
this and change it to True.
|
||||
'''
|
||||
return False
|
||||
|
||||
|
||||
# def story_load(self,filename):
|
||||
# d = pickle.load(self.story.metadata,filename)
|
||||
# self.story.metadata = d['metadata']
|
||||
# self.chapterUrls = d['chapterlist']
|
||||
# self.story.metadataDone = True
|
||||
|
||||
|
||||
def _setURL(self,url):
|
||||
self.url = url
|
||||
self.parsedUrl = up.urlparse(url)
|
||||
self.host = self.parsedUrl.netloc
|
||||
self.path = self.parsedUrl.path
|
||||
self.path = self.parsedUrl.path
|
||||
self.story.setMetadata('storyUrl',self.url,condremoveentities=False)
|
||||
|
||||
## website encoding(s)--in theory, each website reports the character
|
||||
|
|
@ -201,7 +201,7 @@ class BaseSiteAdapter(Configurable):
|
|||
decode = self.getConfigList('website_encodings')
|
||||
else:
|
||||
decode = self.decode
|
||||
|
||||
|
||||
for code in decode:
|
||||
try:
|
||||
#print code
|
||||
|
|
@ -230,7 +230,7 @@ class BaseSiteAdapter(Configurable):
|
|||
usecache=True):
|
||||
'''
|
||||
When should cache be cleared or not used? logins...
|
||||
|
||||
|
||||
extrasleep is primarily for ffnet adapter which has extra
|
||||
sleeps. Passed into fetchs so it can be bypassed when
|
||||
cache hits.
|
||||
|
|
@ -240,7 +240,7 @@ class BaseSiteAdapter(Configurable):
|
|||
logger.debug("#####################################\npagecache HIT: %s"%safe_url(cachekey))
|
||||
data,redirecturl = self._get_from_pagecache(cachekey)
|
||||
return data
|
||||
|
||||
|
||||
logger.debug("#####################################\npagecache MISS: %s"%safe_url(cachekey))
|
||||
self.do_sleep(extrasleep)
|
||||
|
||||
|
|
@ -261,19 +261,19 @@ class BaseSiteAdapter(Configurable):
|
|||
parameters=None,
|
||||
extrasleep=None,
|
||||
usecache=True):
|
||||
|
||||
|
||||
return self._fetchUrlRawOpened(url,
|
||||
parameters,
|
||||
extrasleep,
|
||||
usecache)[0]
|
||||
|
||||
|
||||
def _fetchUrlRawOpened(self, url,
|
||||
parameters=None,
|
||||
extrasleep=None,
|
||||
usecache=True):
|
||||
'''
|
||||
When should cache be cleared or not used? logins...
|
||||
|
||||
|
||||
extrasleep is primarily for ffnet adapter which has extra
|
||||
sleeps. Passed into fetchs so it can be bypassed when
|
||||
cache hits.
|
||||
|
|
@ -289,7 +289,7 @@ class BaseSiteAdapter(Configurable):
|
|||
def geturl(self): return self.url
|
||||
def read(self): return self.data
|
||||
return (data,FakeOpened(data,redirecturl))
|
||||
|
||||
|
||||
logger.debug("#####################################\npagecache MISS: %s"%safe_url(cachekey))
|
||||
self.do_sleep(extrasleep)
|
||||
if parameters != None:
|
||||
|
|
@ -298,13 +298,13 @@ class BaseSiteAdapter(Configurable):
|
|||
opened = self.opener.open(url.replace(' ','%20'),None,float(self.getConfig('connect_timeout',30.0)))
|
||||
data = opened.read()
|
||||
self._set_to_pagecache(cachekey,data,opened.url)
|
||||
|
||||
|
||||
return (data,opened)
|
||||
|
||||
def set_sleep(self,val):
|
||||
logger.debug("\n===========\n set sleep time %s\n==========="%val)
|
||||
self.override_sleep = val
|
||||
|
||||
|
||||
def do_sleep(self,extrasleep=None):
|
||||
if extrasleep:
|
||||
time.sleep(float(extrasleep))
|
||||
|
|
@ -312,7 +312,7 @@ class BaseSiteAdapter(Configurable):
|
|||
time.sleep(float(self.override_sleep))
|
||||
elif self.getConfig('slow_down_sleep_time'):
|
||||
time.sleep(float(self.getConfig('slow_down_sleep_time')))
|
||||
|
||||
|
||||
def _fetchUrl(self, url,
|
||||
parameters=None,
|
||||
usecache=True,
|
||||
|
|
@ -330,7 +330,7 @@ class BaseSiteAdapter(Configurable):
|
|||
|
||||
excpt=None
|
||||
for sleeptime in [0, 0.5, 4, 9]:
|
||||
time.sleep(sleeptime)
|
||||
time.sleep(sleeptime)
|
||||
try:
|
||||
(data,opened)=self._fetchUrlRawOpened(url,
|
||||
parameters=parameters,
|
||||
|
|
@ -345,7 +345,7 @@ class BaseSiteAdapter(Configurable):
|
|||
except Exception, e:
|
||||
excpt=e
|
||||
logger.warn("Caught an exception reading URL: %s sleeptime(%s) Exception %s."%(unicode(safe_url(url)),sleeptime,unicode(e)))
|
||||
|
||||
|
||||
logger.error("Giving up on %s" %safe_url(url))
|
||||
logger.debug(excpt, exc_info=True)
|
||||
raise(excpt)
|
||||
|
|
@ -357,12 +357,16 @@ class BaseSiteAdapter(Configurable):
|
|||
if last:
|
||||
self.chapterLast=int(last)-1
|
||||
self.story.set_chapters_range(first,last)
|
||||
|
||||
|
||||
# Does the download the first time it's called.
|
||||
def getStory(self):
|
||||
if not self.storyDone:
|
||||
self.getStoryMetadataOnly(get_cover=True)
|
||||
|
||||
## one-off step to normalize old chapter URLs if present.
|
||||
if self.oldchaptersmap:
|
||||
self.oldchaptersmap = dict((self.normalize_chapterurl(key), value) for (key, value) in self.oldchaptersmap.items())
|
||||
|
||||
for index, (title,url) in enumerate(self.chapterUrls):
|
||||
newchap = False
|
||||
if (self.chapterFirst!=None and index < self.chapterFirst) or \
|
||||
|
|
@ -388,7 +392,7 @@ class BaseSiteAdapter(Configurable):
|
|||
url in self.oldchaptersdata and (
|
||||
self.oldchaptersdata[url]['chapterorigtitle'] !=
|
||||
self.oldchaptersdata[url]['chaptertitle']) )
|
||||
|
||||
|
||||
if not data:
|
||||
data = self.getChapterText(url)
|
||||
# if had to fetch and has existing chapters
|
||||
|
|
@ -400,13 +404,13 @@ class BaseSiteAdapter(Configurable):
|
|||
# anyway--only if it's replaced during an
|
||||
# update.
|
||||
newchap = False
|
||||
|
||||
|
||||
self.story.addChapter(url,
|
||||
removeEntities(title),
|
||||
removeEntities(data),
|
||||
newchap)
|
||||
self.storyDone = True
|
||||
|
||||
|
||||
# include image, but no cover from story, add default_cover_image cover.
|
||||
if self.getConfig('include_images') and \
|
||||
not self.story.cover and \
|
||||
|
|
@ -423,26 +427,30 @@ class BaseSiteAdapter(Configurable):
|
|||
if not self.story.cover and self.oldcover:
|
||||
self.story.oldcover = self.oldcover
|
||||
self.story.setMetadata('cover_image','old')
|
||||
|
||||
|
||||
# cheesy way to carry calibre bookmark file forward across update.
|
||||
if self.calibrebookmark:
|
||||
self.story.calibrebookmark = self.calibrebookmark
|
||||
if self.logfile:
|
||||
self.story.logfile = self.logfile
|
||||
|
||||
|
||||
return self.story
|
||||
|
||||
def getStoryMetadataOnly(self,get_cover=True):
|
||||
if not self.metadataDone:
|
||||
self.doExtractChapterUrlsAndMetadata(get_cover=get_cover)
|
||||
|
||||
|
||||
if not self.story.getMetadataRaw('dateUpdated'):
|
||||
if self.story.getMetadataRaw('datePublished'):
|
||||
self.story.setMetadata('dateUpdated',self.story.getMetadataRaw('datePublished'))
|
||||
else:
|
||||
self.story.setMetadata('dateUpdated',self.story.getMetadataRaw('dateCreated'))
|
||||
self.story.setMetadata('dateUpdated',self.story.getMetadataRaw('dateCreated'))
|
||||
|
||||
self.metadataDone = True
|
||||
# normalize chapter urls.
|
||||
for index, (title,url) in enumerate(self.chapterUrls):
|
||||
self.chapterUrls[index] = (title,self.normalize_chapterurl(url))
|
||||
|
||||
return self.story
|
||||
|
||||
def setStoryMetadata(self,metahtml):
|
||||
|
|
@ -453,36 +461,36 @@ class BaseSiteAdapter(Configurable):
|
|||
if self.story.getMetadataRaw('datePublished'):
|
||||
self.story.setMetadata('dateUpdated',self.story.getMetadataRaw('datePublished'))
|
||||
else:
|
||||
self.story.setMetadata('dateUpdated',self.story.getMetadataRaw('dateCreated'))
|
||||
|
||||
self.story.setMetadata('dateUpdated',self.story.getMetadataRaw('dateCreated'))
|
||||
|
||||
def hookForUpdates(self,chaptercount):
|
||||
"Usually not needed."
|
||||
return chaptercount
|
||||
|
||||
###############################
|
||||
|
||||
|
||||
@staticmethod
|
||||
def getSiteDomain():
|
||||
"Needs to be overriden in each adapter class."
|
||||
return 'no such domain'
|
||||
|
||||
|
||||
@classmethod
|
||||
def getConfigSection(cls):
|
||||
"Only needs to be overriden if != site domain."
|
||||
return cls.getSiteDomain()
|
||||
|
||||
|
||||
@classmethod
|
||||
def getConfigSections(cls):
|
||||
"Only needs to be overriden if has additional ini sections."
|
||||
return [cls.getConfigSection()]
|
||||
|
||||
|
||||
@classmethod
|
||||
def stripURLParameters(cls,url):
|
||||
"Only needs to be overriden if URL contains more than one parameter"
|
||||
## remove any trailing '&' parameters--?sid=999 will be left.
|
||||
## that's all that any of the current adapters need or want.
|
||||
return re.sub(r"&.*$","",url)
|
||||
|
||||
|
||||
## URL pattern validation is done *after* picking an adaptor based
|
||||
## on domain instead of *as* the adaptor selector so we can offer
|
||||
## the user example(s) for that particular site.
|
||||
|
|
@ -490,7 +498,7 @@ class BaseSiteAdapter(Configurable):
|
|||
def getSiteURLPattern(self):
|
||||
"Used to validate URL. Should be override in each adapter class."
|
||||
return '^http://'+re.escape(self.getSiteDomain())
|
||||
|
||||
|
||||
@classmethod
|
||||
def getSiteExampleURLs(cls):
|
||||
"""
|
||||
|
|
@ -500,7 +508,7 @@ class BaseSiteAdapter(Configurable):
|
|||
validateURL method.
|
||||
"""
|
||||
return 'no such example'
|
||||
|
||||
|
||||
def doExtractChapterUrlsAndMetadata(self,get_cover=True):
|
||||
'''
|
||||
There are a handful of adapters that fetch a cover image while
|
||||
|
|
@ -509,7 +517,7 @@ class BaseSiteAdapter(Configurable):
|
|||
this instead of extractChapterUrlsAndMetadata()
|
||||
'''
|
||||
return self.extractChapterUrlsAndMetadata()
|
||||
|
||||
|
||||
def extractChapterUrlsAndMetadata(self):
|
||||
"Needs to be overriden in each adapter class. Populates self.story metadata and self.chapterUrls"
|
||||
pass
|
||||
|
|
@ -561,7 +569,7 @@ class BaseSiteAdapter(Configurable):
|
|||
# bs4
|
||||
return soup.attrs.keys()
|
||||
return []
|
||||
|
||||
|
||||
# This gives us a unicode object, not just a string containing bytes.
|
||||
# (I gave soup a unicode string, you'd think it could give it back...)
|
||||
# Now also does a bunch of other common processing for us.
|
||||
|
|
@ -570,12 +578,12 @@ class BaseSiteAdapter(Configurable):
|
|||
fetch=self._fetchUrlRaw
|
||||
|
||||
acceptable_attributes = self.getConfigList('keep_html_attrs',['href','name','class','id'])
|
||||
|
||||
|
||||
if self.getConfig("keep_style_attr"):
|
||||
acceptable_attributes.append('style')
|
||||
if self.getConfig("keep_title_attr"):
|
||||
acceptable_attributes.append('title')
|
||||
|
||||
|
||||
#print("include_images:"+self.getConfig('include_images'))
|
||||
if self.getConfig('include_images'):
|
||||
acceptable_attributes.extend(('src','alt','longdesc'))
|
||||
|
|
@ -592,6 +600,19 @@ class BaseSiteAdapter(Configurable):
|
|||
if attr not in acceptable_attributes:
|
||||
del soup[attr] ## strip all tag attributes except href and name
|
||||
|
||||
## apply adapter's normalize_chapterurls to all links in
|
||||
## chapter texts, if they match chapter URLs. While this will
|
||||
## be occasionally helpful by itself, it's really for the next
|
||||
## feature: internal text links.
|
||||
if self.getConfig('normalize_text_links'):
|
||||
for alink in soup.find_all('a'):
|
||||
# try:
|
||||
if alink.has_attr('href'):
|
||||
logger.debug("normalize_text_links %s -> %s"%(alink['href'],self.normalize_chapterurl(alink['href'])))
|
||||
alink['href'] = self.normalize_chapterurl(alink['href'])
|
||||
# except AttributeError as ae:
|
||||
# logger.info("Parsing for normalize_text_links failed...")
|
||||
|
||||
try:
|
||||
# as a generator, each tag will be returned even if there's a
|
||||
# mismatch at the end.
|
||||
|
|
@ -599,8 +620,8 @@ class BaseSiteAdapter(Configurable):
|
|||
for attr in self.get_attr_keys(t):
|
||||
if attr not in acceptable_attributes:
|
||||
del t[attr] ## strip all tag attributes except acceptable_attributes
|
||||
|
||||
# these are not acceptable strict XHTML. But we do already have
|
||||
|
||||
# these are not acceptable strict XHTML. But we do already have
|
||||
# CSS classes of the same names defined
|
||||
if t and hasattr(t,'name') and t.name is not None:
|
||||
if t.name in self.getConfigList('replace_tags_with_spans',['u']):
|
||||
|
|
@ -616,11 +637,11 @@ class BaseSiteAdapter(Configurable):
|
|||
# remove script tags cross the board.
|
||||
if t.name=='script':
|
||||
t.extract()
|
||||
|
||||
|
||||
except AttributeError, ae:
|
||||
if "%s"%ae != "'NoneType' object has no attribute 'next_element'":
|
||||
logger.error("Error parsing HTML, probably poor input HTML. %s"%ae)
|
||||
|
||||
|
||||
retval = unicode(soup)
|
||||
|
||||
if self.getConfig('nook_img_fix') and not self.getConfig('replace_br_with_p'):
|
||||
|
|
@ -629,16 +650,16 @@ class BaseSiteAdapter(Configurable):
|
|||
# that under the text for the rest of the chapter.
|
||||
retval = re.sub(r"(?!<(div|p)>)\s*(?P<imgtag><img[^>]+>)\s*(?!</(div|p)>)",
|
||||
"<div>\g<imgtag></div>",retval)
|
||||
|
||||
|
||||
# Don't want html, head or body tags in chapter html--writers add them.
|
||||
# This is primarily for epub updates.
|
||||
retval = re.sub(r"</?(html|head|body)[^>]*>\r?\n?","",retval)
|
||||
|
||||
|
||||
if self.getConfig("replace_br_with_p") and allow_replace_br_with_p:
|
||||
# Apply heuristic processing to replace <br> paragraph
|
||||
# breaks with <p> tags.
|
||||
retval = replace_br_with_p(retval)
|
||||
|
||||
|
||||
if self.getConfig('replace_hr'):
|
||||
# replacing a self-closing tag with a container tag in the
|
||||
# soup is more difficult than it first appears. So cheat.
|
||||
|
|
@ -648,31 +669,35 @@ class BaseSiteAdapter(Configurable):
|
|||
|
||||
def make_soup(self,data):
|
||||
'''
|
||||
Convenience method for getting a bs4 soup. Older and
|
||||
non-updated adapters call the included bs3 library themselves.
|
||||
Convenience method for getting a bs4 soup. bs3 has been removed.
|
||||
'''
|
||||
|
||||
|
||||
## html5lib handles <noscript> oddly. See:
|
||||
## https://bugs.launchpad.net/beautifulsoup/+bug/1277464
|
||||
## This should 'hide' and restore <noscript> tags.
|
||||
data = data.replace("noscript>","fff_hide_noscript>")
|
||||
|
||||
|
||||
## soup and re-soup because BS4/html5lib is more forgiving of
|
||||
## incorrectly nested tags that way.
|
||||
soup = bs4.BeautifulSoup(data,'html5lib')
|
||||
soup = bs4.BeautifulSoup(unicode(soup),'html5lib')
|
||||
|
||||
|
||||
for ns in soup.find_all('fff_hide_noscript'):
|
||||
ns.name = 'noscript'
|
||||
|
||||
|
||||
return soup
|
||||
|
||||
|
||||
## For adapters, especially base_xenforoforum to override. Make
|
||||
## sure to return unchanged URL if it's NOT a chapter URL...
|
||||
def normalize_chapterurl(self,url):
|
||||
return url
|
||||
|
||||
def cachedfetch(realfetch,cache,url):
|
||||
if url in cache:
|
||||
return cache[url]
|
||||
else:
|
||||
return realfetch(url)
|
||||
|
||||
|
||||
fullmon = {u"January":u"01", u"February":u"02", u"March":u"03", u"April":u"04", u"May":u"05",
|
||||
u"June":u"06","July":u"07", u"August":u"08", u"September":u"09", u"October":u"10",
|
||||
u"November":u"11", u"December":u"12" }
|
||||
|
|
@ -687,7 +712,7 @@ def makeDate(string,dateform):
|
|||
# lie. It has to do something even more complicated to get
|
||||
# Russian month names correct everywhere.
|
||||
do_abbrev = "%b" in dateform
|
||||
|
||||
|
||||
if u"%B" in dateform or do_abbrev:
|
||||
dateform = dateform.replace(u"%B",u"%m").replace(u"%b",u"%m")
|
||||
for (name,num) in fullmon.items():
|
||||
|
|
@ -708,10 +733,10 @@ def makeDate(string,dateform):
|
|||
string = string.replace(u"AM",u"").replace(u"PM",u"").replace(u"am",u"").replace(u"pm",u"")
|
||||
|
||||
date = datetime.strptime(string.encode('utf-8'),dateform.encode('utf-8'))
|
||||
|
||||
|
||||
if add_hours:
|
||||
date += timedelta(hours=12)
|
||||
|
||||
|
||||
return date
|
||||
|
||||
# .? for AO3's ']' in param names.
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2015 FanFicFare team
|
||||
# Copyright 2016 FanFicFare team
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
|
|
@ -85,6 +85,62 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
|
|||
def getSiteURLPattern(self):
|
||||
return r"https?://"+re.escape(self.getSiteDomain())+r"/(?P<tp>threads|posts)/(.+\.)?(?P<id>\d+)/?[^#]*?(#post-(?P<anchorpost>\d+))?$"
|
||||
|
||||
## For adapters, especially base_xenforoforum to override. Make
|
||||
## sure to return unchanged URL if it's NOT a chapter URL. This
|
||||
## is most helpful for xenforoforum because threadmarks use
|
||||
## thread-name URLs--which can change if the thread name changes.
|
||||
def normalize_chapterurl(self,url):
|
||||
(is_chapter_url,normalized_url) = self._is_normalize_chapterurl(url)
|
||||
if is_chapter_url:
|
||||
return normalized_url
|
||||
else:
|
||||
return url
|
||||
|
||||
## returns (is_chapter_url,normalized_url)
|
||||
def _is_normalize_chapterurl(self,url):
|
||||
is_chapter_url = False
|
||||
|
||||
## moved from extract metadata to share with normalize_chapterurl.
|
||||
if not url.startswith('http'):
|
||||
url = self.getURLPrefix()+'/'+url
|
||||
|
||||
if ( url.startswith(self.getURLPrefix()) or
|
||||
url.startswith('http://'+self.getSiteDomain()) or
|
||||
url.startswith('https://'+self.getSiteDomain()) ) and \
|
||||
( '/posts/' in url or '/threads/' in url or 'showpost.php' in url or 'goto/post' in url):
|
||||
# brute force way to deal with SB's http->https change when hardcoded http urls.
|
||||
url = url.replace('http://'+self.getSiteDomain(),self.getURLPrefix())
|
||||
|
||||
# http://forums.spacebattles.com/showpost.php?p=4755532&postcount=9
|
||||
url = re.sub(r'showpost\.php\?p=([0-9]+)(&postcount=[0-9]+)?',r'/posts/\1/',url)
|
||||
|
||||
# http://forums.spacebattles.com/goto/post?id=15222406#post-15222406
|
||||
url = re.sub(r'/goto/post\?id=([0-9]+)(#post-[0-9]+)?',r'/posts/\1/',url)
|
||||
|
||||
url = re.sub(r'(^[\'"]+|[\'"]+$)','',url) # strip leading or trailing '" from incorrect quoting.
|
||||
url = re.sub(r'like$','',url) # strip 'like' if incorrect 'like' link instead of proper post URL.
|
||||
|
||||
#### moved from getChapterText()
|
||||
## there's some history of stories with links to the wrong
|
||||
## page. This changes page#post URLs to perma-link URLs.
|
||||
## Which will be redirected back to page#posts, but the
|
||||
## *correct* ones.
|
||||
# http://forums.sufficientvelocity.com/threads/harry-potter-and-the-not-fatal-at-all-cultural-exchange-program.330/page-4#post-39915
|
||||
# https://forums.sufficientvelocity.com/posts/39915/
|
||||
if '#post-' in url:
|
||||
url = self.getURLPrefix()+'/posts/'+url.split('#post-')[1]+'/'
|
||||
|
||||
## Same as above except for for case where author mistakenly
|
||||
## used the reply link instead of normal link to post.
|
||||
# "http://forums.spacebattles.com/threads/manager-worm-story-thread-iv.301602/reply?quote=15962513"
|
||||
# https://forums.spacebattles.com/posts/
|
||||
if 'reply?quote=' in url:
|
||||
url = self.getURLPrefix()+'/posts/'+url.split('reply?quote=')[1]+'/'
|
||||
|
||||
is_chapter_url = True
|
||||
return (is_chapter_url,url)
|
||||
|
||||
|
||||
def use_pagecache(self):
|
||||
'''
|
||||
adapters that will work with the page cache need to implement
|
||||
|
|
@ -119,7 +175,7 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
|
|||
# params[soup.find('input', {'id':'password'})['name']] = params['password']
|
||||
|
||||
d = self._fetchUrl(loginUrl, params)
|
||||
|
||||
|
||||
if "Log Out" not in d :
|
||||
logger.info("Failed to login to URL %s as %s" % (loginUrl,
|
||||
params['login']))
|
||||
|
|
@ -183,7 +239,7 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
|
|||
threadmark_chaps = True
|
||||
if self.getConfig('always_include_first_post'):
|
||||
self.chapterUrls.append((first_post_title,useurl))
|
||||
|
||||
|
||||
for (atag,url,name) in [ (x,x['href'],stripHTML(x)) for x in markas ]:
|
||||
date = self.make_date(atag.find_next_sibling('div',{'class':'extra'}))
|
||||
if not self.story.getMetadataRaw('datePublished') or date < self.story.getMetadataRaw('datePublished'):
|
||||
|
|
@ -202,7 +258,7 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
|
|||
if self.getConfig('capitalize_forumtags'):
|
||||
tstr = tstr.title()
|
||||
self.story.addToList('forumtags',tstr)
|
||||
|
||||
|
||||
# Now go hunting for the 'chapter list'.
|
||||
bq = soup.find('blockquote') # assume first posting contains TOC urls.
|
||||
|
||||
|
|
@ -222,28 +278,9 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
|
|||
if not self.chapterUrls:
|
||||
self.chapterUrls.append((first_post_title,useurl))
|
||||
for (url,name) in [ (x['href'],stripHTML(x)) for x in bq.find_all('a') ]:
|
||||
#logger.debug("found chapurl:%s"%url)
|
||||
if not url.startswith('http'):
|
||||
url = self.getURLPrefix()+'/'+url
|
||||
|
||||
if ( url.startswith(self.getURLPrefix()) or
|
||||
url.startswith('http://'+self.getSiteDomain()) or
|
||||
url.startswith('https://'+self.getSiteDomain()) ) and \
|
||||
( '/posts/' in url or '/threads/' in url or 'showpost.php' in url or 'goto/post' in url):
|
||||
|
||||
# brute force way to deal with SB's http->https change when hardcoded http urls.
|
||||
url = url.replace('http://'+self.getSiteDomain(),self.getURLPrefix())
|
||||
|
||||
# http://forums.spacebattles.com/showpost.php?p=4755532&postcount=9
|
||||
url = re.sub(r'showpost\.php\?p=([0-9]+)(&postcount=[0-9]+)?',r'/posts/\1/',url)
|
||||
|
||||
# http://forums.spacebattles.com/goto/post?id=15222406#post-15222406
|
||||
url = re.sub(r'/goto/post\?id=([0-9]+)(#post-[0-9]+)?',r'/posts/\1/',url)
|
||||
|
||||
url = re.sub(r'(^[\'"]+|[\'"]+$)','',url) # strip leading or trailing '" from incorrect quoting.
|
||||
url = re.sub(r'like$','',url) # strip 'like' if incorrect 'like' link instead of proper post URL.
|
||||
|
||||
logger.debug("(ch:%s)used chapurl:%s"%(len(self.chapterUrls)+1,url))
|
||||
(is_chapter_url,url) = self._is_normalize_chapterurl(url)
|
||||
if is_chapter_url:
|
||||
self.chapterUrls.append((name,url))
|
||||
if url == useurl and first_post_title == self.chapterUrls[0][0] \
|
||||
and not self.getConfig('always_include_first_post',False):
|
||||
|
|
@ -286,22 +323,6 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
|
|||
def getChapterText(self, url):
|
||||
logger.debug('Getting chapter text from: %s' % url)
|
||||
|
||||
## there's some history of stories with links to the wrong
|
||||
## page. This changes page#post URLs to perma-link URLs.
|
||||
## Which will be redirected back to page#posts, but the
|
||||
## *correct* ones.
|
||||
# http://forums.sufficientvelocity.com/threads/harry-potter-and-the-not-fatal-at-all-cultural-exchange-program.330/page-4#post-39915
|
||||
# https://forums.sufficientvelocity.com/posts/39915/
|
||||
if '#post-' in url:
|
||||
url = self.getURLPrefix()+'/posts/'+url.split('#post-')[1]+'/'
|
||||
|
||||
## Same as above except for for case where author mistakenly
|
||||
## used the reply link instead of normal link to post.
|
||||
# "http://forums.spacebattles.com/threads/manager-worm-story-thread-iv.301602/reply?quote=15962513"
|
||||
# https://forums.spacebattles.com/posts/
|
||||
if 'reply?quote=' in url:
|
||||
url = self.getURLPrefix()+'/posts/'+url.split('reply?quote=')[1]+'/'
|
||||
|
||||
try:
|
||||
origurl = url
|
||||
(data,opened) = self._fetchUrlOpened(url)
|
||||
|
|
@ -309,20 +330,20 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
|
|||
if '#' in origurl and '#' not in url:
|
||||
url = url + origurl[origurl.index('#'):]
|
||||
logger.debug("chapter URL redirected to: %s"%url)
|
||||
|
||||
|
||||
soup = self.make_soup(data)
|
||||
|
||||
|
||||
if '#' in url:
|
||||
anchorid = url.split('#')[1]
|
||||
soup = soup.find('li',id=anchorid)
|
||||
|
||||
|
||||
bq = soup.find('blockquote')
|
||||
|
||||
|
||||
bq.name='div'
|
||||
|
||||
|
||||
for iframe in bq.find_all('iframe'):
|
||||
iframe.extract() # calibre book reader & editor don't like iframes to youtube.
|
||||
|
||||
|
||||
for qdiv in bq.find_all('div',{'class':'quoteExpand'}):
|
||||
qdiv.extract() # Remove <div class="quoteExpand">click to expand</div>
|
||||
|
||||
|
|
@ -330,7 +351,7 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
|
|||
## include lazy load images.
|
||||
for img in bq.find_all('img',{'class':'lazyload'}):
|
||||
img['src'] = img['data-src']
|
||||
|
||||
|
||||
except Exception as e:
|
||||
if self.getConfig('continue_on_chapter_error'):
|
||||
bq = self.make_soup("""<div>
|
||||
|
|
|
|||
|
|
@ -184,6 +184,7 @@ def get_valid_set_options():
|
|||
'include_images':(None,['epub','html'],boollist),
|
||||
'grayscale_images':(None,['epub','html'],boollist),
|
||||
'no_image_processing':(None,['epub','html'],boollist),
|
||||
'normalize_text_links':(None,['epub','html'],boollist),
|
||||
|
||||
'capitalize_forumtags':(base_xenforo_list,None,boollist),
|
||||
'continue_on_chapter_error':(base_xenforo_list,None,boollist),
|
||||
|
|
@ -361,7 +362,7 @@ def get_valid_keywords():
|
|||
'minimum_threadmarks',
|
||||
'first_post_title',
|
||||
'always_include_first_post',
|
||||
'',
|
||||
'normalize_text_links',
|
||||
])
|
||||
|
||||
# *known* entry keywords -- or rather regexps for them.
|
||||
|
|
|
|||
|
|
@ -760,6 +760,11 @@ remove_transparency: true
|
|||
## true--replace_br_with_p also fixes the problem.
|
||||
nook_img_fix:true
|
||||
|
||||
## Apply adapter's normalize_chapterurl() to all links in chapter
|
||||
## texts, if they match chapter URLs. Currently only implemented by
|
||||
## base_xenforoforum adapters.
|
||||
#normalize_text_links:false
|
||||
|
||||
[mobi]
|
||||
## mobi TOC cannot be turned off right now.
|
||||
#include_tocpage: true
|
||||
|
|
|
|||
Loading…
Reference in a new issue