Change all sites that will work with https to use it all the time.

This commit is contained in:
Jim Miller 2018-06-15 10:29:16 -05:00
parent 99ea6d5064
commit f1b3bc021e
36 changed files with 203 additions and 161 deletions

View file

@ -47,7 +47,7 @@ class ArchiveSkyeHawkeComAdapter(BaseSiteAdapter):
# normalized story URL.
self._setURL('http://' + self.getSiteDomain() + '/story.php?no='+self.story.getMetadata('storyId'))
self._setURL('https://' + self.getSiteDomain() + '/story.php?no='+self.story.getMetadata('storyId'))
# Each adapter needs to have a unique site abbreviation.
self.story.setMetadata('siteabbrev','ash')
@ -67,10 +67,10 @@ class ArchiveSkyeHawkeComAdapter(BaseSiteAdapter):
@classmethod
def getSiteExampleURLs(cls):
return "http://archive.skyehawke.com/story.php?no=1234 http://www.skyehawke.com/archive/story.php?no=1234 http://skyehawke.com/archive/story.php?no=1234"
return "https://archive.skyehawke.com/story.php?no=1234 https://www.skyehawke.com/archive/story.php?no=1234 https://skyehawke.com/archive/story.php?no=1234"
def getSiteURLPattern(self):
return re.escape("http://")+r"(archive|www)\.skyehawke\.com/(archive/)?story\.php\?no=\d+$"
return r"https?://(archive|www)\.skyehawke\.com/(archive/)?story\.php\?no=\d+$"
## Getting the chapter list and the meta data, plus 'is adult' checking.
def extractChapterUrlsAndMetadata(self):
@ -100,7 +100,7 @@ class ArchiveSkyeHawkeComAdapter(BaseSiteAdapter):
# Find authorid and URL from... author url.
author = a.find('a')
self.story.setMetadata('authorId',author['href'].split('=')[1])
self.story.setMetadata('authorUrl','http://'+self.host+'/'+author['href'])
self.story.setMetadata('authorUrl','https://'+self.host+'/'+author['href'])
self.story.setMetadata('author',author.string)
authorSoup = self.make_soup(self._fetchUrl(self.story.getMetadata('authorUrl')))

View file

@ -35,7 +35,7 @@ class AsianFanFicsComAdapter(BaseSiteAdapter):
self.story.setMetadata('storyId',m.group('id'))
# normalized story URL.
self._setURL('http://' + self.getSiteDomain() + '/story/view/'+self.story.getMetadata('storyId'))
self._setURL('https://' + self.getSiteDomain() + '/story/view/'+self.story.getMetadata('storyId'))
else:
raise exceptions.InvalidStoryURL(url,
self.getSiteDomain(),
@ -55,7 +55,7 @@ class AsianFanFicsComAdapter(BaseSiteAdapter):
@classmethod
def getSiteExampleURLs(cls):
return "http://"+cls.getSiteDomain()+"/story/view/123456 http://"+cls.getSiteDomain()+"/story/view/123456/story-title-here http://"+cls.getSiteDomain()+"/story/view/123456/1"
return "https://"+cls.getSiteDomain()+"/story/view/123456 https://"+cls.getSiteDomain()+"/story/view/123456/story-title-here https://"+cls.getSiteDomain()+"/story/view/123456/1"
def getSiteURLPattern(self):
return r"https?://"+re.escape(self.getSiteDomain())+r"/story/view/0*(?P<id>\d+)"
@ -89,14 +89,14 @@ class AsianFanFicsComAdapter(BaseSiteAdapter):
if self.is_adult or self.getConfig("is_adult"):
contentFilter = check.find('a',{'href':'/account/mark_over_18'}) #two different types of adult checks
if contentFilter:
loginUrl = 'http://' + self.getSiteDomain() + '/account/mark_over_18'
loginUrl = 'https://' + self.getSiteDomain() + '/account/mark_over_18'
self._fetchUrl(loginUrl)
else:
params = {}
params['csrf_aff_token'] = check.find('input',{'name':'csrf_aff_token'})['value']
params['is_of_age'] = '1'
params['current_url'] = '/story/view/' + self.story.getMetadata('storyId')
loginUrl = 'http://' + self.getSiteDomain() + '/account/toggle_age'
loginUrl = 'https://' + self.getSiteDomain() + '/account/toggle_age'
self._postUrl(loginUrl,params)
data = self._fetchUrl(url,usecache=False)
@ -157,7 +157,7 @@ class AsianFanFicsComAdapter(BaseSiteAdapter):
alist = alist.parent.findAll('a', href=re.compile(r"/profile/view/\d+"))
for a in alist:
self.story.addToList('authorId',a['href'].split('/')[-1])
self.story.addToList('authorUrl','http://'+self.host+a['href'])
self.story.addToList('authorUrl','https://'+self.host+a['href'])
self.story.addToList('author',a.text)
newestChapter = None
@ -168,7 +168,7 @@ class AsianFanFicsComAdapter(BaseSiteAdapter):
self.story.setMetadata('numChapters',len(chapters))
for index, chapter in enumerate(chapters):
if chapter.text != 'Foreword': # skip the foreword
self.chapterUrls.append((stripHTML(chapter.text),'http://' + self.getSiteDomain() + chapter['value'])) # note: AFF cuts off chapter names in list. this gets kind of fixed later on
self.chapterUrls.append((stripHTML(chapter.text),'https://' + self.getSiteDomain() + chapter['value'])) # note: AFF cuts off chapter names in list. this gets kind of fixed later on
# find timestamp
a = soup.find('span', text='Updated')
if a == None:

View file

@ -75,7 +75,7 @@ class BDSMLibraryComSiteAdapter(BaseSiteAdapter):
# get storyId from url--url validation guarantees query is only storyid=1234
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
self._setURL('http://{0}/stories/story.php?storyid={1}'.format(self.getSiteDomain(), self.story.getMetadata('storyId')))
self._setURL('https://{0}/stories/story.php?storyid={1}'.format(self.getSiteDomain(), self.story.getMetadata('storyId')))
# Each adapter needs to have a unique site abbreviation.
self.story.setMetadata('siteabbrev','bdsmlib')
@ -91,10 +91,10 @@ class BDSMLibraryComSiteAdapter(BaseSiteAdapter):
@classmethod
def getSiteExampleURLs(cls):
return "http://"+cls.getSiteDomain()+"/stories/story.php?storyid=1234"
return "https://"+cls.getSiteDomain()+"/stories/story.php?storyid=1234"
def getSiteURLPattern(self):
return re.escape("http://"+self.getSiteDomain()+"/stories/story.php?storyid=")+r"\d+$"
return r"https?://"+re.escape(self.getSiteDomain()+"/stories/story.php?storyid=")+r"\d+$"
def use_pagecache(self):
'''
@ -155,7 +155,7 @@ class BDSMLibraryComSiteAdapter(BaseSiteAdapter):
for chapter in soup.findAll('a', href=re.compile(r'/stories/chapter.php\?storyid='+self.story.getMetadata('storyId')+"&chapterid=\d+$")):
value = chapter.findNext('td').findNext('td').string.replace('(added on','').replace(')','').strip()
self.story.setMetadata('dateUpdated', makeDate(value, self.dateformat))
self.chapterUrls.append((stripHTML(chapter),'http://'+self.getSiteDomain()+chapter['href']))
self.chapterUrls.append((stripHTML(chapter),'https://'+self.getSiteDomain()+chapter['href']))
self.story.setMetadata('numChapters',len(self.chapterUrls))

View file

@ -27,7 +27,7 @@ class BloodshedverseComAdapter(BaseSiteAdapter):
SITE_ABBREVIATION = 'bvc'
SITE_DOMAIN = 'bloodshedverse.com'
BASE_URL = 'http://' + SITE_DOMAIN + '/'
BASE_URL = 'https://' + SITE_DOMAIN + '/'
READ_URL_TEMPLATE = BASE_URL + 'stories.php?go=read&no=%s'
STARTED_DATETIME_FORMAT = '%m/%d/%Y'
@ -65,7 +65,7 @@ class BloodshedverseComAdapter(BaseSiteAdapter):
return cls.READ_URL_TEMPLATE % 1234
def getSiteURLPattern(self):
return re.escape(self.BASE_URL + 'stories.php?go=') + r'(read|chapters)\&(amp;)?no=\d+$'
return r'https?://' + re.escape(self.SITE_DOMAIN + '/stories.php?go=') + r'(read|chapters)\&(amp;)?no=\d+$'
# Override stripURLParameters so the "no" parameter won't get stripped
@classmethod

View file

@ -47,7 +47,7 @@ class CSIForensicsComAdapter(BaseSiteAdapter):
self._setURL('http://' + self.getSiteDomain() + '/viewstory.php?sid='+self.story.getMetadata('storyId'))
self._setURL('https://' + self.getSiteDomain() + '/viewstory.php?sid='+self.story.getMetadata('storyId'))
# Each adapter needs to have a unique site abbreviation.
self.story.setMetadata('siteabbrev','csiforensics')
@ -63,10 +63,10 @@ class CSIForensicsComAdapter(BaseSiteAdapter):
@classmethod
def getSiteExampleURLs(cls):
return "http://"+cls.getSiteDomain()+"/viewstory.php?sid=1234"
return "https://"+cls.getSiteDomain()+"/viewstory.php?sid=1234"
def getSiteURLPattern(self):
return re.escape("http://"+self.getSiteDomain()+"/viewstory.php?sid=")+r"\d+$"
return r"https?://"+re.escape(self.getSiteDomain()+"/viewstory.php?sid=")+r"\d+$"
## Getting the chapter list and the meta data, plus 'is adult' checking.
def extractChapterUrlsAndMetadata(self):
@ -117,7 +117,7 @@ class CSIForensicsComAdapter(BaseSiteAdapter):
# Find authorid and URL from... author url.
a = soup.find('a', href=re.compile(r"viewuser.php\?uid=\d+"))
self.story.setMetadata('authorId',a['href'].split('=')[1])
self.story.setMetadata('authorUrl','http://'+self.host+'/'+a['href'])
self.story.setMetadata('authorUrl','https://'+self.host+'/'+a['href'])
self.story.setMetadata('author',a.string)
# Rating
@ -128,7 +128,7 @@ class CSIForensicsComAdapter(BaseSiteAdapter):
# Find the chapters:
for chapter in soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"&chapter=\d+$")):
# just in case there's tags, like <i> in chapter titles.
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/'+chapter['href']+addurl))
self.chapterUrls.append((stripHTML(chapter),'https://'+self.host+'/'+chapter['href']+addurl))
self.story.setMetadata('numChapters',len(self.chapterUrls))
@ -193,7 +193,7 @@ class CSIForensicsComAdapter(BaseSiteAdapter):
# Find Series name from series URL.
a = soup.find('a', href=re.compile(r"viewseries.php\?seriesid=\d+"))
series_name = a.string
series_url = 'http://'+self.host+'/'+a['href']
series_url = 'https://'+self.host+'/'+a['href']
# use BeautifulSoup HTML parser to make everything easier to find.
seriessoup = self.make_soup(self._fetchUrl(series_url))

View file

@ -22,6 +22,13 @@ from base_efiction_adapter import BaseEfictionAdapter
class DarkSolaceOrgAdapter(BaseEfictionAdapter):
@classmethod
def getProtocol(self):
"""
Some, but not all site now require https.
"""
return "https"
@staticmethod
def getSiteDomain():
return 'dark-solace.org'

View file

@ -48,7 +48,7 @@ class DeepInMySoulNetAdapter(BaseSiteAdapter): # XXX
# normalized story URL.
# XXX Most sites don't have the /fiction part. Replace all to remove it usually.
self._setURL('http://' + self.getSiteDomain() + '/fiction/viewstory.php?sid='+self.story.getMetadata('storyId'))
self._setURL('https://' + self.getSiteDomain() + '/fiction/viewstory.php?sid='+self.story.getMetadata('storyId'))
# Each adapter needs to have a unique site abbreviation.
self.story.setMetadata('siteabbrev','dimsn') ## XXX
@ -64,10 +64,10 @@ class DeepInMySoulNetAdapter(BaseSiteAdapter): # XXX
@classmethod
def getSiteExampleURLs(cls):
return "http://"+cls.getSiteDomain()+"/fiction/viewstory.php?sid=1234"
return "https://"+cls.getSiteDomain()+"/fiction/viewstory.php?sid=1234"
def getSiteURLPattern(self):
return re.escape("http://"+self.getSiteDomain()+"/fiction/viewstory.php?sid=")+r"\d+$"
return "https?://"+re.escape(self.getSiteDomain()+"/fiction/viewstory.php?sid=")+r"\d+$"
## Login seems to be reasonably standard across eFiction sites.
def needToLoginCheck(self, data):
@ -90,7 +90,7 @@ class DeepInMySoulNetAdapter(BaseSiteAdapter): # XXX
params['cookiecheck'] = '1'
params['submit'] = 'Submit'
loginUrl = 'http://' + self.getSiteDomain() + '/fiction/user.php?action=login'
loginUrl = 'https://' + self.getSiteDomain() + '/fiction/user.php?action=login'
logger.debug("Will now login to URL (%s) as (%s)" % (loginUrl,
params['penname']))
@ -183,13 +183,13 @@ class DeepInMySoulNetAdapter(BaseSiteAdapter): # XXX
# Find authorid and URL from... author url.
a = pagetitle.find('a', href=re.compile(r"viewuser.php\?uid=\d+"))
self.story.setMetadata('authorId',a['href'].split('=')[1])
self.story.setMetadata('authorUrl','http://'+self.host+'/'+a['href'])
self.story.setMetadata('authorUrl','https://'+self.host+'/'+a['href'])
self.story.setMetadata('author',a.string)
# Find the chapters:
for chapter in soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"&chapter=\d+$")):
# just in case there's tags, like <i> in chapter titles.
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/fiction/'+chapter['href']+addurl))
self.chapterUrls.append((stripHTML(chapter),'https://'+self.host+'/fiction/'+chapter['href']+addurl))
self.story.setMetadata('numChapters',len(self.chapterUrls))
@ -262,7 +262,7 @@ class DeepInMySoulNetAdapter(BaseSiteAdapter): # XXX
# Find Series name from series URL.
a = soup.find('a', href=re.compile(r"fiction/viewseries.php\?seriesid=\d+"))
series_name = a.string
series_url = 'http://'+self.host+'/'+a['href']
series_url = 'https://'+self.host+'/'+a['href']
# use BeautifulSoup HTML parser to make everything easier to find.
seriessoup = self.make_soup(self._fetchUrl(series_url))

View file

@ -21,6 +21,13 @@ from base_efiction_adapter import BaseEfictionAdapter
class FHSArchiveComAdapter(BaseEfictionAdapter):
@classmethod
def getProtocol(self):
"""
Some, but not all site now require https.
"""
return "https"
@staticmethod
def getSiteDomain():
return 'fhsarchive.com'

View file

@ -19,7 +19,7 @@ class FictionManiaTVAdapter(BaseSiteAdapter):
SITE_ABBREVIATION = 'fmt'
SITE_DOMAIN = 'fictionmania.tv'
BASE_URL = 'http://' + SITE_DOMAIN + '/stories/'
BASE_URL = 'https://' + SITE_DOMAIN + '/stories/'
READ_TEXT_STORY_URL_TEMPLATE = BASE_URL + 'readtextstory.html?storyID=%s'
DETAILS_URL_TEMPLATE = BASE_URL + 'details.html?storyID=%s'
@ -62,7 +62,7 @@ class FictionManiaTVAdapter(BaseSiteAdapter):
return cls.READ_TEXT_STORY_URL_TEMPLATE % 1234
def getSiteURLPattern(self):
return 'https?' + re.escape(self.BASE_URL[len('http'):]) + '(readtextstory|readxstory|details)\.html\?storyID=\d+$'
return r'https?' + re.escape(self.BASE_URL[len('https'):]) + '(readtextstory|readxstory|details)\.html\?storyID=\d+$'
def extractChapterUrlsAndMetadata(self):
url = self.DETAILS_URL_TEMPLATE % self.story.getMetadata('storyId')

View file

@ -54,7 +54,7 @@ class FireflyPopulliOrgSiteAdapter(BaseSiteAdapter):
self.story.setMetadata('storyId',m.group('id'))
# normalized story URL.
self._setURL('http://' + self.getSiteDomain() + '/archive/' +m.group('cat') +
self._setURL('https://' + self.getSiteDomain() + '/archive/' +m.group('cat') +
'/' + self.story.getMetadata('storyId') +'.shtml')
else:
raise exceptions.InvalidStoryURL(url,
@ -82,11 +82,11 @@ class FireflyPopulliOrgSiteAdapter(BaseSiteAdapter):
################################################################################################
@classmethod
def getSiteExampleURLs(cls):
return "http://" + cls.getSiteDomain() + "/archive/999/astoryname.shtml"
return "https://" + cls.getSiteDomain() + "/archive/999/astoryname.shtml"
################################################################################################
def getSiteURLPattern(self):
return re.escape("http://"+self.getSiteDomain())+r'/archive/(?P<cat>\d+)/(?P<id>\S+)\.shtml'
return r"https?://"+re.escape(self.getSiteDomain())+r'/archive/(?P<cat>\d+)/(?P<id>\S+)\.shtml'
################################################################################################
def get_page(self, page):
@ -159,7 +159,7 @@ class FireflyPopulliOrgSiteAdapter(BaseSiteAdapter):
# Some stories list multiple authors, but the search engine only uses 1 author, and since
# we can't tell how many 'words' are in each name, I'm going to do a work around.
author_name = mdata.split(' ')[0].strip()
author_url = ('http://'+self.getSiteDomain()+'/cgi-bin/search.cgi?Author={}&SortBy=0'+
author_url = ('https://'+self.getSiteDomain()+'/cgi-bin/search.cgi?Author={}&SortBy=0'+
'&SortOrder=0&NumToList=0&FastSearch=0&ShortResults=0').format(author_name)
story_found = False
while not story_found:
@ -167,7 +167,7 @@ class FireflyPopulliOrgSiteAdapter(BaseSiteAdapter):
adata = self.get_page(author_url)
if 'No stories found for your search choices.' in adata:
author_name = ' '.join(author_name.split()[:-1])
author_url = ('http://'+self.getSiteDomain(
author_url = ('https://'+self.getSiteDomain(
)+'/cgi-bin/search.cgi?Author={}&SortBy=0'+
'&SortOrder=0&NumToList=0&FastSearch=0' +
'&ShortResults=0').format(author_name)
@ -206,14 +206,14 @@ class FireflyPopulliOrgSiteAdapter(BaseSiteAdapter):
if label == 'Series Title:':
## there is no way to tell which number of the series the story is, so we won't
# put a number
series_url = 'http://'+self.getSiteDomain()+'/'+link['href']
series_url = 'https://'+self.getSiteDomain()+'/'+link['href']
self.story.setMetadata('series', link.get_text())
self.story.setMetadata('seriesUrl', series_url)
elif label == 'Prequel to:':
value = link.string + ' (' + 'http://'+self.getSiteDomain()+link['href'] + ')'
value = link.string + ' (' + 'https://'+self.getSiteDomain()+link['href'] + ')'
self.story.setMetadata('prequelto', value)
elif label == 'Sequel to:':
value = link.string + ' (' + 'http://'+self.getSiteDomain()+link['href'] + ')'
value = link.string + ' (' + 'https://'+self.getSiteDomain()+link['href'] + ')'
self.story.setMetadata('sequelto', value)
# Some stories have alot of text in the "summary", and I've tried to keep down on creating

View file

@ -47,7 +47,7 @@ class HLFictionNetAdapter(BaseSiteAdapter):
# normalized story URL.
self._setURL('http://' + self.getSiteDomain() + '/viewstory.php?sid='+self.story.getMetadata('storyId'))
self._setURL('https://' + self.getSiteDomain() + '/viewstory.php?sid='+self.story.getMetadata('storyId'))
# Each adapter needs to have a unique site abbreviation.
self.story.setMetadata('siteabbrev','hlf')
@ -63,10 +63,10 @@ class HLFictionNetAdapter(BaseSiteAdapter):
@classmethod
def getSiteExampleURLs(cls):
return "http://"+cls.getSiteDomain()+"/viewstory.php?sid=1234"
return "https://"+cls.getSiteDomain()+"/viewstory.php?sid=1234"
def getSiteURLPattern(self):
return re.escape("http://"+self.getSiteDomain()+"/viewstory.php?sid=")+r"\d+$"
return r"https?://"+re.escape(self.getSiteDomain()+"/viewstory.php?sid=")+r"\d+$"
## Getting the chapter list and the meta data, plus 'is adult' checking.
def extractChapterUrlsAndMetadata(self):
@ -98,7 +98,7 @@ class HLFictionNetAdapter(BaseSiteAdapter):
aut = a.find('a', href=re.compile(r"viewuser.php\?uid=\d+"))
self.story.setMetadata('authorId',aut['href'].split('=')[1])
self.story.setMetadata('authorUrl','http://'+self.host+'/'+aut['href'])
self.story.setMetadata('authorUrl','https://'+self.host+'/'+aut['href'])
self.story.setMetadata('author',aut.string)
aut.extract()
@ -109,7 +109,7 @@ class HLFictionNetAdapter(BaseSiteAdapter):
if chapters != None:
for chapter in chapters.findAll('option'):
# just in case there's tags, like <i> in chapter titles.
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/viewstory.php?sid='+self.story.getMetadata('storyId')+'&chapter='+chapter['value']))
self.chapterUrls.append((stripHTML(chapter),'https://'+self.host+'/viewstory.php?sid='+self.story.getMetadata('storyId')+'&chapter='+chapter['value']))
else:
self.chapterUrls.append((self.story.getMetadata('title'),url))
@ -191,7 +191,7 @@ class HLFictionNetAdapter(BaseSiteAdapter):
# Find Series name from series URL.
a = list.find('a', href=re.compile(r"series.php\?seriesid=\d+"))
series_name = a.string
series_url = 'http://'+self.host+'/'+a['href']
series_url = 'https://'+self.host+'/'+a['href']
# use BeautifulSoup HTML parser to make everything easier to find.
seriessoup = self.make_soup(self._fetchUrl(series_url))

View file

@ -47,7 +47,7 @@ class HPFanficArchiveComAdapter(BaseSiteAdapter):
# normalized story URL.
self._setURL('http://' + self.getSiteDomain() + '/stories/viewstory.php?sid='+self.story.getMetadata('storyId'))
self._setURL('https://' + self.getSiteDomain() + '/stories/viewstory.php?sid='+self.story.getMetadata('storyId'))
# Each adapter needs to have a unique site abbreviation.
self.story.setMetadata('siteabbrev','hpffa')
@ -63,7 +63,7 @@ class HPFanficArchiveComAdapter(BaseSiteAdapter):
@classmethod
def getSiteExampleURLs(cls):
return "http://"+cls.getSiteDomain()+"/stories/viewstory.php?sid=1234"
return "https://"+cls.getSiteDomain()+"/stories/viewstory.php?sid=1234"
def getSiteURLPattern(self):
return r"https?:"+re.escape("//"+self.getSiteDomain()+"/stories/viewstory.php?sid=")+r"\d+$"
@ -103,13 +103,13 @@ class HPFanficArchiveComAdapter(BaseSiteAdapter):
# Find authorid and URL from... author url.
a = soup.find('div', id="mainpage").find('a', href=re.compile(r"viewuser.php\?uid=\d+"))
self.story.setMetadata('authorId',a['href'].split('=')[1])
self.story.setMetadata('authorUrl','http://'+self.host+'/stories/'+a['href'])
self.story.setMetadata('authorUrl','https://'+self.host+'/stories/'+a['href'])
self.story.setMetadata('author',a.string)
# Find the chapters:
for chapter in soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"&chapter=\d+$")):
# just in case there's tags, like <i> in chapter titles.
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/stories/'+chapter['href']))
self.chapterUrls.append((stripHTML(chapter),'https://'+self.host+'/stories/'+chapter['href']))
self.story.setMetadata('numChapters',len(self.chapterUrls))
@ -186,7 +186,7 @@ class HPFanficArchiveComAdapter(BaseSiteAdapter):
# Find Series name from series URL.
a = soup.find('a', href=re.compile(r"viewseries.php\?seriesid=\d+"))
series_name = a.string
series_url = 'http://'+self.host+'/stories/'+a['href']
series_url = 'https://'+self.host+'/stories/'+a['href']
# use BeautifulSoup HTML parser to make everything easier to find.
seriessoup = self.make_soup(self._fetchUrl(series_url))

View file

@ -47,7 +47,7 @@ class ImagineEFicComAdapter(BaseSiteAdapter):
# normalized story URL.
self._setURL('http://' + self.getSiteDomain() + '/viewstory.php?sid='+self.story.getMetadata('storyId'))
self._setURL('https://' + self.getSiteDomain() + '/viewstory.php?sid='+self.story.getMetadata('storyId'))
# Each adapter needs to have a unique site abbreviation.
self.story.setMetadata('siteabbrev','ime')
@ -63,10 +63,10 @@ class ImagineEFicComAdapter(BaseSiteAdapter):
@classmethod
def getSiteExampleURLs(cls):
return "http://"+cls.getSiteDomain()+"/viewstory.php?sid=1234"
return "https://"+cls.getSiteDomain()+"/viewstory.php?sid=1234"
def getSiteURLPattern(self):
return re.escape("http://"+self.getSiteDomain()+"/viewstory.php?sid=")+r"\d+$"
return r"https?://"+re.escape(self.getSiteDomain()+"/viewstory.php?sid=")+r"\d+$"
## Login seems to be reasonably standard across eFiction sites.
def needToLoginCheck(self, data):
@ -89,7 +89,7 @@ class ImagineEFicComAdapter(BaseSiteAdapter):
params['cookiecheck'] = '1'
params['submit'] = 'Submit'
loginUrl = 'http://' + self.getSiteDomain() + '/user.php?action=login'
loginUrl = 'https://' + self.getSiteDomain() + '/user.php?action=login'
logger.debug("Will now login to URL (%s) as (%s)" % (loginUrl,
params['penname']))
@ -171,13 +171,13 @@ class ImagineEFicComAdapter(BaseSiteAdapter):
# Find authorid and URL from... author url.
a = soup.find('a', href=re.compile(r"viewuser.php\?uid=\d+"))
self.story.setMetadata('authorId',a['href'].split('=')[1])
self.story.setMetadata('authorUrl','http://'+self.host+'/'+a['href'])
self.story.setMetadata('authorUrl','https://'+self.host+'/'+a['href'])
self.story.setMetadata('author',a.string)
# Find the chapters:
for chapter in soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"&chapter=\d+$")):
# just in case there's tags, like <i> in chapter titles.
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/'+chapter['href']+addurl))
self.chapterUrls.append((stripHTML(chapter),'https://'+self.host+'/'+chapter['href']+addurl))
self.story.setMetadata('numChapters',len(self.chapterUrls))
@ -249,7 +249,7 @@ class ImagineEFicComAdapter(BaseSiteAdapter):
# Find Series name from series URL.
a = soup.find('a', href=re.compile(r"viewseries.php\?seriesid=\d+"))
series_name = a.string
series_url = 'http://'+self.host+'/'+a['href']
series_url = 'https://'+self.host+'/'+a['href']
# use BeautifulSoup HTML parser to make everything easier to find.
seriessoup = self.make_soup(self._fetchUrl(series_url))

View file

@ -52,13 +52,13 @@ class KSArchiveComAdapter(BaseSiteAdapter): # XXX
# normalized story URL.
# XXX Most sites don't have the /fanfic part. Replace all to remove it usually.
self._setURL('http://' + self.getSiteDomain() + '/viewstory.php?sid='+self.story.getMetadata('storyId'))
self._setURL('https://' + self.getSiteDomain() + '/viewstory.php?sid='+self.story.getMetadata('storyId'))
# Each adapter needs to have a unique site abbreviation.
self.story.setMetadata('siteabbrev','ksa') # XXX
# The date format will vary from site to site.
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
# https://docs.python.org/library/datetime.html#strftime-strptime-behavior
self.dateformat = "%b/%d/%Y" # XXX
@classmethod
@ -72,10 +72,10 @@ class KSArchiveComAdapter(BaseSiteAdapter): # XXX
@classmethod
def getSiteExampleURLs(cls):
return "http://"+cls.getSiteDomain()+"/viewstory.php?sid=1234"
return "https://"+cls.getSiteDomain()+"/viewstory.php?sid=1234"
def getSiteURLPattern(self):
return "http://(www.)?"+re.escape(self.getSiteDomain()+"/viewstory.php?sid=")+r"\d+$"
return r"https?://(www.)?"+re.escape(self.getSiteDomain()+"/viewstory.php?sid=")+r"\d+$"
## Getting the chapter list and the meta data, plus 'is adult' checking.
def extractChapterUrlsAndMetadata(self):
@ -160,13 +160,13 @@ class KSArchiveComAdapter(BaseSiteAdapter): # XXX
pagetitle = soup.find('div',id='pagetitle')
for a in pagetitle.findAll('a', href=re.compile(r"viewuser.php\?uid=\d+")):
self.story.addToList('authorId',a['href'].split('=')[1])
self.story.addToList('authorUrl','http://'+self.host+'/'+a['href'])
self.story.addToList('authorUrl','https://'+self.host+'/'+a['href'])
self.story.addToList('author',stripHTML(a))
# Find the chapters:
for chapter in soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"&chapter=\d+$")):
# just in case there's tags, like <i> in chapter titles.
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/'+chapter['href']+addurl))
self.chapterUrls.append((stripHTML(chapter),'https://'+self.host+'/'+chapter['href']+addurl))
self.story.setMetadata('numChapters',len(self.chapterUrls))
@ -286,7 +286,7 @@ class KSArchiveComAdapter(BaseSiteAdapter): # XXX
# Find Series name from series URL.
a = soup.find('a', href=re.compile(r"viewseries.php\?seriesid=\d+"))
series_name = stripHTML(a)
series_url = 'http://'+self.host+'/'+a['href']
series_url = 'https://'+self.host+'/'+a['href']
# use BeautifulSoup HTML parser to make everything easier to find.
seriessoup = self.make_soup(self._fetchUrl(series_url))

View file

@ -67,7 +67,7 @@ class LightNovelGateSiteAdapter(BaseSiteAdapter):
self.story.setMetadata('storyId', m.group('id'))
# normalized story URL.
self._setURL("http://"+self.getSiteDomain()
self._setURL("https://"+self.getSiteDomain()
+"/novel/"+self.story.getMetadata('storyId'))
else:
raise exceptions.InvalidStoryURL(url,
@ -80,11 +80,11 @@ class LightNovelGateSiteAdapter(BaseSiteAdapter):
@classmethod
def getSiteExampleURLs(cls):
return "http://lightnovelgate.com/novel/astoryname"
return "https://lightnovelgate.com/novel/astoryname"
def getSiteURLPattern(self):
# http://lightnovelgate.com/novel/stellar_transformation
return r"http://lightnovelgate\.com/novel/(?P<id>[^/]+)"
return r"https?://lightnovelgate\.com/novel/(?P<id>[^/]+)"
def extractChapterUrlsAndMetadata(self):
# fetch the chapter. From that we will get almost all the

View file

@ -46,7 +46,7 @@ class LOTRgficComAdapter(BaseSiteAdapter):
# normalized story URL.
self._setURL('http://' + self.getSiteDomain() + '/viewstory.php?sid='+self.story.getMetadata('storyId'))
self._setURL('https://' + self.getSiteDomain() + '/viewstory.php?sid='+self.story.getMetadata('storyId'))
@staticmethod
@ -55,10 +55,10 @@ class LOTRgficComAdapter(BaseSiteAdapter):
@classmethod
def getSiteExampleURLs(cls):
return "http://"+cls.getSiteDomain()+"/viewstory.php?sid=1234"
return "https://"+cls.getSiteDomain()+"/viewstory.php?sid=1234"
def getSiteURLPattern(self):
return re.escape("http://"+self.getSiteDomain()+"/viewstory.php?sid=")+r"\d+$"
return r"https?://"+re.escape(self.getSiteDomain()+"/viewstory.php?sid=")+r"\d+$"
def use_pagecache(self):
'''
@ -107,13 +107,13 @@ class LOTRgficComAdapter(BaseSiteAdapter):
# Find authorid and URL from... author url.
a = div.find('a', href=re.compile(r"viewuser.php"))
self.story.setMetadata('authorId',a['href'].split('=')[1])
self.story.setMetadata('authorUrl','http://'+self.host+'/'+a['href'])
self.story.setMetadata('authorUrl','https://'+self.host+'/'+a['href'])
self.story.setMetadata('author',a.string)
# Find the chapters:
for chapter in div.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"&chapter=\d+$")):
# just in case there's tags, like <i> in chapter titles.
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/'+chapter['href']+addurl))
self.chapterUrls.append((stripHTML(chapter),'https://'+self.host+'/'+chapter['href']+addurl))
self.story.setMetadata('numChapters',len(self.chapterUrls))
@ -213,7 +213,7 @@ class LOTRgficComAdapter(BaseSiteAdapter):
# Find Series name from series URL.
a = soup.find('a', href=re.compile(r"viewseries.php\?seriesid=\d+"))
series_name = a.string
series_url = 'http://'+self.host+'/'+a['href']
series_url = 'https://'+self.host+'/'+a['href']
# use BeautifulSoup HTML parser to make everything easier to find.
seriessoup = self_make_soup(self._fetchUrl(series_url))

View file

@ -157,7 +157,7 @@ class MassEffect2InAdapter(BaseSiteAdapter):
self.story.extendList('author', [authorName])
authorId = chapter.getAuthorId()
if authorId:
authorUrl = 'http://%s/index/%s' % (self.getSiteDomain(), authorId)
authorUrl = 'https://%s/index/%s' % (self.getSiteDomain(), authorId)
else:
authorId = u''
authorUrl = u''
@ -280,7 +280,7 @@ class MassEffect2InAdapter(BaseSiteAdapter):
@classmethod
def _makeDocumentUrl(cls, documentId):
"""Make a chapter URL given a document ID."""
return 'http://%s/publ/%s' % (cls.getSiteDomain(), documentId)
return 'https://%s/publ/%s' % (cls.getSiteDomain(), documentId)
class Chapter(object):
@ -573,7 +573,7 @@ class Chapter(object):
else:
_logger.warning(u"No title found for rating label `%s'!" % label)
# TODO: conduct a research on such abnormal URLs.
elif source == 'http://www.masseffect2.in/_fr/10/1360399.png':
elif '/_fr/10/1360399.png' in source:
label = 'Nn'
return {
'label': 'Nn',

View file

@ -62,7 +62,7 @@ class MCStoriesComSiteAdapter(BaseSiteAdapter):
@classmethod
def getSiteExampleURLs(cls):
return "http://mcstories.com/StoryTitle http://mcstories.com/StoryTitle/index.html http://mcstories.com/StoryTitle/StoryTitle1.html"
return "https://mcstories.com/StoryTitle https://mcstories.com/StoryTitle/index.html https://mcstories.com/StoryTitle/StoryTitle1.html"
def getSiteURLPattern(self):
return r"https?://(www\.)?mcstories\.com/([a-zA-Z0-9_-]+)/"

View file

@ -60,7 +60,7 @@ class MediaMinerOrgSiteAdapter(BaseSiteAdapter):
self.getSiteExampleURLs())
# normalized story URL.
self._setURL('http://' + self.getSiteDomain() + '/fanfic/s/'+cattitle+'/'+urltitle+'/'+self.story.getMetadata('storyId'))
self._setURL('https://' + self.getSiteDomain() + '/fanfic/s/'+cattitle+'/'+urltitle+'/'+self.story.getMetadata('storyId'))
else:
raise exceptions.InvalidStoryURL(url,
self.getSiteDomain(),
@ -76,20 +76,20 @@ class MediaMinerOrgSiteAdapter(BaseSiteAdapter):
@classmethod
def getSiteExampleURLs(cls):
return "http://"+cls.getSiteDomain()+"/fanfic/s/category-name/story-title/123456 http://"+cls.getSiteDomain()+"/fanfic/c/category-name/story-title/123456/987612"
return "https://"+cls.getSiteDomain()+"/fanfic/s/category-name/story-title/123456 https://"+cls.getSiteDomain()+"/fanfic/c/category-name/story-title/123456/987612"
def getSiteURLPattern(self):
## old urls
## http://www.mediaminer.org/fanfic/view_st.php/76882
## https://www.mediaminer.org/fanfic/view_st.php/76882
## new urls
## http://www.mediaminer.org/fanfic/s/ghosts-from-the-past/72
## http://www.mediaminer.org/fanfic/c/ghosts-from-the-past/chapter-2/72/174
## http://www.mediaminer.org/fanfic/s/robtech-final-missions/61553
## http://www.mediaminer.org/fanfic/c/robtech-final-missions/robotech-final-missions-oneshot/61553/189830
## https://www.mediaminer.org/fanfic/s/ghosts-from-the-past/72
## https://www.mediaminer.org/fanfic/c/ghosts-from-the-past/chapter-2/72/174
## https://www.mediaminer.org/fanfic/s/robtech-final-missions/61553
## https://www.mediaminer.org/fanfic/c/robtech-final-missions/robotech-final-missions-oneshot/61553/189830
## even newer urls
## http://www.mediaminer.org/fanfic/s/gundam-wing-fan-fiction/the-preventer-operatives/171000
## http://www.mediaminer.org/fanfic/c/gundam-wing-fan-fiction/the-preventer-operatives/171000/608822
return re.escape("http://"+self.getSiteDomain())+r"/fanfic/"+\
## https://www.mediaminer.org/fanfic/s/gundam-wing-fan-fiction/the-preventer-operatives/171000
## https://www.mediaminer.org/fanfic/c/gundam-wing-fan-fiction/the-preventer-operatives/171000/608822
return r"https?://"+re.escape(self.getSiteDomain())+r"/fanfic/"+\
r"((s/(?P<cattitle4>[^/]+)/(?P<urltitle4>[^/]+)/(?P<id4>\d+))|"+\
r"((c/(?P<cattitle5>[^/]+)/(?P<urltitle5>[^/]+)/(?P<id5>\d+))/\d+)|"+\
r"(s/(?P<urltitle1>[^/]+)/(?P<id1>\d+))|"+\
@ -138,7 +138,7 @@ class MediaMinerOrgSiteAdapter(BaseSiteAdapter):
# Find authorid and URL from... author url.
a = soup.find('a', href=re.compile(r"/user_info.php/\d+"))
self.story.setMetadata('authorId',a['href'].split('/')[-1])
self.story.setMetadata('authorUrl','http://'+self.host+a['href'])
self.story.setMetadata('authorUrl','https://'+self.host+a['href'])
self.story.setMetadata('author',a.string)
# save date from first for later.
@ -147,7 +147,7 @@ class MediaMinerOrgSiteAdapter(BaseSiteAdapter):
# Find the chapters - one-shot now have chapter list, too.
chap_p = soup.find('p',{'style':'margin-left:10px;'})
for (atag,aurl,name) in [ (x,x['href'],stripHTML(x)) for x in chap_p.find_all('a') ]:
self.chapterUrls.append((name,'http://'+self.host+aurl))
self.chapterUrls.append((name,'https://'+self.host+aurl))
self.story.setMetadata('numChapters',len(self.chapterUrls))

View file

@ -20,6 +20,13 @@ from base_efiction_adapter import BaseEfictionAdapter
class NCISFictionComAdapter(BaseEfictionAdapter):
@classmethod
def getProtocol(self):
"""
Some, but not all site now require https.
"""
return "https"
@staticmethod
def getSiteDomain():
return 'ncisfiction.com'

View file

@ -46,7 +46,7 @@ class PhoenixSongNetAdapter(BaseSiteAdapter):
# normalized story URL.
self._setURL('http://' + self.getSiteDomain() + '/fanfiction/story/' +self.story.getMetadata('storyId')+'/')
self._setURL('https://' + self.getSiteDomain() + '/fanfiction/story/' +self.story.getMetadata('storyId')+'/')
# Each adapter needs to have a unique site abbreviation.
self.story.setMetadata('siteabbrev','phs')
@ -62,10 +62,10 @@ class PhoenixSongNetAdapter(BaseSiteAdapter):
@classmethod
def getSiteExampleURLs(cls):
return "http://"+cls.getSiteDomain()+"/fanfiction/story/1234/"
return "https://"+cls.getSiteDomain()+"/fanfiction/story/1234/"
def getSiteURLPattern(self):
return re.escape("http://"+self.getSiteDomain()+"/fanfiction/story/")+r"\d+/?$"
return r"https?://"+re.escape(self.getSiteDomain()+"/fanfiction/story/")+r"\d+/?$"
## Login seems to be reasonably standard across eFiction sites.
def needToLoginCheck(self, data):
@ -86,7 +86,7 @@ class PhoenixSongNetAdapter(BaseSiteAdapter):
#params['remember'] = '1'
params['login'] = 'Login'
loginUrl = 'http://' + self.getSiteDomain() + '/users/processlogin.php'
loginUrl = 'https://' + self.getSiteDomain() + '/users/processlogin.php'
logger.debug("Will now login to URL (%s) as (%s)" % (loginUrl,
params['txtusername']))
d = self._fetchUrl(loginUrl, params)
@ -136,7 +136,7 @@ class PhoenixSongNetAdapter(BaseSiteAdapter):
# Find authorid and URL from... author url. /fanfiction/stories.php?psid=125
a = b.find('a', href=re.compile(r"/fanfiction/stories.php\?psid=\d+"))
self.story.setMetadata('authorId',a['href'].split('=')[1])
self.story.setMetadata('authorUrl','http://'+self.host+'/'+a['href'])
self.story.setMetadata('authorUrl','https://'+self.host+'/'+a['href'])
self.story.setMetadata('author',a.string)
# Find the chapters:
@ -152,17 +152,17 @@ class PhoenixSongNetAdapter(BaseSiteAdapter):
i = 0
chapters = chapters.findAll('option')
for chapter in chapters:
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+chapter['value']))
self.chapterUrls.append((stripHTML(chapter),'https://'+self.host+chapter['value']))
if i == 0:
self.story.setMetadata('storyId',chapter['value'].split('/')[3])
head = self.make_soup(self._fetchUrl('http://'+self.host+chapter['value'])).findAll('b')
head = self.make_soup(self._fetchUrl('https://'+self.host+chapter['value'])).findAll('b')
for b in head:
if b.text == "Updated":
date = b.nextSibling.string.split(': ')[1].split(',')
self.story.setMetadata('datePublished', makeDate(date[0]+date[1], self.dateformat))
if i == (len(chapters)-1):
head = self.make_soup(self._fetchUrl('http://'+self.host+chapter['value'])).findAll('b')
head = self.make_soup(self._fetchUrl('https://'+self.host+chapter['value'])).findAll('b')
for b in head:
if b.text == "Updated":
date = b.nextSibling.string.split(': ')[1].split(',')

View file

@ -51,7 +51,7 @@ class PotterFicsComAdapter(BaseSiteAdapter):
self.story.setMetadata('storyId',m.group('id'))
# normalized story URL. gets rid of chapter if there, left with chapter index URL
nurl = "http://"+self.getSiteDomain()+"/historias/"+self.story.getMetadata('storyId')
nurl = "https://"+self.getSiteDomain()+"/historias/"+self.story.getMetadata('storyId')
self._setURL(nurl)
else:
raise exceptions.InvalidStoryURL(url,
@ -69,15 +69,15 @@ class PotterFicsComAdapter(BaseSiteAdapter):
@classmethod
def getSiteExampleURLs(cls):
return "http://www.potterfics.com/historias/12345 http://www.potterfics.com/historias/12345/capitulo-1 "
return "https://www.potterfics.com/historias/12345 https://www.potterfics.com/historias/12345/capitulo-1 "
def getSiteURLPattern(self):
#http://www.potterfics.com/historias/127583
#http://www.potterfics.com/historias/127583/capitulo-1
#http://www.potterfics.com/historias/127583/capitulo-4
#http://www.potterfics.com/historias/92810 -> Complete story
#http://www.potterfics.com/historias/111194 -> Complete, single chap
p = re.escape("http://"+self.getSiteDomain()+"/historias/")+\
#https://www.potterfics.com/historias/127583
#https://www.potterfics.com/historias/127583/capitulo-1
#https://www.potterfics.com/historias/127583/capitulo-4
#https://www.potterfics.com/historias/92810 -> Complete story
#https://www.potterfics.com/historias/111194 -> Complete, single chap
p = r"https?://"+re.escape(self.getSiteDomain()+"/historias/")+\
r"(?P<id>\d+)(/capitulo-(?P<ch>\d+))?/?$"
return p
@ -101,7 +101,7 @@ class PotterFicsComAdapter(BaseSiteAdapter):
params['login_password'] = self.getConfig("password")
params['login_ck'] = '1'
loginUrl = 'http://www.potterfics.com/secciones/usuarios/login.php'
loginUrl = 'https://www.potterfics.com/secciones/usuarios/login.php'
logger.debug("Will now login to URL (%s) as (%s)" % (loginUrl,
params['login_usuario']))
d = self._postUrl(loginUrl,params)
@ -117,10 +117,10 @@ class PotterFicsComAdapter(BaseSiteAdapter):
def extractChapterUrlsAndMetadata(self):
#this converts '/historias/12345' to 'http://www.potterfics.com/historias/12345'
#this converts '/historias/12345' to 'https://www.potterfics.com/historias/12345'
def makeAbsoluteURL(url):
if url[0] == '/':
url = 'http://'+self.getSiteDomain()+url
url = 'https://'+self.getSiteDomain()+url
return url
#use this to get month numbers from Spanish months

View file

@ -47,7 +47,7 @@ class PretenderCenterComAdapter(BaseSiteAdapter):
# normalized story URL.
self._setURL('http://' + self.getSiteDomain() + '/missingpieces/viewstory.php?sid='+self.story.getMetadata('storyId'))
self._setURL('https://' + self.getSiteDomain() + '/missingpieces/viewstory.php?sid='+self.story.getMetadata('storyId'))
# Each adapter needs to have a unique site abbreviation.
self.story.setMetadata('siteabbrev','ptdc')
@ -67,10 +67,10 @@ class PretenderCenterComAdapter(BaseSiteAdapter):
@classmethod
def getSiteExampleURLs(cls):
return "http://"+cls.getSiteDomain()+"/missingpieces/viewstory.php?sid=1234"
return "https://"+cls.getSiteDomain()+"/missingpieces/viewstory.php?sid=1234"
def getSiteURLPattern(self):
return re.escape("http://")+"(www\.)?"+re.escape(self.getSiteDomain()+"/missingpieces/viewstory.php?sid=")+r"\d+$"
return r"https?://(www\.)?"+re.escape(self.getSiteDomain()+"/missingpieces/viewstory.php?sid=")+r"\d+$"
## Getting the chapter list and the meta data, plus 'is adult' checking.
def extractChapterUrlsAndMetadata(self):
@ -135,13 +135,13 @@ class PretenderCenterComAdapter(BaseSiteAdapter):
# Find authorid and URL from... author url.
a = soup.find('a', href=re.compile(r"viewuser.php\?uid=\d+"))
self.story.setMetadata('authorId',a['href'].split('=')[1])
self.story.setMetadata('authorUrl','http://'+self.host+'/missingpieces/'+a['href'])
self.story.setMetadata('authorUrl','https://'+self.host+'/missingpieces/'+a['href'])
self.story.setMetadata('author',a.string)
# Find the chapters:
for chapter in soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"&chapter=\d+$")):
# just in case there's tags, like <i> in chapter titles.
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/missingpieces/'+chapter['href']+addurl))
self.chapterUrls.append((stripHTML(chapter),'https://'+self.host+'/missingpieces/'+chapter['href']+addurl))
self.story.setMetadata('numChapters',len(self.chapterUrls))
@ -213,7 +213,7 @@ class PretenderCenterComAdapter(BaseSiteAdapter):
# Find Series name from series URL.
a = soup.find('a', href=re.compile(r"viewseries.php\?seriesid=\d+"))
series_name = a.string
series_url = 'http://'+self.host+'/missingpieces/'+a['href']
series_url = 'https://'+self.host+'/missingpieces/'+a['href']
# use BeautifulSoup HTML parser to make everything easier to find.
seriessoup = self.make_soup(self._fetchUrl(series_url))

View file

@ -47,7 +47,7 @@ class QafFicComAdapter(BaseSiteAdapter):
# normalized story URL.
self._setURL('http://' + self.getSiteDomain() + '/atp/viewstory.php?sid='+self.story.getMetadata('storyId'))
self._setURL('https://' + self.getSiteDomain() + '/atp/viewstory.php?sid='+self.story.getMetadata('storyId'))
# Each adapter needs to have a unique site abbreviation.
self.story.setMetadata('siteabbrev','atp')
@ -63,10 +63,10 @@ class QafFicComAdapter(BaseSiteAdapter):
@classmethod
def getSiteExampleURLs(cls):
return "http://"+cls.getSiteDomain()+"/atp/viewstory.php?sid=1234"
return "https://"+cls.getSiteDomain()+"/atp/viewstory.php?sid=1234"
def getSiteURLPattern(self):
return re.escape("http://"+self.getSiteDomain()+"/atp/viewstory.php?sid=")+r"\d+$"
return r"https?://"+re.escape(self.getSiteDomain()+"/atp/viewstory.php?sid=")+r"\d+$"
## Getting the chapter list and the meta data, plus 'is adult' checking.
@ -130,7 +130,7 @@ class QafFicComAdapter(BaseSiteAdapter):
aut = a.find('a', href=re.compile(r"viewuser.php\?uid=\d+"))
self.story.setMetadata('authorId',aut['href'].split('=')[1])
self.story.setMetadata('authorUrl','http://'+self.host+'/atp/'+aut['href'])
self.story.setMetadata('authorUrl','https://'+self.host+'/atp/'+aut['href'])
self.story.setMetadata('author',aut.string)
aut.extract()
@ -141,7 +141,7 @@ class QafFicComAdapter(BaseSiteAdapter):
if chapters != None:
for chapter in chapters.findAll('option'):
# just in case there's tags, like <i> in chapter titles.
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/atp/viewstory.php?sid='+self.story.getMetadata('storyId')+'&chapter='+chapter['value']))
self.chapterUrls.append((stripHTML(chapter),'https://'+self.host+'/atp/viewstory.php?sid='+self.story.getMetadata('storyId')+'&chapter='+chapter['value']))
else:
self.chapterUrls.append((self.story.getMetadata('title'),url))
@ -223,7 +223,7 @@ class QafFicComAdapter(BaseSiteAdapter):
if list.find('a', href=re.compile(r"series.php")) != None:
for series in asoup.findAll('a', href=re.compile(r"series.php\?seriesid=\d+")):
# Find Series name from series URL.
series_url = 'http://'+self.host+'/atp/'+series['href']
series_url = 'https://'+self.host+'/atp/'+series['href']
# use BeautifulSoup HTML parser to make everything easier to find.
seriessoup = self.make_soup(self._fetchUrl(series_url))
storyas = seriessoup.findAll('a', href=re.compile(r'^viewstory.php\?sid=\d+$'))

View file

@ -10,7 +10,7 @@ from base_adapter import BaseSiteAdapter
from ..htmlcleanup import stripHTML
SITE_DOMAIN = 'quotev.com'
STORY_URL_TEMPLATE = 'http://www.quotev.com/story/%s'
STORY_URL_TEMPLATE = 'https://www.quotev.com/story/%s'
def getClass():
@ -41,8 +41,8 @@ class QuotevComAdapter(BaseSiteAdapter):
def getSiteURLPattern(self):
pattern = re.escape(STORY_URL_TEMPLATE.rsplit('%', 1)[0]) + r'(.+?)($|&|/)'
pattern = pattern.replace(r'http\:', r'https?\:')
pattern = pattern.replace(r'https?\:\/\/www\.', r'https?\:\/\/(www\.)?')
pattern = pattern.replace(r'https', r'https?')
pattern = pattern.replace(r'www\.', r'(www\.)?')
return pattern
def use_pagecache(self):
@ -75,7 +75,7 @@ class QuotevComAdapter(BaseSiteAdapter):
self.story.addToList('authorUrl', urlparse.urljoin(self.url, a['href']))
if not self.story.getList('author'):
self.story.addToList('author','Anonymous')
self.story.addToList('authorUrl','http://www.quotev.com')
self.story.addToList('authorUrl','https://www.quotev.com')
self.story.addToList('authorId','0')
self.setDescription(self.url, soup.find('div', id='qdesct'))

View file

@ -66,7 +66,7 @@ class RoyalRoadAdapter(BaseSiteAdapter):
# normalized story URL.
self._setURL('http://' + self.getSiteDomain() + '/fiction/'+self.story.getMetadata('storyId'))
self._setURL('https://' + self.getSiteDomain() + '/fiction/'+self.story.getMetadata('storyId'))
# Each adapter needs to have a unique site abbreviation.
self.story.setMetadata('siteabbrev','rylrdl')
@ -157,7 +157,7 @@ class RoyalRoadAdapter(BaseSiteAdapter):
if author_link:
authorId = author_link['href'].rsplit('/', 1)[1]
self.story.setMetadata('authorId', authorId)
self.story.setMetadata('authorUrl','http://'+self.host+'/user/profile/'+authorId)
self.story.setMetadata('authorUrl','https://'+self.host+'/user/profile/'+authorId)
self.story.setMetadata('author',soup.find(attrs=dict(property="books:author"))['content'])
@ -165,7 +165,7 @@ class RoyalRoadAdapter(BaseSiteAdapter):
chapters = soup.find('table',{'id':'chapters'}).find('tbody')
tds = [tr.findAll('td')[0] for tr in chapters.findAll('tr')]
for td in tds:
chapterUrl = 'http://' + self.getSiteDomain() + td.a['href']
chapterUrl = 'https://' + self.getSiteDomain() + td.a['href']
self.chapterUrls.append((stripHTML(td.text), chapterUrl))
self.story.setMetadata('numChapters',len(self.chapterUrls))

View file

@ -71,7 +71,7 @@ class SheppardWeirComAdapter(BaseSiteAdapter): # XXX
# normalized story URL.
# XXX Most sites don't have the /fanfic part. Replace all to remove it usually.
self._setURL('http://' + self.getSiteDomain() + '/fanfics/viewstory.php?sid='+self.story.getMetadata('storyId'))
self._setURL('https://' + self.getSiteDomain() + '/fanfics/viewstory.php?sid='+self.story.getMetadata('storyId'))
# Each adapter needs to have a unique site abbreviation.
self.story.setMetadata('siteabbrev','swf') # XXX
@ -87,10 +87,10 @@ class SheppardWeirComAdapter(BaseSiteAdapter): # XXX
@classmethod
def getSiteExampleURLs(self):
return "http://"+self.getSiteDomain()+"/fanfics/viewstory.php?sid=1234"
return "https://"+self.getSiteDomain()+"/fanfics/viewstory.php?sid=1234"
def getSiteURLPattern(self):
return re.escape("http://"+self.getSiteDomain()+"/fanfics/viewstory.php?sid=")+r"\d+$"
return r"https?://"+re.escape(self.getSiteDomain()+"/fanfics/viewstory.php?sid=")+r"\d+$"
## Login seems to be reasonably standard across eFiction sites.
def needToLoginCheck(self, data):
@ -113,7 +113,7 @@ class SheppardWeirComAdapter(BaseSiteAdapter): # XXX
params['cookiecheck'] = '1'
params['submit'] = 'Submit'
loginUrl = 'http://' + self.getSiteDomain() + '/fanfics/user.php?action=login'
loginUrl = 'https://' + self.getSiteDomain() + '/fanfics/user.php?action=login'
logger.debug("Will now login to URL (%s) as (%s)" % (loginUrl,
params['penname']))
@ -182,7 +182,7 @@ class SheppardWeirComAdapter(BaseSiteAdapter): # XXX
alist = soup.findAll('a', href=re.compile(r"viewuser.php\?uid=\d+"))
for a in alist:
self.story.addToList('authorId',a['href'].split('=')[1])
self.story.addToList('authorUrl','http://'+self.host+'/fanfics/'+a['href'])
self.story.addToList('authorUrl','https://'+self.host+'/fanfics/'+a['href'])
self.story.addToList('author',a.string)
@ -195,7 +195,7 @@ class SheppardWeirComAdapter(BaseSiteAdapter): # XXX
# Find the chapters:
for chapter in soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"&chapter=\d+$")):
# just in case there's tags, like <i> in chapter titles.
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/fanfics/'+chapter['href']+addurl))
self.chapterUrls.append((stripHTML(chapter),'https://'+self.host+'/fanfics/'+chapter['href']+addurl))
self.story.setMetadata('numChapters',len(self.chapterUrls))
@ -279,7 +279,7 @@ class SheppardWeirComAdapter(BaseSiteAdapter): # XXX
# Find Series name from series URL.
a = soup.find('a', href=re.compile(r"viewseries.php\?seriesid=\d+"))
series_name = a.string
series_url = 'http://'+self.host+'/fanfics/'+a['href']
series_url = 'https://'+self.host+'/fanfics/'+a['href']
# use BeautifulSoup HTML parser to make everything easier to find.
seriessoup = self.make_soup(self._fetchUrl(series_url))

View file

@ -28,7 +28,7 @@ class SpikeluverComAdapter(BaseSiteAdapter):
SITE_ABBREVIATION = 'slc'
SITE_DOMAIN = 'spikeluver.com'
BASE_URL = 'http://' + SITE_DOMAIN + '/SpuffyRealm/'
BASE_URL = 'https://' + SITE_DOMAIN + '/SpuffyRealm/'
LOGIN_URL = BASE_URL + 'user.php?action=login'
VIEW_STORY_URL_TEMPLATE = BASE_URL + 'viewstory.php?sid=%d'
METADATA_URL_SUFFIX = '&index=1'
@ -69,7 +69,7 @@ class SpikeluverComAdapter(BaseSiteAdapter):
return cls.VIEW_STORY_URL_TEMPLATE % 1234
def getSiteURLPattern(self):
return re.escape(self.VIEW_STORY_URL_TEMPLATE[:-2]) + r'\d+$'
return re.escape(self.VIEW_STORY_URL_TEMPLATE[:-2]).replace('https','https?') + r'\d+$'
def extractChapterUrlsAndMetadata(self):
soup = self._customized_fetch_url(self.url + self.METADATA_URL_SUFFIX)

View file

@ -24,6 +24,13 @@ class TheDelphicExpanseComAdapter(BaseEfictionAdapter):
''' This adapter will download stories from the
'Taste of Poison, the Fanfiction of Arsenic Jade' site '''
@classmethod
def getProtocol(self):
"""
Some, but not all site now require https.
"""
return "https"
@staticmethod
def getSiteDomain():
return 'www.thedelphicexpanse.com'

View file

@ -20,6 +20,13 @@ from base_efiction_adapter import BaseEfictionAdapter
class TheHookupZoneNetAdapter(BaseEfictionAdapter):
@classmethod
def getProtocol(self):
"""
Some, but not all site now require https.
"""
return "https"
@staticmethod
def getSiteDomain():
return 'thehookupzone.net'

View file

@ -22,6 +22,13 @@ from base_efiction_adapter import BaseEfictionAdapter
class ThundercatsFansOrgSiteAdapter(BaseEfictionAdapter):
@classmethod
def getProtocol(self):
"""
Some, but not all site now require https.
"""
return "https"
@staticmethod
def getSiteDomain():
return 'www.thundercatsfans.org'

View file

@ -45,7 +45,7 @@ class TrekiverseOrgAdapter(BaseSiteAdapter):
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
# normalized story URL.
self._setURL("http://"+self.getSiteDomain()\
self._setURL("https://"+self.getSiteDomain()\
+"/efiction/viewstory.php?sid="+self.story.getMetadata('storyId'))
# Each adapter needs to have a unique site abbreviation.
@ -66,10 +66,10 @@ class TrekiverseOrgAdapter(BaseSiteAdapter):
@classmethod
def getSiteExampleURLs(cls):
return "http://"+cls.getSiteDomain()+"/efiction/viewstory.php?sid=1234 http://efiction."+cls.getSiteDomain()+"/viewstory.php?sid=1234"
return "https://"+cls.getSiteDomain()+"/efiction/viewstory.php?sid=1234 https://efiction."+cls.getSiteDomain()+"/viewstory.php?sid=1234"
def getSiteURLPattern(self):
return r'(http://trekiverse\.org/efiction/viewstory\.php\?sid=\d+|http://efiction\.trekiverse\.org/viewstory\.php\?sid=\d+)'
return r'(https?://trekiverse\.org/efiction/viewstory\.php\?sid=\d+|https?://efiction\.trekiverse\.org/viewstory\.php\?sid=\d+)'
## Login seems to be reasonably standard across eFiction sites.
def needToLoginCheck(self, data):
@ -92,7 +92,7 @@ class TrekiverseOrgAdapter(BaseSiteAdapter):
params['cookiecheck'] = '1'
params['submit'] = 'Submit'
loginUrl = 'http://' + self.getSiteDomain() + '/efiction/user.php?action=login'
loginUrl = 'https://' + self.getSiteDomain() + '/efiction/user.php?action=login'
logger.debug("Will now login to URL (%s) as (%s)" % (loginUrl,
params['penname']))
@ -171,7 +171,7 @@ class TrekiverseOrgAdapter(BaseSiteAdapter):
a = soup.find('div', {'id' : 'pagetitle'})
aut = a.find('a', href=re.compile(r"^viewuser\.php\?uid="))
self.story.setMetadata('authorId',aut['href'].split('=')[1])
self.story.setMetadata('authorUrl','http://'+self.host+'/efiction/'+aut['href'])
self.story.setMetadata('authorUrl','https://'+self.host+'/efiction/'+aut['href'])
self.story.setMetadata('author',aut.string)
ttl = a.find('a', href=re.compile(r'^viewstory.php\?sid=%s$'%self.story.getMetadata('storyId')))
@ -185,11 +185,11 @@ class TrekiverseOrgAdapter(BaseSiteAdapter):
if len(chapters)==0:
raise exceptions.FailedToDownload(self.getSiteDomain() +" says: No php/html chapters found.")
if len(chapters)==1:
self.chapterUrls.append((self.story.getMetadata('title'),'http://'+self.host+'/efiction/'+chapters[0]['href']))
self.chapterUrls.append((self.story.getMetadata('title'),'https://'+self.host+'/efiction/'+chapters[0]['href']))
else:
for chapter in chapters:
# just in case there's tags, like <i> in chapter titles.
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/efiction/'+chapter['href']))
self.chapterUrls.append((stripHTML(chapter),'https://'+self.host+'/efiction/'+chapter['href']))
self.story.setMetadata('numChapters',len(self.chapterUrls))
@ -282,7 +282,7 @@ class TrekiverseOrgAdapter(BaseSiteAdapter):
# Find Series name from series URL.
a = soup.find('a', href=re.compile(r"viewseries.php\?seriesid=\d+"))
series_name = a.string
series_url = 'http://'+self.host+'/efiction/'+a['href']
series_url = 'https://'+self.host+'/efiction/'+a['href']
# use BeautifulSoup HTML parser to make everything easier to find.
seriessoup = self.make_soup(self._fetchUrl(series_url))

View file

@ -27,7 +27,7 @@ class Voracity2EficComAdapter(BaseSiteAdapter):
SITE_ABBREVIATION = 'voe'
SITE_DOMAIN = 'voracity2.e-fic.com'
BASE_URL = 'http://' + SITE_DOMAIN + '/'
BASE_URL = 'https://' + SITE_DOMAIN + '/'
LOGIN_URL = BASE_URL + 'user.php?action=login'
VIEW_STORY_URL_TEMPLATE = BASE_URL + 'viewstory.php?sid=%d'
METADATA_URL_SUFFIX = '&index=1'
@ -98,7 +98,7 @@ class Voracity2EficComAdapter(BaseSiteAdapter):
return cls.VIEW_STORY_URL_TEMPLATE % 1234
def getSiteURLPattern(self):
return re.escape(self.VIEW_STORY_URL_TEMPLATE[:-2]) + r'\d+$'
return re.escape(self.VIEW_STORY_URL_TEMPLATE[:-2]).replace('https','https?') + r'\d+$'
def extractChapterUrlsAndMetadata(self):
soup = self._customized_fetch_url(self.url + self.METADATA_URL_SUFFIX)

View file

@ -46,7 +46,7 @@ class WolverineAndRogueComAdapter(BaseSiteAdapter):
# normalized story URL.
self._setURL('http://' + self.getSiteDomain() + '/wrfa/viewstory.php?sid='+self.story.getMetadata('storyId'))
self._setURL('https://' + self.getSiteDomain() + '/wrfa/viewstory.php?sid='+self.story.getMetadata('storyId'))
# Each adapter needs to have a unique site abbreviation.
self.story.setMetadata('siteabbrev','wrfa')
@ -62,10 +62,10 @@ class WolverineAndRogueComAdapter(BaseSiteAdapter):
@classmethod
def getSiteExampleURLs(cls):
return "http://"+cls.getSiteDomain()+"/wrfa/viewstory.php?sid=1234"
return "https://"+cls.getSiteDomain()+"/wrfa/viewstory.php?sid=1234"
def getSiteURLPattern(self):
return re.escape("http://"+self.getSiteDomain()+"/wrfa/viewstory.php?sid=")+r"\d+$"
return r"https?://"+re.escape(self.getSiteDomain()+"/wrfa/viewstory.php?sid=")+r"\d+$"
## Getting the chapter list and the meta data, plus 'is adult' checking.
@ -101,7 +101,7 @@ class WolverineAndRogueComAdapter(BaseSiteAdapter):
# Find authorid and URL from... author url.
a = pt.find('a', href=re.compile(r"viewuser.php\?uid=\d+"))
self.story.setMetadata('authorId',a['href'].split('=')[1])
self.story.setMetadata('authorUrl','http://'+self.host+'/wrfa/'+a['href'])
self.story.setMetadata('authorUrl','https://'+self.host+'/wrfa/'+a['href'])
self.story.setMetadata('author',a.string)
rating=pt.text.split('(')[1].split(')')[0]
@ -110,7 +110,7 @@ class WolverineAndRogueComAdapter(BaseSiteAdapter):
# Find the chapters:
for chapter in soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"&chapter=\d+$")):
# just in case there's tags, like <i> in chapter titles.
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/wrfa/'+chapter['href']))
self.chapterUrls.append((stripHTML(chapter),'https://'+self.host+'/wrfa/'+chapter['href']))
self.story.setMetadata('numChapters',len(self.chapterUrls))
@ -178,7 +178,7 @@ class WolverineAndRogueComAdapter(BaseSiteAdapter):
# Find Series name from series URL.
a = soup.find('a', href=re.compile(r"viewseries.php\?seriesid=\d+"))
series_name = a.string
series_url = 'http://'+self.host+'/wrfa/'+a['href']
series_url = 'https://'+self.host+'/wrfa/'+a['href']
# use BeautifulSoup HTML parser to make everything easier to find.
seriessoup = self.make_soup(self._fetchUrl(series_url))

View file

@ -47,7 +47,7 @@ class WraithBaitComAdapter(BaseSiteAdapter):
self._setURL('http://' + self.getSiteDomain() + '/viewstory.php?sid='+self.story.getMetadata('storyId'))
self._setURL('https://' + self.getSiteDomain() + '/viewstory.php?sid='+self.story.getMetadata('storyId'))
# Each adapter needs to have a unique site abbreviation.
self.story.setMetadata('siteabbrev','wb')
@ -63,10 +63,10 @@ class WraithBaitComAdapter(BaseSiteAdapter):
@classmethod
def getSiteExampleURLs(cls):
return "http://"+cls.getSiteDomain()+"/viewstory.php?sid=1234"
return "https://"+cls.getSiteDomain()+"/viewstory.php?sid=1234"
def getSiteURLPattern(self):
return re.escape("http://"+self.getSiteDomain()+"/viewstory.php?sid=")+r"\d+$"
return r"https?://"+re.escape(self.getSiteDomain()+"/viewstory.php?sid=")+r"\d+$"
## Getting the chapter list and the meta data, plus 'is adult' checking.
def extractChapterUrlsAndMetadata(self):
@ -114,7 +114,7 @@ class WraithBaitComAdapter(BaseSiteAdapter):
alist = pt.findAll('a', href=re.compile(r"viewuser.php\?uid=\d+"))
for a in alist:
self.story.addToList('authorId',a['href'].split('=')[1])
self.story.addToList('authorUrl','http://'+self.host+'/'+a['href'])
self.story.addToList('authorUrl','https://'+self.host+'/'+a['href'])
self.story.addToList('author',a.string)
rating=pt.text.split('[')[1].split(']')[0]
@ -133,7 +133,7 @@ class WraithBaitComAdapter(BaseSiteAdapter):
else:
add = ""
# just in case there's tags, like <i> in chapter titles.
self.chapterUrls.append((stripHTML(chapter)+add,'http://'+self.host+'/'+chapter['href']+addurl))
self.chapterUrls.append((stripHTML(chapter)+add,'https://'+self.host+'/'+chapter['href']+addurl))
self.story.setMetadata('numChapters',len(self.chapterUrls))
@ -193,7 +193,7 @@ class WraithBaitComAdapter(BaseSiteAdapter):
# Find Series name from series URL.
a = soup.find('a', href=re.compile(r"viewseries.php\?seriesid=\d+"))
series_name = a.string
series_url = 'http://'+self.host+'/'+a['href']
series_url = 'https://'+self.host+'/'+a['href']
# use BeautifulSoup HTML parser to make everything easier to find.
seriessoup = self.make_soup(self._fetchUrl(series_url))

View file

@ -46,7 +46,7 @@ class WuxiaWorldComSiteAdapter(BaseSiteAdapter):
story_id = match.group('id')
self.story.setMetadata('storyId', story_id)
self._setURL('http://%s/novel/%s' % (self.getSiteDomain(), story_id))
self._setURL('https://%s/novel/%s' % (self.getSiteDomain(), story_id))
@staticmethod
def getSiteDomain():
@ -54,10 +54,10 @@ class WuxiaWorldComSiteAdapter(BaseSiteAdapter):
@classmethod
def getSiteExampleURLs(cls):
return 'http://%s/novel/story-name' % cls.getSiteDomain()
return 'https://%s/novel/story-name' % cls.getSiteDomain()
def getSiteURLPattern(self):
return r'http(s)?://%s/novel/(?P<id>[^/]+)(/)?' % re.escape(self.getSiteDomain())
return r'https?://%s/novel/(?P<id>[^/]+)(/)?' % re.escape(self.getSiteDomain())
def use_pagecache(self):
return True