fictionalley-archive.org: Convert adapter_fictionalleyorg to adapter_fictionalleyarchiveorg.

This commit is contained in:
Jim Miller 2021-10-27 13:33:42 -05:00
parent 913f8dc256
commit 9da07fd160
5 changed files with 289 additions and 257 deletions

View file

@ -74,7 +74,7 @@ rating_label:Rating
warnings_label:Warnings
numChapters_label:Chapters
numWords_label:Words
## www.fanfiction.net, fictionalley.com, etc.
## www.fanfiction.net, fictionalley-archive.com, etc.
site_label:Publisher
## ffnet, fpcom, etc.
siteabbrev_label:Site Abbrev
@ -2980,33 +2980,46 @@ use_basic_cache:true
## for examples of how to use them.
extra_valid_entries:native_status
[www.fictionalley.org]
## Some sites do not require a login, but do require the user to
## confirm they are adult for adult content. In commandline version,
## this should go in your personal.ini, not defaults.ini.
#is_adult:true
[www.fictionalley-archive.org]
## also accepts fictionalley.org story URLs
use_basic_cache:true
## Site dedicated to these categories/characters/ships
extracategories:Harry Potter
## fictionally.org storyIds are not unique. Combine with authorId.
## fictionalley-archive.org storyIds are not unique. Combine with authorId.
output_filename: ${title}-${siteabbrev}_${authorId}_${storyId}${formatext}
## fictionalley.org doesn't have a status metadatum. If uncommented,
## fictionalley-archive.org doesn't have a status metadatum. If uncommented,
## this will be used for status.
#default_value_status:Unknown
website_encodings:Windows-1252,utf8
slow_down_sleep_time:10
## Extra metadata that this adapter knows about. See [archiveofourown.org]
## for examples of how to use them.
extra_valid_entries:house,era,spoilers,hits
## sites are sensitive to too many hits. Users are sensitive to long
## waits during the initial metadata collection in the foreground.
## When used, these settings will speed up metadata downloads in the
## foreground linearly.
tweak_fg_sleep:true
min_fg_sleep:2.0
max_fg_sleep:5.0
max_fg_sleep_at_downloads:4
## fictionalley-archive chapters have 'date', 'words', 'hits' and
## 'summary' available for each chapter. These can be used with
## custom output (see
## https://github.com/JimmXinu/FanFicFare/wiki/CustomOutput ) or with
## chapter_title_*_pattern settings.
## Examples for html & epub:
#[html]
#tocpage_entry:
# <a href="#section${index04}">${chapter}</a> ${date} ${words}<br />
#[epub]
#tocpage_entry:
# <a href="file${index04}.xhtml">${chapter}</a> ${date} ${words}<br /><br />
## The 'date' value for chapters mentioned above can be formated with
## datechapter_format. Otherwise it will default to
## datePublished_format
#datechapter_format:%%Y-%%m-%%d
## fictionalley-archive.org chapters can have author notes attached to
## them. Setting include_author_notes:true will include them with the
## chapter text. Includes both leading and trailing notes.
#include_author_notes:false
[www.fictionpress.com]
## Using cloudscraper can satisfy the first couple levels of

View file

@ -33,7 +33,7 @@ from .. import configurable as configurable
from . import base_efiction_adapter
from . import adapter_test1
from . import adapter_fanfictionnet
from . import adapter_fictionalleyorg
from . import adapter_fictionalleyarchiveorg
from . import adapter_fictionpresscom
from . import adapter_ficwadcom
from . import adapter_fimfictionnet

View file

@ -0,0 +1,225 @@
# -*- coding: utf-8 -*-
# Copyright 2011 Fanficdownloader team, 2021 FanFicFare team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from __future__ import absolute_import
import logging
logger = logging.getLogger(__name__)
import re
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from .base_adapter import BaseSiteAdapter, makeDate
class FictionAlleyArchiveOrgSiteAdapter(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.story.setMetadata('siteabbrev','fa')
self.is_adult=False
# get storyId from url--url validation guarantees query correct
m = re.match(self.getSiteURLPattern(),url)
if m:
# normalized story URL.
url = "https://"+self.getSiteDomain()+"/authors/"+m.group('auth')+"/"+m.group('id')+".html"
self._setURL(url)
else:
raise exceptions.InvalidStoryURL(url,
self.getSiteDomain(),
self.getSiteExampleURLs())
# The date format will vary from site to site.
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
self.dateformat = "%m/%d/%Y"
def _setURL(self,url):
# logger.debug("set URL:%s"%url)
super(FictionAlleyArchiveOrgSiteAdapter, self)._setURL(url)
m = re.match(self.getSiteURLPattern(),url)
if m:
self.story.setMetadata('authorId',m.group('auth'))
self.story.setMetadata('storyId',m.group('id'))
@staticmethod
def getSiteDomain():
return 'www.fictionalley-archive.org'
@classmethod
def getAcceptDomains(cls):
return ['www.fictionalley-archive.org',
'www.fictionalley.org']
@classmethod
def getSiteExampleURLs(cls):
return "https://"+cls.getSiteDomain()+"/authors/drt/DA.html https://"+cls.getSiteDomain()+"/authors/drt/JOTP01a.html"
@classmethod
def getURLDomain(cls):
return 'https://' + cls.getSiteDomain()
def getSiteURLPattern(self):
# http://www.fictionalley-archive.org/authors/drt/DA.html
# http://www.fictionalley-archive.org/authors/drt/JOTP01a.html
return r"https?://www.fictionalley(-archive)?.org/authors/(?P<auth>[a-zA-Z0-9_]+)/(?P<id>[a-zA-Z0-9_]+)\.html"
def extractChapterUrlsAndMetadata(self):
## could be either chapter list page or one-shot text page.
logger.debug("URL: "+self.url)
(data,rurl) = self.get_request_redirected(self.url)
if rurl != self.url:
self._setURL(rurl)
logger.debug("set to redirected url:%s"%self.url)
soup = self.make_soup(data)
# If chapter list page, get the first chapter to look for adult check
chapterlinklist = soup.select('h5.mb-1 > a')
# logger.debug(chapterlinklist)
if not chapterlinklist:
# no chapter list, it's either a chapter URL or a single chapter story
# <nav aria-label="Chapter Navigation">
# <a class="page-link" href="/authors/mz_xxo/HPATOTFI.html">Index</a>
storya = soup.select_one('nav[aria-label="Chapter Navigation"] a')
# logger.debug(storya)
if storya:
## multi chapter story
self._setURL(self.getURLDomain()+storya['href'])
logger.debug("Normalizing to URL: "+self.url)
# ## title's right there...
# self.story.setMetadata('title',stripHTML(storya))
data = self.get_request(self.url)
soup = self.make_soup(data)
chapterlinklist = soup.select('h5.mb-1 > a')
# logger.debug(chapterlinklist)
else:
## single chapter story.
# logger.debug("Single chapter story")
pass
self.story.setMetadata('title',stripHTML(soup.select_one('h1')))
## authorid already set.
## <h1 class="title" align="center">Just Off The Platform II by <a href="http://www.fictionalley.org/authors/drt/">DrT</a></h1>
authora=soup.select_one('h1 + h3 > a')
self.story.setMetadata('author',stripHTML(authora))
self.story.setMetadata('authorUrl',self.getURLDomain()+authora['href'])
if chapterlinklist:
# Find the chapters:
for chapter in chapterlinklist:
listitem = chapter.parent.parent.parent
# logger.debug(listitem)
# date
date = stripHTML(listitem.select_one('small.text-nowrap'))
chapterDate = makeDate(date,self.dateformat)
wordshits = listitem.select('span.font-weight-normal')
chap_data = {
'date':chapterDate.strftime(self.getConfig("datechapter_format",self.getConfig("datePublished_format","%Y-%m-%d"))),
'words':stripHTML(wordshits[0]),
'hits':stripHTML(wordshits[1]),
'summary':stripHTML(listitem.select_one('p.my-2')),
}
# logger.debug(chap_data)
self.add_chapter(chapter,self.getURLDomain()+chapter['href'], chap_data)
else:
self.add_chapter(self.story.getMetadata('title'),self.url)
cardbody = soup.select_one('div.card-body')
searchs_to_meta = (
# sitetype, ffftype, islist
('Rating', 'rating', False),
('House', 'house', True),
('Character', 'characters', True),
('Genre', 'genre', True),
('Era', 'era', True),
('Spoiler', 'spoilers', True),
('Ship', 'ships', True),
)
for (sitetype,ffftype, islist) in searchs_to_meta:
# logger.debug((sitetype,ffftype, islist))
tags = cardbody.select('a[href^="/stories?Include.%s"]'%sitetype)
# logger.debug(tags)
if tags:
if islist:
self.story.extendList(ffftype, [ stripHTML(a) for a in tags ])
else:
self.story.setMetadata(ffftype, stripHTML(tags[0]))
# Published: 09/26/2003 Updated: 04/13/2004 Words: 14,268 Chapters: 5 Hits: 743
badgeinfos = cardbody.select('div.badge-info')
# logger.debug(badgeinfos)
for badge in badgeinfos:
txt = stripHTML(badge)
(key,val)=txt.split(':')
# logger.debug((key,val))
if key in ( 'Published', 'Updated'):
date = makeDate(val,self.dateformat)
self.story.setMetadata('date'+key,date)
elif key in ('Hits'):
self.story.setMetadata(key.lower(),val)
elif key == 'Words':
self.story.setMetadata('numWords',val)
summary = soup.find('dt',text='Story Summary:')
if summary:
summary = summary.find_next_sibling('dd')
summary.name='div'
self.setDescription(self.url,summary)
return
def getChapterText(self, url):
logger.debug('Getting chapter text from: %s' % url)
data = self.get_request(url)
soup = self.make_soup(data)
# this may be a brittle way to get the chapter text.
# Site doesn't give a lot of hints.
chaptext = soup.select_one('main#content div:not([class])')
# not sure how, but we can get html, etc tags still in some
# stories. That breaks later updates because it confuses
# epubutils.py
# Yes, this still applies to fictionalley-archive.
for tag in chaptext.findAll('head') + chaptext.findAll('meta') + chaptext.findAll('script'):
tag.extract()
for tag in chaptext.findAll('body') + chaptext.findAll('html'):
tag.name = 'div'
if self.getConfig('include_author_notes'):
row = chaptext.find_previous_sibling('div',class_='row')
logger.debug(row)
andt = row.find('dt',text="Author's Note:")
logger.debug(andt)
if andt:
chaptext.insert(0,andt.parent.extract())
# post notes aren't as structured(?)
for div in chaptext.find_next_siblings('div',class_='row'):
chaptext.append(div.extract())
# logger.debug(chaptext)
return self.utf8FromSoup(url,chaptext)
def getClass():
return FictionAlleyArchiveOrgSiteAdapter

View file

@ -1,228 +0,0 @@
# -*- coding: utf-8 -*-
# Copyright 2011 Fanficdownloader team, 2018 FanFicFare team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from __future__ import absolute_import
import logging
logger = logging.getLogger(__name__)
import re
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
# py2 vs py3 transition
from .base_adapter import BaseSiteAdapter, makeDate
class FictionAlleyOrgSiteAdapter(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.story.setMetadata('siteabbrev','fa')
self.is_adult=False
# get storyId from url--url validation guarantees query correct
m = re.match(self.getSiteURLPattern(),url)
if m:
self.story.setMetadata('authorId',m.group('auth'))
self.story.setMetadata('storyId',m.group('id'))
# normalized story URL.
self._setURL(url)
else:
raise exceptions.InvalidStoryURL(url,
self.getSiteDomain(),
self.getSiteExampleURLs())
@staticmethod
def getSiteDomain():
return 'www.fictionalley.org'
@classmethod
def getSiteExampleURLs(cls):
return "http://"+cls.getSiteDomain()+"/authors/drt/DA.html http://"+cls.getSiteDomain()+"/authors/drt/JOTP01a.html"
def getSiteURLPattern(self):
# http://www.fictionalley.org/authors/drt/DA.html
# http://www.fictionalley.org/authors/drt/JOTP01a.html
return re.escape("http://"+self.getSiteDomain())+r"/authors/(?P<auth>[a-zA-Z0-9_]+)/(?P<id>[a-zA-Z0-9_]+)\.html"
def _postFetchWithIAmOld(self,url):
if self.is_adult or self.getConfig("is_adult"):
params={'iamold':'Yes',
'action':'ageanswer'}
logger.info("Attempting to get cookie for %s" % url)
## posting on list doesn't work, but doesn't hurt, either.
data = self.post_request(url,params)
else:
data = self.get_request(url)
return data
def extractChapterUrlsAndMetadata(self):
## could be either chapter list page or one-shot text page.
url = self.url
logger.debug("URL: "+url)
data = self._postFetchWithIAmOld(url)
soup = self.make_soup(data)
chapterdata = data
# If chapter list page, get the first chapter to look for adult check
chapterlinklist = soup.findAll('a',{'class':'chapterlink'})
if chapterlinklist:
chapterdata = self._postFetchWithIAmOld(chapterlinklist[0]['href'])
if "Are you over seventeen years old" in chapterdata:
raise exceptions.AdultCheckRequired(self.url)
if not chapterlinklist:
# no chapter list, chapter URL: change to list link.
# second a tag inside div breadcrumbs
storya = soup.find('div',{'class':'breadcrumbs'}).findAll('a')[1]
self._setURL(storya['href'])
url=self.url
logger.debug("Normalizing to URL: "+url)
## title's right there...
self.story.setMetadata('title',stripHTML(storya))
data = self.get_request(url)
soup = self.make_soup(data)
chapterlinklist = soup.findAll('a',{'class':'chapterlink'})
else:
## still need title from somewhere. If chapterlinklist,
## then chapterdata contains a chapter, find title the
## same way.
chapsoup = self.make_soup(chapterdata)
storya = chapsoup.find('div',{'class':'breadcrumbs'}).findAll('a')[1]
self.story.setMetadata('title',stripHTML(storya))
del chapsoup
del chapterdata
## authorid already set.
## <h1 class="title" align="center">Just Off The Platform II by <a href="http://www.fictionalley.org/authors/drt/">DrT</a></h1>
authora=soup.find('h1',{'class':'title'}).find('a')
self.story.setMetadata('author',authora.string)
self.story.setMetadata('authorUrl',authora['href'])
if len(chapterlinklist) == 1:
self.add_chapter(self.story.getMetadata('title'),chapterlinklist[0]['href'])
else:
# Find the chapters:
for chapter in chapterlinklist:
# just in case there's tags, like <i> in chapter titles.
self.add_chapter(chapter,chapter['href'])
## Go scrape the rest of the metadata from the author's page.
data = self.get_request(self.story.getMetadata('authorUrl'))
soup = self.make_soup(data)
# <dl><dt><a class = "Rid story" href = "http://www.fictionalley.org/authors/aafro_man_ziegod/TMH.html">
# [Rid] The Magical Hottiez</a> by <a class = "pen_name" href = "http://www.fictionalley.org/authors/aafro_man_ziegod/">Aafro Man Ziegod</a> </small></dt>
# <dd><small class = "storyinfo"><a href = "http://www.fictionalley.org/ratings.html" target = "_new">Rating:</a> PG-13 - Spoilers: PS/SS, CoS, PoA, GoF, QTTA, FB - 4264 hits - 5060 words<br />
# Genre: Humor, Romance - Main character(s): None - Ships: None - Era: Multiple Eras<br /></small>
# Chaos ensues after Witch Weekly, seeking to increase readers, decides to create a boyband out of five seemingly talentless wizards: Harry Potter, Draco Malfoy, Ron Weasley, Neville Longbottom, and Oliver "Toss Your Knickers Here" Wood.<br />
# <small class = "storyinfo">Published: June 3, 2002 (between Goblet of Fire and Order of Phoenix) - Updated: June 3, 2002</small>
# </dd></dl>
storya = soup.find('a',{'href':self.story.getMetadata('storyUrl')})
storydd = storya.findNext('dd')
# Rating: PG - Spoilers: None - 2525 hits - 736 words
# Genre: Humor - Main character(s): H, R - Ships: None - Era: Multiple Eras
# Harry and Ron are back at it again! They reeeeeeally don't want to be back, because they know what's awaiting them. "VH1 Goes Inside..." is back! Why? 'Cos there are soooo many more couples left to pick on.
# Published: September 25, 2004 (between Order of Phoenix and Half-Blood Prince) - Updated: September 25, 2004
## change to text and regexp find.
metastr = stripHTML(storydd).replace('\n',' ').replace('\t',' ')
m = re.match(r".*?Rating: (.+?) -.*?",metastr)
if m:
self.story.setMetadata('rating', m.group(1))
m = re.match(r".*?Genre: (.+?) -.*?",metastr)
if m:
for g in m.group(1).split(','):
self.story.addToList('genre',g)
m = re.match(r".*?Published: ([a-zA-Z]+ \d\d?, \d\d\d\d).*?",metastr)
if m:
self.story.setMetadata('datePublished',makeDate(m.group(1), "%B %d, %Y"))
m = re.match(r".*?Updated: ([a-zA-Z]+ \d\d?, \d\d\d\d).*?",metastr)
if m:
self.story.setMetadata('dateUpdated',makeDate(m.group(1), "%B %d, %Y"))
m = re.match(r".*? (\d+) words Genre.*?",metastr)
if m:
self.story.setMetadata('numWords', m.group(1))
for small in storydd.findAll('small'):
small.extract() ## removes the <small> tags, leaving only the summary.
storydd.name = 'div' ## change tag name else Calibre treats it oddly.
self.setDescription(url,storydd)
#self.story.setMetadata('description',stripHTML(storydd))
return
def getChapterText(self, url):
logger.debug('Getting chapter text from: %s' % url)
data = self.get_request(url)
# find <!-- headerend --> & <!-- footerstart --> and
# replaced with matching div pair for easier parsing.
# Yes, it's an evil kludge, but what can ya do? Using
# something other than div prevents soup from pairing
# our div with poor html inside the story text.
crazy = "crazytagstringnobodywouldstumbleonaccidently"
data = data.replace('<!-- headerend -->','<'+crazy+' id="storytext">').replace('<!-- footerstart -->','</'+crazy+'>')
# problems with some stories confusing Soup. This is a nasty
# hack, but it works.
data = data[data.index('<'+crazy+''):]
# ditto with extra crap at the end.
data = data[:data.index('</'+crazy+'>')+len('</'+crazy+'>')]
soup = self.make_soup(data)
body = soup.findAll('body') ## some stories use a nested body and body
## tag, in which case we don't
## need crazytagstringnobodywouldstumbleonaccidently
## and use the second one instead.
if len(body)>1:
text = body[1]
text.name='div' # force to be a div to avoid multiple body tags.
else:
text = soup.find(crazy, {'id' : 'storytext'})
text.name='div' # change to div tag.
if not data or not text:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
# not sure how, but we can get html, etc tags still in some
# stories. That breaks later updates because it confuses
# epubutils.py
for tag in text.findAll('head'):
tag.extract()
for tag in text.findAll('body') + text.findAll('html'):
tag.name = 'div'
return self.utf8FromSoup(url,text)
def getClass():
return FictionAlleyOrgSiteAdapter

View file

@ -74,7 +74,7 @@ rating_label:Rating
warnings_label:Warnings
numChapters_label:Chapters
numWords_label:Words
## www.fanfiction.net, fictionalley.com, etc.
## www.fanfiction.net, fictionalley-archive.com, etc.
site_label:Publisher
## ffnet, fpcom, etc.
siteabbrev_label:Site Abbrev
@ -2993,24 +2993,46 @@ use_basic_cache:true
## for examples of how to use them.
extra_valid_entries:native_status
[www.fictionalley.org]
## Some sites do not require a login, but do require the user to
## confirm they are adult for adult content. In commandline version,
## this should go in your personal.ini, not defaults.ini.
#is_adult:true
[www.fictionalley-archive.org]
## also accepts fictionalley.org story URLs
use_basic_cache:true
## Site dedicated to these categories/characters/ships
extracategories:Harry Potter
## fictionally.org storyIds are not unique. Combine with authorId.
## fictionalley-archive.org storyIds are not unique. Combine with authorId.
output_filename: ${title}-${siteabbrev}_${authorId}_${storyId}${formatext}
## fictionalley.org doesn't have a status metadatum. If uncommented,
## fictionalley-archive.org doesn't have a status metadatum. If uncommented,
## this will be used for status.
#default_value_status:Unknown
website_encodings:Windows-1252,utf8
slow_down_sleep_time:10
## Extra metadata that this adapter knows about. See [archiveofourown.org]
## for examples of how to use them.
extra_valid_entries:house,era,spoilers,hits
## fictionalley-archive chapters have 'date', 'words', 'hits' and
## 'summary' available for each chapter. These can be used with
## custom output (see
## https://github.com/JimmXinu/FanFicFare/wiki/CustomOutput ) or with
## chapter_title_*_pattern settings.
## Examples for html & epub:
#[html]
#tocpage_entry:
# <a href="#section${index04}">${chapter}</a> ${date} ${words}<br />
#[epub]
#tocpage_entry:
# <a href="file${index04}.xhtml">${chapter}</a> ${date} ${words}<br /><br />
## The 'date' value for chapters mentioned above can be formated with
## datechapter_format. Otherwise it will default to
## datePublished_format
#datechapter_format:%%Y-%%m-%%d
## fictionalley-archive.org chapters can have author notes attached to
## them. Setting include_author_notes:true will include them with the
## chapter text. Includes both leading and trailing notes.
#include_author_notes:false
[www.fictionpress.com]
## Using cloudscraper can satisfy the first couple levels of