mirror of
https://github.com/JimmXinu/FanFicFare.git
synced 2026-05-08 12:36:11 +02:00
fictionalley-archive.org: Convert adapter_fictionalleyorg to adapter_fictionalleyarchiveorg.
This commit is contained in:
parent
913f8dc256
commit
9da07fd160
5 changed files with 289 additions and 257 deletions
|
|
@ -74,7 +74,7 @@ rating_label:Rating
|
|||
warnings_label:Warnings
|
||||
numChapters_label:Chapters
|
||||
numWords_label:Words
|
||||
## www.fanfiction.net, fictionalley.com, etc.
|
||||
## www.fanfiction.net, fictionalley-archive.com, etc.
|
||||
site_label:Publisher
|
||||
## ffnet, fpcom, etc.
|
||||
siteabbrev_label:Site Abbrev
|
||||
|
|
@ -2980,33 +2980,46 @@ use_basic_cache:true
|
|||
## for examples of how to use them.
|
||||
extra_valid_entries:native_status
|
||||
|
||||
[www.fictionalley.org]
|
||||
## Some sites do not require a login, but do require the user to
|
||||
## confirm they are adult for adult content. In commandline version,
|
||||
## this should go in your personal.ini, not defaults.ini.
|
||||
#is_adult:true
|
||||
[www.fictionalley-archive.org]
|
||||
## also accepts fictionalley.org story URLs
|
||||
use_basic_cache:true
|
||||
|
||||
## Site dedicated to these categories/characters/ships
|
||||
extracategories:Harry Potter
|
||||
|
||||
## fictionally.org storyIds are not unique. Combine with authorId.
|
||||
## fictionalley-archive.org storyIds are not unique. Combine with authorId.
|
||||
output_filename: ${title}-${siteabbrev}_${authorId}_${storyId}${formatext}
|
||||
|
||||
## fictionalley.org doesn't have a status metadatum. If uncommented,
|
||||
## fictionalley-archive.org doesn't have a status metadatum. If uncommented,
|
||||
## this will be used for status.
|
||||
#default_value_status:Unknown
|
||||
|
||||
website_encodings:Windows-1252,utf8
|
||||
slow_down_sleep_time:10
|
||||
## Extra metadata that this adapter knows about. See [archiveofourown.org]
|
||||
## for examples of how to use them.
|
||||
extra_valid_entries:house,era,spoilers,hits
|
||||
|
||||
## sites are sensitive to too many hits. Users are sensitive to long
|
||||
## waits during the initial metadata collection in the foreground.
|
||||
## When used, these settings will speed up metadata downloads in the
|
||||
## foreground linearly.
|
||||
tweak_fg_sleep:true
|
||||
min_fg_sleep:2.0
|
||||
max_fg_sleep:5.0
|
||||
max_fg_sleep_at_downloads:4
|
||||
## fictionalley-archive chapters have 'date', 'words', 'hits' and
|
||||
## 'summary' available for each chapter. These can be used with
|
||||
## custom output (see
|
||||
## https://github.com/JimmXinu/FanFicFare/wiki/CustomOutput ) or with
|
||||
## chapter_title_*_pattern settings.
|
||||
## Examples for html & epub:
|
||||
#[html]
|
||||
#tocpage_entry:
|
||||
# <a href="#section${index04}">${chapter}</a> ${date} ${words}<br />
|
||||
#[epub]
|
||||
#tocpage_entry:
|
||||
# <a href="file${index04}.xhtml">${chapter}</a> ${date} ${words}<br /><br />
|
||||
|
||||
## The 'date' value for chapters mentioned above can be formated with
|
||||
## datechapter_format. Otherwise it will default to
|
||||
## datePublished_format
|
||||
#datechapter_format:%%Y-%%m-%%d
|
||||
|
||||
## fictionalley-archive.org chapters can have author notes attached to
|
||||
## them. Setting include_author_notes:true will include them with the
|
||||
## chapter text. Includes both leading and trailing notes.
|
||||
#include_author_notes:false
|
||||
|
||||
[www.fictionpress.com]
|
||||
## Using cloudscraper can satisfy the first couple levels of
|
||||
|
|
|
|||
|
|
@ -33,7 +33,7 @@ from .. import configurable as configurable
|
|||
from . import base_efiction_adapter
|
||||
from . import adapter_test1
|
||||
from . import adapter_fanfictionnet
|
||||
from . import adapter_fictionalleyorg
|
||||
from . import adapter_fictionalleyarchiveorg
|
||||
from . import adapter_fictionpresscom
|
||||
from . import adapter_ficwadcom
|
||||
from . import adapter_fimfictionnet
|
||||
|
|
|
|||
225
fanficfare/adapters/adapter_fictionalleyarchiveorg.py
Normal file
225
fanficfare/adapters/adapter_fictionalleyarchiveorg.py
Normal file
|
|
@ -0,0 +1,225 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2011 Fanficdownloader team, 2021 FanFicFare team
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from __future__ import absolute_import
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
import re
|
||||
from ..htmlcleanup import stripHTML
|
||||
from .. import exceptions as exceptions
|
||||
|
||||
from .base_adapter import BaseSiteAdapter, makeDate
|
||||
|
||||
class FictionAlleyArchiveOrgSiteAdapter(BaseSiteAdapter):
|
||||
|
||||
def __init__(self, config, url):
|
||||
BaseSiteAdapter.__init__(self, config, url)
|
||||
self.story.setMetadata('siteabbrev','fa')
|
||||
self.is_adult=False
|
||||
|
||||
# get storyId from url--url validation guarantees query correct
|
||||
m = re.match(self.getSiteURLPattern(),url)
|
||||
if m:
|
||||
# normalized story URL.
|
||||
url = "https://"+self.getSiteDomain()+"/authors/"+m.group('auth')+"/"+m.group('id')+".html"
|
||||
self._setURL(url)
|
||||
else:
|
||||
raise exceptions.InvalidStoryURL(url,
|
||||
self.getSiteDomain(),
|
||||
self.getSiteExampleURLs())
|
||||
# The date format will vary from site to site.
|
||||
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
|
||||
self.dateformat = "%m/%d/%Y"
|
||||
|
||||
def _setURL(self,url):
|
||||
# logger.debug("set URL:%s"%url)
|
||||
super(FictionAlleyArchiveOrgSiteAdapter, self)._setURL(url)
|
||||
m = re.match(self.getSiteURLPattern(),url)
|
||||
if m:
|
||||
self.story.setMetadata('authorId',m.group('auth'))
|
||||
self.story.setMetadata('storyId',m.group('id'))
|
||||
|
||||
@staticmethod
|
||||
def getSiteDomain():
|
||||
return 'www.fictionalley-archive.org'
|
||||
|
||||
@classmethod
|
||||
def getAcceptDomains(cls):
|
||||
return ['www.fictionalley-archive.org',
|
||||
'www.fictionalley.org']
|
||||
|
||||
@classmethod
|
||||
def getSiteExampleURLs(cls):
|
||||
return "https://"+cls.getSiteDomain()+"/authors/drt/DA.html https://"+cls.getSiteDomain()+"/authors/drt/JOTP01a.html"
|
||||
|
||||
@classmethod
|
||||
def getURLDomain(cls):
|
||||
return 'https://' + cls.getSiteDomain()
|
||||
|
||||
def getSiteURLPattern(self):
|
||||
# http://www.fictionalley-archive.org/authors/drt/DA.html
|
||||
# http://www.fictionalley-archive.org/authors/drt/JOTP01a.html
|
||||
return r"https?://www.fictionalley(-archive)?.org/authors/(?P<auth>[a-zA-Z0-9_]+)/(?P<id>[a-zA-Z0-9_]+)\.html"
|
||||
|
||||
def extractChapterUrlsAndMetadata(self):
|
||||
|
||||
## could be either chapter list page or one-shot text page.
|
||||
logger.debug("URL: "+self.url)
|
||||
|
||||
(data,rurl) = self.get_request_redirected(self.url)
|
||||
if rurl != self.url:
|
||||
self._setURL(rurl)
|
||||
logger.debug("set to redirected url:%s"%self.url)
|
||||
soup = self.make_soup(data)
|
||||
|
||||
# If chapter list page, get the first chapter to look for adult check
|
||||
chapterlinklist = soup.select('h5.mb-1 > a')
|
||||
# logger.debug(chapterlinklist)
|
||||
|
||||
if not chapterlinklist:
|
||||
# no chapter list, it's either a chapter URL or a single chapter story
|
||||
# <nav aria-label="Chapter Navigation">
|
||||
# <a class="page-link" href="/authors/mz_xxo/HPATOTFI.html">Index</a>
|
||||
storya = soup.select_one('nav[aria-label="Chapter Navigation"] a')
|
||||
# logger.debug(storya)
|
||||
if storya:
|
||||
## multi chapter story
|
||||
self._setURL(self.getURLDomain()+storya['href'])
|
||||
logger.debug("Normalizing to URL: "+self.url)
|
||||
# ## title's right there...
|
||||
# self.story.setMetadata('title',stripHTML(storya))
|
||||
data = self.get_request(self.url)
|
||||
soup = self.make_soup(data)
|
||||
chapterlinklist = soup.select('h5.mb-1 > a')
|
||||
# logger.debug(chapterlinklist)
|
||||
else:
|
||||
## single chapter story.
|
||||
# logger.debug("Single chapter story")
|
||||
pass
|
||||
|
||||
self.story.setMetadata('title',stripHTML(soup.select_one('h1')))
|
||||
|
||||
## authorid already set.
|
||||
## <h1 class="title" align="center">Just Off The Platform II by <a href="http://www.fictionalley.org/authors/drt/">DrT</a></h1>
|
||||
authora=soup.select_one('h1 + h3 > a')
|
||||
self.story.setMetadata('author',stripHTML(authora))
|
||||
self.story.setMetadata('authorUrl',self.getURLDomain()+authora['href'])
|
||||
|
||||
if chapterlinklist:
|
||||
# Find the chapters:
|
||||
for chapter in chapterlinklist:
|
||||
listitem = chapter.parent.parent.parent
|
||||
# logger.debug(listitem)
|
||||
# date
|
||||
date = stripHTML(listitem.select_one('small.text-nowrap'))
|
||||
chapterDate = makeDate(date,self.dateformat)
|
||||
wordshits = listitem.select('span.font-weight-normal')
|
||||
chap_data = {
|
||||
'date':chapterDate.strftime(self.getConfig("datechapter_format",self.getConfig("datePublished_format","%Y-%m-%d"))),
|
||||
'words':stripHTML(wordshits[0]),
|
||||
'hits':stripHTML(wordshits[1]),
|
||||
'summary':stripHTML(listitem.select_one('p.my-2')),
|
||||
}
|
||||
# logger.debug(chap_data)
|
||||
self.add_chapter(chapter,self.getURLDomain()+chapter['href'], chap_data)
|
||||
else:
|
||||
self.add_chapter(self.story.getMetadata('title'),self.url)
|
||||
|
||||
cardbody = soup.select_one('div.card-body')
|
||||
|
||||
searchs_to_meta = (
|
||||
# sitetype, ffftype, islist
|
||||
('Rating', 'rating', False),
|
||||
('House', 'house', True),
|
||||
('Character', 'characters', True),
|
||||
('Genre', 'genre', True),
|
||||
('Era', 'era', True),
|
||||
('Spoiler', 'spoilers', True),
|
||||
('Ship', 'ships', True),
|
||||
)
|
||||
for (sitetype,ffftype, islist) in searchs_to_meta:
|
||||
# logger.debug((sitetype,ffftype, islist))
|
||||
tags = cardbody.select('a[href^="/stories?Include.%s"]'%sitetype)
|
||||
# logger.debug(tags)
|
||||
if tags:
|
||||
if islist:
|
||||
self.story.extendList(ffftype, [ stripHTML(a) for a in tags ])
|
||||
else:
|
||||
self.story.setMetadata(ffftype, stripHTML(tags[0]))
|
||||
|
||||
|
||||
# Published: 09/26/2003 Updated: 04/13/2004 Words: 14,268 Chapters: 5 Hits: 743
|
||||
badgeinfos = cardbody.select('div.badge-info')
|
||||
# logger.debug(badgeinfos)
|
||||
for badge in badgeinfos:
|
||||
txt = stripHTML(badge)
|
||||
(key,val)=txt.split(':')
|
||||
# logger.debug((key,val))
|
||||
if key in ( 'Published', 'Updated'):
|
||||
date = makeDate(val,self.dateformat)
|
||||
self.story.setMetadata('date'+key,date)
|
||||
elif key in ('Hits'):
|
||||
self.story.setMetadata(key.lower(),val)
|
||||
elif key == 'Words':
|
||||
self.story.setMetadata('numWords',val)
|
||||
|
||||
summary = soup.find('dt',text='Story Summary:')
|
||||
if summary:
|
||||
summary = summary.find_next_sibling('dd')
|
||||
summary.name='div'
|
||||
self.setDescription(self.url,summary)
|
||||
|
||||
return
|
||||
|
||||
def getChapterText(self, url):
|
||||
|
||||
logger.debug('Getting chapter text from: %s' % url)
|
||||
|
||||
data = self.get_request(url)
|
||||
soup = self.make_soup(data)
|
||||
|
||||
# this may be a brittle way to get the chapter text.
|
||||
# Site doesn't give a lot of hints.
|
||||
chaptext = soup.select_one('main#content div:not([class])')
|
||||
|
||||
# not sure how, but we can get html, etc tags still in some
|
||||
# stories. That breaks later updates because it confuses
|
||||
# epubutils.py
|
||||
# Yes, this still applies to fictionalley-archive.
|
||||
|
||||
for tag in chaptext.findAll('head') + chaptext.findAll('meta') + chaptext.findAll('script'):
|
||||
tag.extract()
|
||||
|
||||
for tag in chaptext.findAll('body') + chaptext.findAll('html'):
|
||||
tag.name = 'div'
|
||||
|
||||
if self.getConfig('include_author_notes'):
|
||||
row = chaptext.find_previous_sibling('div',class_='row')
|
||||
logger.debug(row)
|
||||
andt = row.find('dt',text="Author's Note:")
|
||||
logger.debug(andt)
|
||||
if andt:
|
||||
chaptext.insert(0,andt.parent.extract())
|
||||
# post notes aren't as structured(?)
|
||||
for div in chaptext.find_next_siblings('div',class_='row'):
|
||||
chaptext.append(div.extract())
|
||||
|
||||
# logger.debug(chaptext)
|
||||
return self.utf8FromSoup(url,chaptext)
|
||||
|
||||
def getClass():
|
||||
return FictionAlleyArchiveOrgSiteAdapter
|
||||
|
|
@ -1,228 +0,0 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2011 Fanficdownloader team, 2018 FanFicFare team
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from __future__ import absolute_import
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
import re
|
||||
from ..htmlcleanup import stripHTML
|
||||
from .. import exceptions as exceptions
|
||||
|
||||
# py2 vs py3 transition
|
||||
|
||||
from .base_adapter import BaseSiteAdapter, makeDate
|
||||
|
||||
class FictionAlleyOrgSiteAdapter(BaseSiteAdapter):
|
||||
|
||||
def __init__(self, config, url):
|
||||
BaseSiteAdapter.__init__(self, config, url)
|
||||
self.story.setMetadata('siteabbrev','fa')
|
||||
self.is_adult=False
|
||||
|
||||
# get storyId from url--url validation guarantees query correct
|
||||
m = re.match(self.getSiteURLPattern(),url)
|
||||
if m:
|
||||
self.story.setMetadata('authorId',m.group('auth'))
|
||||
self.story.setMetadata('storyId',m.group('id'))
|
||||
|
||||
# normalized story URL.
|
||||
self._setURL(url)
|
||||
else:
|
||||
raise exceptions.InvalidStoryURL(url,
|
||||
self.getSiteDomain(),
|
||||
self.getSiteExampleURLs())
|
||||
|
||||
@staticmethod
|
||||
def getSiteDomain():
|
||||
return 'www.fictionalley.org'
|
||||
|
||||
@classmethod
|
||||
def getSiteExampleURLs(cls):
|
||||
return "http://"+cls.getSiteDomain()+"/authors/drt/DA.html http://"+cls.getSiteDomain()+"/authors/drt/JOTP01a.html"
|
||||
|
||||
def getSiteURLPattern(self):
|
||||
# http://www.fictionalley.org/authors/drt/DA.html
|
||||
# http://www.fictionalley.org/authors/drt/JOTP01a.html
|
||||
return re.escape("http://"+self.getSiteDomain())+r"/authors/(?P<auth>[a-zA-Z0-9_]+)/(?P<id>[a-zA-Z0-9_]+)\.html"
|
||||
|
||||
def _postFetchWithIAmOld(self,url):
|
||||
if self.is_adult or self.getConfig("is_adult"):
|
||||
params={'iamold':'Yes',
|
||||
'action':'ageanswer'}
|
||||
logger.info("Attempting to get cookie for %s" % url)
|
||||
## posting on list doesn't work, but doesn't hurt, either.
|
||||
data = self.post_request(url,params)
|
||||
else:
|
||||
data = self.get_request(url)
|
||||
return data
|
||||
|
||||
def extractChapterUrlsAndMetadata(self):
|
||||
|
||||
## could be either chapter list page or one-shot text page.
|
||||
url = self.url
|
||||
logger.debug("URL: "+url)
|
||||
|
||||
data = self._postFetchWithIAmOld(url)
|
||||
|
||||
soup = self.make_soup(data)
|
||||
|
||||
chapterdata = data
|
||||
# If chapter list page, get the first chapter to look for adult check
|
||||
chapterlinklist = soup.findAll('a',{'class':'chapterlink'})
|
||||
if chapterlinklist:
|
||||
chapterdata = self._postFetchWithIAmOld(chapterlinklist[0]['href'])
|
||||
|
||||
if "Are you over seventeen years old" in chapterdata:
|
||||
raise exceptions.AdultCheckRequired(self.url)
|
||||
|
||||
if not chapterlinklist:
|
||||
# no chapter list, chapter URL: change to list link.
|
||||
# second a tag inside div breadcrumbs
|
||||
storya = soup.find('div',{'class':'breadcrumbs'}).findAll('a')[1]
|
||||
self._setURL(storya['href'])
|
||||
url=self.url
|
||||
logger.debug("Normalizing to URL: "+url)
|
||||
## title's right there...
|
||||
self.story.setMetadata('title',stripHTML(storya))
|
||||
data = self.get_request(url)
|
||||
soup = self.make_soup(data)
|
||||
chapterlinklist = soup.findAll('a',{'class':'chapterlink'})
|
||||
else:
|
||||
## still need title from somewhere. If chapterlinklist,
|
||||
## then chapterdata contains a chapter, find title the
|
||||
## same way.
|
||||
chapsoup = self.make_soup(chapterdata)
|
||||
storya = chapsoup.find('div',{'class':'breadcrumbs'}).findAll('a')[1]
|
||||
self.story.setMetadata('title',stripHTML(storya))
|
||||
del chapsoup
|
||||
|
||||
del chapterdata
|
||||
|
||||
## authorid already set.
|
||||
## <h1 class="title" align="center">Just Off The Platform II by <a href="http://www.fictionalley.org/authors/drt/">DrT</a></h1>
|
||||
authora=soup.find('h1',{'class':'title'}).find('a')
|
||||
self.story.setMetadata('author',authora.string)
|
||||
self.story.setMetadata('authorUrl',authora['href'])
|
||||
|
||||
if len(chapterlinklist) == 1:
|
||||
self.add_chapter(self.story.getMetadata('title'),chapterlinklist[0]['href'])
|
||||
else:
|
||||
# Find the chapters:
|
||||
for chapter in chapterlinklist:
|
||||
# just in case there's tags, like <i> in chapter titles.
|
||||
self.add_chapter(chapter,chapter['href'])
|
||||
|
||||
|
||||
## Go scrape the rest of the metadata from the author's page.
|
||||
data = self.get_request(self.story.getMetadata('authorUrl'))
|
||||
soup = self.make_soup(data)
|
||||
|
||||
# <dl><dt><a class = "Rid story" href = "http://www.fictionalley.org/authors/aafro_man_ziegod/TMH.html">
|
||||
# [Rid] The Magical Hottiez</a> by <a class = "pen_name" href = "http://www.fictionalley.org/authors/aafro_man_ziegod/">Aafro Man Ziegod</a> </small></dt>
|
||||
# <dd><small class = "storyinfo"><a href = "http://www.fictionalley.org/ratings.html" target = "_new">Rating:</a> PG-13 - Spoilers: PS/SS, CoS, PoA, GoF, QTTA, FB - 4264 hits - 5060 words<br />
|
||||
# Genre: Humor, Romance - Main character(s): None - Ships: None - Era: Multiple Eras<br /></small>
|
||||
# Chaos ensues after Witch Weekly, seeking to increase readers, decides to create a boyband out of five seemingly talentless wizards: Harry Potter, Draco Malfoy, Ron Weasley, Neville Longbottom, and Oliver "Toss Your Knickers Here" Wood.<br />
|
||||
# <small class = "storyinfo">Published: June 3, 2002 (between Goblet of Fire and Order of Phoenix) - Updated: June 3, 2002</small>
|
||||
# </dd></dl>
|
||||
|
||||
storya = soup.find('a',{'href':self.story.getMetadata('storyUrl')})
|
||||
storydd = storya.findNext('dd')
|
||||
|
||||
# Rating: PG - Spoilers: None - 2525 hits - 736 words
|
||||
# Genre: Humor - Main character(s): H, R - Ships: None - Era: Multiple Eras
|
||||
# Harry and Ron are back at it again! They reeeeeeally don't want to be back, because they know what's awaiting them. "VH1 Goes Inside..." is back! Why? 'Cos there are soooo many more couples left to pick on.
|
||||
# Published: September 25, 2004 (between Order of Phoenix and Half-Blood Prince) - Updated: September 25, 2004
|
||||
|
||||
## change to text and regexp find.
|
||||
metastr = stripHTML(storydd).replace('\n',' ').replace('\t',' ')
|
||||
|
||||
m = re.match(r".*?Rating: (.+?) -.*?",metastr)
|
||||
if m:
|
||||
self.story.setMetadata('rating', m.group(1))
|
||||
|
||||
m = re.match(r".*?Genre: (.+?) -.*?",metastr)
|
||||
if m:
|
||||
for g in m.group(1).split(','):
|
||||
self.story.addToList('genre',g)
|
||||
|
||||
m = re.match(r".*?Published: ([a-zA-Z]+ \d\d?, \d\d\d\d).*?",metastr)
|
||||
if m:
|
||||
self.story.setMetadata('datePublished',makeDate(m.group(1), "%B %d, %Y"))
|
||||
|
||||
m = re.match(r".*?Updated: ([a-zA-Z]+ \d\d?, \d\d\d\d).*?",metastr)
|
||||
if m:
|
||||
self.story.setMetadata('dateUpdated',makeDate(m.group(1), "%B %d, %Y"))
|
||||
|
||||
m = re.match(r".*? (\d+) words Genre.*?",metastr)
|
||||
if m:
|
||||
self.story.setMetadata('numWords', m.group(1))
|
||||
|
||||
for small in storydd.findAll('small'):
|
||||
small.extract() ## removes the <small> tags, leaving only the summary.
|
||||
storydd.name = 'div' ## change tag name else Calibre treats it oddly.
|
||||
self.setDescription(url,storydd)
|
||||
#self.story.setMetadata('description',stripHTML(storydd))
|
||||
|
||||
return
|
||||
|
||||
def getChapterText(self, url):
|
||||
|
||||
logger.debug('Getting chapter text from: %s' % url)
|
||||
|
||||
data = self.get_request(url)
|
||||
# find <!-- headerend --> & <!-- footerstart --> and
|
||||
# replaced with matching div pair for easier parsing.
|
||||
# Yes, it's an evil kludge, but what can ya do? Using
|
||||
# something other than div prevents soup from pairing
|
||||
# our div with poor html inside the story text.
|
||||
crazy = "crazytagstringnobodywouldstumbleonaccidently"
|
||||
data = data.replace('<!-- headerend -->','<'+crazy+' id="storytext">').replace('<!-- footerstart -->','</'+crazy+'>')
|
||||
|
||||
# problems with some stories confusing Soup. This is a nasty
|
||||
# hack, but it works.
|
||||
data = data[data.index('<'+crazy+''):]
|
||||
# ditto with extra crap at the end.
|
||||
data = data[:data.index('</'+crazy+'>')+len('</'+crazy+'>')]
|
||||
|
||||
soup = self.make_soup(data)
|
||||
body = soup.findAll('body') ## some stories use a nested body and body
|
||||
## tag, in which case we don't
|
||||
## need crazytagstringnobodywouldstumbleonaccidently
|
||||
## and use the second one instead.
|
||||
if len(body)>1:
|
||||
text = body[1]
|
||||
text.name='div' # force to be a div to avoid multiple body tags.
|
||||
else:
|
||||
text = soup.find(crazy, {'id' : 'storytext'})
|
||||
text.name='div' # change to div tag.
|
||||
|
||||
if not data or not text:
|
||||
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
# not sure how, but we can get html, etc tags still in some
|
||||
# stories. That breaks later updates because it confuses
|
||||
# epubutils.py
|
||||
for tag in text.findAll('head'):
|
||||
tag.extract()
|
||||
|
||||
for tag in text.findAll('body') + text.findAll('html'):
|
||||
tag.name = 'div'
|
||||
|
||||
return self.utf8FromSoup(url,text)
|
||||
|
||||
def getClass():
|
||||
return FictionAlleyOrgSiteAdapter
|
||||
|
|
@ -74,7 +74,7 @@ rating_label:Rating
|
|||
warnings_label:Warnings
|
||||
numChapters_label:Chapters
|
||||
numWords_label:Words
|
||||
## www.fanfiction.net, fictionalley.com, etc.
|
||||
## www.fanfiction.net, fictionalley-archive.com, etc.
|
||||
site_label:Publisher
|
||||
## ffnet, fpcom, etc.
|
||||
siteabbrev_label:Site Abbrev
|
||||
|
|
@ -2993,24 +2993,46 @@ use_basic_cache:true
|
|||
## for examples of how to use them.
|
||||
extra_valid_entries:native_status
|
||||
|
||||
[www.fictionalley.org]
|
||||
## Some sites do not require a login, but do require the user to
|
||||
## confirm they are adult for adult content. In commandline version,
|
||||
## this should go in your personal.ini, not defaults.ini.
|
||||
#is_adult:true
|
||||
[www.fictionalley-archive.org]
|
||||
## also accepts fictionalley.org story URLs
|
||||
use_basic_cache:true
|
||||
|
||||
## Site dedicated to these categories/characters/ships
|
||||
extracategories:Harry Potter
|
||||
|
||||
## fictionally.org storyIds are not unique. Combine with authorId.
|
||||
## fictionalley-archive.org storyIds are not unique. Combine with authorId.
|
||||
output_filename: ${title}-${siteabbrev}_${authorId}_${storyId}${formatext}
|
||||
|
||||
## fictionalley.org doesn't have a status metadatum. If uncommented,
|
||||
## fictionalley-archive.org doesn't have a status metadatum. If uncommented,
|
||||
## this will be used for status.
|
||||
#default_value_status:Unknown
|
||||
|
||||
website_encodings:Windows-1252,utf8
|
||||
slow_down_sleep_time:10
|
||||
## Extra metadata that this adapter knows about. See [archiveofourown.org]
|
||||
## for examples of how to use them.
|
||||
extra_valid_entries:house,era,spoilers,hits
|
||||
|
||||
## fictionalley-archive chapters have 'date', 'words', 'hits' and
|
||||
## 'summary' available for each chapter. These can be used with
|
||||
## custom output (see
|
||||
## https://github.com/JimmXinu/FanFicFare/wiki/CustomOutput ) or with
|
||||
## chapter_title_*_pattern settings.
|
||||
## Examples for html & epub:
|
||||
#[html]
|
||||
#tocpage_entry:
|
||||
# <a href="#section${index04}">${chapter}</a> ${date} ${words}<br />
|
||||
#[epub]
|
||||
#tocpage_entry:
|
||||
# <a href="file${index04}.xhtml">${chapter}</a> ${date} ${words}<br /><br />
|
||||
|
||||
## The 'date' value for chapters mentioned above can be formated with
|
||||
## datechapter_format. Otherwise it will default to
|
||||
## datePublished_format
|
||||
#datechapter_format:%%Y-%%m-%%d
|
||||
|
||||
## fictionalley-archive.org chapters can have author notes attached to
|
||||
## them. Setting include_author_notes:true will include them with the
|
||||
## chapter text. Includes both leading and trailing notes.
|
||||
#include_author_notes:false
|
||||
|
||||
[www.fictionpress.com]
|
||||
## Using cloudscraper can satisfy the first couple levels of
|
||||
|
|
|
|||
Loading…
Reference in a new issue