Adding sites chosentwofanfic.com www.asexstories.com www.bdsmlibrary.com www.ficsite.com -- Thanks GComyn!

This commit is contained in:
Jim Miller 2016-10-30 10:55:34 -05:00
parent 1e0cc699c2
commit 9961c59402
7 changed files with 999 additions and 0 deletions

View file

@ -936,6 +936,11 @@ strip_text_links:true
## Site dedicated to these categories/characters/ships
extracategories:Blood Ties
[chosentwofanfic.com]
extra_valid_entries:disclaimer
disclaimer_label: Disclaimer
add_to_titlepage_entries:,disclaimer
[fanfic.castletv.net]
## Some sites require login (or login for some rated stories) The
## program can prompt you, or you can save it in config. In
@ -1790,6 +1795,27 @@ readings_label:Readings
## Site dedicated to these categories/characters/ships
extracategories:Star Trek
[www.asexstories.com]
## Some sites also require the user to confirm they are adult for
## adult content. Uncomment by removing '#' in front of is_adult.
#is_adult:true
## This site has links to a vidow site embeded in the text. They are
## not needed, and will be removed if the below property is set to True
strip_text_links:true
[www.bdsmlibrary.com]
## Some sites also require the user to confirm they are adult for
## adult content. Uncomment by removing '#' in front of is_adult.
#is_adult:true
extra_valid_entries:eroticatags,size,comments
size_label: Size
comments_label: Comments
eroticatags_label:Erotica Tags
extra_titlepage_entries:size,comments,eroticatags
keep_style_attr: false
[www.dracoandginny.com]
## Some sites require login (or login for some rated stories) The
## program can prompt you, or you can save it in config. In
@ -1917,6 +1943,11 @@ check_next_chapter:false
## this should go in your personal.ini, not defaults.ini.
#is_adult:true
[www.ficsite.com]
## Some sites also require the user to confirm they are adult for
## adult content. Uncomment by removing '#' in front of is_adult.
#is_adult:true
[www.fictionalley.org]
## Some sites do not require a login, but do require the user to
## confirm they are adult for adult content. In commandline version,

View file

@ -145,6 +145,10 @@ import adapter_fanfictionlucifaelcom
import adapter_adultfanfictionorg
import adapter_fictionhuntcom
import adapter_royalroadl
import adapter_chosentwofanficcom
import adapter_bdsmlibrarycom
import adapter_ficsitecom
import adapter_asexstoriescom
## This bit of complexity allows adapters to be added by just adding
## importing. It eliminates the long if/else clauses we used to need

View file

@ -0,0 +1,174 @@
# -*- coding: utf-8 -*-
# Copyright 2013 Fanficdownloader team, 2016 FanFicFare team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import time
import logging
logger = logging.getLogger(__name__)
import re
import urllib2
import urlparse
import time
import os
from bs4.element import Comment
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
import sys
from base_adapter import BaseSiteAdapter, makeDate
def getClass():
return ASexStoriesComAdapter
class ASexStoriesComAdapter(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.decode = ["utf8",
"Windows-1252",
"iso-8859-1"]
# 1252 is a superset of iso-8859-1.
# Most sites that claim to be
# iso-8859-1 (and some that claim to be
# utf8) are really windows-1252.
self.story.setMetadata('siteabbrev','asscom')
# Extract story ID from base URL, http://www.asexstories.com/Halloween-party-with-the-phantom/
storyId = self.parsedUrl.path.split('/',)[1]
self.story.setMetadata('storyId', storyId)
## set url
self._setURL(url)
@staticmethod
def getSiteDomain():
return 'www.asexstories.com'
@classmethod
def getAcceptDomains(cls):
return ['www.asexstories.com']
@classmethod
def getSiteExampleURLs(cls):
return "http://www.asexstories.com/StoryTitle/"
def getSiteURLPattern(self):
return r"https?://(www\.)?asexstories\.com/([a-zA-Z0-9_-]+)/"
def extractChapterUrlsAndMetadata(self):
"""
Chapters are located at /StoryName/ (for single-chapter
stories), or //StoryName/index#.html for multiple chapters (# is a
non-padded incrementing number, like StoryName1, StoryName2.html, ...,
StoryName10.html)
This site doesn't have much in the way of metadata, except on the
Category and Tags index pages. so we will get what we can.
Also, as this is an Adult site, the is_adult check is mandatory.
"""
if not (self.is_adult or self.getConfig("is_adult")):
raise exceptions.AdultCheckRequired(self.url)
try:
data1 = self._fetchUrl(self.url)
soup1 = self.make_soup(data1)
#strip comments from soup
[comment.extract() for comment in soup1.find_all(text=lambda text:isinstance(text, Comment))]
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
if 'Page Not Found.' in data1:
raise exceptions.StoryDoesNotExist(self.url)
url = self.url
# Extract metadata
# Title
title = soup1.find('div',{'class':'story-top-block'}).find('h1')
self.story.setMetadata('title', title.string)
# Author
author = soup1.find('div',{'class':'story-info'}).findAll('div',{'class':'story-info-bl'})[1].find('a')
authorurl = author['href']
self.story.setMetadata('author', author.string)
self.story.setMetadata('authorUrl', authorurl)
authorid = os.path.splitext(os.path.basename(authorurl))[0]
self.story.setMetadata('authorId', authorid)
# Description
### The only way to get the Description (summary) is to
### parse through the Category and/or Tags index pages.
### To get a summary, I've taken the first 150 characters
### from the story.
description = soup1.find('div',{'class':'story-block'}).get_text(strip=True)
description = description.encode('utf-8','ignore').strip()[0:150].decode('utf-8','ignore')
self.setDescription(url,'Excerpt from beginning of story: '+description+'...')
# Get chapter URLs
self.chapterUrls = []
### The first 'chapter' is not listed in the links, so we have to
### add it before the rest of the pages, if any
self.chapterUrls.append(('1', self.url))
chapterTable = soup1.find('div',{'class':'pages'}).findAll('a')
if chapterTable is not None:
# Multi-chapter story
for page in chapterTable:
chapterTitle = page.string
chapterUrl = urlparse.urljoin(self.url, page['href'])
self.chapterUrls.append((chapterTitle, chapterUrl))
self.story.setMetadata('numChapters', len(self.chapterUrls))
rated = soup1.find('div',{'class':'story-info'}).findAll('div',{'story-info-bl5'})[0].find('img')['title'].replace('- Rate','').strip()
self.story.setMetadata('rating',rated)
self.story.setMetadata('dateUpdated', makeDate('01/01/2001', '%m/%d/%Y'))
logger.debug("Story: <%s>", self.story)
return
def getChapterText(self, url):
logger.debug('Getting chapter text from <%s>' % url)
#logger.info('Getting chapter text from <%s>' % url)
data1 = self._fetchUrl(url)
soup1 = self.make_soup(data1)
# get story text
story1 = soup1.find('div', {'class':'story-block'})
### This site has links embeded in the text that lead
### to either a video site, or to a tags index page
### the default is to remove them, but you can set the
### strip_text_links to false to keep them in the text
if self.getConfig('strip_text_links'):
for anchor in story1('a', {'target': '_blank'}):
anchor.replaceWith(anchor.string)
return self.utf8FromSoup(url, story1)

View file

@ -0,0 +1,226 @@
# -*- coding: utf-8 -*-
# Copyright 2011 Fanficdownloader team, 2015 FanFicFare team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
###########################################################################
### written by GComyn - 10/06/2016
### updated by GComyn = 10/24/2016
###########################################################################
'''
This works, but some of the stories have abysmal formatting, so it would
probably need to be edited for reading.
I've seen one story that downloaded at 25M, but after editing is only 201K
after the formatting was corrected.
Right now it is written to download each chapter seperatly, but I may change
that to get the whole story. It will still have formatting problems, but should
be able to get the longer stories this way.
Also, the site is notrious for lagging, so some of the longer stories will
probably not be downloadable, since this program doesn't wait long enough
for the site to catch up.
'''
import time
import logging
logger = logging.getLogger(__name__)
import re
import urllib
import urllib2
import sys
import urlparse
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, makeDate
def getClass():
return BDSMLibraryComSiteAdapter
class BDSMLibraryComSiteAdapter(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.decode = ["utf8",
"Windows-1252",
"iso-8859-1"] # 1252 is a superset of iso-8859-1.
# Most sites that claim to be
# iso-8859-1 (and some that claim to be
# utf8) are really windows-1252.
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
self.password = ""
self.is_adult=False
# get storyId from url--url validation guarantees query is only storyid=1234
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
self._setURL('http://{0}/stories/story.php?storyid={1}'.format(self.getSiteDomain(), self.story.getMetadata('storyId')))
# Each adapter needs to have a unique site abbreviation.
self.story.setMetadata('siteabbrev','bdsmlib')
# The date format will vary from site to site.
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
self.dateformat = "%b %d, %Y"
@staticmethod # must be @staticmethod, don't remove it.
def getSiteDomain():
# The site domain. Does have www here, if it uses it.
return 'www.bdsmlibrary.com'
@classmethod
def getSiteExampleURLs(cls):
return "http://"+cls.getSiteDomain()+"/stories/story.php?storyid=1234"
def getSiteURLPattern(self):
return re.escape("http://"+self.getSiteDomain()+"/stories/story.php?storyid=")+r"\d+$"
def use_pagecache(self):
'''
adapters that will work with the page cache need to implement
this and change it to True.
'''
return True
def extractChapterUrlsAndMetadata(self):
if not (self.is_adult or self.getConfig("is_adult")):
raise exceptions.AdultCheckRequired(self.url)
try:
data = self._fetchUrl(self.url)
soup = self.make_soup(data)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
if 'The story does not exist' in data:
raise exceptions.StoryDoesNotExist(self.url)
# Extract metadata
title=soup.title.text.replace('BDSM Library - Story: ','')
self.story.setMetadata('title', title)
# Author
author = soup.find('a', href=re.compile(r"/stories/author.php\?authorid=\d+"))
i = 0
while author == None:
time.sleep(1)
logger.warning('A problem retrieving the author information. Trying Again')
try:
data = self._fetchUrl(self.url)
soup = self.make_soup(data)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
author = soup.find('a', href=re.compile(r"/stories/author.php\?authorid=\d+"))
print author
i += 1
if i == 20:
logger.info('Too Many cycles... exiting')
sys.exit()
authorurl = urlparse.urljoin(self.url, author['href'])
self.story.setMetadata('author', author.text)
self.story.setMetadata('authorUrl', authorurl)
authorid = author['href'].split('=')[1]
self.story.setMetadata('authorId', authorid)
# Find the chapters:
# The update date is with the chapter links... so we will update it here as well
for a in soup.findAll('a'):
if '/stories/chapter.php?storyid='+self.story.getMetadata('storyId')+"&chapterid=" in a['href']:
value = a.findNext('td').findNext('td').string.replace('(added on','').replace(')','').strip()
self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat))
self.chapterUrls.append((stripHTML(a),'http://'+self.getSiteDomain()+a['href']))
# I can't seem to get the re.compile to work for this. so I'm commenting it out
#for chapter in soup.findAll('a', href=re.compile(r'/stories/chapter.php?storyid='+self.story.getMetadata('storyId')+"&chapterid=\d+$")):
# # just in case there's tags, like <i> in chapter titles.
# self.chapterUrls.append((stripHTML(chapter),'http://'+self.getSiteDomain()+chapter['href']+addurl))
self.story.setMetadata('numChapters',len(self.chapterUrls))
# Get the MetaData
# Erotia Tags
tags = soup.findAll('a',href=re.compile(r'/stories/search.php\?selectedcode'))
for tag in tags:
self.story.addToList('eroticatags',tag.text)
# Published Date
tds = soup.findAll('td')
for td in tds:
if len(td.text)>0:
if 'Added on:' in td.text and '<table' not in unicode(td):
value = td.text.replace('Added on:','').strip()
self.story.setMetadata('datePublished', makeDate(stripHTML(value), self.dateformat))
elif 'Synopsis:' in td.text and '<table' not in unicode(td):
value = td.text.replace('\n','').replace('Synopsis:','').strip()
self.setDescription(self.url,stripHTML(value))
elif 'Size:' in td.text and '<table' not in unicode(td):
value = td.text.replace('\n','').replace('Size:','').strip()
self.story.setMetadata('size',stripHTML(value))
elif 'Comments:' in td.text and '<table' not in unicode(td):
value = td.text.replace('\n','').replace('Comments:','').strip()
self.story.setMetadata('comments',stripHTML(value))
# grab the text for an individual chapter.
def getChapterText(self, url):
#Since each chapter is on 1 page, we don't need to do anything special, just get the content of the page.
logger.debug('Getting chapter text from: %s' % url)
logger.info('Getting chapter text from: %s' % url)
soup = self.make_soup(self._fetchUrl(url))
chaptertag = soup.find('div',{'class' : 'storyblock'})
# Some of the stories have the chapters in <pre> sections, so have to check for that
if chaptertag == None:
chaptertag = soup.find('pre')
try:
# BDSM Library basically wraps it's own html around the document,
# so we will be removing the script, title and meta content from the
# storyblock
scripts = chaptertag.findAll('style')
if scripts != None:
for script in scripts:
script.extract()
titles = chaptertag.findAll('title')
if titles !=None:
for title in titles:
title.extract()
metas = chaptertag.findAll('meta')
if metas !=None:
for meta in metas:
meta.extract()
except:
pass
if None == chaptertag:
raise exceptions.FailedToDownload("Error downloading Chapter: {0}! Missing required element!".format(url))
return self.utf8FromSoup(url,chaptertag)

View file

@ -0,0 +1,241 @@
# -*- coding: utf-8 -*-
# Copyright 2012 Fanficdownloader team, 2015 FanFicFare team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Software: eFiction
import time
import logging
logger = logging.getLogger(__name__)
import re
import urllib2
import sys
from bs4.element import Comment
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, makeDate
def getClass():
return ChosenTwoFanFicArchiveAdapter
# Class name has to be unique. Our convention is camel case the
# sitename with Adapter at the end. www is skipped.
class ChosenTwoFanFicArchiveAdapter(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.decode = ["Windows-1252",
"utf8",
"iso-8859-1"] # 1252 is a superset of iso-8859-1.
# Most sites that claim to be
# iso-8859-1 (and some that claim to be
# utf8) are really windows-1252.
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
self.password = ""
self.is_adult=False
# get storyId from url--url validation guarantees query is only sid=1234
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
# normalized story URL.
self._setURL('http://' + self.getSiteDomain() + '/viewstory.php?sid='+self.story.getMetadata('storyId'))
# Each adapter needs to have a unique site abbreviation.
self.story.setMetadata('siteabbrev','chosen2')
# The date format will vary from site to site.
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
self.dateformat = "%m/%d/%Y"
@staticmethod # must be @staticmethod, don't remove it.
def getSiteDomain():
# The site domain. Does have www here, if it uses it.
return 'chosentwofanfic.com'
@classmethod
def getSiteExampleURLs(cls):
return "http://"+cls.getSiteDomain()+"/viewstory.php?sid=1234"
def getSiteURLPattern(self):
return re.escape("http://"+self.getSiteDomain()+"/viewstory.php?sid=")+r"\d+$"
## Getting the chapter list and the meta data, plus 'is adult' checking.
def extractChapterUrlsAndMetadata(self):
# checking to see if the is_adult is set to true
if self.is_adult or self.getConfig("is_adult"):
addURL = "&ageconsent=ok&warning=3"
else:
addURL = ""
# index=1 makes sure we see the story chapter index. Some
# sites skip that for one-chapter stories.
url = '{0}&index=1{1}'.format(self.url,addURL)
logger.debug("URL: "+url)
try:
data = self._fetchUrl(url)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
if "Content is only suitable for mature adults. May contain explicit language and adult themes. Equivalent of NC-17." in data:
raise exceptions.AdultCheckRequired(self.url)
if "Access denied. This story has not been validated by the adminstrators of this site." in data:
raise exceptions.AccessDenied("{0} says: Access denied. This story has not been validated by the adminstrators of this site.".format(self.getSiteDomain()))
# use BeautifulSoup HTML parser to make everything easier to find.
soup = self.make_soup(data)
# Now go hunting for all the meta data and the chapter list.
## Title
## Some stories have a banner that has it's own a tag before the actual text title...
## so I'm checking the pagetitle div for all a tags that match the criteria, then taking the last.
a = soup.find('div',{'id':'pagetitle'}).findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"$"))[-1]
self.story.setMetadata('title',stripHTML(a))
# Find authorid and URL from... author url.
# This site lists the newest member to the site before the div that has the story info
# so I'm checking the pagetitle div for this as well
a = soup.find('div',{'id':'pagetitle'}).find('a', href=re.compile(r"viewuser.php\?uid=\d+"))
self.story.setMetadata('authorId',a['href'].split('=')[1])
self.story.setMetadata('authorUrl','http://'+self.host+'/'+a['href'])
self.story.setMetadata('author',a.string)
# Find the chapters:
for chapter in soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"&chapter=\d+$")):
# just in case there's tags, like <i> in chapter titles.
#self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/'+chapter['href']))
self.chapterUrls.append((stripHTML(chapter),'http://{0}/{1}{2}'.format(self.host, chapter['href'],addURL)))
self.story.setMetadata('numChapters',len(self.chapterUrls))
# eFiction sites don't help us out a lot with their meta data
# formating, so it's a little ugly.
# utility method
def defaultGetattr(d,k):
try:
return d[k]
except:
return ""
# <span class="label">Rated:</span> NC-17<br /> etc
labels = soup.findAll('span',{'class':'label'})
for labelspan in labels:
val = labelspan.nextSibling
value = unicode('')
while val and not 'label' in defaultGetattr(val,'class'):
# print("val:%s"%val)
if not isinstance(val,Comment):
value += unicode(val)
val = val.nextSibling
label = labelspan.string
# print("label:%s\nvalue:%s"%(label,value))
if 'Summary' in label:
self.setDescription(url,value)
if 'Rated' in label:
self.story.setMetadata('rating', stripHTML(value))
if 'Word count' in label:
self.story.setMetadata('numWords', stripHTML(value))
if 'Categories' in label:
cats = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=categories'))
for cat in cats:
self.story.addToList('category',cat.string)
if 'Characters' in label:
chars = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=characters'))
for char in chars:
self.story.addToList('characters',char.string)
if 'Genre' in label:
genres = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=1')) # XXX
for genre in genres:
self.story.addToList('genre',genre.string)
if 'Pairing' in label:
ships = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=4'))
for ship in ships:
self.story.addToList('ships',ship.string)
if 'Warnings' in label:
warnings = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=2')) # XXX
for warning in warnings:
self.story.addToList('warnings',warning.string)
if 'Completed' in label:
if 'Yes' in stripHTML(value):
self.story.setMetadata('status', 'Completed')
else:
self.story.setMetadata('status', 'In-Progress')
if 'Published' in label:
self.story.setMetadata('datePublished', makeDate(stripHTML(value), self.dateformat))
if 'Updated' in label:
self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat))
if 'Disclaimer' in label:
self.story.setMetadata('disclaimer', stripHTML(value))
try:
# Find Series name from series URL.
a = soup.find('a', href=re.compile(r"viewseries.php\?seriesid=\d+"))
series_name = a.string
series_url = 'http://'+self.host+'/'+a['href']
# use BeautifulSoup HTML parser to make everything easier to find.
seriessoup = self.make_soup(self._fetchUrl(series_url))
# can't use ^viewstory...$ in case of higher rated stories with javascript href.
storyas = seriessoup.findAll('a', href=re.compile(r'viewstory.php\?sid=\d+'))
i=1
for a in storyas:
# skip 'report this' and 'TOC' links
if 'contact.php' not in a['href'] and 'index' not in a['href']:
if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):
self.setSeries(series_name, i)
self.story.setMetadata('seriesUrl',series_url)
break
i+=1
except:
# I find it hard to care if the series parsing fails
pass
# grab the text for an individual chapter.
def getChapterText(self, url):
logger.debug('Getting chapter text from: %s' % url)
soup = self.make_soup(self._fetchUrl(url))
div = soup.find('div', {'id' : 'story'})
if None == div:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return self.utf8FromSoup(url,div)

View file

@ -0,0 +1,292 @@
# -*- coding: utf-8 -*-
# Copyright 2012 Fanficdownloader team, 2015 FanFicFare team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Software: eFiction
import time
import logging
logger = logging.getLogger(__name__)
import re
import urllib2
import sys
from bs4.element import Comment
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, makeDate
def getClass():
return HPFanficArchiveComAdapter
# Class name has to be unique. Our convention is camel case the
# sitename with Adapter at the end. www is skipped.
class HPFanficArchiveComAdapter(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.decode = ["Windows-1252",
"utf8", "iso-8859-1"]
# 1252 is a superset of iso-8859-1.
# Most sites that claim to be
# iso-8859-1 (and some that claim to be
# utf8) are really windows-1252.
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
self.password = ""
self.is_adult=False
# get storyId from url--url validation guarantees query is only sid=1234
self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1])
# normalized story URL.
self._setURL('http://' + self.getSiteDomain() + '/viewstory.php?sid='+self.story.getMetadata('storyId'))
# Each adapter needs to have a unique site abbreviation.
self.story.setMetadata('siteabbrev','ficsite')
# The date format will vary from site to site.
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
self.dateformat = "%m/%d/%Y"
@staticmethod # must be @staticmethod, don't remove it.
def getSiteDomain():
# The site domain. Does have www here, if it uses it.
return 'www.ficsite.com'
@classmethod
def getSiteExampleURLs(cls):
return "http://"+cls.getSiteDomain()+"/viewstory.php?sid=1234"
def getSiteURLPattern(self):
return re.escape("http://"+self.getSiteDomain()+"/viewstory.php?sid=")+r"\d+$"
## Login seems to be reasonably standard across eFiction sites.
def needToLoginCheck(self, data):
if 'Registered Users Only' in data \
or 'There is no such account on our website' in data \
or "That password doesn't match the one in our database" in data:
return True
else:
return False
def performLogin(self, url):
params = {}
if self.password:
params['penname'] = self.username
params['password'] = self.password
else:
params['penname'] = self.getConfig("username")
params['password'] = self.getConfig("password")
params['cookiecheck'] = '1'
params['submit'] = 'Submit'
loginUrl = 'http://' + self.getSiteDomain() + '/user.php?action=login'
logger.debug("Will now login to URL (%s) as (%s)" % (loginUrl,
params['penname']))
d = self._fetchUrl(loginUrl, params)
if "Member Account" not in d : #Member Account
logger.info("Failed to login to URL %s as %s" % (loginUrl,
params['penname']))
raise exceptions.FailedToLogin(url,params['penname'])
return False
else:
return True
# I've added this because there are several warnings
# that are used by this site.
def getWarning(self, data):
if "This story contains adult subject matter that may include coarse language, violence, and mild sexual content of a graphical nature. Reader discretion is requested. Thank you." in data:
return '&ageconsent=ok&warning=5'
elif "This story contains graphical material of an adult nature and a same sex primary relationship. Please do not read if this is not to your taste. Thank you." in data:
return '&warning=7'
elif "This story contains graphical material of an adult nature. Reader discretion is requested. Thank you." in data:
return '&warning=6'
else:
return False
## Getting the chapter list and the meta data, plus 'is adult' checking.
def extractChapterUrlsAndMetadata(self):
if (self.is_adult or self.getConfig("is_adult")):
addurl = '&index=1&ageconsent=ok&warning=5'
else:
addurl='&index=1'
# index=1 makes sure we see the story chapter index. Some
# sites skip that for one-chapter stories.
url = self.url+addurl
logger.debug("URL: "+url)
try:
data = self._fetchUrl(url)
except urllib2.HTTPError, e:
if e.code == 404:
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
if self.needToLoginCheck(data):
# need to log in for this one.
self.performLogin(url)
data = self._fetchUrl(url)
warning = self.getWarning(data)
if warning != False:
data = self._fetchUrl(url+warning)
if "Access denied. This story has not been validated by the adminstrators of this site." in data:
raise exceptions.AccessDenied(self.getSiteDomain() +" says: Access denied. This story has not been validated by the adminstrators of this site.")
elif "This story contains adult subject matter that may include coarse language, violence, and mild sexual content of a graphical nature. Reader discretion is requested. Thank you." in data:
raise exceptions.AccessDenied(self.getSiteDomain()+" says: This story contains adult subject matter that may include coarse language, violence, and mild sexual content of a graphical nature. Reader discretion is requested. Thank you.")
elif "This story contains graphical material of an adult nature and a same sex primary relationship. Please do not read if this is not to your taste. Thank you." in data:
raise exceptions.AccessDenied(self.getSiteDomain()+" says: This story contains graphical material of an adult nature and a same sex primary relationship. Please do not read if this is not to your taste. Thank you.")
# use BeautifulSoup HTML parser to make everything easier to find.
soup = self.make_soup(data)
# print data
# Now go hunting for all the meta data and the chapter list.
## Title and Author Div
div = soup.find('div',{'id':'pagetitle'})
## Title
a = div.find('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"$"))
self.story.setMetadata('title',stripHTML(a))
# Find authorid and URL from... author url.
a = div.find('a', href=re.compile(r"viewuser.php\?uid=\d+"))
self.story.setMetadata('authorId',a['href'].split('=')[1])
self.story.setMetadata('authorUrl','http://'+self.host+'/'+a['href'])
self.story.setMetadata('author',a.string)
# Find the chapters:
for chapter in soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"&chapter=\d+$")):
# just in case there's tags, like <i> in chapter titles.
self.chapterUrls.append((stripHTML(chapter),'http://'+self.host+'/'+chapter['href']))
self.story.setMetadata('numChapters',len(self.chapterUrls))
# eFiction sites don't help us out a lot with their meta data
# formating, so it's a little ugly.
# utility method
def defaultGetattr(d,k):
try:
return d[k]
except:
return ""
# <span class="label">Rated:</span> NC-17<br /> etc
labels = soup.findAll('span',{'class':'label'})
for labelspan in labels:
val = labelspan.nextSibling
value = unicode('')
while val and not 'label' in defaultGetattr(val,'class'):
# print("val:%s"%val)
if not isinstance(val,Comment):
value += unicode(val)
val = val.nextSibling
label = labelspan.string
# print("label:%s\nvalue:%s"%(label,value))
if 'Summary' in label:
self.setDescription(url,value)
if 'Rated' in label:
self.story.setMetadata('rating', stripHTML(value))
if 'Word count' in label:
self.story.setMetadata('numWords', stripHTML(value))
if 'Categories' in label:
cats = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=categories'))
for cat in cats:
self.story.addToList('category',cat.string)
if 'Characters' in label:
chars = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=characters'))
for char in chars:
self.story.addToList('characters',char.string)
if 'Genre' in label:
genres = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=1')) # XXX
for genre in genres:
self.story.addToList('genre',genre.string)
if 'Pairing' in label:
ships = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=4'))
for ship in ships:
self.story.addToList('ships',ship.string)
if 'Warnings' in label:
warnings = labelspan.parent.findAll('a',href=re.compile(r'browse.php\?type=class&type_id=2')) # XXX
for warning in warnings:
self.story.addToList('warnings',warning.string)
if 'Completed' in label:
if 'Yes' in stripHTML(value):
self.story.setMetadata('status', 'Completed')
else:
self.story.setMetadata('status', 'In-Progress')
if 'Published' in label:
self.story.setMetadata('datePublished', makeDate(stripHTML(value), self.dateformat))
if 'Updated' in label:
self.story.setMetadata('dateUpdated', makeDate(stripHTML(value), self.dateformat))
try:
# Find Series name from series URL.
a = soup.find('a', href=re.compile(r"viewseries.php\?seriesid=\d+"))
series_name = a.string
series_url = 'http://'+self.host+'/'+a['href']
# use BeautifulSoup HTML parser to make everything easier to find.
seriessoup = self.make_soup(self._fetchUrl(series_url))
# can't use ^viewstory...$ in case of higher rated stories with javascript href.
storyas = seriessoup.findAll('a', href=re.compile(r'viewstory.php\?sid=\d+'))
i=1
for a in storyas:
# skip 'report this' and 'TOC' links
if 'contact.php' not in a['href'] and 'index' not in a['href']:
if a['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):
self.setSeries(series_name, i)
self.story.setMetadata('seriesUrl',series_url)
break
i+=1
except:
# I find it hard to care if the series parsing fails
pass
# grab the text for an individual chapter.
def getChapterText(self, url):
logger.debug('Getting chapter text from: %s' % url)
soup = self.make_soup(self._fetchUrl(url))
div = soup.find('div', {'id' : 'story'})
if None == div:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return self.utf8FromSoup(url,div)

View file

@ -982,6 +982,11 @@ strip_text_links:true
## Site dedicated to these categories/characters/ships
extracategories:Blood Ties
[chosentwofanfic.com]
extra_valid_entries:disclaimer
disclaimer_label: Disclaimer
add_to_titlepage_entries:,disclaimer
[fanfic.castletv.net]
## Some sites require login (or login for some rated stories) The
## program can prompt you, or you can save it in config. In
@ -1809,6 +1814,27 @@ readings_label:Readings
## Site dedicated to these categories/characters/ships
extracategories:Star Trek
[www.asexstories.com]
## Some sites also require the user to confirm they are adult for
## adult content. Uncomment by removing '#' in front of is_adult.
#is_adult:true
## This site has links to a vidow site embeded in the text. They are
## not needed, and will be removed if the below property is set to True
strip_text_links:true
[www.bdsmlibrary.com]
## Some sites also require the user to confirm they are adult for
## adult content. Uncomment by removing '#' in front of is_adult.
#is_adult:true
extra_valid_entries:eroticatags,size,comments
size_label: Size
comments_label: Comments
eroticatags_label:Erotica Tags
extra_titlepage_entries:size,comments,eroticatags
keep_style_attr: false
[www.dracoandginny.com]
## Some sites require login (or login for some rated stories) The
## program can prompt you, or you can save it in config. In
@ -1927,6 +1953,11 @@ check_next_chapter:false
## this should go in your personal.ini, not defaults.ini.
#is_adult:true
[www.ficsite.com]
## Some sites also require the user to confirm they are adult for
## adult content. Uncomment by removing '#' in front of is_adult.
#is_adult:true
[www.fictionalley.org]
## Some sites do not require a login, but do require the user to
## confirm they are adult for adult content. In commandline version,