mirror of
https://github.com/JimmXinu/FanFicFare.git
synced 2026-05-08 21:11:59 +02:00
Remove gayauthors.org at the site admin's request.
This commit is contained in:
parent
f207ba01fe
commit
8c5f784afb
1 changed files with 0 additions and 205 deletions
|
|
@ -1,205 +0,0 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2011 Fanficdownloader team
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import datetime
|
||||
import logging
|
||||
import re
|
||||
import urllib2
|
||||
from urllib import unquote
|
||||
|
||||
from .. import BeautifulSoup as bs
|
||||
from ..htmlcleanup import stripHTML
|
||||
from .. import exceptions as exceptions
|
||||
|
||||
from base_adapter import BaseSiteAdapter, makeDate
|
||||
|
||||
def getClass():
|
||||
return GayAuthorsAdapter
|
||||
|
||||
# Class name has to be unique. Our convention is camel case the
|
||||
# sitename with Adapter at the end. www is skipped.
|
||||
class GayAuthorsAdapter(BaseSiteAdapter):
|
||||
|
||||
def __init__(self, config, url):
|
||||
BaseSiteAdapter.__init__(self, config, url)
|
||||
|
||||
self.decode = ["utf8",
|
||||
"Windows-1252"] # 1252 is a superset of iso-8859-1.
|
||||
# Most sites that claim to be
|
||||
# iso-8859-1 (and some that claim to be
|
||||
# utf8) are really windows-1252.
|
||||
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
|
||||
self.password = ""
|
||||
self.is_adult=False
|
||||
|
||||
|
||||
# get storyId from url--url validation guarantees query is only sid=1234
|
||||
self.story.setMetadata('storyId',self.parsedUrl.path.split('/',)[3])
|
||||
logging.debug("storyId: (%s)"%self.story.getMetadata('storyId'))
|
||||
|
||||
# unqoute, change '_' and ' ' to '-', downcase, and remove non-[a-z0-9-]
|
||||
authid = unquote(self.parsedUrl.path.split('/',)[2])
|
||||
authid = authid.lower().replace('_','-').replace(' ','-')
|
||||
authid = re.sub(r"[^a-z0-9-]","",authid)
|
||||
|
||||
self.story.setMetadata('authorId',authid)
|
||||
logging.debug("authorId: (%s)"%self.story.getMetadata('authorId'))
|
||||
|
||||
# normalized story URL.
|
||||
self._setURL('http://' + self.getSiteDomain() + '/story/'+self.story.getMetadata('authorId') + '/' + self.story.getMetadata('storyId'))
|
||||
|
||||
# Each adapter needs to have a unique site abbreviation.
|
||||
self.story.setMetadata('siteabbrev','ga')
|
||||
|
||||
# The date format will vary from site to site.
|
||||
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
|
||||
self.dateformat = "%d %b %Y"
|
||||
|
||||
@staticmethod # must be @staticmethod, don't remove it.
|
||||
def getSiteDomain():
|
||||
# The site domain. Does have www here, if it uses it.
|
||||
return 'www.gayauthors.org'
|
||||
|
||||
def getSiteExampleURLs(self):
|
||||
return "http://"+self.getSiteDomain()+"/story/author/storytitle"
|
||||
|
||||
def getSiteURLPattern(self):
|
||||
return re.escape("http://"+self.getSiteDomain()+"/story/")+r".*?/\w+.*?$"
|
||||
|
||||
|
||||
|
||||
## Getting the chapter list and the meta data, plus 'is adult' checking.
|
||||
def extractChapterUrlsAndMetadata(self):
|
||||
|
||||
|
||||
url = self.url
|
||||
logging.debug("URL: "+url)
|
||||
|
||||
try:
|
||||
data = self._fetchUrl(url)
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
raise exceptions.StoryDoesNotExist(self.url)
|
||||
else:
|
||||
raise e
|
||||
|
||||
|
||||
|
||||
# use BeautifulSoup HTML parser to make everything easier to find.
|
||||
soup = bs.BeautifulSoup(data)
|
||||
# print data
|
||||
|
||||
# Now go hunting for all the meta data and the chapter list.
|
||||
msoup = soup.find('div', {'class' : 'story'})
|
||||
if msoup == None:
|
||||
msoup = soup.find('div', {'class' : 'story ispinned'})
|
||||
csoup = soup.find('div', {'id' : 'story_chapters'})
|
||||
|
||||
## Title
|
||||
a = msoup.find('span', {'class' : 'title'})
|
||||
title=a.find('span', {'itemprop' : 'name'})
|
||||
self.story.setMetadata('title',title.text)
|
||||
try:
|
||||
# Find Series name from series URL.
|
||||
series = a.find('span',{'class':"description"})
|
||||
series_name = series.find('a')
|
||||
series_name.extract()
|
||||
series_index = int(series.text.split(' ')[1])
|
||||
self.setSeries(series_name.text, series_index)
|
||||
|
||||
except:
|
||||
# I find it hard to care if the series parsing fails
|
||||
pass
|
||||
|
||||
# Find authorid and URL from... author url.
|
||||
a = msoup.find('a', href=re.compile(r'/author/'+self.story.getMetadata('authorId')))
|
||||
self.story.setMetadata('authorUrl',a['href'])
|
||||
self.story.setMetadata('author',a.text)
|
||||
|
||||
|
||||
# Find the chapters:
|
||||
spans=csoup.findAll('span', {'class' : 'desc chapter-info'})
|
||||
for span in spans:
|
||||
span.extract()
|
||||
for chapter in csoup.findAll('a'):
|
||||
# just in case there's tags, like <i> in chapter titles.
|
||||
a=chapter['href'].split(self.story.getMetadata('author'))
|
||||
a=a[0]+self.story.getMetadata('authorId')+a[1]
|
||||
self.chapterUrls.append((stripHTML(chapter),a))
|
||||
|
||||
|
||||
self.story.setMetadata('numChapters',len(self.chapterUrls))
|
||||
|
||||
cats = msoup.findAll('a', href=re.compile(r'/browse/list/page__filtertype_0__category\w+$'))
|
||||
for cat in cats:
|
||||
self.story.addToList('category',cat.text)
|
||||
|
||||
genres = msoup.findAll('a', href=re.compile(r'/browse/list/page__filtertype_1__genre\w+$'))
|
||||
for genre in genres:
|
||||
self.story.addToList('genre',genre.text)
|
||||
genres = msoup.findAll('a', href=re.compile(r'/browse/list/page__filtertype_2__tag\w+$'))
|
||||
for genre in genres:
|
||||
self.story.addToList('genre',genre.text)
|
||||
|
||||
status = msoup.find('a', href=re.compile(r'/browse/list/page__filtertype_3__status\w+$'))
|
||||
self.story.setMetadata('status',status.text)
|
||||
|
||||
rating = msoup.find('a', href=re.compile(r'/browse/list/page__filtertype_4__rating\w+$'))
|
||||
self.story.setMetadata('rating',rating.text)
|
||||
|
||||
summary = msoup.find('span', {'itemprop' : 'description'})
|
||||
self.setDescription(self.url,summary.text)
|
||||
#self.story.setMetadata('description',summary.text)
|
||||
|
||||
|
||||
stats = msoup.find('dl',{'class':'info'})
|
||||
dt = stats.findAll('dt')
|
||||
dd = stats.findAll('dd')
|
||||
for x in range(0,len(dt)):
|
||||
label = dt[x].text
|
||||
value = dd[x].text
|
||||
|
||||
if 'Words:' in label:
|
||||
self.story.setMetadata('numWords', value)
|
||||
|
||||
if 'Published:' in label:
|
||||
date=stripHTML(value.split(' - ')[0])
|
||||
if ',' in date:
|
||||
date=datetime.date.today().strftime(self.dateformat)
|
||||
self.story.setMetadata('datePublished', makeDate(date, self.dateformat))
|
||||
|
||||
if 'Updated:' in label:
|
||||
date=stripHTML(value.split(' - ')[0])
|
||||
if ',' in date:
|
||||
date=datetime.date.today().strftime(self.dateformat)
|
||||
self.story.setMetadata('dateUpdated', makeDate(date, self.dateformat))
|
||||
|
||||
# grab the text for an individual chapter.
|
||||
def getChapterText(self, url):
|
||||
|
||||
logging.debug('Getting chapter text from: %s' % url)
|
||||
|
||||
data = self._fetchUrl(url)
|
||||
data = data[data.index("<div id='chapter-content'>"):]
|
||||
soup = bs.BeautifulSoup(data) # this one's happier with Soup, not StoneSoup for some reason.
|
||||
|
||||
div = soup.find('div', {'id' : 'chapter-content'})
|
||||
|
||||
if None == div:
|
||||
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
return self.utf8FromSoup(url,div)
|
||||
Loading…
Reference in a new issue