FanFicFare/fictionalley.py

377 lines
12 KiB
Python

import os
import re
import sys
import shutil
import logging
import os.path
import urllib as u
import pprint as pp
import urllib2 as u2
import cookielib as cl
import urlparse as up
import BeautifulSoup as bs
import htmlentitydefs as hdefs
import time as time
import datetime
from adapter import *
class FictionAlley(FanfictionSiteAdapter):
def __init__(self, url):
self.url = url
parsedUrl = up.urlparse(url)
self.host = parsedUrl.netloc
self.path = parsedUrl.path
logging.debug('self.host=%s' % self.host)
logging.debug('self.path=%s' % self.path)
cookieproc = u2.HTTPCookieProcessor()
# FictionAlley wants a cookie to prove you're old enough to read R+ rated stuff.
cookie = cl.Cookie(version=0, name='fauser', value='wizard',
port=None, port_specified=False,
domain='www.fictionalley.org', domain_specified=False, domain_initial_dot=False,
path='/authors', path_specified=True,
secure=False,
expires=time.time()+100,
discard=False,
comment=None,
comment_url=None,
rest={'HttpOnly': None},
rfc2109=False)
cookieproc.cookiejar.set_cookie(cookie)
self.opener = u2.build_opener(cookieproc)
ss = self.path.split('/')
self.storyDescription = 'Fanfiction Story'
self.authorId = ''
self.authorURL = ''
self.storyId = ''
if len(ss) > 2 and ss[1] == 'authors':
self.authorId = ss[2]
self.authorURL = 'http://' + self.host + '/authors/' + self.authorId
if len(ss) > 3:
self.storyId = ss[3].replace ('.html','')
self.storyPublished = datetime.date(1970, 01, 31)
self.storyCreated = datetime.datetime.now()
self.storyUpdated = datetime.date(1970, 01, 31)
self.languageId = 'en-UK'
self.language = 'English'
self.subjects = []
self.subjects.append ('fanfiction')
self.publisher = self.host
self.numChapters = 0
self.numWords = 0
self.genre = 'FanFiction'
self.category = 'Category'
self.storyStatus = 'In-Progress'
self.storyRating = 'K'
self.storyUserRating = '0'
self.storyCharacters = []
self.storySeries = ''
def requiresLogin(self, url = None):
return False
def performLogin(self, url = None):
pass
def setLogin(self, login):
self.login = login
def setPassword(self, password):
self.password = password
def _addSubject(self, subject):
subj = subject.upper()
for s in self.subjects:
if s.upper() == subj:
return False
self.subjects.append(subject)
return True
def _addCharacter(self, character):
chara = character.upper()
for c in self.storyCharacters:
if c.upper() == chara:
return False
self.storyCharacters.append(character)
return True
def _processChapterHeaders(self, div):
brs = div.findAll ('br')
for br in brs:
keystr=''
valstr=''
if len(br.contents) > 2:
keystr = br.contents[1]
if keystr is not None:
strs = re.split ("<[^>]+>", str(keystr))
keystr=''
for s in strs:
keystr = keystr + s
valstr = br.contents[2].strip(' ')
if keystr is not None:
if keystr == 'Rating:':
self.storyRating = valstr
logging.debug('self.storyRating=%s' % self.storyRating)
elif keystr == 'Genre:':
self.genre = valstr
logging.debug('self.genre=%s' % self.genre)
s2 = valstr.split(', ')
for ss2 in s2:
self._addSubject(ss2)
logging.debug('self.subjects=%s' % self.subjects)
elif keystr == 'Main Character(s):':
s2 = valstr.split(', ')
for ss2 in s2:
self._addCharacter(ss2)
logging.debug('self.storyCharacters=%s' % self.storyCharacters)
elif keystr == 'Summary:':
self.storyDescription = valstr
logging.debug('self.storyDescription=%s' % self.storyDescription)
def extractIndividualUrls(self):
data = self.opener.open(self.url).read()
# There is some usefull information in the headers of the first chapter page..
data = data.replace('<!-- headerstart -->','<crazytagstringnobodywouldstumbleonaccidently id="storyheaders">').replace('<!-- headerend -->','</crazytagstringnobodywouldstumbleonaccidently>')
soup = bs.BeautifulStoneSoup(data)
# Get title from <title>, remove before '-'.
title = soup.find('title').string
self.storyName = "-".join(title.split('-')[1:]).strip().replace(" (Story Text)","")
self.outputName = self.storyName.replace(" ", "_") + '-fa_' + self.storyId
links = soup.findAll('li')
# If it is decided that we really do care about number of words.. It's only available on the author's page..
#d0 = self.opener.open(self.authorURL).read()
#soupA = bs.BeautifulStoneSoup(d0)
#dls = soupA.findAll('dl')
#logging.debug('dls=%s' % dls)
self.numChapters = 0;
result = []
if len(links) == 0:
# Be aware that this means that the user has entered the {STORY}01.html
# We will not have valid Publised and Updated dates. User should enter
# the {STORY}.html instead. We should force that instead of this.
breadcrumbs = soup.find('div', {'class': 'breadcrumbs'})
self.authorName = breadcrumbs.a.string.replace("'s Fics","")
result.append((self.url,self.storyName))
#logging.debug('chapter[%s]=%s, %s' % (self.numChapters+1,self.url,self.storyName))
self.numChapters = self.numChapters + 1;
div = soup.find('crazytagstringnobodywouldstumbleonaccidently', {'id' : 'storyheaders'})
if div is not None:
self._processChapterHeaders(div)
else:
author = soup.find('h1', {'class' : 'title'})
self.authorName = author.a.string
summary = soup.find('div', {'class' : 'summary'})
ss = summary.contents
if len(ss) > 1:
ss1 = ss[0].split(': ')
if len(ss1) > 1 and ss1[0] == 'Rating':
self.storyRating = ss1[1]
logging.debug('self.storyRating=%s' % self.storyRating)
self.storyDescription = str(ss[1]).replace("<br>","").replace("</br>","").replace('\n','')
logging.debug('self.storyDescription=%s' % self.storyDescription)
for li in links:
a = li.find('a', {'class' : 'chapterlink'})
s = li.contents
if a is not None:
url = a['href']
title = a.string
result.append((url,title))
#logging.debug('chapter[%s]=%s, %s' % (self.numChapters+1,url,title))
if self.numChapters == 0:
# fictionalley uses full URLs in chapter list.
d1 = self.opener.open(url).read()
# find <!-- headerstart --> & <!-- headerend --> and
# replaced with matching div pair for easier parsing.
# Yes, it's an evil kludge, but what can ya do? Using
# something other than div prevents soup from pairing
# our div with poor html inside the story text.
d1 = d1.replace('<!-- headerstart -->','<crazytagstringnobodywouldstumbleonaccidently id="storyheaders">').replace('<!-- headerend -->','</crazytagstringnobodywouldstumbleonaccidently>')
sop = bs.BeautifulStoneSoup(d1)
div = sop.find('crazytagstringnobodywouldstumbleonaccidently', {'id' : 'storyheaders'})
if div is not None:
self._processChapterHeaders(div)
self.numChapters = self.numChapters + 1
if len(s) > 1:
datestr=''
ss2 = s[1].replace('\n','').replace('(','').split(' ')
if len(ss2) > 2 and ss2[0] == 'Posted:':
datestr = ss2[1] + ' ' + ss2[2]
tmpdate = datetime.datetime.fromtimestamp(time.mktime(time.strptime(datestr.strip(' '), "%Y-%m-%d %H:%M:%S")))
if self.numChapters == 1:
self.storyPublished = tmpdate
self.storyUpdated = tmpdate
logging.debug('self.storyPublished=%s, self.storyUpdated=%s' % (self.storyPublished, self.storyUpdated))
else:
logging.debug('li chapterlink not found! li=%s' % li)
print('Story "%s" by %s' % (self.storyName, self.authorName))
self.uuid = 'urn:uuid:' + self.host + '-u.' + self.authorId + '-s.' + self.storyId
logging.debug('self.uuid=%s' % self.uuid)
return result
def getStoryName(self):
return self.storyName
def getAuthorName(self):
return self.authorName
def getOutputName(self):
return self.outputName
def getText(self, url):
# fictionalley uses full URLs in chapter list.
data = self.opener.open(url).read()
# find <!-- headerend --> & <!-- footerstart --> and
# replaced with matching div pair for easier parsing.
# Yes, it's an evil kludge, but what can ya do? Using
# something other than div prevents soup from pairing
# our div with poor html inside the story text.
data = data.replace('<!-- headerend -->','<crazytagstringnobodywouldstumbleonaccidently id="storytext">').replace('<!-- footerstart -->','</crazytagstringnobodywouldstumbleonaccidently>')
soup = bs.BeautifulStoneSoup(data)
div = soup.find('crazytagstringnobodywouldstumbleonaccidently', {'id' : 'storytext'})
if None == div:
logging.error("Error downloading Chapter: %s" % url)
exit(20)
return '<html/>'
html = soup.findAll('html')
if len(html) > 1:
return html[1].__str__('utf8')
else:
return div.__str__('utf8').replace('crazytagstringnobodywouldstumbleonaccidently','div')
def getStoryURL(self):
logging.debug('self.url=%s' % self.url)
return self.url
def getAuthorURL(self):
logging.debug('self.authorURL=%s' % self.authorURL)
return self.authorURL
def getUUID(self):
logging.debug('self.uuid=%s' % self.uuid)
return self.uuid
def getAuthorId(self):
logging.debug('self.authorId=%s' % self.authorId)
return self.authorId
def getStoryId(self):
logging.debug('self.storyId=%s' % self.storyId)
return self.storyId
def getStoryDescription(self):
logging.debug('self.storyDescription=%s' % self.storyDescription)
return self.storyDescription
def getStoryPublished(self):
logging.debug('self.storyPublished=%s' % self.storyPublished)
return self.storyPublished
def getStoryCreated(self):
self.storyCreated = datetime.datetime.now()
logging.debug('self.storyCreated=%s' % self.storyCreated)
return self.storyCreated
def getStoryUpdated(self):
logging.debug('self.storyUpdated=%s' % self.storyUpdated)
return self.storyUpdated
def getLanguage(self):
logging.debug('self.language=%s' % self.language)
return self.language
def getLanguageId(self):
logging.debug('self.languageId=%s' % self.languageId)
return self.languageId
def getSubjects(self):
logging.debug('self.subjects=%s' % self.authorName)
return self.subjects
def getPublisher(self):
logging.debug('self.publisher=%s' % self.publisher)
return self.publisher
def getNumChapters(self):
logging.debug('self.numChapters=%s' % self.numChapters)
return self.numChapters
def getNumWords(self):
logging.debug('self.numWords=%s' % self.numWords)
return self.numWords
def getCategory(self):
logging.debug('self.category=%s' % self.category)
return self.category
def getGenre(self):
logging.debug('self.genre=%s' % self.genre)
return self.genre
def getStoryStatus(self):
logging.debug('self.storyStatus=%s' % self.storyStatus)
return self.storyStatus
def getStoryRating(self):
logging.debug('self.storyRating=%s' % self.storyRating)
return self.storyRating
def getStoryUserRating(self):
logging.debug('self.storyUserRating=%s' % self.storyUserRating)
return self.storyUserRating
def getPrintableUrl(self, url):
return url
def getPasswordLine(self):
return 'opaopapassword'
def getLoginScript(self):
return 'opaopaloginscript'
def getLoginPasswordOthers(self):
login = dict(login = 'name', password = 'pass')
other = dict(submit = 'Log In', remember='yes')
return (login, other)
def getStoryCharacters(self):
logging.debug('self.storyCharacters=%s' % self.storyCharacters)
return self.storyCharacters
def getStorySeries(self):
logging.debug('self.storySeries=%s' % self.storySeries)
return self.storySeries
if __name__ == '__main__':
url = 'http://www.fictionalley.org/authors/drt/DA.html'
data = self.opener.open(url).read()
host = up.urlparse(url).netloc
fw = FictionAlley(url)
urls = fw.extractIndividualUrls(data, host, url)
pp.pprint(urls)
print(fw.getText(data))