Fictionalley.com working, also add twilighted.com to downaloder.py.

This commit is contained in:
retiefjimm 2010-09-28 19:26:06 -05:00
parent fe67201f95
commit 84fea5b896
2 changed files with 74 additions and 22 deletions

View file

@ -80,14 +80,16 @@ if __name__ == '__main__':
adapter = ffa.FFA(url)
elif url.find('fictionalley') != -1:
adapter = fictionalley.FictionAlley(url)
print >> sys.stderr, "FictionAlley adapter is broken, try to find this fic on fanfiction.net or fanficauthors"
sys.exit(0)
#print >> sys.stderr, "FictionAlley adapter is broken, try to find this fic on fanfiction.net or fanficauthors"
#sys.exit(0)
elif url.find('ficwad') != -1:
adapter = ficwad.FicWad(url)
elif url.find('fanfiction.net') != -1 or url.find('fictionpress.com') != -1:
adapter = ffnet.FFNet(url)
elif url.find('harrypotterfanfiction.com') != -1:
adapter = hpfiction.HPFiction(url)
elif url.find('twilighted.com') != -1:
adapter = twilighted.Twilighted(url)
else:
print >> sys.stderr, "Oi! I can haz not appropriate adapter for URL %s!" % url
sys.exit(1)

View file

@ -2,36 +2,78 @@ import os
import re
import sys
import shutil
import logging
import os.path
import urllib as u
import pprint as pp
import urllib2 as u2
import cookielib as cl
import urlparse as up
import BeautifulSoup as bs
import htmlentitydefs as hdefs
import time as time
from adapter import *
class FictionAlley:
def __init__(self):
pass
class FictionAlley(FanfictionSiteAdapter):
def __init__(self, url):
self.url = url
self.host = up.urlparse(url).netloc
cookieproc = u2.HTTPCookieProcessor()
# FictionAlley wants a cookie to prove you're old enough to read R+ rated stuff.
cookie = cl.Cookie(version=0, name='fauser', value='wizard',
port=None, port_specified=False,
domain='www.fictionalley.org', domain_specified=False, domain_initial_dot=False,
path='/authors', path_specified=True,
secure=False,
expires=time.time()+100,
discard=False,
comment=None,
comment_url=None,
rest={'HttpOnly': None},
rfc2109=False)
cookieproc.cookiejar.set_cookie(cookie)
self.opener = u2.build_opener(cookieproc)
def requiresLogin(self, url = None):
return False
def extractIndividualUrls(self, data, host, contents):
def performLogin(self, url = None):
pass
def setLogin(self, login):
self.login = login
def setPassword(self, password):
self.password = password
def extractIndividualUrls(self):
data = self.opener.open(self.url).read()
soup = bs.BeautifulStoneSoup(data)
# Get title from <title>, remove before '-'.
title = soup.find('title').string
self.storyName = "-".join(title.split('-')[1:]).strip()
authors = soup.findAll('a')
print('Story "%s" by %s' % (self.storyName, self.authorName))
self.storyName = "-".join(title.split('-')[1:]).strip().replace(" (Story Text)","")
links = soup.findAll('a', { 'class' : 'chapterlink' } )
result = []
for a in links:
url = a['href']
title = a.string
result.append((url,title))
if len(links) == 0:
breadcrumbs = soup.find('div', {'class': 'breadcrumbs'})
self.authorName = breadcrumbs.a.string.replace("'s Fics","")
result.append((self.url,self.storyName))
else:
author = soup.find('h1', {'class' : 'title'})
self.authorName = author.a.string
for a in links:
url = a['href']
title = a.string
result.append((url,title))
print('Story "%s" by %s' % (self.storyName, self.authorName))
return result
def getStoryName(self):
@ -40,11 +82,20 @@ class FictionAlley:
def getAuthorName(self):
return self.authorName
def getText(self, data, fetch = False):
def getText(self, url):
# fictionalley uses full URLs in chapter list.
data = self.opener.open(url).read()
# find <!-- headerend --> & <!-- footerstart -->
# and replaced with matching div pair for easier parsing.
# Yes, it's an evil kludge, but what can ya do?
data = data.replace('<!-- headerend -->','<div id="storytext">').replace('<!-- footerstart -->','</div>')
soup = bs.BeautifulStoneSoup(data)
div = soup.find('div', {'id' : 'storytext'})
if None == div:
logging.error("Error downloading Chapter: %s" % url)
exit(1)
return '<html/>'
return div.prettify()
@ -62,14 +113,13 @@ class FictionAlley:
login = dict(login = 'name', password = 'pass')
other = dict(submit = 'Log In', remember='yes')
return (login, other)
if __name__ == '__main__':
url = 'http://www.fictionalley.org/authors/drt/DA.html'
data = u2.urlopen(url).read()
data = self.opener.open(url).read()
host = up.urlparse(url).netloc
fw = FictionAlley()
fw.authorName = 'DrT'
fw = FictionAlley(url)
urls = fw.extractIndividualUrls(data, host, url)
pp.pprint(urls)
print(fw.getText(data))