diff --git a/downaloder.py b/downaloder.py index dee7a30d..de93c790 100644 --- a/downaloder.py +++ b/downaloder.py @@ -80,14 +80,16 @@ if __name__ == '__main__': adapter = ffa.FFA(url) elif url.find('fictionalley') != -1: adapter = fictionalley.FictionAlley(url) - print >> sys.stderr, "FictionAlley adapter is broken, try to find this fic on fanfiction.net or fanficauthors" - sys.exit(0) + #print >> sys.stderr, "FictionAlley adapter is broken, try to find this fic on fanfiction.net or fanficauthors" + #sys.exit(0) elif url.find('ficwad') != -1: adapter = ficwad.FicWad(url) elif url.find('fanfiction.net') != -1 or url.find('fictionpress.com') != -1: adapter = ffnet.FFNet(url) elif url.find('harrypotterfanfiction.com') != -1: adapter = hpfiction.HPFiction(url) + elif url.find('twilighted.com') != -1: + adapter = twilighted.Twilighted(url) else: print >> sys.stderr, "Oi! I can haz not appropriate adapter for URL %s!" % url sys.exit(1) diff --git a/fictionalley.py b/fictionalley.py index 6ccc5ad8..59a0b7d6 100644 --- a/fictionalley.py +++ b/fictionalley.py @@ -2,36 +2,78 @@ import os import re import sys import shutil +import logging import os.path import urllib as u import pprint as pp import urllib2 as u2 +import cookielib as cl import urlparse as up import BeautifulSoup as bs import htmlentitydefs as hdefs +import time as time +from adapter import * -class FictionAlley: - def __init__(self): - pass + +class FictionAlley(FanfictionSiteAdapter): + def __init__(self, url): + self.url = url + self.host = up.urlparse(url).netloc + cookieproc = u2.HTTPCookieProcessor() + + # FictionAlley wants a cookie to prove you're old enough to read R+ rated stuff. + cookie = cl.Cookie(version=0, name='fauser', value='wizard', + port=None, port_specified=False, + domain='www.fictionalley.org', domain_specified=False, domain_initial_dot=False, + path='/authors', path_specified=True, + secure=False, + expires=time.time()+100, + discard=False, + comment=None, + comment_url=None, + rest={'HttpOnly': None}, + rfc2109=False) + cookieproc.cookiejar.set_cookie(cookie) + self.opener = u2.build_opener(cookieproc) + + def requiresLogin(self, url = None): + return False - def extractIndividualUrls(self, data, host, contents): + def performLogin(self, url = None): + pass + + def setLogin(self, login): + self.login = login + + def setPassword(self, password): + self.password = password + + def extractIndividualUrls(self): + data = self.opener.open(self.url).read() soup = bs.BeautifulStoneSoup(data) - + + # Get title from , remove before '-'. title = soup.find('title').string - self.storyName = "-".join(title.split('-')[1:]).strip() - - authors = soup.findAll('a') - - print('Story "%s" by %s' % (self.storyName, self.authorName)) + self.storyName = "-".join(title.split('-')[1:]).strip().replace(" (Story Text)","") links = soup.findAll('a', { 'class' : 'chapterlink' } ) result = [] - for a in links: - url = a['href'] - title = a.string - result.append((url,title)) + if len(links) == 0: + breadcrumbs = soup.find('div', {'class': 'breadcrumbs'}) + self.authorName = breadcrumbs.a.string.replace("'s Fics","") + result.append((self.url,self.storyName)) + else: + author = soup.find('h1', {'class' : 'title'}) + self.authorName = author.a.string + for a in links: + url = a['href'] + title = a.string + result.append((url,title)) + + print('Story "%s" by %s' % (self.storyName, self.authorName)) + return result def getStoryName(self): @@ -40,11 +82,20 @@ class FictionAlley: def getAuthorName(self): return self.authorName - - def getText(self, data, fetch = False): + def getText(self, url): + # fictionalley uses full URLs in chapter list. + data = self.opener.open(url).read() + + # find <!-- headerend --> & <!-- footerstart --> + # and replaced with matching div pair for easier parsing. + # Yes, it's an evil kludge, but what can ya do? + data = data.replace('<!-- headerend -->','<div id="storytext">').replace('<!-- footerstart -->','</div>') soup = bs.BeautifulStoneSoup(data) + div = soup.find('div', {'id' : 'storytext'}) if None == div: + logging.error("Error downloading Chapter: %s" % url) + exit(1) return '<html/>' return div.prettify() @@ -62,14 +113,13 @@ class FictionAlley: login = dict(login = 'name', password = 'pass') other = dict(submit = 'Log In', remember='yes') return (login, other) - + if __name__ == '__main__': url = 'http://www.fictionalley.org/authors/drt/DA.html' - data = u2.urlopen(url).read() + data = self.opener.open(url).read() host = up.urlparse(url).netloc - fw = FictionAlley() - fw.authorName = 'DrT' + fw = FictionAlley(url) urls = fw.extractIndividualUrls(data, host, url) pp.pprint(urls) print(fw.getText(data)) \ No newline at end of file