From ed9e2992c00afafb4f88461bbf93a9ec1c5cfec1 Mon Sep 17 00:00:00 2001 From: sigizmund Date: Fri, 18 Dec 2009 18:57:53 +0000 Subject: [PATCH] hpfiction added - not yet working --- hpfiction.py | 106 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 hpfiction.py diff --git a/hpfiction.py b/hpfiction.py new file mode 100644 index 00000000..f33849a7 --- /dev/null +++ b/hpfiction.py @@ -0,0 +1,106 @@ +# -*- coding: utf-8 -*- + +import os +import re +import sys +import cgi +import uuid +import shutil +import base64 +import os.path +import logging +import unittest +import urllib as u +import pprint as pp +import urllib2 as u2 +import urlparse as up +import BeautifulSoup as bs +import htmlentitydefs as hdefs + +from constants import * +from adapter import * + +try: + import login_password +except: + # tough luck + pass + +class HPFiction(FanfictionSiteAdapter): + def __init__(self, url): + self.url = url + parsedUrl = up.urlparse(url) + self.host = parsedUrl.netloc + self.path = parsedUrl.path + + self.opener = u2.build_opener(u2.HTTPCookieProcessor()) + + logging.debug("Created HPFiction: url=%s" % (self.url)) + + def _getLoginScript(self): + return self.path + + def requiresLogin(self, url = None): + return False + + def performLogin(self, url = None): + return True + + def extractIndividualUrls(self): + data = self.opener.open(self.url).read() + soup = bs.BeautifulSoup(data) + + links = soup.findAll('a') + + for a in links: + if a['href'].find('psid') != -1: + self.storyName = a.string + elif a['href'].find('viewuser.php') != -1: + self.authorName = a.string + + select = soup.find('select', {'name' : 'chapterid'}) + urls = [] + for o in select.findAll('option'): + if 'value' in o._getAttrMap(): + url = 'http://' + self.host + '/' + self.path + o['value'] + title = o.string + urls.append((url,title)) + return urls + + def getStoryName(self): + return self.storyName + + def getAuthorName(self): + return self.authorName + + def getText(self, url): + logging.debug('Downloading from URL: %s' % url) + data = self.opener.open(self.url).read() + soup = bs.BeautifulSoup(data) + divtext = soup.find('div', {'id' : 'fluidtext'}) + return divtext.prettify() + +class FF_UnitTests(unittest.TestCase): + def setUp(self): + logging.basicConfig(level=logging.DEBUG) + pass + + def testChaptersAuthStory(self): + f = HPFiction('http://www.harrypotterfanfiction.com/viewstory.php?chapterid=80123') + urls = f.extractIndividualUrls() + + self.assertEquals(49, len(urls)) + self.assertEquals('Elisha', f.getAuthorName()) + self.assertEquals('A Secret Thought', f.getStoryName()) + + def testGetText(self): + url = 'http://www.harrypotterfanfiction.com/viewstory.php?chapterid=80123' + f = HPFiction(url) + #urls = f.extractIndividualUrls() + text = f.getText(url) + self.assertTrue(text.find('She pulled out of his arms and felt the subtle regret') != -1) + +if __name__ == '__main__': + unittest.main() + + \ No newline at end of file