import os import re import sys import shutil import os.path import urllib as u import logging import pprint as pp import unittest import urllib2 as u2 import urlparse as up import BeautifulSoup as bs import htmlentitydefs as hdefs import time import datetime from adapter import * import twipassword class Twilighted(FanfictionSiteAdapter): def __init__(self, url): self.url = url parsedUrl = up.urlparse(url) self.host = parsedUrl.netloc self.path = parsedUrl.path self.opener = u2.build_opener(u2.HTTPCookieProcessor()) self.password=twipassword.password self.login='sigizmund' self.storyDescription = 'Fanfiction Story' self.authorId = '0' self.authorURL = '' self.storyId = '0' self.storyPublished = datetime.date(1970, 01, 31) self.storyCreated = datetime.datetime.now() self.storyUpdated = datetime.date(1970, 01, 31) self.languageId = 'en-UK' self.language = 'English' self.subjects = [] self.subjects.append ('fanfiction') self.subjects.append ('Twilight') self.publisher = self.host self.numChapters = 0 self.numWords = 0 self.genre = 'FanFiction' self.category = 'Category' self.storyStatus = 'In-Progress' self.storyRating = 'PG' self.storyUserRating = '0' self.storyCharacters = [] self.storySeries = '' self.uuid = 'urn:uuid:' + self.host + '-u.' + self.authorId + '-s.' + self.storyId logging.debug('self.uuid=%s' % self.uuid) logging.debug("Created Twilighted: url=%s" % (self.url)) def requiresLogin(self, url = None): # potionsandsnitches.net doesn't require login. if self.host == 'potionsandsnitches.net': return False else: return True def performLogin(self, url = None): data = {} data['penname'] = self.login data['password'] = self.password data['cookiecheck'] = '1' data['submit'] = 'Submit' urlvals = u.urlencode(data) loginUrl = 'http://' + self.host + self._getLoginScript() logging.debug("Will now login to URL %s" % loginUrl) req = self.opener.open(loginUrl, urlvals) d = req.read().decode('utf-8') if self.reqLoginData(d) : return False else: return True def setLogin(self, login): self.login = login def setPassword(self, password): self.password = password def _addSubject(self, subject): subj = subject.upper() for s in self.subjects: if s.upper() == subj: return False self.subjects.append(subject) return True def _addCharacter(self, character): chara = character.upper() for c in self.storyCharacters: if c.upper() == chara: return False self.storyCharacters.append(character) return True def extractIndividualUrls(self): data = self.opener.open(self.url).read() if self.reqLoginData(data): self.performLogin() data = self.opener.open(self.url).read() if self.reqLoginData(data): return None soup = bs.BeautifulStoneSoup(data) title = soup.find('title').string self.storyName = title.split(' by ')[0].strip() self.authorName = title.split(' by ')[1].strip() self.outputName = self.storyName.replace(" ", "_") select = soup.find('select', { 'name' : 'chapter' } ) result = [] if select is None: # no chapters found, try url by itself. result.append((self.url,self.storyName)) else: allOptions = select.findAll('option') for o in allOptions: url = self.url + "&chapter=%s" % o['value'] title = o.string result.append((url,title)) url = self.url + "&index=1" data = self.opener.open(url).read() lines = data.split('\n') soup = bs.BeautifulStoneSoup(data) metas = soup.findAll('meta') for meta in metas: if 'name' in meta._getAttrMap() and meta['name'].find('description') != -1: #logging.debug('Meta: %s' % meta) if 'content' in meta._getAttrMap(): s1 = bs.BeautifulStoneSoup(meta['content']) ps = s1.findAll('p') if len(ps) > 0: self.storyDescription = ps[0] logging.debug('self.storyDescription=%s' % (self.storyDescription)) else: divs = meta.findAll('div') #logging.debug('Divs: %s' % divs) for div in divs: #logging.debug('Div: %s' % div) if 'id' in div._getAttrMap() and div['id'].find('pagetitle') != -1: #logging.debug('Div PAGETITLE: %s' % div) allA = div.findAll('a') for a in allA: if 'href' in a._getAttrMap(): if a['href'].find('viewstory.php?sid=') != -1: str1 = a.string (vs, self.storyId) = a['href'].split('=') logging.debug('self.storyId=%s self.storyName=%s' % (self.storyId, self.storyName)) self.outputName = self.outputName + "-tw_" + self.storyId logging.debug('self.outputName=%s' % self.outputName) if a['href'].find('viewuser.php?uid=') != -1: str1 = a.string (vs, self.authorId) = a['href'].split('=') logging.debug('self.authorId=%s self.authorName=%s' % (self.authorId, self.authorName)) self.authorURL = 'http://'+self.host+'/viewuser.php?uid='+self.authorId logging.debug('self.authorURL=%s' % self.authorURL) if 'class' in div._getAttrMap() and div['class'].find('content') != -1: #logging.debug('Div CONTENT: %s' % div) brs = div.findAll('br') for br in brs: buf = unicode(br).encode('utf-8') strs = re.split ('<[^>]+>', buf) #logging.debug('BUF: %s' % strs) ii = 2 stlen = len(strs) while stlen > ii+1: if len(strs[ii]) == 0: ii = ii+1 continue if strs[ii] == 'Categories:': ii = ii+1 while stlen > ii and len(strs[ii]) != 0 and strs[ii].find(':') == -1: if strs[ii] != ' ' and strs[ii] != ', ': if self.category == 'Category': self.category = strs[ii].strip(' ') self._addSubject(strs[ii].strip(' ')) ii = ii+1 logging.debug('self.subjects=%s' % self.subjects) if strs[ii] == 'Characters: ': ii = ii+1 while stlen > ii and len(strs[ii]) != 0 and strs[ii].find(':') == -1: if strs[ii] != ' ' and strs[ii] != ', ': self._addCharacter(strs[ii].strip(' ')) ii = ii+1 logging.debug('self.storyCharacters=%s' % self.storyCharacters) elif strs[ii] == 'Completed:': if strs[ii+1].strip(' ') == "No": self.storyStatus = 'In-Progress' else: self.storyStatus = 'Completed' ii = ii+2 logging.debug('self.storyStatus=%s' % self.storyStatus) elif strs[ii] == 'Rated:': self.storyRating = strs[ii+1].strip(' ') ii = ii+2 logging.debug('self.storyRating=%s' % self.storyRating) elif strs[ii] == 'Series:': self.storySeries = strs[ii+1].strip(' ') if self.storySeries == 'None': self.storySeries = '' ii = ii+2 logging.debug('self.storySeries=%s' % self.storySeries) elif strs[ii] == 'Chapters: ': self.numChapters = strs[ii+1].strip(' ') ii = ii+2 logging.debug('self.numChapters=%s' % self.numChapters) elif strs[ii] == 'Word count:': self.numWords = strs[ii+1].strip(' ') ii = ii+2 logging.debug('self.numWords=%s' % self.numWords) elif strs[ii] == ' Published: ': self.storyPublished = datetime.datetime.fromtimestamp(time.mktime(time.strptime(strs[ii+1].strip(' '), "%B %d, %Y"))) ii = ii+2 logging.debug('self.storyPublished=%s' % self.storyPublished) elif strs[ii] == 'Updated:': self.storyUpdated = datetime.datetime.fromtimestamp(time.mktime(time.strptime(strs[ii+1].strip(' '), "%B %d, %Y"))) ii = ii+2 logging.debug('self.storyUpdated=%s' % self.storyUpdated) else: logging.debug('Skipped Label \"%s\" Value \"%s\"' % (strs[ii], strs[ii+1])) ii = ii+2 self.uuid = 'urn:uuid:' + self.host + '-u.' + self.authorId + '-s.' + self.storyId logging.debug('self.uuid=%s' % self.uuid) return result def getStoryName(self): return self.storyName def getOutputName(self): return self.outputName def getAuthorName(self): return self.authorName def getText(self, url): if url.find('http://') == -1: url = 'http://' + self.host + '/' + url logging.debug('Getting data from: %s' % url) data = self.opener.open(url).read() soup = bs.BeautifulStoneSoup(data, convertEntities=bs.BeautifulStoneSoup.HTML_ENTITIES) div = soup.find('div', {'id' : 'story'}) if None == div: return '' return div.__str__('utf8') def _getLoginScript(self): return '/user.php?action=login' def reqLoginData(self, data): if data.find('Registered Users Only. Please click OK to login or register.') != -1 or data.find('There is no such account on our website') != -1: return True else: return False def getStoryURL(self): logging.debug('self.url=%s' % self.url) return self.url def getAuthorURL(self): logging.debug('self.authorURL=%s' % self.authorURL) return self.authorURL def getUUID(self): logging.debug('self.uuid=%s' % self.uuid) return self.uuid def getStoryDescription(self): logging.debug('self.storyDescription=%s' % self.storyDescription) return self.storyDescription def getStoryPublished(self): logging.debug('self.storyPublished=%s' % self.storyPublished) return self.storyPublished def getStoryCreated(self): self.storyCreated = datetime.datetime.now() logging.debug('self.storyCreated=%s' % self.storyCreated) return self.storyCreated def getStoryUpdated(self): logging.debug('self.storyUpdated=%s' % self.storyUpdated) return self.storyUpdated def getLanguage(self): logging.debug('self.language=%s' % self.language) return self.language def getLanguageId(self): logging.debug('self.languageId=%s' % self.languageId) return self.languageId def getSubjects(self): logging.debug('self.subjects=%s' % self.authorName) return self.subjects def getPublisher(self): logging.debug('self.publisher=%s' % self.publisher) return self.publisher def getNumChapters(self): logging.debug('self.numChapters=%s' % self.numChapters) return self.numChapters def getNumWords(self): logging.debug('self.numWords=%s' % self.numWords) return self.numWords def getAuthorId(self): logging.debug('self.authorId=%s' % self.authorId) return self.authorId def getStoryId(self): logging.debug('self.storyId=%s' % self.storyId) return self.storyId def getCategory(self): logging.debug('self.category=%s' % self.category) return self.category def getGenre(self): logging.debug('self.genre=%s' % self.genre) return self.genre def getStoryStatus(self): logging.debug('self.storyStatus=%s' % self.storyStatus) return self.storyStatus def getStoryRating(self): logging.debug('self.storyRating=%s' % self.storyRating) return self.storyRating def getStoryUserRating(self): logging.debug('self.storyUserRating=%s' % self.storyUserRating) return self.storyUserRating def getStoryCharacters(self): logging.debug('self.storyCharacters=%s' % self.storyCharacters) return self.storyCharacters def getStorySeries(self): logging.debug('self.storySeries=%s' % self.storySeries) return self.storySeries class Twilighted_UnitTests(unittest.TestCase): def setUp(self): logging.basicConfig(level=logging.DEBUG) pass def testLoginWorks(self): url = 'http://www.twilighted.net/viewstory.php?sid=10004' self.assertTrue(Twilighted(url).performLogin()) def testGetUrlsWorks(self): url = 'http://www.twilighted.net/viewstory.php?sid=10004' self.assertEquals(32, len(Twilighted(url).extractIndividualUrls())) if __name__ == '__main__': unittest.main()