import os import re import sys import shutil import logging import os.path import urllib as u import pprint as pp import urllib2 as u2 import cookielib as cl import urlparse as up import BeautifulSoup as bs import htmlentitydefs as hdefs import time as time import datetime from adapter import * class FictionAlley(FanfictionSiteAdapter): def __init__(self, url): self.url = url parsedUrl = up.urlparse(url) self.host = parsedUrl.netloc self.path = parsedUrl.path logging.debug('self.host=%s' % self.host) logging.debug('self.path=%s' % self.path) cookieproc = u2.HTTPCookieProcessor() # FictionAlley wants a cookie to prove you're old enough to read R+ rated stuff. cookie = cl.Cookie(version=0, name='fauser', value='wizard', port=None, port_specified=False, domain='www.fictionalley.org', domain_specified=False, domain_initial_dot=False, path='/authors', path_specified=True, secure=False, expires=time.time()+100, discard=False, comment=None, comment_url=None, rest={'HttpOnly': None}, rfc2109=False) cookieproc.cookiejar.set_cookie(cookie) self.opener = u2.build_opener(cookieproc) ss = self.path.split('/') self.storyDescription = 'Fanfiction Story' self.authorId = '' self.authorURL = '' self.storyId = '' if len(ss) > 2 and ss[1] == 'authors': self.authorId = ss[2] self.authorURL = 'http://' + self.host + '/authors/' + self.authorId if len(ss) > 3: self.storyId = ss[3].replace ('.html','') self.storyPublished = datetime.date(1970, 01, 31) self.storyCreated = datetime.datetime.now() self.storyUpdated = datetime.date(1970, 01, 31) self.languageId = 'en-UK' self.language = 'English' self.subjects = [] self.subjects.append ('fanfiction') self.publisher = self.host self.numChapters = 0 self.numWords = 0 self.genre = 'FanFiction' self.category = 'Category' self.storyStatus = 'In-Progress' self.storyRating = 'K' self.storyUserRating = '0' self.storyCharacters = [] self.storySeries = '' def requiresLogin(self, url = None): return False def performLogin(self, url = None): pass def setLogin(self, login): self.login = login def setPassword(self, password): self.password = password def _addSubject(self, subject): subj = subject.upper() for s in self.subjects: if s.upper() == subj: return False self.subjects.append(subject) return True def _addCharacter(self, character): chara = character.upper() for c in self.storyCharacters: if c.upper() == chara: return False self.storyCharacters.append(character) return True def _processChapterHeaders(self, div): brs = div.findAll ('br') for br in brs: keystr='' valstr='' if len(br.contents) > 2: keystr = br.contents[1] if keystr is not None: strs = re.split ("<[^>]+>", str(keystr)) keystr='' for s in strs: keystr = keystr + s valstr = br.contents[2].strip(' ') if keystr is not None: if keystr == 'Rating:': self.storyRating = valstr logging.debug('self.storyRating=%s' % self.storyRating) elif keystr == 'Genre:': self.genre = valstr logging.debug('self.genre=%s' % self.genre) s2 = valstr.split(', ') for ss2 in s2: self._addSubject(ss2) logging.debug('self.subjects=%s' % self.subjects) elif keystr == 'Main Character(s):': s2 = valstr.split(', ') for ss2 in s2: self._addCharacter(ss2) logging.debug('self.storyCharacters=%s' % self.storyCharacters) elif keystr == 'Summary:': self.storyDescription = valstr logging.debug('self.storyDescription=%s' % self.storyDescription) def extractIndividualUrls(self): data = self.opener.open(self.url).read() # There is some usefull information in the headers of the first chapter page.. data = data.replace('','').replace('','') soup = bs.BeautifulStoneSoup(data) # Get title from , remove before '-'. title = soup.find('title').string self.storyName = "-".join(title.split('-')[1:]).strip().replace(" (Story Text)","") self.outputName = self.storyName.replace(" ", "_") + '-fa_' + self.storyId links = soup.findAll('li') # If it is decided that we really do care about number of words.. It's only available on the author's page.. #d0 = self.opener.open(self.authorURL).read() #soupA = bs.BeautifulStoneSoup(d0) #dls = soupA.findAll('dl') #logging.debug('dls=%s' % dls) self.numChapters = 0; result = [] if len(links) == 0: # Be aware that this means that the user has entered the {STORY}01.html # We will not have valid Publised and Updated dates. User should enter # the {STORY}.html instead. We should force that instead of this. breadcrumbs = soup.find('div', {'class': 'breadcrumbs'}) self.authorName = breadcrumbs.a.string.replace("'s Fics","") result.append((self.url,self.storyName)) #logging.debug('chapter[%s]=%s, %s' % (self.numChapters+1,self.url,self.storyName)) self.numChapters = self.numChapters + 1; div = soup.find('crazytagstringnobodywouldstumbleonaccidently', {'id' : 'storyheaders'}) if div is not None: self._processChapterHeaders(div) else: author = soup.find('h1', {'class' : 'title'}) self.authorName = author.a.string summary = soup.find('div', {'class' : 'summary'}) ss = summary.contents if len(ss) > 1: ss1 = ss[0].split(': ') if len(ss1) > 1 and ss1[0] == 'Rating': self.storyRating = ss1[1] logging.debug('self.storyRating=%s' % self.storyRating) self.storyDescription = str(ss[1]).replace("<br>","").replace("</br>","").replace('\n','') logging.debug('self.storyDescription=%s' % self.storyDescription) for li in links: a = li.find('a', {'class' : 'chapterlink'}) s = li.contents if a is not None: url = a['href'] title = a.string result.append((url,title)) #logging.debug('chapter[%s]=%s, %s' % (self.numChapters+1,url,title)) if self.numChapters == 0: # fictionalley uses full URLs in chapter list. d1 = self.opener.open(url).read() # find <!-- headerstart --> & <!-- headerend --> and # replaced with matching div pair for easier parsing. # Yes, it's an evil kludge, but what can ya do? Using # something other than div prevents soup from pairing # our div with poor html inside the story text. d1 = d1.replace('<!-- headerstart -->','<crazytagstringnobodywouldstumbleonaccidently id="storyheaders">').replace('<!-- headerend -->','</crazytagstringnobodywouldstumbleonaccidently>') sop = bs.BeautifulStoneSoup(d1) div = sop.find('crazytagstringnobodywouldstumbleonaccidently', {'id' : 'storyheaders'}) if div is not None: self._processChapterHeaders(div) self.numChapters = self.numChapters + 1 if len(s) > 1: datestr='' ss2 = s[1].replace('\n','').replace('(','').split(' ') if len(ss2) > 2 and ss2[0] == 'Posted:': datestr = ss2[1] + ' ' + ss2[2] tmpdate = datetime.datetime.fromtimestamp(time.mktime(time.strptime(datestr.strip(' '), "%Y-%m-%d %H:%M:%S"))) if self.numChapters == 1: self.storyPublished = tmpdate self.storyUpdated = tmpdate logging.debug('self.storyPublished=%s, self.storyUpdated=%s' % (self.storyPublished, self.storyUpdated)) else: logging.debug('li chapterlink not found! li=%s' % li) print('Story "%s" by %s' % (self.storyName, self.authorName)) self.uuid = 'urn:uuid:' + self.host + '-u.' + self.authorId + '-s.' + self.storyId logging.debug('self.uuid=%s' % self.uuid) return result def getStoryName(self): return self.storyName def getAuthorName(self): return self.authorName def getOutputName(self): return self.outputName def getText(self, url): # fictionalley uses full URLs in chapter list. data = self.opener.open(url).read() # find <!-- headerend --> & <!-- footerstart --> and # replaced with matching div pair for easier parsing. # Yes, it's an evil kludge, but what can ya do? Using # something other than div prevents soup from pairing # our div with poor html inside the story text. data = data.replace('<!-- headerend -->','<crazytagstringnobodywouldstumbleonaccidently id="storytext">').replace('<!-- footerstart -->','</crazytagstringnobodywouldstumbleonaccidently>') soup = bs.BeautifulStoneSoup(data) div = soup.find('crazytagstringnobodywouldstumbleonaccidently', {'id' : 'storytext'}) if None == div: logging.error("Error downloading Chapter: %s" % url) exit(20) return '<html/>' html = soup.findAll('html') if len(html) > 1: return html[1].__str__('utf8') else: return div.__str__('utf8').replace('crazytagstringnobodywouldstumbleonaccidently','div') def getStoryURL(self): logging.debug('self.url=%s' % self.url) return self.url def getAuthorURL(self): logging.debug('self.authorURL=%s' % self.authorURL) return self.authorURL def getUUID(self): logging.debug('self.uuid=%s' % self.uuid) return self.uuid def getAuthorId(self): logging.debug('self.authorId=%s' % self.authorId) return self.authorId def getStoryId(self): logging.debug('self.storyId=%s' % self.storyId) return self.storyId def getStoryDescription(self): logging.debug('self.storyDescription=%s' % self.storyDescription) return self.storyDescription def getStoryPublished(self): logging.debug('self.storyPublished=%s' % self.storyPublished) return self.storyPublished def getStoryCreated(self): self.storyCreated = datetime.datetime.now() logging.debug('self.storyCreated=%s' % self.storyCreated) return self.storyCreated def getStoryUpdated(self): logging.debug('self.storyUpdated=%s' % self.storyUpdated) return self.storyUpdated def getLanguage(self): logging.debug('self.language=%s' % self.language) return self.language def getLanguageId(self): logging.debug('self.languageId=%s' % self.languageId) return self.languageId def getSubjects(self): logging.debug('self.subjects=%s' % self.authorName) return self.subjects def getPublisher(self): logging.debug('self.publisher=%s' % self.publisher) return self.publisher def getNumChapters(self): logging.debug('self.numChapters=%s' % self.numChapters) return self.numChapters def getNumWords(self): logging.debug('self.numWords=%s' % self.numWords) return self.numWords def getCategory(self): logging.debug('self.category=%s' % self.category) return self.category def getGenre(self): logging.debug('self.genre=%s' % self.genre) return self.genre def getStoryStatus(self): logging.debug('self.storyStatus=%s' % self.storyStatus) return self.storyStatus def getStoryRating(self): logging.debug('self.storyRating=%s' % self.storyRating) return self.storyRating def getStoryUserRating(self): logging.debug('self.storyUserRating=%s' % self.storyUserRating) return self.storyUserRating def getPrintableUrl(self, url): return url def getPasswordLine(self): return 'opaopapassword' def getLoginScript(self): return 'opaopaloginscript' def getLoginPasswordOthers(self): login = dict(login = 'name', password = 'pass') other = dict(submit = 'Log In', remember='yes') return (login, other) def getStoryCharacters(self): logging.debug('self.storyCharacters=%s' % self.storyCharacters) return self.storyCharacters def getStorySeries(self): logging.debug('self.storySeries=%s' % self.storySeries) return self.storySeries if __name__ == '__main__': url = 'http://www.fictionalley.org/authors/drt/DA.html' data = self.opener.open(url).read() host = up.urlparse(url).netloc fw = FictionAlley(url) urls = fw.extractIndividualUrls(data, host, url) pp.pprint(urls) print(fw.getText(data))