From 379efc34f17acbd75b05c6ebc8516d295a6e941e Mon Sep 17 00:00:00 2001 From: wsuetholz Date: Tue, 9 Nov 2010 16:35:46 -0600 Subject: [PATCH] Add a title page to the resulting EPUB file. This required scraping more information from the web pages in order to populate the new fields. This change includes a change to the way that the output.py uses the adapters. It now gets passed in the adapter and then calls functions from the adapter in order to retrieve the scraped information. This will make it easier down the road when adding more information, or even pictures. --- adapter.py | 73 +++++- constants.py | 82 ++++-- downaloder.py => downloader.py | 9 +- ffnet.py | 293 +++++++++++++++++++++- fictionalley.py | 269 +++++++++++++++++++- ficwad.py | 288 +++++++++++++++++++-- hpfiction.py | 226 ++++++++++++++++- output.py | 67 +++-- twilighted.py | 439 +++++++++++++++++++++++++-------- 9 files changed, 1577 insertions(+), 169 deletions(-) rename downaloder.py => downloader.py (91%) diff --git a/adapter.py b/adapter.py index bf37a017..246f4177 100644 --- a/adapter.py +++ b/adapter.py @@ -29,11 +29,80 @@ class FanfictionSiteAdapter: def setPassword(self, password): pass - def getStoryName(self): + def getStoryURL(self): + pass + + def getUUID(self): + pass + + def getOutputName(self): + pass + + def getAuthorURL(self): + pass + + def getAuthorId(self): pass def getAuthorName(self): pass + def getStoryId(self): + pass + + def getStoryName(self): + pass + + def getStoryDescription(self): + pass + + def getStoryCreated(self): + pass + + def getStoryPublished(self): + pass + + def getStoryUpdated(self): + pass + + def getStorySeries(self): + pass + + def getLanguage(self): + pass + + def getLanguageId(self): + pass + + def getSubjects(self): + pass + + def getCharacters(self): + pass + + def getPublisher(self): + pass + + def getNumChapters(self): + pass + + def getNumWords(self): + pass + + def getCategory(self): + pass + + def getGenre(self): + pass + + def getStoryStatus(self): + pass + + def getStoryRating(self): + pass + + def getStoryUserRating(self): + pass + def getPrintableUrl(self, url): - pass \ No newline at end of file + pass diff --git a/constants.py b/constants.py index e01342d9..6ea1f086 100644 --- a/constants.py +++ b/constants.py @@ -15,6 +15,9 @@ h6 { text-align: center; } padding:0px; } .center {text-align: center;} +.cover {text-align: center;} +.full {width: 100%; } +.quarter {width: 25%; } .smcap {font-variant: small-caps;} .u {text-decoration: underline;} .bold {font-weight: bold;} @@ -22,6 +25,37 @@ h6 { text-align: center; } MIMETYPE = '''application/epub+zip''' +TITLE_PAGE = ''' +%s - %s +
+

%s

+

by %s

+
+ + + + + + + + + + + + + + + + + + + + + +
Category:%s
Genre:%s
Status:%s
Published:%s
Updated:%s
Packaged:%s
Rating Age/User:%s / %s
Chapters/Words:%s / %s
URL:

%s

Summary:
%s
+
+''' + CONTAINER = ''' @@ -30,42 +64,60 @@ CONTAINER = ''' ''' -CONTENT_START = ''' +CONTENT_START = ''' - + unique-identifier="fanficdownloader-uuid"> + + BookID-Epub-%s %s %s - en-UK + fanficdownloader [http://fanficdownloader.googlecode.com] + %s - fanfiction - sgzmd - %s + %s + %s + %s + + %s +''' + +CONTENT_END_METADATA = ''' %s + %s + %s + %s + FanFiction + ''' -CONTENT_ITEM = ''' +CONTENT_SUBJECT = ''' %s ''' -CONTENT_END_MANIFEST = ''' - +CONTENT_ITEM = ''' ''' -CONTENT_ITEMREF = ''' +CONTENT_END_MANIFEST = ''' + ''' -CONTENT_END = ''' +CONTENT_ITEMREF = ''' +''' + +CONTENT_END = ''' ''' TOC_START = ''' - + @@ -502,3 +554,5 @@ FB2_DESCRIPTION = ''' 2.0 ''' + +HTML_ESC_Definitions = 'HTML_Escape.def' diff --git a/downaloder.py b/downloader.py similarity index 91% rename from downaloder.py rename to downloader.py index b8af3abe..f8ca80c6 100644 --- a/downaloder.py +++ b/downloader.py @@ -34,6 +34,7 @@ class FanficLoader: self.inmemory = inmemory self.compress = compress self.badLogin = False + self.overWrite = True def getAdapter(): return self.adapter @@ -48,7 +49,13 @@ class FanficLoader: raise adapter.LoginRequiredException(self.adapter.url) urls = self.adapter.extractIndividualUrls() - self.writer = self.writerClass(self.booksDirectory, self.adapter.getStoryName(), self.adapter.getAuthorName(), inmemory=self.inmemory, compress=self.compress) + + s = self.booksDirectory + "/" + self.adapter.getOutputName() + "." + format + if not self.overWrite and os.path.isfile(s): + print >> sys.stderr, "File " + s + " already exists! Skipping!" + exit(10) + + self.writer = self.writerClass(self.booksDirectory, self.adapter, inmemory=self.inmemory, compress=self.compress) i = 1 for u,n in urls: diff --git a/ffnet.py b/ffnet.py index f3e101fc..7320ec5a 100644 --- a/ffnet.py +++ b/ffnet.py @@ -15,6 +15,8 @@ import urllib2 as u2 import urlparse as up import BeautifulSoup as bs import htmlentitydefs as hdefs +import time +import datetime from constants import * from adapter import * @@ -40,10 +42,37 @@ class FFNet(FanfictionSiteAdapter): self.storyName = 'FF.Net story' self.authorName = 'FF.Net author' + self.outputName = 'FF.Net_story' + self.storyDescription = 'Fanfiction Story' + self.storyCharacters = [] + self.storySeries = '' + self.authorId = '0' + self.authorURL = self.path + self.storyId = '0' + self.storyPublished = datetime.date(1970, 01, 31) + self.storyCreated = datetime.datetime.now() + self.storyUpdated = datetime.date(1970, 01, 31) + self.languageId = 'en-UK' + self.language = 'English' + self.subjects = [] + self.subjects.append ('FanFiction') + logging.debug('self.subjects=%s' % self.subjects) + self.publisher = self.host + self.numChapters = 0 + self.numWords = 0 + self.genre = 'FanFiction' + self.category = 'FF.Net Category' + self.storyStatus = 'In-Progress' + self.storyRating = 'K' + self.storyUserRating = '0' + logging.debug('self.path=%s' % self.path) + spl = self.path.split('/') + logging.debug('spl=%s' % spl) if len(spl) == 5: self.path = "/".join(spl[1:-1]) + self.outputName = spl[4] + '-ffnet_' + spl[2] if self.path.startswith('/'): self.path = self.path[1:] @@ -51,10 +80,14 @@ class FFNet(FanfictionSiteAdapter): if self.path.endswith('/'): self.path = self.path[:-1] + logging.debug('self.path=%s' % self.path) + (s, self.storyId, chapter) = self.path.split('/') - logging.debug('self.storyId=%s, chapter=%s' % (self.storyId, chapter)) - + self.uuid = 'urn:uuid:' + self.host + '-u.' + self.authorId + '-s.' + self.storyId + logging.debug('self.uuid=%s' % self.uuid) + + logging.debug('self.storyId=%s, chapter=%s, self.outputName=%s' % (self.storyId, chapter, self.outputName)) if not appEngine: self.opener = u2.build_opener(u2.HTTPCookieProcessor()) else: @@ -70,7 +103,70 @@ class FFNet(FanfictionSiteAdapter): def performLogin(self, url = None): return True + + def _getVarValue(self, varstr): + #logging.debug('_getVarValue varstr=%s' % varstr) + vals = varstr.split('=') + #logging.debug('vals=%s' % vals) + retstr="".join(vals[+1:]) + #logging.debug('retstr=%s' % retstr) + if retstr.startswith(' '): + retstr = retstr[1:] + if retstr.endswith(';'): + retstr = retstr[:-1] + return retstr + def _splitCrossover(self, subject): + if "Crossover" in subject: + self._addSubject ("Crossover") + logging.debug('Crossover=%s' % subject) + if subject.find(' and ') != -1: + words = subject.split(' ') + logging.debug('words=%s' % words) + subj = '' + for s in words: + if s in "and Crossover": + if len(subj) > 0: + self._addSubject(subj) + subj = '' + else: + if len(subj) > 0: + subj = subj + ' ' + subj = subj + s + if len(subj) > 0: + self._addSubject(subj) + else: + self._addSubject(subject) + else: + self._addSubject(subject) + return True + + def _splitGenre(self, subject): + if len(subject) > 0: + words = subject.split('/') + logging.debug('words=%s' % words) + for subj in words: + if len(subj) > 0: + self._addSubject(subj) + return True + + def _addSubject(self, subject): + subj = subject.upper() + for s in self.subjects: + if s.upper() == subj: + return False + + self.subjects.append(subject) + return True + + def _addCharacter(self, character): + chara = character.upper() + for c in self.storyCharacters: + if c.upper() == chara: + return False + self.storyCharacters.append(character) + return True + def _fetchUrl(self, url): if not appEngine: return self.opener.open(url).read().decode('utf-8') @@ -85,6 +181,8 @@ class FFNet(FanfictionSiteAdapter): for a in allA: if 'href' in a._getAttrMap() and a['href'].find('/u/') != -1: self.authorName = a.string + (u1, u2, self.authorId, u3) = a['href'].split('/') + logging.debug('self.authorId=%s self.authorName=%s' % (self.authorId, self.authorName)) urls = [] lines = data.split('\n') @@ -92,9 +190,38 @@ class FFNet(FanfictionSiteAdapter): if l.find("»") != -1 and l.find('') != -1: s2 = bs.BeautifulStoneSoup(l) self.storyName = str(s2.find('b').string) + logging.debug('self.storyId=%s self.storyName=%s' % (self.storyId, self.storyName)) elif l.find(" 0: continue @@ -102,6 +229,8 @@ class FFNet(FanfictionSiteAdapter): u = l.decode('utf-8') except UnicodeEncodeError, e: u = l + except: + u = l.encode('ascii', 'xmlcharrefreplace') u = re.sub('&\#[0-9]+;', ' ', u) s2 = bs.BeautifulSoup(u) options = s2.findAll('option') @@ -110,19 +239,69 @@ class FFNet(FanfictionSiteAdapter): title = o.string logging.debug('URL = `%s`, Title = `%s`' % (url, title)) urls.append((url,title)) - if len(urls) == 0: + elif l.find("var chapters") != -1: + self.numChapters = self._getVarValue (l) + logging.debug('self.numChapters=%s' % self.numChapters) + elif l.find("var words") != -1: + self.numWords = self._getVarValue (l) + logging.debug('self.numWords=%s' % self.numWords) + elif l.find("var categoryid") != -1: + categoryid = self._getVarValue (l) + logging.debug('categoryid=%s' % categoryid) + elif l.find("var cat_title") != -1: + self.category = self._getVarValue (l).strip("'") + logging.debug('self.category=%s' % self.category) + self._splitCrossover(self.category) + logging.debug('self.subjects=%s' % self.subjects) + elif l.find("var summary") != -1: + self.storyDescription = self._getVarValue (l).strip("'") + if '&' in self.storyDescription: + s = self.storyDescription.split('&') + logging.debug('s=%s' % s) + self.storyDescription = '' + for ss in s: + if len(self.storyDescription) > 0: + if len(ss) > 4 and 'amp;' in ss[1:4]: + self.storyDescription = self.storyDescription + '&' + ss + else: + self.storyDescription = self.storyDescription + '&' + ss + else: + self.storyDescription = ss + logging.debug('self.storyDescription=%s' % self.storyDescription) + elif l.find("var datep") != -1: + dateps = self._getVarValue (l) + self.storyPublished = datetime.datetime(*time.strptime ( dateps, "'%m-%d-%y'" )[0:5]) + logging.debug('self.storyPublished=%s' % self.storyPublished.strftime("%Y-%m-%dT%I:%M:%S")) + elif l.find("var dateu") != -1: + dateus = self._getVarValue (l) + self.storyUpdated = datetime.datetime(*time.strptime ( dateus, "'%m-%d-%y'" )[0:5]) + logging.debug('self.storyUpdated=%s' % self.storyUpdated.strftime("%Y-%m-%dT%I:%M:%S")) + + if len(urls) <= 0: # no chapters found, try url by itself. urls.append((self.url,self.storyName)) + + self.uuid = 'urn:uuid:' + self.host + '-a.' + self.authorId + '-s.' + self.storyId + self.authorURL = 'http://' + self.host + '/u/' + self.authorId + logging.debug('self.uuid=%s' % self.uuid) + + #logging.debug('urls=%s' % urls) return urls def getText(self, url): + time.sleep( 2.0 ) data = self._fetchUrl(url) + lines = data.split('\n') + + textbuf = '' + emit = False + olddata = data try: data = data.decode('utf8') except: data = olddata - + try: soup = bs.BeautifulStoneSoup(data) except: @@ -131,23 +310,121 @@ class FFNet(FanfictionSiteAdapter): div = soup.find('div', {'id' : 'storytext'}) if None == div: logging.error("Error downloading Chapter: %s" % url) - exit(1) + exit (20) return '' return div.__str__('utf8') - + def setLogin(self, login): self.login = login def setPassword(self, password): self.password = password - def getStoryName(self): - return self.storyName + def getStoryURL(self): + logging.debug('self.url=%s' % self.url) + return self.url + + def getUUID(self): + logging.debug('self.uuid=%s' % self.uuid) + return self.uuid + + def getOutputName(self): + logging.debug('self.storyId=%s, self.storyName=%s self.outputName=%s' % (self.storyId, self.storyName, self.outputName)) + return self.outputName def getAuthorName(self): + logging.debug('self.authorName=%s' % self.authorName) return self.authorName + def getAuthorId(self): + logging.debug('self.authorId=%s' % self.authorId) + return self.authorId + + def getAuthorURL(self): + logging.debug('self.authorURL=%s' % self.authorURL) + return self.authorURL + + def getStoryId(self): + logging.debug('self.storyId=%s' % self.storyId) + return self.storyId + + def getStoryName(self): + logging.debug('self.storyName=%s' % self.storyName) + return self.storyName + + def getStoryDescription(self): + logging.debug('self.storyDescription=%s' % self.storyDescription) + return self.storyDescription + + def getStoryPublished(self): + logging.debug('self.storyPublished=%s' % self.storyPublished) + return self.storyPublished + + def getStoryCreated(self): + self.storyCreated = datetime.datetime.now() + logging.debug('self.storyCreated=%s' % self.storyCreated) + return self.storyCreated + + def getStoryUpdated(self): + logging.debug('self.storyUpdated=%s' % self.storyUpdated) + return self.storyUpdated + + def getLanguage(self): + logging.debug('self.language=%s' % self.language) + return self.language + + def getLanguageId(self): + logging.debug('self.languageId=%s' % self.languageId) + return self.languageId + + def getSubjects(self): + logging.debug('self.subjects=%s' % self.authorName) + return self.subjects + + def getPublisher(self): + logging.debug('self.publisher=%s' % self.publisher) + return self.publisher + + def getNumChapters(self): + logging.debug('self.numChapters=%s' % self.numChapters) + return self.numChapters + + def getNumWords(self): + logging.debug('self.numWords=%s' % self.numWords) + return self.numWords + + def getCategory(self): + logging.debug('self.category=%s' % self.category) + return self.category + + def getGenre(self): + logging.debug('self.genre=%s' % self.genre) + return self.genre + + def getStoryStatus(self): + logging.debug('self.storyStatus=%s' % self.storyStatus) + return self.storyStatus + + def getStoryRating(self): + logging.debug('self.storyRating=%s' % self.storyRating) + return self.storyRating + + def getStoryUserRating(self): + logging.debug('self.storyUserRating=%s' % self.storyUserRating) + return self.storyUserRating + + def getPrintableUrl(self, url): + pass + + def getStoryCharacters(self): + logging.debug('self.storyCharacters=%s' % self.storyCharacters) + return self.storyCharacters + + def getStorySeries(self): + logging.debug('self.storySeries=%s' % self.storySeries) + return self.storySeries + class FFA_UnitTests(unittest.TestCase): def setUp(self): logging.basicConfig(level=logging.DEBUG) diff --git a/fictionalley.py b/fictionalley.py index 884720fd..20763bf9 100644 --- a/fictionalley.py +++ b/fictionalley.py @@ -12,13 +12,20 @@ import urlparse as up import BeautifulSoup as bs import htmlentitydefs as hdefs import time as time +import datetime from adapter import * class FictionAlley(FanfictionSiteAdapter): def __init__(self, url): self.url = url - self.host = up.urlparse(url).netloc + parsedUrl = up.urlparse(url) + self.host = parsedUrl.netloc + self.path = parsedUrl.path + + logging.debug('self.host=%s' % self.host) + logging.debug('self.path=%s' % self.path) + cookieproc = u2.HTTPCookieProcessor() # FictionAlley wants a cookie to prove you're old enough to read R+ rated stuff. @@ -35,6 +42,36 @@ class FictionAlley(FanfictionSiteAdapter): rfc2109=False) cookieproc.cookiejar.set_cookie(cookie) self.opener = u2.build_opener(cookieproc) + + ss = self.path.split('/') + + self.storyDescription = 'Fanfiction Story' + self.authorId = '' + self.authorURL = '' + self.storyId = '' + if len(ss) > 2 and ss[1] == 'authors': + self.authorId = ss[2] + self.authorURL = 'http://' + self.host + '/authors/' + self.authorId + if len(ss) > 3: + self.storyId = ss[3].replace ('.html','') + self.storyPublished = datetime.date(1970, 01, 31) + self.storyCreated = datetime.datetime.now() + self.storyUpdated = datetime.date(1970, 01, 31) + self.languageId = 'en-UK' + self.language = 'English' + self.subjects = [] + self.subjects.append ('fanfiction') + self.publisher = self.host + self.numChapters = 0 + self.numWords = 0 + self.genre = 'FanFiction' + self.category = 'Category' + self.storyStatus = 'In-Progress' + self.storyRating = 'K' + self.storyUserRating = '0' + self.storyCharacters = [] + self.storySeries = '' + def requiresLogin(self, url = None): return False @@ -48,31 +85,147 @@ class FictionAlley(FanfictionSiteAdapter): def setPassword(self, password): self.password = password + def _addSubject(self, subject): + subj = subject.upper() + for s in self.subjects: + if s.upper() == subj: + return False + self.subjects.append(subject) + return True + + def _addCharacter(self, character): + chara = character.upper() + for c in self.storyCharacters: + if c.upper() == chara: + return False + self.storyCharacters.append(character) + return True + + def _processChapterHeaders(self, div): + brs = div.findAll ('br') + for br in brs: + keystr='' + valstr='' + if len(br.contents) > 2: + keystr = br.contents[1] + if keystr is not None: + strs = re.split ("<[^>]+>", str(keystr)) + keystr='' + for s in strs: + keystr = keystr + s + valstr = br.contents[2].strip(' ') + if keystr is not None: + if keystr == 'Rating:': + self.storyRating = valstr + logging.debug('self.storyRating=%s' % self.storyRating) + elif keystr == 'Genre:': + self.genre = valstr + logging.debug('self.genre=%s' % self.genre) + s2 = valstr.split(', ') + for ss2 in s2: + self._addSubject(ss2) + logging.debug('self.subjects=%s' % self.subjects) + elif keystr == 'Main Character(s):': + s2 = valstr.split(', ') + for ss2 in s2: + self._addCharacter(ss2) + logging.debug('self.storyCharacters=%s' % self.storyCharacters) + elif keystr == 'Summary:': + self.storyDescription = valstr + logging.debug('self.storyDescription=%s' % self.storyDescription) + + def extractIndividualUrls(self): data = self.opener.open(self.url).read() + + # There is some usefull information in the headers of the first chapter page.. + data = data.replace('','').replace('','') soup = bs.BeautifulStoneSoup(data) # Get title from , remove before '-'. title = soup.find('title').string self.storyName = "-".join(title.split('-')[1:]).strip().replace(" (Story Text)","") + self.outputName = self.storyName.replace(" ", "_") + '-fa_' + self.storyId - links = soup.findAll('a', { 'class' : 'chapterlink' } ) + links = soup.findAll('li') + # If it is decided that we really do care about number of words.. It's only available on the author's page.. + #d0 = self.opener.open(self.authorURL).read() + #soupA = bs.BeautifulStoneSoup(d0) + #dls = soupA.findAll('dl') + #logging.debug('dls=%s' % dls) + + self.numChapters = 0; result = [] if len(links) == 0: + # Be aware that this means that the user has entered the {STORY}01.html + # We will not have valid Publised and Updated dates. User should enter + # the {STORY}.html instead. We should force that instead of this. breadcrumbs = soup.find('div', {'class': 'breadcrumbs'}) self.authorName = breadcrumbs.a.string.replace("'s Fics","") result.append((self.url,self.storyName)) + #logging.debug('chapter[%s]=%s, %s' % (self.numChapters+1,self.url,self.storyName)) + self.numChapters = self.numChapters + 1; + div = soup.find('crazytagstringnobodywouldstumbleonaccidently', {'id' : 'storyheaders'}) + if div is not None: + self._processChapterHeaders(div) else: author = soup.find('h1', {'class' : 'title'}) self.authorName = author.a.string - for a in links: - url = a['href'] - title = a.string - result.append((url,title)) + summary = soup.find('div', {'class' : 'summary'}) + ss = summary.contents + if len(ss) > 1: + ss1 = ss[0].split(': ') + if len(ss1) > 1 and ss1[0] == 'Rating': + self.storyRating = ss1[1] + logging.debug('self.storyRating=%s' % self.storyRating) + self.storyDescription = str(ss[1]).replace("<br>","").replace("</br>","").replace('\n','') + logging.debug('self.storyDescription=%s' % self.storyDescription) + + for li in links: + a = li.find('a', {'class' : 'chapterlink'}) + s = li.contents + if a is not None: + url = a['href'] + title = a.string + result.append((url,title)) + #logging.debug('chapter[%s]=%s, %s' % (self.numChapters+1,url,title)) + if self.numChapters == 0: + # fictionalley uses full URLs in chapter list. + d1 = self.opener.open(url).read() + + # find <!-- headerstart --> & <!-- headerend --> and + # replaced with matching div pair for easier parsing. + # Yes, it's an evil kludge, but what can ya do? Using + # something other than div prevents soup from pairing + # our div with poor html inside the story text. + d1 = d1.replace('<!-- headerstart -->','<crazytagstringnobodywouldstumbleonaccidently id="storyheaders">').replace('<!-- headerend -->','</crazytagstringnobodywouldstumbleonaccidently>') + sop = bs.BeautifulStoneSoup(d1) + + div = sop.find('crazytagstringnobodywouldstumbleonaccidently', {'id' : 'storyheaders'}) + if div is not None: + self._processChapterHeaders(div) + + self.numChapters = self.numChapters + 1 + if len(s) > 1: + datestr='' + ss2 = s[1].replace('\n','').replace('(','').split(' ') + if len(ss2) > 2 and ss2[0] == 'Posted:': + datestr = ss2[1] + ' ' + ss2[2] + tmpdate = datetime.datetime.fromtimestamp(time.mktime(time.strptime(datestr.strip(' '), "%Y-%m-%d %H:%M:%S"))) + if self.numChapters == 1: + self.storyPublished = tmpdate + self.storyUpdated = tmpdate + logging.debug('self.storyPublished=%s, self.storyUpdated=%s' % (self.storyPublished, self.storyUpdated)) + else: + logging.debug('li chapterlink not found! li=%s' % li) - #print('Story "%s" by %s' % (self.storyName, self.authorName)) + + print('Story "%s" by %s' % (self.storyName, self.authorName)) + + self.uuid = 'urn:uuid:' + self.host + '-u.' + self.authorId + '-s.' + self.storyId + logging.debug('self.uuid=%s' % self.uuid) return result @@ -82,6 +235,9 @@ class FictionAlley(FanfictionSiteAdapter): def getAuthorName(self): return self.authorName + def getOutputName(self): + return self.outputName + def getText(self, url): # fictionalley uses full URLs in chapter list. data = self.opener.open(url).read() @@ -97,10 +253,96 @@ class FictionAlley(FanfictionSiteAdapter): div = soup.find('crazytagstringnobodywouldstumbleonaccidently', {'id' : 'storytext'}) if None == div: logging.error("Error downloading Chapter: %s" % url) - exit(1) + exit(20) return '<html/>' - return div.__str__('utf8').replace('crazytagstringnobodywouldstumbleonaccidently','div') + + html = soup.findAll('html') + if len(html) > 1: + return html[1].__str__('utf8') + else: + return div.__str__('utf8').replace('crazytagstringnobodywouldstumbleonaccidently','div') + def getStoryURL(self): + logging.debug('self.url=%s' % self.url) + return self.url + + def getAuthorURL(self): + logging.debug('self.authorURL=%s' % self.authorURL) + return self.authorURL + + def getUUID(self): + logging.debug('self.uuid=%s' % self.uuid) + return self.uuid + + def getAuthorId(self): + logging.debug('self.authorId=%s' % self.authorId) + return self.authorId + + def getStoryId(self): + logging.debug('self.storyId=%s' % self.storyId) + return self.storyId + + def getStoryDescription(self): + logging.debug('self.storyDescription=%s' % self.storyDescription) + return self.storyDescription + + def getStoryPublished(self): + logging.debug('self.storyPublished=%s' % self.storyPublished) + return self.storyPublished + + def getStoryCreated(self): + self.storyCreated = datetime.datetime.now() + logging.debug('self.storyCreated=%s' % self.storyCreated) + return self.storyCreated + + def getStoryUpdated(self): + logging.debug('self.storyUpdated=%s' % self.storyUpdated) + return self.storyUpdated + + def getLanguage(self): + logging.debug('self.language=%s' % self.language) + return self.language + + def getLanguageId(self): + logging.debug('self.languageId=%s' % self.languageId) + return self.languageId + + def getSubjects(self): + logging.debug('self.subjects=%s' % self.authorName) + return self.subjects + + def getPublisher(self): + logging.debug('self.publisher=%s' % self.publisher) + return self.publisher + + def getNumChapters(self): + logging.debug('self.numChapters=%s' % self.numChapters) + return self.numChapters + + def getNumWords(self): + logging.debug('self.numWords=%s' % self.numWords) + return self.numWords + + def getCategory(self): + logging.debug('self.category=%s' % self.category) + return self.category + + def getGenre(self): + logging.debug('self.genre=%s' % self.genre) + return self.genre + + def getStoryStatus(self): + logging.debug('self.storyStatus=%s' % self.storyStatus) + return self.storyStatus + + def getStoryRating(self): + logging.debug('self.storyRating=%s' % self.storyRating) + return self.storyRating + + def getStoryUserRating(self): + logging.debug('self.storyUserRating=%s' % self.storyUserRating) + return self.storyUserRating + def getPrintableUrl(self, url): return url @@ -114,6 +356,15 @@ class FictionAlley(FanfictionSiteAdapter): login = dict(login = 'name', password = 'pass') other = dict(submit = 'Log In', remember='yes') return (login, other) + + def getStoryCharacters(self): + logging.debug('self.storyCharacters=%s' % self.storyCharacters) + return self.storyCharacters + + def getStorySeries(self): + logging.debug('self.storySeries=%s' % self.storySeries) + return self.storySeries + if __name__ == '__main__': diff --git a/ficwad.py b/ficwad.py index 28b71584..133d424a 100644 --- a/ficwad.py +++ b/ficwad.py @@ -12,6 +12,8 @@ import urlparse as up import BeautifulSoup as bs import htmlentitydefs as hdefs import logging +import time +import datetime from adapter import * @@ -32,7 +34,44 @@ class FicWad(FanfictionSiteAdapter): def setPassword(self, password): self.password = password + def _addSubject(self, subject): + subj = subject.upper() + for s in self.subjects: + if s.upper() == subj: + return False + self.subjects.append(subject) + return True + + def _addCharacter(self, character): + chara = character.upper() + for c in self.storyCharacters: + if c.upper() == chara: + return False + self.storyCharacters.append(character) + return True + def extractIndividualUrls(self): + self.storyDescription = 'Fanfiction Story' + self.authorId = '0' + self.storyId = '0' + self.storyPublished = datetime.date(1970, 01, 31) + self.storyCreated = datetime.datetime.now() + self.storyUpdated = datetime.date(1970, 01, 31) + self.languageId = 'en-UK' + self.language = 'English' + self.subjects = [] + self.subjects.append ('fanfiction') + self.publisher = self.host + self.numChapters = 0 + self.numWords = 0 + self.genre = 'FanFiction' + self.category = 'Category' + self.storyStatus = 'In-Progress' + self.storyRating = 'PG' + self.storyUserRating = '0' + self.storyCharacters = [] + self.storySeries = '' + data = u2.urlopen(self.url).read() soup = bs.BeautifulStoneSoup(data) @@ -40,50 +79,254 @@ class FicWad(FanfictionSiteAdapter): crumbtrail = story.find('h3') # the only h3 ficwad uses. allAhrefs = crumbtrail.findAll('a') # last of crumbtrail - self.storyName = allAhrefs[-1].string.strip() + storyinfo = allAhrefs[-1] + (u0, u1, storyid) = storyinfo['href'].split('/') + if u1 == "story": + # This page does not have the correct information on it.. Need to get the Story Title Page + logging.debug('URL %s is a chapter URL. Getting Title Page http://%s/%s/%s.' % (self.url, self.host, u1, storyid)) + self.url = 'http://' + self.host + '/' + u1 + '/' + storyid + data = u2.urlopen(self.url).read() + soup = bs.BeautifulStoneSoup(data) + + story = soup.find('div', {'id' : 'story'}) + crumbtrail = story.find('h3') # the only h3 ficwad uses. + allAhrefs = crumbtrail.findAll('a') + # save chapter name from header in case of one-shot. - chaptername = story.find('h4').find('a').string.strip() + storyinfo = story.find('h4').find('a') + (u0, u1, self.storyId) = storyinfo['href'].split('/') + self.storyName = storyinfo.string.strip() + self.outputName = self.storyName.replace(" ", "_") + '-fw_' + self.storyId + + logging.debug('self.storyName=%s, self.storyId=%s, self.outputName=%s' % (self.storyName, self.storyId, self.outputName)) author = soup.find('span', {'class' : 'author'}) self.authorName = str(author.a.string) + (u0, u1,self.authorId) = author.a['href'].split('/') + self.authorURL = 'http://' + self.host + author.a['href'] + logging.debug('self.authorName=%s self.authorId=%s' % (self.authorName, self.authorId)) - select = soup.find('select', { 'name' : 'goto' } ) + description = soup.find('blockquote', {'class' : 'summary'}) + if description is not None: + self.storyDescription = str(description.p.string) + logging.debug('self.storyDescription=%s' % self.storyDescription) + + meta = soup.find('p', {'class' : 'meta'}) + if meta is not None: + s = str(meta).replace('\n',' ').replace('\t','').split(' - ') + logging.debug('meta.s=%s' % s) + for ss in s: + s1 = ss.replace(' ','').split(':') + #logging.debug('meta.s.s1=%s' % s1) + if len(s1) > 1: + s2 = re.split ('<[^>]+>', s1[0]) + #logging.debug('meta.s.s1.s2=%s' % s2) + if len(s2) > 1: + s1[0] = s2[1] + skey = s1[0].strip() + #logging.debug('Checking = %s' % skey) + if skey == 'Category': + soup1 = bs.BeautifulStoneSoup(s1[1]) + allAs = soup1.findAll('a') + for a in allAs: + if self.category == 'Category': + self.category = str(a.string) + logging.debug('self.category=%s' % self.category) + self._addSubject(self.category) + logging.debug('self.subjects=%s' % self.subjects) + elif skey == 'Rating': + self.storyRating = s1[1] + logging.debug('self.storyRating=%s' % self.storyRating) + elif skey == 'Genres': + self.genre = s1[1] + logging.debug('self.genre=%s' % self.genre) + s2 = s1[1].split(', ') + for ss2 in s2: + self._addSubject(ss2) + logging.debug('self.subjects=%s' % self.subjects) + elif skey == 'Characters': + s2 = s1[1].split(', ') + for ss2 in s2: + self._addCharacter(ss2) + logging.debug('self.storyCharacters=%s' % self.storyCharacters) + elif skey == 'Chapters': + self.numChapters = s1[1] + logging.debug('self.numChapters=%s' % self.numChapters) + elif skey == 'Warnings': + logging.debug('Warnings=%s' % s1[1]) + elif skey == 'Published': + self.storyPublished = datetime.datetime.fromtimestamp(time.mktime(time.strptime(s1[1].strip(' '), "%Y/%m/%d"))) + logging.debug('self.storyPublished=%s' % self.storyPublished) + elif skey == 'Updated': + self.storyUpdated = datetime.datetime.fromtimestamp(time.mktime(time.strptime(s1[1].strip(' '), "%Y/%m/%d"))) + logging.debug('self.storyUpdated=%s' % self.storyUpdated) + else: + s3 = re.split ('<[^>]+>', s1[0]) + #logging.debug('meta.s.s1.s3=%s' % s3) + if len(s3) > 1: + s1[0] = s3[0] + s4 = s1[0].split('w') + #logging.debug('meta.s.s1.s4=%s' % s4) + if len(s4) > 1 and s4[1] == 'ords': + self.numWords = s4[0] + logging.debug('self.numWords=%s' % self.numWords) + + + print('Story "%s" by %s' % (self.storyName, self.authorName)) result = [] - if select is None: - # Single chapter storys don't have title in crumbtrail, just 'chapter' title in h4. - self.storyName = chaptername - # no chapters found, try url by itself. - result.append((self.url,self.storyName)) - else: - allOptions = select.findAll('option') - for o in allOptions: - url = 'http://' + self.host + o['value'] - title = o.string - # ficwad includes 'Story Index' in the dropdown of chapters, - # but it's not a real chapter. - if title != "Story Index": - result.append((url,title)) + ii = 1 + + storylist = soup.find('ul', {'id' : 'storylist'}) + if storylist is not None: + allH4s = storylist.findAll('h4') + #logging.debug('allH4s=%s' % allH4s) + + if allH4s is not None: + for h4 in allH4s: + chapterinfo = h4.find('a') + #logging.debug('Chapter1=%s' % chapterinfo) + url = 'http://' + self.host + chapterinfo['href'] + title = chapterinfo.string.strip() + #logging.debug('Chapter=%s, %s' % (url, title)) + # ficwad includes 'Story Index' in the dropdown of chapters, + # but it's not a real chapter. + if title != "Story Index": + logging.debug('Chapter[%s]=%s, %s' % (ii, url, title)) + result.append((url,title)) + ii = ii+1 + else: + logging.debug('Skipping Story Index. URL %s' % url) + + if ii == 1: + select = soup.find('select', { 'name' : 'goto' } ) + + if select is None: + result.append((self.url,self.storyName)) + logging.debug('Chapter[%s]=%s %s' % (ii, self.url, self.storyName)) + else: + allOptions = select.findAll('option') + for o in allOptions: + url = 'http://' + self.host + o['value'] + title = o.string + # ficwad includes 'Story Index' in the dropdown of chapters, + # but it's not a real chapter. + if title != "Story Index": + logging.debug('Chapter[%s]=%s, %s' % (ii, url, title)) + result.append((url,title)) + ii = ii+1 + else: + logging.debug('Skipping Story Index. URL %s' % url) + self.uuid = 'urn:uuid:' + self.host + '-u.' + self.authorId + '-s.' + self.storyId + logging.debug('self.uuid=%s' % self.uuid) + return result def getStoryName(self): return self.storyName + def getOutputName(self): + return self.outputName + def getAuthorName(self): return self.authorName def getText(self, url): + if url.find('http://') == -1: + url = 'http://' + self.host + '/' + url + data = u2.urlopen(url).read() soup = bs.BeautifulStoneSoup(data) div = soup.find('div', {'id' : 'storytext'}) if None == div: logging.error("Error downloading Chapter: %s" % url) - exit(1) + exit(20) return '<html/>' return div.__str__('utf8') + def getStoryURL(self): + logging.debug('self.url=%s' % self.url) + return self.url + + def getAuthorURL(self): + logging.debug('self.authorURL=%s' % self.authorURL) + return self.authorURL + + def getUUID(self): + logging.debug('self.uuid=%s' % self.uuid) + return self.uuid + + def getAuthorId(self): + logging.debug('self.authorId=%s' % self.authorId) + return self.authorId + + def getStoryId(self): + logging.debug('self.storyId=%s' % self.storyId) + return self.storyId + + def getStoryDescription(self): + logging.debug('self.storyDescription=%s' % self.storyDescription) + return self.storyDescription + + def getStoryPublished(self): + logging.debug('self.storyPublished=%s' % self.storyPublished) + return self.storyPublished + + def getStoryCreated(self): + self.storyCreated = datetime.datetime.now() + logging.debug('self.storyCreated=%s' % self.storyCreated) + return self.storyCreated + + def getStoryUpdated(self): + logging.debug('self.storyUpdated=%s' % self.storyUpdated) + return self.storyUpdated + + def getLanguage(self): + logging.debug('self.language=%s' % self.language) + return self.language + + def getLanguageId(self): + logging.debug('self.languageId=%s' % self.languageId) + return self.languageId + + def getSubjects(self): + logging.debug('self.subjects=%s' % self.authorName) + return self.subjects + + def getPublisher(self): + logging.debug('self.publisher=%s' % self.publisher) + return self.publisher + + def getNumChapters(self): + logging.debug('self.numChapters=%s' % self.numChapters) + return self.numChapters + + def getNumWords(self): + logging.debug('self.numWords=%s' % self.numWords) + return self.numWords + + def getCategory(self): + logging.debug('self.category=%s' % self.category) + return self.category + + def getGenre(self): + logging.debug('self.genre=%s' % self.genre) + return self.genre + + def getStoryStatus(self): + logging.debug('self.storyStatus=%s' % self.storyStatus) + return self.storyStatus + + def getStoryRating(self): + logging.debug('self.storyRating=%s' % self.storyRating) + return self.storyRating + + def getStoryUserRating(self): + logging.debug('self.storyUserRating=%s' % self.storyUserRating) + return self.storyUserRating + def getPrintableUrl(self, url): return url @@ -98,6 +341,15 @@ class FicWad(FanfictionSiteAdapter): other = dict(submit = 'Log In', remember='yes') return (login, other) + def getStoryCharacters(self): + logging.debug('self.storyCharacters=%s' % self.storyCharacters) + return self.storyCharacters + + def getStorySeries(self): + logging.debug('self.storySeries=%s' % self.storySeries) + return self.storySeries + + if __name__ == '__main__': url = 'http://www.ficwad.com/story/14536' diff --git a/hpfiction.py b/hpfiction.py index 75cb4597..9f6cd467 100644 --- a/hpfiction.py +++ b/hpfiction.py @@ -15,6 +15,8 @@ import urllib2 as u2 import urlparse as up import BeautifulSoup as bs import htmlentitydefs as hdefs +import time +import datetime from constants import * from adapter import * @@ -32,8 +34,37 @@ class HPFiction(FanfictionSiteAdapter): self.host = parsedUrl.netloc self.path = parsedUrl.path + logging.debug('self.url=%s' % self.url) + logging.debug('self.host=%s' % self.host) + logging.debug('self.path=%s' % self.path) + self.opener = u2.build_opener(u2.HTTPCookieProcessor()) + self.storyDescription = 'Fanfiction Story' + self.authorId = '0' + self.authorURL = '' + (u1, self.storyId) = self.url.split('=') + self.storyPublished = datetime.date(1970, 01, 31) + self.storyCreated = datetime.datetime.now() + self.storyUpdated = datetime.date(1970, 01, 31) + self.languageId = 'en-UK' + self.language = 'English' + self.subjects = [] + self.subjects.append ('fanfiction') + self.subjects.append ('Harry Potter') + self.publisher = self.host + self.numChapters = 0 + self.numWords = 0 + self.genre = 'FanFiction' + self.category = 'Category' + self.storyStatus = 'In-Progress' + self.storyRating = 'K' + self.storyUserRating = '0' + self.storyCharacters = [] + self.storySeries = '' + self.uuid = 'urn:uuid:' + self.host + '-u.' + self.authorId + '-s.' + self.storyId + logging.debug('self.uuid=%s' % self.uuid) + logging.debug("Created HPFiction: url=%s" % (self.url)) def _getLoginScript(self): @@ -45,23 +76,116 @@ class HPFiction(FanfictionSiteAdapter): def performLogin(self, url = None): return True + def _addSubject(self, subject): + subj = subject.upper() + for s in self.subjects: + if s.upper() == subj: + return False + self.subjects.append(subject) + return True + + def _addCharacter(self, character): + chara = character.upper() + for c in self.storyCharacters: + if c.upper() == chara: + return False + self.storyCharacters.append(character) + return True + def extractIndividualUrls(self): data = self.opener.open(self.url).read() soup = bs.BeautifulSoup(data) links = soup.findAll('a') + def_chapurl = '' + def_chaptitle = '' for a in links: if a['href'].find('psid') != -1: self.storyName = a.string + logging.debug('self.storyName=%s' % self.storyName) elif a['href'].find('viewuser.php') != -1: self.authorName = a.string + self.authorURL = 'http://' + self.host + '/' + a['href'] + (u1, self.authorId) = a['href'].split('=') + logging.debug('self.authorName=%s, self.authorId=%s' % (self.authorName, self.authorId)) + elif a['href'].find('chapterid=') != -1 and len(def_chapurl) == 0: + def_chapurl = 'http://' + self.host + '/viewstory.php' + str(a['href']) + def_chaptitle = a.string + logging.debug('def_chapurl=%s, def_chaptitle=%s' % (def_chapurl, def_chaptitle)) + + centers = soup.findAll('center') + for center in centers: + tds = center.findAll ('td') + if tds is not None and len(tds) > 0: + for td in tds: + s = re.split ("<[^>]+>", str(td).replace('\n','').replace(' ',' ')) + logging.debug('s=%s' % s) + ii = 0 + ll = len(s) + sss = '' + while ii < ll - 1: + if s[ii] is not None and len(s[ii]) > 0: + if s[ii] == 'Rating:': + self.storyRating = s[ii+1] + logging.debug('self.storyRating=%s' % self.storyRating) + ii = ii + 2 + elif s[ii] == 'Chapters:': + self.numChapters = s[ii+1] + logging.debug('self.numChapters=%s' % self.numChapters) + ii = ii + 2 + elif s[ii] == 'Characters:': + s2 = s[ii+1].split(', ') + for ss2 in s2: + self._addCharacter(ss2) + logging.debug('self.storyCharacters=%s' % self.storyCharacters) + ii = ii + 2 + elif s[ii] == 'Genre(s):': + self.genre = s[ii+1] + logging.debug('self.genre=%s' % self.genre) + s2 = s[ii+1].split(', ') + for ss2 in s2: + self._addSubject(ss2) + logging.debug('self.subjects=%s' % self.subjects) + ii = ii + 2 + elif s[ii] == 'Status:': + if s[ii+1].strip(' ') == "Work In Progress": + self.storyStatus = 'In-Progress' + else: + self.storyStatus = 'Completed' + ii = ii + 2 + elif s[ii] == 'First Published:': + self.storyPublished = datetime.datetime.fromtimestamp(time.mktime(time.strptime(s[ii+1].strip(' '), "%Y.%m.%d"))) + logging.debug('self.storyPublished=%s' % self.storyPublished) + ii = ii + 2 + elif s[ii] == 'Last Updated:': + self.storyUpdated = datetime.datetime.fromtimestamp(time.mktime(time.strptime(s[ii+1].strip(' '), "%Y.%m.%d"))) + logging.debug('self.storyUpdated=%s' % self.storyUpdated) + ii = ii + 2 + elif s[ii] == 'Last Published Chapter:': + ii = ii + 2 + elif s[ii] == 'Pairings:': + ii = ii + 2 + elif s[ii] == 'Warnings:': + ii = ii + 2 + else: + sss = sss + ' ' + s[ii] + ii = ii + 1 + else: + ii = ii + 1 + self.storyDescription = sss + logging.debug('self.storyDescription=%s' % self.storyDescription) urls = [] + self.outputName = self.storyName.replace(" ", "_") + '-hp_' + self.storyId + select = soup.find('select', {'name' : 'chapterid'}) if select is None: # no chapters found, try url by itself. - urls.append((self.url,self.storyName)) + if len(def_chapurl) > 0: + urls.append((def_chapurl, def_chaptitle)) + else: + urls.append((self.url,self.storyName)) else: for o in select.findAll('option'): if 'value' in o._getAttrMap(): @@ -69,11 +193,18 @@ class HPFiction(FanfictionSiteAdapter): title = o.string if title != "Story Index": urls.append((url,title)) + + self.uuid = 'urn:uuid:' + self.host + '-u.' + self.authorId + '-s.' + self.storyId + logging.debug('self.uuid=%s' % self.uuid) + return urls def getStoryName(self): return self.storyName + def getOutputName(self): + return self.outputName + def getAuthorName(self): return self.authorName @@ -84,9 +215,100 @@ class HPFiction(FanfictionSiteAdapter): divtext = soup.find('div', {'id' : 'fluidtext'}) if None == divtext: logging.error("Error downloading Chapter: %s" % url) - exit(1) + exit(20) return divtext.__str__('utf8') + def getAuthorId(self): + logging.debug('self.authorId=%s' % self.authorId) + return self.authorId + + def getStoryId(self): + logging.debug('self.storyId=%s' % self.storyId) + return self.storyId + + def getStoryDescription(self): + logging.debug('self.storyDescription=%s' % self.storyDescription) + return self.storyDescription + + def getStoryPublished(self): + logging.debug('self.storyPublished=%s' % self.storyPublished) + return self.storyPublished + + def getStoryCreated(self): + self.storyCreated = datetime.datetime.now() + logging.debug('self.storyCreated=%s' % self.storyCreated) + return self.storyCreated + + def getStoryUpdated(self): + logging.debug('self.storyUpdated=%s' % self.storyUpdated) + return self.storyUpdated + + def getLanguage(self): + logging.debug('self.language=%s' % self.language) + return self.language + + def getLanguageId(self): + logging.debug('self.languageId=%s' % self.languageId) + return self.languageId + + def getSubjects(self): + logging.debug('self.subjects=%s' % self.authorName) + return self.subjects + + def getPublisher(self): + logging.debug('self.publisher=%s' % self.publisher) + return self.publisher + + def getNumChapters(self): + logging.debug('self.numChapters=%s' % self.numChapters) + return self.numChapters + + def getNumWords(self): + logging.debug('self.numWords=%s' % self.numWords) + return self.numWords + + def getStoryURL(self): + logging.debug('self.url=%s' % self.url) + return self.url + + def getAuthorURL(self): + logging.debug('self.authorURL=%s' % self.authorURL) + return self.authorURL + + def getUUID(self): + logging.debug('self.uuid=%s' % self.uuid) + return self.uuid + + def getCategory(self): + logging.debug('self.category=%s' % self.category) + return self.category + + def getGenre(self): + logging.debug('self.genre=%s' % self.genre) + return self.genre + + def getStoryStatus(self): + logging.debug('self.storyStatus=%s' % self.storyStatus) + return self.storyStatus + + def getStoryRating(self): + logging.debug('self.storyRating=%s' % self.storyRating) + return self.storyRating + + def getStoryUserRating(self): + logging.debug('self.storyUserRating=%s' % self.storyUserRating) + return self.storyUserRating + + def getStoryCharacters(self): + logging.debug('self.storyCharacters=%s' % self.storyCharacters) + return self.storyCharacters + + def getStorySeries(self): + logging.debug('self.storySeries=%s' % self.storySeries) + return self.storySeries + + + class FF_UnitTests(unittest.TestCase): def setUp(self): logging.basicConfig(level=logging.DEBUG) diff --git a/output.py b/output.py index 1700bfe7..9ffb1503 100644 --- a/output.py +++ b/output.py @@ -26,6 +26,7 @@ from constants import * import html2text +import datetime class FanficWriter: @@ -41,8 +42,8 @@ class FanficWriter: class TextWriter(FanficWriter): htmlWriter = None - def __init__(self, base, name, author, inmemory=False, compress=False): - self.htmlWriter = HTMLWriter(base, name, author, True, False) + def __init__(self, base, adapter, inmemory=False, compress=False): + self.htmlWriter = HTMLWriter(base, adapter, True, False) def writeChapter(self, index, title, text): self.htmlWriter.writeChapter(index, title, text) @@ -57,12 +58,13 @@ class TextWriter(FanficWriter): class HTMLWriter(FanficWriter): body = '' - def __init__(self, base, name, author, inmemory=False, compress=False): + def __init__(self, base, adapter, inmemory=False, compress=False): self.basePath = base - self.storyTitle = removeEntities(name) - self.name = makeAcceptableFilename(name) - self.fileName = self.basePath + '/' + self.name + '.html' - self.authorName = removeEntities(author) + self.storyTitle = removeEntities(adapter.getStoryName()) + self.name = makeAcceptableFilename(adapter.getOutputName()) + self.fileName = self.basePath + '/' + self.name + '.html' + self.authorName = removeEntities(adapter.getAuthorName()) + self.adapter = adapter self.inmemory = inmemory @@ -131,14 +133,14 @@ class EPubFanficWriter(FanficWriter): for f in self.files: self.files[f].close() - def __init__(self, base, name, author, inmemory=False, compress=True): + def __init__(self, base, adapter, inmemory=False, compress=True): self.basePath = base - self.storyTitle = removeEntities(name) - self.name = makeAcceptableFilename(name) + self.storyTitle = removeEntities(adapter.getStoryName()) + self.name = makeAcceptableFilename(adapter.getOutputName()) self.directory = self.basePath + '/' + self.name - self.authorName = removeEntities(author) - + self.authorName = removeEntities(adapter.getAuthorName()) self.inmemory = inmemory + self.adapter = adapter self.files = {} self.chapters = [] @@ -226,17 +228,50 @@ class EPubFanficWriter(FanficWriter): tocFilePath = "OEBPS/toc.ncx" # toc = open(tocFilePath, 'w') # print >> toc, TOC_START % self.storyTitle - self._writeFile(tocFilePath, TOC_START % self.storyTitle) + self._writeFile(tocFilePath, TOC_START % (self.adapter.getUUID(), self.storyTitle)) + + published = self.adapter.getStoryPublished().strftime("%Y-%m-%d") + createda = self.adapter.getStoryCreated().strftime("%Y-%m-%d %H:%M:%S") + created = self.adapter.getStoryCreated().strftime("%Y-%m-%d") + updated = self.adapter.getStoryUpdated().strftime("%Y-%m-%d") + calibre = self.adapter.getStoryUpdated().strftime("%Y-%m-%dT%H:%M:%S") + + ### writing content -- title page + titleFilePath = "OEBPS/title_page.xhtml" + self._writeFile(titleFilePath, TITLE_PAGE % (self.authorName, self.storyTitle, self.adapter.getStoryURL(), self.storyTitle, self.adapter.getAuthorURL(), self.authorName, self.adapter.getCategory(), self.adapter.getGenre(), self.adapter.getStoryStatus(), published, updated, createda, self.adapter.getStoryRating(), self.adapter.getStoryUserRating(), self.adapter.getNumChapters(), self.adapter.getNumWords(), self.adapter.getStoryURL(), self.adapter.getStoryURL(), self.adapter.getStoryDescription())) + ### writing content -- opf file opfFilePath = "OEBPS/content.opf" - + # opf = open(opfFilePath, 'w') - self._writeFile(opfFilePath, CONTENT_START % (self.storyTitle, self.authorName, uuid.uuid4().urn)) + self._writeFile(opfFilePath, CONTENT_START % (uuid.uuid4().urn, self.storyTitle, self.authorName, self.adapter.getLanguageId(), published, created, updated, calibre, self.adapter.getStoryDescription())) + + i = 0 + subjs = [] + subjs = self.adapter.getSubjects() + for subj in subjs: + self._writeFile(opfFilePath, CONTENT_SUBJECT % subj) + i = i + 1 + if (i <= 0): + self._writeFile(opfFilePath, CONTENT_SUBJECT % "FanFiction") + + self._writeFile(opfFilePath, CONTENT_END_METADATA % (self.adapter.getPublisher(), self.adapter.getUUID(), self.adapter.getStoryURL(), self.adapter.getStoryURL(), self.adapter.getStoryUserRating())) # print >> opf, CONTENT_START % (uuid.uuid4().urn, self.storyTitle, self.authorName) ids = [] - i = 1 + i = 0 + + t = "Title Page" + f = "title_page.xhtml" + chapterId = "Title Page" + self._writeFile(tocFilePath, TOC_ITEM % (chapterId, i, t, f)) + self._writeFile(opfFilePath, CONTENT_ITEM % (chapterId, f)) + + ids.append(chapterId) + + i = i + 1 + for t,f in self.chapters: chapterId = "chapter%04d" % i diff --git a/twilighted.py b/twilighted.py index a7e77a53..f7654041 100644 --- a/twilighted.py +++ b/twilighted.py @@ -11,119 +11,360 @@ import urllib2 as u2 import urlparse as up import BeautifulSoup as bs import htmlentitydefs as hdefs +import time +import datetime from adapter import * import twipassword class Twilighted(FanfictionSiteAdapter): - def __init__(self, url): - self.url = url - parsedUrl = up.urlparse(url) - self.host = parsedUrl.netloc - self.path = parsedUrl.path - self.opener = u2.build_opener(u2.HTTPCookieProcessor()) - self.password=twipassword.password - self.login='sigizmund' - logging.debug("Created Twilighted: url=%s" % (self.url)) - - - def requiresLogin(self, url = None): - # potionsandsnitches.net doesn't require login. - if self.host == 'potionsandsnitches.net': - return False - else: - return True - - def performLogin(self, url = None): - data = {} - - data['penname'] = self.login - data['password'] = self.password - data['cookiecheck'] = '1' - data['submit'] = 'Submit' - - urlvals = u.urlencode(data) - loginUrl = 'http://' + self.host + self._getLoginScript() - logging.debug("Will now login to URL %s" % loginUrl) - - req = self.opener.open(loginUrl, urlvals) - - d = req.read().decode('utf-8') - - if self.reqLoginData(d) : - return False - else: - return True - - - def setLogin(self, login): - self.login = login - - def setPassword(self, password): - self.password = password - - def extractIndividualUrls(self): - data = self.opener.open(self.url).read() + def __init__(self, url): + self.url = url + parsedUrl = up.urlparse(url) + self.host = parsedUrl.netloc + self.path = parsedUrl.path + self.opener = u2.build_opener(u2.HTTPCookieProcessor()) + self.password=twipassword.password + self.login='sigizmund' + self.storyDescription = 'Fanfiction Story' + self.authorId = '0' + self.authorURL = '' + self.storyId = '0' + self.storyPublished = datetime.date(1970, 01, 31) + self.storyCreated = datetime.datetime.now() + self.storyUpdated = datetime.date(1970, 01, 31) + self.languageId = 'en-UK' + self.language = 'English' + self.subjects = [] + self.subjects.append ('fanfiction') + self.subjects.append ('Twilight') + self.publisher = self.host + self.numChapters = 0 + self.numWords = 0 + self.genre = 'FanFiction' + self.category = 'Category' + self.storyStatus = 'In-Progress' + self.storyRating = 'PG' + self.storyUserRating = '0' + self.storyCharacters = [] + self.storySeries = '' + self.uuid = 'urn:uuid:' + self.host + '-u.' + self.authorId + '-s.' + self.storyId + logging.debug('self.uuid=%s' % self.uuid) - if self.reqLoginData(data): - self.performLogin() - data = self.opener.open(self.url).read() - if self.reqLoginData(data): - return None + logging.debug("Created Twilighted: url=%s" % (self.url)) + + + def requiresLogin(self, url = None): + # potionsandsnitches.net doesn't require login. + if self.host == 'potionsandsnitches.net': + return False + else: + return True + + def performLogin(self, url = None): + data = {} - soup = bs.BeautifulStoneSoup(data) - - title = soup.find('title').string - self.storyName = title.split(' by ')[0].strip() - self.authorName = title.split(' by ')[1].strip() - - select = soup.find('select', { 'name' : 'chapter' } ) - - result = [] - if select is None: - # no chapters found, try url by itself. - result.append((self.url,self.storyName)) - else: - allOptions = select.findAll('option') - for o in allOptions: - url = self.url + "&chapter=%s" % o['value'] - title = o.string - result.append((url,title)) - - return result - - def getStoryName(self): - return self.storyName - - def getAuthorName(self): - return self.authorName - - def getText(self, url): - if url.find('http://') == -1: - url = 'http://' + self.host + '/' + url - - logging.debug('Getting data from: %s' % url) - - data = self.opener.open(url).read() + data['penname'] = self.login + data['password'] = self.password + data['cookiecheck'] = '1' + data['submit'] = 'Submit' - soup = bs.BeautifulStoneSoup(data, convertEntities=bs.BeautifulStoneSoup.HTML_ENTITIES) + urlvals = u.urlencode(data) + loginUrl = 'http://' + self.host + self._getLoginScript() + logging.debug("Will now login to URL %s" % loginUrl) + + req = self.opener.open(loginUrl, urlvals) + + d = req.read().decode('utf-8') + + if self.reqLoginData(d) : + return False + else: + return True - div = soup.find('div', {'id' : 'story'}) - if None == div: - return '<html/>' + def setLogin(self, login): + self.login = login - return div.__str__('utf8') + def setPassword(self, password): + self.password = password - def _getLoginScript(self): - return '/user.php?action=login' + def _addSubject(self, subject): + subj = subject.upper() + for s in self.subjects: + if s.upper() == subj: + return False + self.subjects.append(subject) + return True - def reqLoginData(self, data): - if data.find('Registered Users Only. Please click OK to login or register.') != -1 or data.find('There is no such account on our website') != -1: - return True - else: - return False + def _addCharacter(self, character): + chara = character.upper() + for c in self.storyCharacters: + if c.upper() == chara: + return False + self.storyCharacters.append(character) + return True + def extractIndividualUrls(self): + data = self.opener.open(self.url).read() + + if self.reqLoginData(data): + self.performLogin() + data = self.opener.open(self.url).read() + if self.reqLoginData(data): + return None + + soup = bs.BeautifulStoneSoup(data) + + title = soup.find('title').string + self.storyName = title.split(' by ')[0].strip() + self.authorName = title.split(' by ')[1].strip() + self.outputName = self.storyName.replace(" ", "_") + + select = soup.find('select', { 'name' : 'chapter' } ) + + result = [] + if select is None: + # no chapters found, try url by itself. + result.append((self.url,self.storyName)) + else: + allOptions = select.findAll('option') + for o in allOptions: + url = self.url + "&chapter=%s" % o['value'] + title = o.string + result.append((url,title)) + + url = self.url + "&index=1" + data = self.opener.open(url).read() + lines = data.split('\n') + soup = bs.BeautifulStoneSoup(data) + metas = soup.findAll('meta') + for meta in metas: + if 'name' in meta._getAttrMap() and meta['name'].find('description') != -1: + #logging.debug('Meta: %s' % meta) + if 'content' in meta._getAttrMap(): + s1 = bs.BeautifulStoneSoup(meta['content']) + ps = s1.findAll('p') + if len(ps) > 0: + self.storyDescription = ps[0] + logging.debug('self.storyDescription=%s' % (self.storyDescription)) + else: + divs = meta.findAll('div') + #logging.debug('Divs: %s' % divs) + + for div in divs: + #logging.debug('Div: %s' % div) + if 'id' in div._getAttrMap() and div['id'].find('pagetitle') != -1: + #logging.debug('Div PAGETITLE: %s' % div) + allA = div.findAll('a') + for a in allA: + if 'href' in a._getAttrMap(): + if a['href'].find('viewstory.php?sid=') != -1: + str1 = a.string + (vs, self.storyId) = a['href'].split('=') + logging.debug('self.storyId=%s self.storyName=%s' % (self.storyId, self.storyName)) + self.outputName = self.outputName + "-tw_" + self.storyId + logging.debug('self.outputName=%s' % self.outputName) + if a['href'].find('viewuser.php?uid=') != -1: + str1 = a.string + (vs, self.authorId) = a['href'].split('=') + logging.debug('self.authorId=%s self.authorName=%s' % (self.authorId, self.authorName)) + self.authorURL = 'http://'+self.host+'/viewuser.php?uid='+self.authorId + logging.debug('self.authorURL=%s' % self.authorURL) + if 'class' in div._getAttrMap() and div['class'].find('content') != -1: + #logging.debug('Div CONTENT: %s' % div) + brs = div.findAll('br') + for br in brs: + buf = unicode(br).encode('utf-8') + strs = re.split ('<[^>]+>', buf) + #logging.debug('BUF: %s' % strs) + ii = 2 + stlen = len(strs) + while stlen > ii+1: + if len(strs[ii]) == 0: + ii = ii+1 + continue + if strs[ii] == 'Categories:': + ii = ii+1 + while stlen > ii and len(strs[ii]) != 0 and strs[ii].find(':') == -1: + if strs[ii] != ' ' and strs[ii] != ', ': + if self.category == 'Category': + self.category = strs[ii].strip(' ') + self._addSubject(strs[ii].strip(' ')) + ii = ii+1 + logging.debug('self.subjects=%s' % self.subjects) + if strs[ii] == 'Characters: ': + ii = ii+1 + while stlen > ii and len(strs[ii]) != 0 and strs[ii].find(':') == -1: + if strs[ii] != ' ' and strs[ii] != ', ': + self._addCharacter(strs[ii].strip(' ')) + ii = ii+1 + logging.debug('self.storyCharacters=%s' % self.storyCharacters) + elif strs[ii] == 'Completed:': + if strs[ii+1].strip(' ') == "No": + self.storyStatus = 'In-Progress' + else: + self.storyStatus = 'Completed' + ii = ii+2 + logging.debug('self.storyStatus=%s' % self.storyStatus) + elif strs[ii] == 'Rated:': + self.storyRating = strs[ii+1].strip(' ') + ii = ii+2 + logging.debug('self.storyRating=%s' % self.storyRating) + elif strs[ii] == 'Series:': + self.storySeries = strs[ii+1].strip(' ') + if self.storySeries == 'None': + self.storySeries = '' + ii = ii+2 + logging.debug('self.storySeries=%s' % self.storySeries) + elif strs[ii] == 'Chapters: ': + self.numChapters = strs[ii+1].strip(' ') + ii = ii+2 + logging.debug('self.numChapters=%s' % self.numChapters) + elif strs[ii] == 'Word count:': + self.numWords = strs[ii+1].strip(' ') + ii = ii+2 + logging.debug('self.numWords=%s' % self.numWords) + elif strs[ii] == ' Published: ': + self.storyPublished = datetime.datetime.fromtimestamp(time.mktime(time.strptime(strs[ii+1].strip(' '), "%B %d, %Y"))) + ii = ii+2 + logging.debug('self.storyPublished=%s' % self.storyPublished) + elif strs[ii] == 'Updated:': + self.storyUpdated = datetime.datetime.fromtimestamp(time.mktime(time.strptime(strs[ii+1].strip(' '), "%B %d, %Y"))) + ii = ii+2 + logging.debug('self.storyUpdated=%s' % self.storyUpdated) + else: + logging.debug('Skipped Label \"%s\" Value \"%s\"' % (strs[ii], strs[ii+1])) + ii = ii+2 + + self.uuid = 'urn:uuid:' + self.host + '-u.' + self.authorId + '-s.' + self.storyId + logging.debug('self.uuid=%s' % self.uuid) + + return result + + def getStoryName(self): + return self.storyName + + def getOutputName(self): + return self.outputName + + def getAuthorName(self): + return self.authorName + + def getText(self, url): + if url.find('http://') == -1: + url = 'http://' + self.host + '/' + url + + logging.debug('Getting data from: %s' % url) + + data = self.opener.open(url).read() + + soup = bs.BeautifulStoneSoup(data, convertEntities=bs.BeautifulStoneSoup.HTML_ENTITIES) + + div = soup.find('div', {'id' : 'story'}) + + if None == div: + return '<html/>' + + return div.__str__('utf8') + + def _getLoginScript(self): + return '/user.php?action=login' + + def reqLoginData(self, data): + if data.find('Registered Users Only. Please click OK to login or register.') != -1 or data.find('There is no such account on our website') != -1: + return True + else: + return False + + def getStoryURL(self): + logging.debug('self.url=%s' % self.url) + return self.url + + def getAuthorURL(self): + logging.debug('self.authorURL=%s' % self.authorURL) + return self.authorURL + + def getUUID(self): + logging.debug('self.uuid=%s' % self.uuid) + return self.uuid + + def getStoryDescription(self): + logging.debug('self.storyDescription=%s' % self.storyDescription) + return self.storyDescription + + def getStoryPublished(self): + logging.debug('self.storyPublished=%s' % self.storyPublished) + return self.storyPublished + + def getStoryCreated(self): + self.storyCreated = datetime.datetime.now() + logging.debug('self.storyCreated=%s' % self.storyCreated) + return self.storyCreated + + def getStoryUpdated(self): + logging.debug('self.storyUpdated=%s' % self.storyUpdated) + return self.storyUpdated + + def getLanguage(self): + logging.debug('self.language=%s' % self.language) + return self.language + + def getLanguageId(self): + logging.debug('self.languageId=%s' % self.languageId) + return self.languageId + + def getSubjects(self): + logging.debug('self.subjects=%s' % self.authorName) + return self.subjects + + def getPublisher(self): + logging.debug('self.publisher=%s' % self.publisher) + return self.publisher + + def getNumChapters(self): + logging.debug('self.numChapters=%s' % self.numChapters) + return self.numChapters + + def getNumWords(self): + logging.debug('self.numWords=%s' % self.numWords) + return self.numWords + + def getAuthorId(self): + logging.debug('self.authorId=%s' % self.authorId) + return self.authorId + + def getStoryId(self): + logging.debug('self.storyId=%s' % self.storyId) + return self.storyId + + def getCategory(self): + logging.debug('self.category=%s' % self.category) + return self.category + + def getGenre(self): + logging.debug('self.genre=%s' % self.genre) + return self.genre + + def getStoryStatus(self): + logging.debug('self.storyStatus=%s' % self.storyStatus) + return self.storyStatus + + def getStoryRating(self): + logging.debug('self.storyRating=%s' % self.storyRating) + return self.storyRating + + def getStoryUserRating(self): + logging.debug('self.storyUserRating=%s' % self.storyUserRating) + return self.storyUserRating + + def getStoryCharacters(self): + logging.debug('self.storyCharacters=%s' % self.storyCharacters) + return self.storyCharacters + + def getStorySeries(self): + logging.debug('self.storySeries=%s' % self.storySeries) + return self.storySeries class Twilighted_UnitTests(unittest.TestCase): def setUp(self):