diff --git a/adapter.py b/adapter.py index bf37a017..246f4177 100644 --- a/adapter.py +++ b/adapter.py @@ -29,11 +29,80 @@ class FanfictionSiteAdapter: def setPassword(self, password): pass - def getStoryName(self): + def getStoryURL(self): + pass + + def getUUID(self): + pass + + def getOutputName(self): + pass + + def getAuthorURL(self): + pass + + def getAuthorId(self): pass def getAuthorName(self): pass + def getStoryId(self): + pass + + def getStoryName(self): + pass + + def getStoryDescription(self): + pass + + def getStoryCreated(self): + pass + + def getStoryPublished(self): + pass + + def getStoryUpdated(self): + pass + + def getStorySeries(self): + pass + + def getLanguage(self): + pass + + def getLanguageId(self): + pass + + def getSubjects(self): + pass + + def getCharacters(self): + pass + + def getPublisher(self): + pass + + def getNumChapters(self): + pass + + def getNumWords(self): + pass + + def getCategory(self): + pass + + def getGenre(self): + pass + + def getStoryStatus(self): + pass + + def getStoryRating(self): + pass + + def getStoryUserRating(self): + pass + def getPrintableUrl(self, url): - pass \ No newline at end of file + pass diff --git a/constants.py b/constants.py index e01342d9..6ea1f086 100644 --- a/constants.py +++ b/constants.py @@ -15,6 +15,9 @@ h6 { text-align: center; } padding:0px; } .center {text-align: center;} +.cover {text-align: center;} +.full {width: 100%; } +.quarter {width: 25%; } .smcap {font-variant: small-caps;} .u {text-decoration: underline;} .bold {font-weight: bold;} @@ -22,6 +25,37 @@ h6 { text-align: center; } MIMETYPE = '''application/epub+zip''' +TITLE_PAGE = ''' +%s - %s +
+

%s

+

by %s

+
+ + + + + + + + + + + + + + + + + + + + + +
Category:%s
Genre:%s
Status:%s
Published:%s
Updated:%s
Packaged:%s
Rating Age/User:%s / %s
Chapters/Words:%s / %s
URL:

%s

Summary:
%s
+
+''' + CONTAINER = ''' @@ -30,42 +64,60 @@ CONTAINER = ''' ''' -CONTENT_START = ''' +CONTENT_START = ''' - + unique-identifier="fanficdownloader-uuid"> + + BookID-Epub-%s %s %s - en-UK + fanficdownloader [http://fanficdownloader.googlecode.com] + %s - fanfiction - sgzmd - %s + %s + %s + %s + + %s +''' + +CONTENT_END_METADATA = ''' %s + %s + %s + %s + FanFiction + ''' -CONTENT_ITEM = ''' +CONTENT_SUBJECT = ''' %s ''' -CONTENT_END_MANIFEST = ''' - +CONTENT_ITEM = ''' ''' -CONTENT_ITEMREF = ''' +CONTENT_END_MANIFEST = ''' + ''' -CONTENT_END = ''' +CONTENT_ITEMREF = ''' +''' + +CONTENT_END = ''' ''' TOC_START = ''' - + @@ -502,3 +554,5 @@ FB2_DESCRIPTION = ''' 2.0 ''' + +HTML_ESC_Definitions = 'HTML_Escape.def' diff --git a/downaloder.py b/downloader.py similarity index 91% rename from downaloder.py rename to downloader.py index b8af3abe..f8ca80c6 100644 --- a/downaloder.py +++ b/downloader.py @@ -34,6 +34,7 @@ class FanficLoader: self.inmemory = inmemory self.compress = compress self.badLogin = False + self.overWrite = True def getAdapter(): return self.adapter @@ -48,7 +49,13 @@ class FanficLoader: raise adapter.LoginRequiredException(self.adapter.url) urls = self.adapter.extractIndividualUrls() - self.writer = self.writerClass(self.booksDirectory, self.adapter.getStoryName(), self.adapter.getAuthorName(), inmemory=self.inmemory, compress=self.compress) + + s = self.booksDirectory + "/" + self.adapter.getOutputName() + "." + format + if not self.overWrite and os.path.isfile(s): + print >> sys.stderr, "File " + s + " already exists! Skipping!" + exit(10) + + self.writer = self.writerClass(self.booksDirectory, self.adapter, inmemory=self.inmemory, compress=self.compress) i = 1 for u,n in urls: diff --git a/ffnet.py b/ffnet.py index f3e101fc..7320ec5a 100644 --- a/ffnet.py +++ b/ffnet.py @@ -15,6 +15,8 @@ import urllib2 as u2 import urlparse as up import BeautifulSoup as bs import htmlentitydefs as hdefs +import time +import datetime from constants import * from adapter import * @@ -40,10 +42,37 @@ class FFNet(FanfictionSiteAdapter): self.storyName = 'FF.Net story' self.authorName = 'FF.Net author' + self.outputName = 'FF.Net_story' + self.storyDescription = 'Fanfiction Story' + self.storyCharacters = [] + self.storySeries = '' + self.authorId = '0' + self.authorURL = self.path + self.storyId = '0' + self.storyPublished = datetime.date(1970, 01, 31) + self.storyCreated = datetime.datetime.now() + self.storyUpdated = datetime.date(1970, 01, 31) + self.languageId = 'en-UK' + self.language = 'English' + self.subjects = [] + self.subjects.append ('FanFiction') + logging.debug('self.subjects=%s' % self.subjects) + self.publisher = self.host + self.numChapters = 0 + self.numWords = 0 + self.genre = 'FanFiction' + self.category = 'FF.Net Category' + self.storyStatus = 'In-Progress' + self.storyRating = 'K' + self.storyUserRating = '0' + logging.debug('self.path=%s' % self.path) + spl = self.path.split('/') + logging.debug('spl=%s' % spl) if len(spl) == 5: self.path = "/".join(spl[1:-1]) + self.outputName = spl[4] + '-ffnet_' + spl[2] if self.path.startswith('/'): self.path = self.path[1:] @@ -51,10 +80,14 @@ class FFNet(FanfictionSiteAdapter): if self.path.endswith('/'): self.path = self.path[:-1] + logging.debug('self.path=%s' % self.path) + (s, self.storyId, chapter) = self.path.split('/') - logging.debug('self.storyId=%s, chapter=%s' % (self.storyId, chapter)) - + self.uuid = 'urn:uuid:' + self.host + '-u.' + self.authorId + '-s.' + self.storyId + logging.debug('self.uuid=%s' % self.uuid) + + logging.debug('self.storyId=%s, chapter=%s, self.outputName=%s' % (self.storyId, chapter, self.outputName)) if not appEngine: self.opener = u2.build_opener(u2.HTTPCookieProcessor()) else: @@ -70,7 +103,70 @@ class FFNet(FanfictionSiteAdapter): def performLogin(self, url = None): return True + + def _getVarValue(self, varstr): + #logging.debug('_getVarValue varstr=%s' % varstr) + vals = varstr.split('=') + #logging.debug('vals=%s' % vals) + retstr="".join(vals[+1:]) + #logging.debug('retstr=%s' % retstr) + if retstr.startswith(' '): + retstr = retstr[1:] + if retstr.endswith(';'): + retstr = retstr[:-1] + return retstr + def _splitCrossover(self, subject): + if "Crossover" in subject: + self._addSubject ("Crossover") + logging.debug('Crossover=%s' % subject) + if subject.find(' and ') != -1: + words = subject.split(' ') + logging.debug('words=%s' % words) + subj = '' + for s in words: + if s in "and Crossover": + if len(subj) > 0: + self._addSubject(subj) + subj = '' + else: + if len(subj) > 0: + subj = subj + ' ' + subj = subj + s + if len(subj) > 0: + self._addSubject(subj) + else: + self._addSubject(subject) + else: + self._addSubject(subject) + return True + + def _splitGenre(self, subject): + if len(subject) > 0: + words = subject.split('/') + logging.debug('words=%s' % words) + for subj in words: + if len(subj) > 0: + self._addSubject(subj) + return True + + def _addSubject(self, subject): + subj = subject.upper() + for s in self.subjects: + if s.upper() == subj: + return False + + self.subjects.append(subject) + return True + + def _addCharacter(self, character): + chara = character.upper() + for c in self.storyCharacters: + if c.upper() == chara: + return False + self.storyCharacters.append(character) + return True + def _fetchUrl(self, url): if not appEngine: return self.opener.open(url).read().decode('utf-8') @@ -85,6 +181,8 @@ class FFNet(FanfictionSiteAdapter): for a in allA: if 'href' in a._getAttrMap() and a['href'].find('/u/') != -1: self.authorName = a.string + (u1, u2, self.authorId, u3) = a['href'].split('/') + logging.debug('self.authorId=%s self.authorName=%s' % (self.authorId, self.authorName)) urls = [] lines = data.split('\n') @@ -92,9 +190,38 @@ class FFNet(FanfictionSiteAdapter): if l.find("»") != -1 and l.find('') != -1: s2 = bs.BeautifulStoneSoup(l) self.storyName = str(s2.find('b').string) + logging.debug('self.storyId=%s self.storyName=%s' % (self.storyId, self.storyName)) elif l.find(" 0: continue @@ -102,6 +229,8 @@ class FFNet(FanfictionSiteAdapter): u = l.decode('utf-8') except UnicodeEncodeError, e: u = l + except: + u = l.encode('ascii', 'xmlcharrefreplace') u = re.sub('&\#[0-9]+;', ' ', u) s2 = bs.BeautifulSoup(u) options = s2.findAll('option') @@ -110,19 +239,69 @@ class FFNet(FanfictionSiteAdapter): title = o.string logging.debug('URL = `%s`, Title = `%s`' % (url, title)) urls.append((url,title)) - if len(urls) == 0: + elif l.find("var chapters") != -1: + self.numChapters = self._getVarValue (l) + logging.debug('self.numChapters=%s' % self.numChapters) + elif l.find("var words") != -1: + self.numWords = self._getVarValue (l) + logging.debug('self.numWords=%s' % self.numWords) + elif l.find("var categoryid") != -1: + categoryid = self._getVarValue (l) + logging.debug('categoryid=%s' % categoryid) + elif l.find("var cat_title") != -1: + self.category = self._getVarValue (l).strip("'") + logging.debug('self.category=%s' % self.category) + self._splitCrossover(self.category) + logging.debug('self.subjects=%s' % self.subjects) + elif l.find("var summary") != -1: + self.storyDescription = self._getVarValue (l).strip("'") + if '&' in self.storyDescription: + s = self.storyDescription.split('&') + logging.debug('s=%s' % s) + self.storyDescription = '' + for ss in s: + if len(self.storyDescription) > 0: + if len(ss) > 4 and 'amp;' in ss[1:4]: + self.storyDescription = self.storyDescription + '&' + ss + else: + self.storyDescription = self.storyDescription + '&' + ss + else: + self.storyDescription = ss + logging.debug('self.storyDescription=%s' % self.storyDescription) + elif l.find("var datep") != -1: + dateps = self._getVarValue (l) + self.storyPublished = datetime.datetime(*time.strptime ( dateps, "'%m-%d-%y'" )[0:5]) + logging.debug('self.storyPublished=%s' % self.storyPublished.strftime("%Y-%m-%dT%I:%M:%S")) + elif l.find("var dateu") != -1: + dateus = self._getVarValue (l) + self.storyUpdated = datetime.datetime(*time.strptime ( dateus, "'%m-%d-%y'" )[0:5]) + logging.debug('self.storyUpdated=%s' % self.storyUpdated.strftime("%Y-%m-%dT%I:%M:%S")) + + if len(urls) <= 0: # no chapters found, try url by itself. urls.append((self.url,self.storyName)) + + self.uuid = 'urn:uuid:' + self.host + '-a.' + self.authorId + '-s.' + self.storyId + self.authorURL = 'http://' + self.host + '/u/' + self.authorId + logging.debug('self.uuid=%s' % self.uuid) + + #logging.debug('urls=%s' % urls) return urls def getText(self, url): + time.sleep( 2.0 ) data = self._fetchUrl(url) + lines = data.split('\n') + + textbuf = '' + emit = False + olddata = data try: data = data.decode('utf8') except: data = olddata - + try: soup = bs.BeautifulStoneSoup(data) except: @@ -131,23 +310,121 @@ class FFNet(FanfictionSiteAdapter): div = soup.find('div', {'id' : 'storytext'}) if None == div: logging.error("Error downloading Chapter: %s" % url) - exit(1) + exit (20) return '' return div.__str__('utf8') - + def setLogin(self, login): self.login = login def setPassword(self, password): self.password = password - def getStoryName(self): - return self.storyName + def getStoryURL(self): + logging.debug('self.url=%s' % self.url) + return self.url + + def getUUID(self): + logging.debug('self.uuid=%s' % self.uuid) + return self.uuid + + def getOutputName(self): + logging.debug('self.storyId=%s, self.storyName=%s self.outputName=%s' % (self.storyId, self.storyName, self.outputName)) + return self.outputName def getAuthorName(self): + logging.debug('self.authorName=%s' % self.authorName) return self.authorName + def getAuthorId(self): + logging.debug('self.authorId=%s' % self.authorId) + return self.authorId + + def getAuthorURL(self): + logging.debug('self.authorURL=%s' % self.authorURL) + return self.authorURL + + def getStoryId(self): + logging.debug('self.storyId=%s' % self.storyId) + return self.storyId + + def getStoryName(self): + logging.debug('self.storyName=%s' % self.storyName) + return self.storyName + + def getStoryDescription(self): + logging.debug('self.storyDescription=%s' % self.storyDescription) + return self.storyDescription + + def getStoryPublished(self): + logging.debug('self.storyPublished=%s' % self.storyPublished) + return self.storyPublished + + def getStoryCreated(self): + self.storyCreated = datetime.datetime.now() + logging.debug('self.storyCreated=%s' % self.storyCreated) + return self.storyCreated + + def getStoryUpdated(self): + logging.debug('self.storyUpdated=%s' % self.storyUpdated) + return self.storyUpdated + + def getLanguage(self): + logging.debug('self.language=%s' % self.language) + return self.language + + def getLanguageId(self): + logging.debug('self.languageId=%s' % self.languageId) + return self.languageId + + def getSubjects(self): + logging.debug('self.subjects=%s' % self.authorName) + return self.subjects + + def getPublisher(self): + logging.debug('self.publisher=%s' % self.publisher) + return self.publisher + + def getNumChapters(self): + logging.debug('self.numChapters=%s' % self.numChapters) + return self.numChapters + + def getNumWords(self): + logging.debug('self.numWords=%s' % self.numWords) + return self.numWords + + def getCategory(self): + logging.debug('self.category=%s' % self.category) + return self.category + + def getGenre(self): + logging.debug('self.genre=%s' % self.genre) + return self.genre + + def getStoryStatus(self): + logging.debug('self.storyStatus=%s' % self.storyStatus) + return self.storyStatus + + def getStoryRating(self): + logging.debug('self.storyRating=%s' % self.storyRating) + return self.storyRating + + def getStoryUserRating(self): + logging.debug('self.storyUserRating=%s' % self.storyUserRating) + return self.storyUserRating + + def getPrintableUrl(self, url): + pass + + def getStoryCharacters(self): + logging.debug('self.storyCharacters=%s' % self.storyCharacters) + return self.storyCharacters + + def getStorySeries(self): + logging.debug('self.storySeries=%s' % self.storySeries) + return self.storySeries + class FFA_UnitTests(unittest.TestCase): def setUp(self): logging.basicConfig(level=logging.DEBUG) diff --git a/fictionalley.py b/fictionalley.py index 884720fd..20763bf9 100644 --- a/fictionalley.py +++ b/fictionalley.py @@ -12,13 +12,20 @@ import urlparse as up import BeautifulSoup as bs import htmlentitydefs as hdefs import time as time +import datetime from adapter import * class FictionAlley(FanfictionSiteAdapter): def __init__(self, url): self.url = url - self.host = up.urlparse(url).netloc + parsedUrl = up.urlparse(url) + self.host = parsedUrl.netloc + self.path = parsedUrl.path + + logging.debug('self.host=%s' % self.host) + logging.debug('self.path=%s' % self.path) + cookieproc = u2.HTTPCookieProcessor() # FictionAlley wants a cookie to prove you're old enough to read R+ rated stuff. @@ -35,6 +42,36 @@ class FictionAlley(FanfictionSiteAdapter): rfc2109=False) cookieproc.cookiejar.set_cookie(cookie) self.opener = u2.build_opener(cookieproc) + + ss = self.path.split('/') + + self.storyDescription = 'Fanfiction Story' + self.authorId = '' + self.authorURL = '' + self.storyId = '' + if len(ss) > 2 and ss[1] == 'authors': + self.authorId = ss[2] + self.authorURL = 'http://' + self.host + '/authors/' + self.authorId + if len(ss) > 3: + self.storyId = ss[3].replace ('.html','') + self.storyPublished = datetime.date(1970, 01, 31) + self.storyCreated = datetime.datetime.now() + self.storyUpdated = datetime.date(1970, 01, 31) + self.languageId = 'en-UK' + self.language = 'English' + self.subjects = [] + self.subjects.append ('fanfiction') + self.publisher = self.host + self.numChapters = 0 + self.numWords = 0 + self.genre = 'FanFiction' + self.category = 'Category' + self.storyStatus = 'In-Progress' + self.storyRating = 'K' + self.storyUserRating = '0' + self.storyCharacters = [] + self.storySeries = '' + def requiresLogin(self, url = None): return False @@ -48,31 +85,147 @@ class FictionAlley(FanfictionSiteAdapter): def setPassword(self, password): self.password = password + def _addSubject(self, subject): + subj = subject.upper() + for s in self.subjects: + if s.upper() == subj: + return False + self.subjects.append(subject) + return True + + def _addCharacter(self, character): + chara = character.upper() + for c in self.storyCharacters: + if c.upper() == chara: + return False + self.storyCharacters.append(character) + return True + + def _processChapterHeaders(self, div): + brs = div.findAll ('br') + for br in brs: + keystr='' + valstr='' + if len(br.contents) > 2: + keystr = br.contents[1] + if keystr is not None: + strs = re.split ("<[^>]+>", str(keystr)) + keystr='' + for s in strs: + keystr = keystr + s + valstr = br.contents[2].strip(' ') + if keystr is not None: + if keystr == 'Rating:': + self.storyRating = valstr + logging.debug('self.storyRating=%s' % self.storyRating) + elif keystr == 'Genre:': + self.genre = valstr + logging.debug('self.genre=%s' % self.genre) + s2 = valstr.split(', ') + for ss2 in s2: + self._addSubject(ss2) + logging.debug('self.subjects=%s' % self.subjects) + elif keystr == 'Main Character(s):': + s2 = valstr.split(', ') + for ss2 in s2: + self._addCharacter(ss2) + logging.debug('self.storyCharacters=%s' % self.storyCharacters) + elif keystr == 'Summary:': + self.storyDescription = valstr + logging.debug('self.storyDescription=%s' % self.storyDescription) + + def extractIndividualUrls(self): data = self.opener.open(self.url).read() + + # There is some usefull information in the headers of the first chapter page.. + data = data.replace('','').replace('','') soup = bs.BeautifulStoneSoup(data) # Get title from , remove before '-'. title = soup.find('title').string self.storyName = "-".join(title.split('-')[1:]).strip().replace(" (Story Text)","") + self.outputName = self.storyName.replace(" ", "_") + '-fa_' + self.storyId - links = soup.findAll('a', { 'class' : 'chapterlink' } ) + links = soup.findAll('li') + # If it is decided that we really do care about number of words.. It's only available on the author's page.. + #d0 = self.opener.open(self.authorURL).read() + #soupA = bs.BeautifulStoneSoup(d0) + #dls = soupA.findAll('dl') + #logging.debug('dls=%s' % dls) + + self.numChapters = 0; result = [] if len(links) == 0: + # Be aware that this means that the user has entered the {STORY}01.html + # We will not have valid Publised and Updated dates. User should enter + # the {STORY}.html instead. We should force that instead of this. breadcrumbs = soup.find('div', {'class': 'breadcrumbs'}) self.authorName = breadcrumbs.a.string.replace("'s Fics","") result.append((self.url,self.storyName)) + #logging.debug('chapter[%s]=%s, %s' % (self.numChapters+1,self.url,self.storyName)) + self.numChapters = self.numChapters + 1; + div = soup.find('crazytagstringnobodywouldstumbleonaccidently', {'id' : 'storyheaders'}) + if div is not None: + self._processChapterHeaders(div) else: author = soup.find('h1', {'class' : 'title'}) self.authorName = author.a.string - for a in links: - url = a['href'] - title = a.string - result.append((url,title)) + summary = soup.find('div', {'class' : 'summary'}) + ss = summary.contents + if len(ss) > 1: + ss1 = ss[0].split(': ') + if len(ss1) > 1 and ss1[0] == 'Rating': + self.storyRating = ss1[1] + logging.debug('self.storyRating=%s' % self.storyRating) + self.storyDescription = str(ss[1]).replace("<br>","").replace("</br>","").replace('\n','') + logging.debug('self.storyDescription=%s' % self.storyDescription) + + for li in links: + a = li.find('a', {'class' : 'chapterlink'}) + s = li.contents + if a is not None: + url = a['href'] + title = a.string + result.append((url,title)) + #logging.debug('chapter[%s]=%s, %s' % (self.numChapters+1,url,title)) + if self.numChapters == 0: + # fictionalley uses full URLs in chapter list. + d1 = self.opener.open(url).read() + + # find <!-- headerstart --> & <!-- headerend --> and + # replaced with matching div pair for easier parsing. + # Yes, it's an evil kludge, but what can ya do? Using + # something other than div prevents soup from pairing + # our div with poor html inside the story text. + d1 = d1.replace('<!-- headerstart -->','<crazytagstringnobodywouldstumbleonaccidently id="storyheaders">').replace('<!-- headerend -->','</crazytagstringnobodywouldstumbleonaccidently>') + sop = bs.BeautifulStoneSoup(d1) + + div = sop.find('crazytagstringnobodywouldstumbleonaccidently', {'id' : 'storyheaders'}) + if div is not None: + self._processChapterHeaders(div) + + self.numChapters = self.numChapters + 1 + if len(s) > 1: + datestr='' + ss2 = s[1].replace('\n','').replace('(','').split(' ') + if len(ss2) > 2 and ss2[0] == 'Posted:': + datestr = ss2[1] + ' ' + ss2[2] + tmpdate = datetime.datetime.fromtimestamp(time.mktime(time.strptime(datestr.strip(' '), "%Y-%m-%d %H:%M:%S"))) + if self.numChapters == 1: + self.storyPublished = tmpdate + self.storyUpdated = tmpdate + logging.debug('self.storyPublished=%s, self.storyUpdated=%s' % (self.storyPublished, self.storyUpdated)) + else: + logging.debug('li chapterlink not found! li=%s' % li) - #print('Story "%s" by %s' % (self.storyName, self.authorName)) + + print('Story "%s" by %s' % (self.storyName, self.authorName)) + + self.uuid = 'urn:uuid:' + self.host + '-u.' + self.authorId + '-s.' + self.storyId + logging.debug('self.uuid=%s' % self.uuid) return result @@ -82,6 +235,9 @@ class FictionAlley(FanfictionSiteAdapter): def getAuthorName(self): return self.authorName + def getOutputName(self): + return self.outputName + def getText(self, url): # fictionalley uses full URLs in chapter list. data = self.opener.open(url).read() @@ -97,10 +253,96 @@ class FictionAlley(FanfictionSiteAdapter): div = soup.find('crazytagstringnobodywouldstumbleonaccidently', {'id' : 'storytext'}) if None == div: logging.error("Error downloading Chapter: %s" % url) - exit(1) + exit(20) return '<html/>' - return div.__str__('utf8').replace('crazytagstringnobodywouldstumbleonaccidently','div') + + html = soup.findAll('html') + if len(html) > 1: + return html[1].__str__('utf8') + else: + return div.__str__('utf8').replace('crazytagstringnobodywouldstumbleonaccidently','div') + def getStoryURL(self): + logging.debug('self.url=%s' % self.url) + return self.url + + def getAuthorURL(self): + logging.debug('self.authorURL=%s' % self.authorURL) + return self.authorURL + + def getUUID(self): + logging.debug('self.uuid=%s' % self.uuid) + return self.uuid + + def getAuthorId(self): + logging.debug('self.authorId=%s' % self.authorId) + return self.authorId + + def getStoryId(self): + logging.debug('self.storyId=%s' % self.storyId) + return self.storyId + + def getStoryDescription(self): + logging.debug('self.storyDescription=%s' % self.storyDescription) + return self.storyDescription + + def getStoryPublished(self): + logging.debug('self.storyPublished=%s' % self.storyPublished) + return self.storyPublished + + def getStoryCreated(self): + self.storyCreated = datetime.datetime.now() + logging.debug('self.storyCreated=%s' % self.storyCreated) + return self.storyCreated + + def getStoryUpdated(self): + logging.debug('self.storyUpdated=%s' % self.storyUpdated) + return self.storyUpdated + + def getLanguage(self): + logging.debug('self.language=%s' % self.language) + return self.language + + def getLanguageId(self): + logging.debug('self.languageId=%s' % self.languageId) + return self.languageId + + def getSubjects(self): + logging.debug('self.subjects=%s' % self.authorName) + return self.subjects + + def getPublisher(self): + logging.debug('self.publisher=%s' % self.publisher) + return self.publisher + + def getNumChapters(self): + logging.debug('self.numChapters=%s' % self.numChapters) + return self.numChapters + + def getNumWords(self): + logging.debug('self.numWords=%s' % self.numWords) + return self.numWords + + def getCategory(self): + logging.debug('self.category=%s' % self.category) + return self.category + + def getGenre(self): + logging.debug('self.genre=%s' % self.genre) + return self.genre + + def getStoryStatus(self): + logging.debug('self.storyStatus=%s' % self.storyStatus) + return self.storyStatus + + def getStoryRating(self): + logging.debug('self.storyRating=%s' % self.storyRating) + return self.storyRating + + def getStoryUserRating(self): + logging.debug('self.storyUserRating=%s' % self.storyUserRating) + return self.storyUserRating + def getPrintableUrl(self, url): return url @@ -114,6 +356,15 @@ class FictionAlley(FanfictionSiteAdapter): login = dict(login = 'name', password = 'pass') other = dict(submit = 'Log In', remember='yes') return (login, other) + + def getStoryCharacters(self): + logging.debug('self.storyCharacters=%s' % self.storyCharacters) + return self.storyCharacters + + def getStorySeries(self): + logging.debug('self.storySeries=%s' % self.storySeries) + return self.storySeries + if __name__ == '__main__': diff --git a/ficwad.py b/ficwad.py index 28b71584..133d424a 100644 --- a/ficwad.py +++ b/ficwad.py @@ -12,6 +12,8 @@ import urlparse as up import BeautifulSoup as bs import htmlentitydefs as hdefs import logging +import time +import datetime from adapter import * @@ -32,7 +34,44 @@ class FicWad(FanfictionSiteAdapter): def setPassword(self, password): self.password = password + def _addSubject(self, subject): + subj = subject.upper() + for s in self.subjects: + if s.upper() == subj: + return False + self.subjects.append(subject) + return True + + def _addCharacter(self, character): + chara = character.upper() + for c in self.storyCharacters: + if c.upper() == chara: + return False + self.storyCharacters.append(character) + return True + def extractIndividualUrls(self): + self.storyDescription = 'Fanfiction Story' + self.authorId = '0' + self.storyId = '0' + self.storyPublished = datetime.date(1970, 01, 31) + self.storyCreated = datetime.datetime.now() + self.storyUpdated = datetime.date(1970, 01, 31) + self.languageId = 'en-UK' + self.language = 'English' + self.subjects = [] + self.subjects.append ('fanfiction') + self.publisher = self.host + self.numChapters = 0 + self.numWords = 0 + self.genre = 'FanFiction' + self.category = 'Category' + self.storyStatus = 'In-Progress' + self.storyRating = 'PG' + self.storyUserRating = '0' + self.storyCharacters = [] + self.storySeries = '' + data = u2.urlopen(self.url).read() soup = bs.BeautifulStoneSoup(data) @@ -40,50 +79,254 @@ class FicWad(FanfictionSiteAdapter): crumbtrail = story.find('h3') # the only h3 ficwad uses. allAhrefs = crumbtrail.findAll('a') # last of crumbtrail - self.storyName = allAhrefs[-1].string.strip() + storyinfo = allAhrefs[-1] + (u0, u1, storyid) = storyinfo['href'].split('/') + if u1 == "story": + # This page does not have the correct information on it.. Need to get the Story Title Page + logging.debug('URL %s is a chapter URL. Getting Title Page http://%s/%s/%s.' % (self.url, self.host, u1, storyid)) + self.url = 'http://' + self.host + '/' + u1 + '/' + storyid + data = u2.urlopen(self.url).read() + soup = bs.BeautifulStoneSoup(data) + + story = soup.find('div', {'id' : 'story'}) + crumbtrail = story.find('h3') # the only h3 ficwad uses. + allAhrefs = crumbtrail.findAll('a') + # save chapter name from header in case of one-shot. - chaptername = story.find('h4').find('a').string.strip() + storyinfo = story.find('h4').find('a') + (u0, u1, self.storyId) = storyinfo['href'].split('/') + self.storyName = storyinfo.string.strip() + self.outputName = self.storyName.replace(" ", "_") + '-fw_' + self.storyId + + logging.debug('self.storyName=%s, self.storyId=%s, self.outputName=%s' % (self.storyName, self.storyId, self.outputName)) author = soup.find('span', {'class' : 'author'}) self.authorName = str(author.a.string) + (u0, u1,self.authorId) = author.a['href'].split('/') + self.authorURL = 'http://' + self.host + author.a['href'] + logging.debug('self.authorName=%s self.authorId=%s' % (self.authorName, self.authorId)) - select = soup.find('select', { 'name' : 'goto' } ) + description = soup.find('blockquote', {'class' : 'summary'}) + if description is not None: + self.storyDescription = str(description.p.string) + logging.debug('self.storyDescription=%s' % self.storyDescription) + + meta = soup.find('p', {'class' : 'meta'}) + if meta is not None: + s = str(meta).replace('\n',' ').replace('\t','').split(' - ') + logging.debug('meta.s=%s' % s) + for ss in s: + s1 = ss.replace(' ','').split(':') + #logging.debug('meta.s.s1=%s' % s1) + if len(s1) > 1: + s2 = re.split ('<[^>]+>', s1[0]) + #logging.debug('meta.s.s1.s2=%s' % s2) + if len(s2) > 1: + s1[0] = s2[1] + skey = s1[0].strip() + #logging.debug('Checking = %s' % skey) + if skey == 'Category': + soup1 = bs.BeautifulStoneSoup(s1[1]) + allAs = soup1.findAll('a') + for a in allAs: + if self.category == 'Category': + self.category = str(a.string) + logging.debug('self.category=%s' % self.category) + self._addSubject(self.category) + logging.debug('self.subjects=%s' % self.subjects) + elif skey == 'Rating': + self.storyRating = s1[1] + logging.debug('self.storyRating=%s' % self.storyRating) + elif skey == 'Genres': + self.genre = s1[1] + logging.debug('self.genre=%s' % self.genre) + s2 = s1[1].split(', ') + for ss2 in s2: + self._addSubject(ss2) + logging.debug('self.subjects=%s' % self.subjects) + elif skey == 'Characters': + s2 = s1[1].split(', ') + for ss2 in s2: + self._addCharacter(ss2) + logging.debug('self.storyCharacters=%s' % self.storyCharacters) + elif skey == 'Chapters': + self.numChapters = s1[1] + logging.debug('self.numChapters=%s' % self.numChapters) + elif skey == 'Warnings': + logging.debug('Warnings=%s' % s1[1]) + elif skey == 'Published': + self.storyPublished = datetime.datetime.fromtimestamp(time.mktime(time.strptime(s1[1].strip(' '), "%Y/%m/%d"))) + logging.debug('self.storyPublished=%s' % self.storyPublished) + elif skey == 'Updated': + self.storyUpdated = datetime.datetime.fromtimestamp(time.mktime(time.strptime(s1[1].strip(' '), "%Y/%m/%d"))) + logging.debug('self.storyUpdated=%s' % self.storyUpdated) + else: + s3 = re.split ('<[^>]+>', s1[0]) + #logging.debug('meta.s.s1.s3=%s' % s3) + if len(s3) > 1: + s1[0] = s3[0] + s4 = s1[0].split('w') + #logging.debug('meta.s.s1.s4=%s' % s4) + if len(s4) > 1 and s4[1] == 'ords': + self.numWords = s4[0] + logging.debug('self.numWords=%s' % self.numWords) + + + print('Story "%s" by %s' % (self.storyName, self.authorName)) result = [] - if select is None: - # Single chapter storys don't have title in crumbtrail, just 'chapter' title in h4. - self.storyName = chaptername - # no chapters found, try url by itself. - result.append((self.url,self.storyName)) - else: - allOptions = select.findAll('option') - for o in allOptions: - url = 'http://' + self.host + o['value'] - title = o.string - # ficwad includes 'Story Index' in the dropdown of chapters, - # but it's not a real chapter. - if title != "Story Index": - result.append((url,title)) + ii = 1 + + storylist = soup.find('ul', {'id' : 'storylist'}) + if storylist is not None: + allH4s = storylist.findAll('h4') + #logging.debug('allH4s=%s' % allH4s) + + if allH4s is not None: + for h4 in allH4s: + chapterinfo = h4.find('a') + #logging.debug('Chapter1=%s' % chapterinfo) + url = 'http://' + self.host + chapterinfo['href'] + title = chapterinfo.string.strip() + #logging.debug('Chapter=%s, %s' % (url, title)) + # ficwad includes 'Story Index' in the dropdown of chapters, + # but it's not a real chapter. + if title != "Story Index": + logging.debug('Chapter[%s]=%s, %s' % (ii, url, title)) + result.append((url,title)) + ii = ii+1 + else: + logging.debug('Skipping Story Index. URL %s' % url) + + if ii == 1: + select = soup.find('select', { 'name' : 'goto' } ) + + if select is None: + result.append((self.url,self.storyName)) + logging.debug('Chapter[%s]=%s %s' % (ii, self.url, self.storyName)) + else: + allOptions = select.findAll('option') + for o in allOptions: + url = 'http://' + self.host + o['value'] + title = o.string + # ficwad includes 'Story Index' in the dropdown of chapters, + # but it's not a real chapter. + if title != "Story Index": + logging.debug('Chapter[%s]=%s, %s' % (ii, url, title)) + result.append((url,title)) + ii = ii+1 + else: + logging.debug('Skipping Story Index. URL %s' % url) + self.uuid = 'urn:uuid:' + self.host + '-u.' + self.authorId + '-s.' + self.storyId + logging.debug('self.uuid=%s' % self.uuid) + return result def getStoryName(self): return self.storyName + def getOutputName(self): + return self.outputName + def getAuthorName(self): return self.authorName def getText(self, url): + if url.find('http://') == -1: + url = 'http://' + self.host + '/' + url + data = u2.urlopen(url).read() soup = bs.BeautifulStoneSoup(data) div = soup.find('div', {'id' : 'storytext'}) if None == div: logging.error("Error downloading Chapter: %s" % url) - exit(1) + exit(20) return '<html/>' return div.__str__('utf8') + def getStoryURL(self): + logging.debug('self.url=%s' % self.url) + return self.url + + def getAuthorURL(self): + logging.debug('self.authorURL=%s' % self.authorURL) + return self.authorURL + + def getUUID(self): + logging.debug('self.uuid=%s' % self.uuid) + return self.uuid + + def getAuthorId(self): + logging.debug('self.authorId=%s' % self.authorId) + return self.authorId + + def getStoryId(self): + logging.debug('self.storyId=%s' % self.storyId) + return self.storyId + + def getStoryDescription(self): + logging.debug('self.storyDescription=%s' % self.storyDescription) + return self.storyDescription + + def getStoryPublished(self): + logging.debug('self.storyPublished=%s' % self.storyPublished) + return self.storyPublished + + def getStoryCreated(self): + self.storyCreated = datetime.datetime.now() + logging.debug('self.storyCreated=%s' % self.storyCreated) + return self.storyCreated + + def getStoryUpdated(self): + logging.debug('self.storyUpdated=%s' % self.storyUpdated) + return self.storyUpdated + + def getLanguage(self): + logging.debug('self.language=%s' % self.language) + return self.language + + def getLanguageId(self): + logging.debug('self.languageId=%s' % self.languageId) + return self.languageId + + def getSubjects(self): + logging.debug('self.subjects=%s' % self.authorName) + return self.subjects + + def getPublisher(self): + logging.debug('self.publisher=%s' % self.publisher) + return self.publisher + + def getNumChapters(self): + logging.debug('self.numChapters=%s' % self.numChapters) + return self.numChapters + + def getNumWords(self): + logging.debug('self.numWords=%s' % self.numWords) + return self.numWords + + def getCategory(self): + logging.debug('self.category=%s' % self.category) + return self.category + + def getGenre(self): + logging.debug('self.genre=%s' % self.genre) + return self.genre + + def getStoryStatus(self): + logging.debug('self.storyStatus=%s' % self.storyStatus) + return self.storyStatus + + def getStoryRating(self): + logging.debug('self.storyRating=%s' % self.storyRating) + return self.storyRating + + def getStoryUserRating(self): + logging.debug('self.storyUserRating=%s' % self.storyUserRating) + return self.storyUserRating + def getPrintableUrl(self, url): return url @@ -98,6 +341,15 @@ class FicWad(FanfictionSiteAdapter): other = dict(submit = 'Log In', remember='yes') return (login, other) + def getStoryCharacters(self): + logging.debug('self.storyCharacters=%s' % self.storyCharacters) + return self.storyCharacters + + def getStorySeries(self): + logging.debug('self.storySeries=%s' % self.storySeries) + return self.storySeries + + if __name__ == '__main__': url = 'http://www.ficwad.com/story/14536' diff --git a/hpfiction.py b/hpfiction.py index 75cb4597..9f6cd467 100644 --- a/hpfiction.py +++ b/hpfiction.py @@ -15,6 +15,8 @@ import urllib2 as u2 import urlparse as up import BeautifulSoup as bs import htmlentitydefs as hdefs +import time +import datetime from constants import * from adapter import * @@ -32,8 +34,37 @@ class HPFiction(FanfictionSiteAdapter): self.host = parsedUrl.netloc self.path = parsedUrl.path + logging.debug('self.url=%s' % self.url) + logging.debug('self.host=%s' % self.host) + logging.debug('self.path=%s' % self.path) + self.opener = u2.build_opener(u2.HTTPCookieProcessor()) + self.storyDescription = 'Fanfiction Story' + self.authorId = '0' + self.authorURL = '' + (u1, self.storyId) = self.url.split('=') + self.storyPublished = datetime.date(1970, 01, 31) + self.storyCreated = datetime.datetime.now() + self.storyUpdated = datetime.date(1970, 01, 31) + self.languageId = 'en-UK' + self.language = 'English' + self.subjects = [] + self.subjects.append ('fanfiction') + self.subjects.append ('Harry Potter') + self.publisher = self.host + self.numChapters = 0 + self.numWords = 0 + self.genre = 'FanFiction' + self.category = 'Category' + self.storyStatus = 'In-Progress' + self.storyRating = 'K' + self.storyUserRating = '0' + self.storyCharacters = [] + self.storySeries = '' + self.uuid = 'urn:uuid:' + self.host + '-u.' + self.authorId + '-s.' + self.storyId + logging.debug('self.uuid=%s' % self.uuid) + logging.debug("Created HPFiction: url=%s" % (self.url)) def _getLoginScript(self): @@ -45,23 +76,116 @@ class HPFiction(FanfictionSiteAdapter): def performLogin(self, url = None): return True + def _addSubject(self, subject): + subj = subject.upper() + for s in self.subjects: + if s.upper() == subj: + return False + self.subjects.append(subject) + return True + + def _addCharacter(self, character): + chara = character.upper() + for c in self.storyCharacters: + if c.upper() == chara: + return False + self.storyCharacters.append(character) + return True + def extractIndividualUrls(self): data = self.opener.open(self.url).read() soup = bs.BeautifulSoup(data) links = soup.findAll('a') + def_chapurl = '' + def_chaptitle = '' for a in links: if a['href'].find('psid') != -1: self.storyName = a.string + logging.debug('self.storyName=%s' % self.storyName) elif a['href'].find('viewuser.php') != -1: self.authorName = a.string + self.authorURL = 'http://' + self.host + '/' + a['href'] + (u1, self.authorId) = a['href'].split('=') + logging.debug('self.authorName=%s, self.authorId=%s' % (self.authorName, self.authorId)) + elif a['href'].find('chapterid=') != -1 and len(def_chapurl) == 0: + def_chapurl = 'http://' + self.host + '/viewstory.php' + str(a['href']) + def_chaptitle = a.string + logging.debug('def_chapurl=%s, def_chaptitle=%s' % (def_chapurl, def_chaptitle)) + + centers = soup.findAll('center') + for center in centers: + tds = center.findAll ('td') + if tds is not None and len(tds) > 0: + for td in tds: + s = re.split ("<[^>]+>", str(td).replace('\n','').replace(' ',' ')) + logging.debug('s=%s' % s) + ii = 0 + ll = len(s) + sss = '' + while ii < ll - 1: + if s[ii] is not None and len(s[ii]) > 0: + if s[ii] == 'Rating:': + self.storyRating = s[ii+1] + logging.debug('self.storyRating=%s' % self.storyRating) + ii = ii + 2 + elif s[ii] == 'Chapters:': + self.numChapters = s[ii+1] + logging.debug('self.numChapters=%s' % self.numChapters) + ii = ii + 2 + elif s[ii] == 'Characters:': + s2 = s[ii+1].split(', ') + for ss2 in s2: + self._addCharacter(ss2) + logging.debug('self.storyCharacters=%s' % self.storyCharacters) + ii = ii + 2 + elif s[ii] == 'Genre(s):': + self.genre = s[ii+1] + logging.debug('self.genre=%s' % self.genre) + s2 = s[ii+1].split(', ') + for ss2 in s2: + self._addSubject(ss2) + logging.debug('self.subjects=%s' % self.subjects) + ii = ii + 2 + elif s[ii] == 'Status:': + if s[ii+1].strip(' ') == "Work In Progress": + self.storyStatus = 'In-Progress' + else: + self.storyStatus = 'Completed' + ii = ii + 2 + elif s[ii] == 'First Published:': + self.storyPublished = datetime.datetime.fromtimestamp(time.mktime(time.strptime(s[ii+1].strip(' '), "%Y.%m.%d"))) + logging.debug('self.storyPublished=%s' % self.storyPublished) + ii = ii + 2 + elif s[ii] == 'Last Updated:': + self.storyUpdated = datetime.datetime.fromtimestamp(time.mktime(time.strptime(s[ii+1].strip(' '), "%Y.%m.%d"))) + logging.debug('self.storyUpdated=%s' % self.storyUpdated) + ii = ii + 2 + elif s[ii] == 'Last Published Chapter:': + ii = ii + 2 + elif s[ii] == 'Pairings:': + ii = ii + 2 + elif s[ii] == 'Warnings:': + ii = ii + 2 + else: + sss = sss + ' ' + s[ii] + ii = ii + 1 + else: + ii = ii + 1 + self.storyDescription = sss + logging.debug('self.storyDescription=%s' % self.storyDescription) urls = [] + self.outputName = self.storyName.replace(" ", "_") + '-hp_' + self.storyId + select = soup.find('select', {'name' : 'chapterid'}) if select is None: # no chapters found, try url by itself. - urls.append((self.url,self.storyName)) + if len(def_chapurl) > 0: + urls.append((def_chapurl, def_chaptitle)) + else: + urls.append((self.url,self.storyName)) else: for o in select.findAll('option'): if 'value' in o._getAttrMap(): @@ -69,11 +193,18 @@ class HPFiction(FanfictionSiteAdapter): title = o.string if title != "Story Index": urls.append((url,title)) + + self.uuid = 'urn:uuid:' + self.host + '-u.' + self.authorId + '-s.' + self.storyId + logging.debug('self.uuid=%s' % self.uuid) + return urls def getStoryName(self): return self.storyName + def getOutputName(self): + return self.outputName + def getAuthorName(self): return self.authorName @@ -84,9 +215,100 @@ class HPFiction(FanfictionSiteAdapter): divtext = soup.find('div', {'id' : 'fluidtext'}) if None == divtext: logging.error("Error downloading Chapter: %s" % url) - exit(1) + exit(20) return divtext.__str__('utf8') + def getAuthorId(self): + logging.debug('self.authorId=%s' % self.authorId) + return self.authorId + + def getStoryId(self): + logging.debug('self.storyId=%s' % self.storyId) + return self.storyId + + def getStoryDescription(self): + logging.debug('self.storyDescription=%s' % self.storyDescription) + return self.storyDescription + + def getStoryPublished(self): + logging.debug('self.storyPublished=%s' % self.storyPublished) + return self.storyPublished + + def getStoryCreated(self): + self.storyCreated = datetime.datetime.now() + logging.debug('self.storyCreated=%s' % self.storyCreated) + return self.storyCreated + + def getStoryUpdated(self): + logging.debug('self.storyUpdated=%s' % self.storyUpdated) + return self.storyUpdated + + def getLanguage(self): + logging.debug('self.language=%s' % self.language) + return self.language + + def getLanguageId(self): + logging.debug('self.languageId=%s' % self.languageId) + return self.languageId + + def getSubjects(self): + logging.debug('self.subjects=%s' % self.authorName) + return self.subjects + + def getPublisher(self): + logging.debug('self.publisher=%s' % self.publisher) + return self.publisher + + def getNumChapters(self): + logging.debug('self.numChapters=%s' % self.numChapters) + return self.numChapters + + def getNumWords(self): + logging.debug('self.numWords=%s' % self.numWords) + return self.numWords + + def getStoryURL(self): + logging.debug('self.url=%s' % self.url) + return self.url + + def getAuthorURL(self): + logging.debug('self.authorURL=%s' % self.authorURL) + return self.authorURL + + def getUUID(self): + logging.debug('self.uuid=%s' % self.uuid) + return self.uuid + + def getCategory(self): + logging.debug('self.category=%s' % self.category) + return self.category + + def getGenre(self): + logging.debug('self.genre=%s' % self.genre) + return self.genre + + def getStoryStatus(self): + logging.debug('self.storyStatus=%s' % self.storyStatus) + return self.storyStatus + + def getStoryRating(self): + logging.debug('self.storyRating=%s' % self.storyRating) + return self.storyRating + + def getStoryUserRating(self): + logging.debug('self.storyUserRating=%s' % self.storyUserRating) + return self.storyUserRating + + def getStoryCharacters(self): + logging.debug('self.storyCharacters=%s' % self.storyCharacters) + return self.storyCharacters + + def getStorySeries(self): + logging.debug('self.storySeries=%s' % self.storySeries) + return self.storySeries + + + class FF_UnitTests(unittest.TestCase): def setUp(self): logging.basicConfig(level=logging.DEBUG) diff --git a/output.py b/output.py index 1700bfe7..9ffb1503 100644 --- a/output.py +++ b/output.py @@ -26,6 +26,7 @@ from constants import * import html2text +import datetime class FanficWriter: @@ -41,8 +42,8 @@ class FanficWriter: class TextWriter(FanficWriter): htmlWriter = None - def __init__(self, base, name, author, inmemory=False, compress=False): - self.htmlWriter = HTMLWriter(base, name, author, True, False) + def __init__(self, base, adapter, inmemory=False, compress=False): + self.htmlWriter = HTMLWriter(base, adapter, True, False) def writeChapter(self, index, title, text): self.htmlWriter.writeChapter(index, title, text) @@ -57,12 +58,13 @@ class TextWriter(FanficWriter): class HTMLWriter(FanficWriter): body = '' - def __init__(self, base, name, author, inmemory=False, compress=False): + def __init__(self, base, adapter, inmemory=False, compress=False): self.basePath = base - self.storyTitle = removeEntities(name) - self.name = makeAcceptableFilename(name) - self.fileName = self.basePath + '/' + self.name + '.html' - self.authorName = removeEntities(author) + self.storyTitle = removeEntities(adapter.getStoryName()) + self.name = makeAcceptableFilename(adapter.getOutputName()) + self.fileName = self.basePath + '/' + self.name + '.html' + self.authorName = removeEntities(adapter.getAuthorName()) + self.adapter = adapter self.inmemory = inmemory @@ -131,14 +133,14 @@ class EPubFanficWriter(FanficWriter): for f in self.files: self.files[f].close() - def __init__(self, base, name, author, inmemory=False, compress=True): + def __init__(self, base, adapter, inmemory=False, compress=True): self.basePath = base - self.storyTitle = removeEntities(name) - self.name = makeAcceptableFilename(name) + self.storyTitle = removeEntities(adapter.getStoryName()) + self.name = makeAcceptableFilename(adapter.getOutputName()) self.directory = self.basePath + '/' + self.name - self.authorName = removeEntities(author) - + self.authorName = removeEntities(adapter.getAuthorName()) self.inmemory = inmemory + self.adapter = adapter self.files = {} self.chapters = [] @@ -226,17 +228,50 @@ class EPubFanficWriter(FanficWriter): tocFilePath = "OEBPS/toc.ncx" # toc = open(tocFilePath, 'w') # print >> toc, TOC_START % self.storyTitle - self._writeFile(tocFilePath, TOC_START % self.storyTitle) + self._writeFile(tocFilePath, TOC_START % (self.adapter.getUUID(), self.storyTitle)) + + published = self.adapter.getStoryPublished().strftime("%Y-%m-%d") + createda = self.adapter.getStoryCreated().strftime("%Y-%m-%d %H:%M:%S") + created = self.adapter.getStoryCreated().strftime("%Y-%m-%d") + updated = self.adapter.getStoryUpdated().strftime("%Y-%m-%d") + calibre = self.adapter.getStoryUpdated().strftime("%Y-%m-%dT%H:%M:%S") + + ### writing content -- title page + titleFilePath = "OEBPS/title_page.xhtml" + self._writeFile(titleFilePath, TITLE_PAGE % (self.authorName, self.storyTitle, self.adapter.getStoryURL(), self.storyTitle, self.adapter.getAuthorURL(), self.authorName, self.adapter.getCategory(), self.adapter.getGenre(), self.adapter.getStoryStatus(), published, updated, createda, self.adapter.getStoryRating(), self.adapter.getStoryUserRating(), self.adapter.getNumChapters(), self.adapter.getNumWords(), self.adapter.getStoryURL(), self.adapter.getStoryURL(), self.adapter.getStoryDescription())) + ### writing content -- opf file opfFilePath = "OEBPS/content.opf" - + # opf = open(opfFilePath, 'w') - self._writeFile(opfFilePath, CONTENT_START % (self.storyTitle, self.authorName, uuid.uuid4().urn)) + self._writeFile(opfFilePath, CONTENT_START % (uuid.uuid4().urn, self.storyTitle, self.authorName, self.adapter.getLanguageId(), published, created, updated, calibre, self.adapter.getStoryDescription())) + + i = 0 + subjs = [] + subjs = self.adapter.getSubjects() + for subj in subjs: + self._writeFile(opfFilePath, CONTENT_SUBJECT % subj) + i = i + 1 + if (i <= 0): + self._writeFile(opfFilePath, CONTENT_SUBJECT % "FanFiction") + + self._writeFile(opfFilePath, CONTENT_END_METADATA % (self.adapter.getPublisher(), self.adapter.getUUID(), self.adapter.getStoryURL(), self.adapter.getStoryURL(), self.adapter.getStoryUserRating())) # print >> opf, CONTENT_START % (uuid.uuid4().urn, self.storyTitle, self.authorName) ids = [] - i = 1 + i = 0 + + t = "Title Page" + f = "title_page.xhtml" + chapterId = "Title Page" + self._writeFile(tocFilePath, TOC_ITEM % (chapterId, i, t, f)) + self._writeFile(opfFilePath, CONTENT_ITEM % (chapterId, f)) + + ids.append(chapterId) + + i = i + 1 + for t,f in self.chapters: chapterId = "chapter%04d" % i diff --git a/twilighted.py b/twilighted.py index a7e77a53..f7654041 100644 --- a/twilighted.py +++ b/twilighted.py @@ -11,119 +11,360 @@ import urllib2 as u2 import urlparse as up import BeautifulSoup as bs import htmlentitydefs as hdefs +import time +import datetime from adapter import * import twipassword class Twilighted(FanfictionSiteAdapter): - def __init__(self, url): - self.url = url - parsedUrl = up.urlparse(url) - self.host = parsedUrl.netloc - self.path = parsedUrl.path - self.opener = u2.build_opener(u2.HTTPCookieProcessor()) - self.password=twipassword.password - self.login='sigizmund' - logging.debug("Created Twilighted: url=%s" % (self.url)) - - - def requiresLogin(self, url = None): - # potionsandsnitches.net doesn't require login. - if self.host == 'potionsandsnitches.net': - return False - else: - return True - - def performLogin(self, url = None): - data = {} - - data['penname'] = self.login - data['password'] = self.password - data['cookiecheck'] = '1' - data['submit'] = 'Submit' - - urlvals = u.urlencode(data) - loginUrl = 'http://' + self.host + self._getLoginScript() - logging.debug("Will now login to URL %s" % loginUrl) - - req = self.opener.open(loginUrl, urlvals) - - d = req.read().decode('utf-8') - - if self.reqLoginData(d) : - return False - else: - return True - - - def setLogin(self, login): - self.login = login - - def setPassword(self, password): - self.password = password - - def extractIndividualUrls(self): - data = self.opener.open(self.url).read() + def __init__(self, url): + self.url = url + parsedUrl = up.urlparse(url) + self.host = parsedUrl.netloc + self.path = parsedUrl.path + self.opener = u2.build_opener(u2.HTTPCookieProcessor()) + self.password=twipassword.password + self.login='sigizmund' + self.storyDescription = 'Fanfiction Story' + self.authorId = '0' + self.authorURL = '' + self.storyId = '0' + self.storyPublished = datetime.date(1970, 01, 31) + self.storyCreated = datetime.datetime.now() + self.storyUpdated = datetime.date(1970, 01, 31) + self.languageId = 'en-UK' + self.language = 'English' + self.subjects = [] + self.subjects.append ('fanfiction') + self.subjects.append ('Twilight') + self.publisher = self.host + self.numChapters = 0 + self.numWords = 0 + self.genre = 'FanFiction' + self.category = 'Category' + self.storyStatus = 'In-Progress' + self.storyRating = 'PG' + self.storyUserRating = '0' + self.storyCharacters = [] + self.storySeries = '' + self.uuid = 'urn:uuid:' + self.host + '-u.' + self.authorId + '-s.' + self.storyId + logging.debug('self.uuid=%s' % self.uuid) - if self.reqLoginData(data): - self.performLogin() - data = self.opener.open(self.url).read() - if self.reqLoginData(data): - return None + logging.debug("Created Twilighted: url=%s" % (self.url)) + + + def requiresLogin(self, url = None): + # potionsandsnitches.net doesn't require login. + if self.host == 'potionsandsnitches.net': + return False + else: + return True + + def performLogin(self, url = None): + data = {} - soup = bs.BeautifulStoneSoup(data) - - title = soup.find('title').string - self.storyName = title.split(' by ')[0].strip() - self.authorName = title.split(' by ')[1].strip() - - select = soup.find('select', { 'name' : 'chapter' } ) - - result = [] - if select is None: - # no chapters found, try url by itself. - result.append((self.url,self.storyName)) - else: - allOptions = select.findAll('option') - for o in allOptions: - url = self.url + "&chapter=%s" % o['value'] - title = o.string - result.append((url,title)) - - return result - - def getStoryName(self): - return self.storyName - - def getAuthorName(self): - return self.authorName - - def getText(self, url): - if url.find('http://') == -1: - url = 'http://' + self.host + '/' + url - - logging.debug('Getting data from: %s' % url) - - data = self.opener.open(url).read() + data['penname'] = self.login + data['password'] = self.password + data['cookiecheck'] = '1' + data['submit'] = 'Submit' - soup = bs.BeautifulStoneSoup(data, convertEntities=bs.BeautifulStoneSoup.HTML_ENTITIES) + urlvals = u.urlencode(data) + loginUrl = 'http://' + self.host + self._getLoginScript() + logging.debug("Will now login to URL %s" % loginUrl) + + req = self.opener.open(loginUrl, urlvals) + + d = req.read().decode('utf-8') + + if self.reqLoginData(d) : + return False + else: + return True - div = soup.find('div', {'id' : 'story'}) - if None == div: - return '<html/>' + def setLogin(self, login): + self.login = login - return div.__str__('utf8') + def setPassword(self, password): + self.password = password - def _getLoginScript(self): - return '/user.php?action=login' + def _addSubject(self, subject): + subj = subject.upper() + for s in self.subjects: + if s.upper() == subj: + return False + self.subjects.append(subject) + return True - def reqLoginData(self, data): - if data.find('Registered Users Only. Please click OK to login or register.') != -1 or data.find('There is no such account on our website') != -1: - return True - else: - return False + def _addCharacter(self, character): + chara = character.upper() + for c in self.storyCharacters: + if c.upper() == chara: + return False + self.storyCharacters.append(character) + return True + def extractIndividualUrls(self): + data = self.opener.open(self.url).read() + + if self.reqLoginData(data): + self.performLogin() + data = self.opener.open(self.url).read() + if self.reqLoginData(data): + return None + + soup = bs.BeautifulStoneSoup(data) + + title = soup.find('title').string + self.storyName = title.split(' by ')[0].strip() + self.authorName = title.split(' by ')[1].strip() + self.outputName = self.storyName.replace(" ", "_") + + select = soup.find('select', { 'name' : 'chapter' } ) + + result = [] + if select is None: + # no chapters found, try url by itself. + result.append((self.url,self.storyName)) + else: + allOptions = select.findAll('option') + for o in allOptions: + url = self.url + "&chapter=%s" % o['value'] + title = o.string + result.append((url,title)) + + url = self.url + "&index=1" + data = self.opener.open(url).read() + lines = data.split('\n') + soup = bs.BeautifulStoneSoup(data) + metas = soup.findAll('meta') + for meta in metas: + if 'name' in meta._getAttrMap() and meta['name'].find('description') != -1: + #logging.debug('Meta: %s' % meta) + if 'content' in meta._getAttrMap(): + s1 = bs.BeautifulStoneSoup(meta['content']) + ps = s1.findAll('p') + if len(ps) > 0: + self.storyDescription = ps[0] + logging.debug('self.storyDescription=%s' % (self.storyDescription)) + else: + divs = meta.findAll('div') + #logging.debug('Divs: %s' % divs) + + for div in divs: + #logging.debug('Div: %s' % div) + if 'id' in div._getAttrMap() and div['id'].find('pagetitle') != -1: + #logging.debug('Div PAGETITLE: %s' % div) + allA = div.findAll('a') + for a in allA: + if 'href' in a._getAttrMap(): + if a['href'].find('viewstory.php?sid=') != -1: + str1 = a.string + (vs, self.storyId) = a['href'].split('=') + logging.debug('self.storyId=%s self.storyName=%s' % (self.storyId, self.storyName)) + self.outputName = self.outputName + "-tw_" + self.storyId + logging.debug('self.outputName=%s' % self.outputName) + if a['href'].find('viewuser.php?uid=') != -1: + str1 = a.string + (vs, self.authorId) = a['href'].split('=') + logging.debug('self.authorId=%s self.authorName=%s' % (self.authorId, self.authorName)) + self.authorURL = 'http://'+self.host+'/viewuser.php?uid='+self.authorId + logging.debug('self.authorURL=%s' % self.authorURL) + if 'class' in div._getAttrMap() and div['class'].find('content') != -1: + #logging.debug('Div CONTENT: %s' % div) + brs = div.findAll('br') + for br in brs: + buf = unicode(br).encode('utf-8') + strs = re.split ('<[^>]+>', buf) + #logging.debug('BUF: %s' % strs) + ii = 2 + stlen = len(strs) + while stlen > ii+1: + if len(strs[ii]) == 0: + ii = ii+1 + continue + if strs[ii] == 'Categories:': + ii = ii+1 + while stlen > ii and len(strs[ii]) != 0 and strs[ii].find(':') == -1: + if strs[ii] != ' ' and strs[ii] != ', ': + if self.category == 'Category': + self.category = strs[ii].strip(' ') + self._addSubject(strs[ii].strip(' ')) + ii = ii+1 + logging.debug('self.subjects=%s' % self.subjects) + if strs[ii] == 'Characters: ': + ii = ii+1 + while stlen > ii and len(strs[ii]) != 0 and strs[ii].find(':') == -1: + if strs[ii] != ' ' and strs[ii] != ', ': + self._addCharacter(strs[ii].strip(' ')) + ii = ii+1 + logging.debug('self.storyCharacters=%s' % self.storyCharacters) + elif strs[ii] == 'Completed:': + if strs[ii+1].strip(' ') == "No": + self.storyStatus = 'In-Progress' + else: + self.storyStatus = 'Completed' + ii = ii+2 + logging.debug('self.storyStatus=%s' % self.storyStatus) + elif strs[ii] == 'Rated:': + self.storyRating = strs[ii+1].strip(' ') + ii = ii+2 + logging.debug('self.storyRating=%s' % self.storyRating) + elif strs[ii] == 'Series:': + self.storySeries = strs[ii+1].strip(' ') + if self.storySeries == 'None': + self.storySeries = '' + ii = ii+2 + logging.debug('self.storySeries=%s' % self.storySeries) + elif strs[ii] == 'Chapters: ': + self.numChapters = strs[ii+1].strip(' ') + ii = ii+2 + logging.debug('self.numChapters=%s' % self.numChapters) + elif strs[ii] == 'Word count:': + self.numWords = strs[ii+1].strip(' ') + ii = ii+2 + logging.debug('self.numWords=%s' % self.numWords) + elif strs[ii] == ' Published: ': + self.storyPublished = datetime.datetime.fromtimestamp(time.mktime(time.strptime(strs[ii+1].strip(' '), "%B %d, %Y"))) + ii = ii+2 + logging.debug('self.storyPublished=%s' % self.storyPublished) + elif strs[ii] == 'Updated:': + self.storyUpdated = datetime.datetime.fromtimestamp(time.mktime(time.strptime(strs[ii+1].strip(' '), "%B %d, %Y"))) + ii = ii+2 + logging.debug('self.storyUpdated=%s' % self.storyUpdated) + else: + logging.debug('Skipped Label \"%s\" Value \"%s\"' % (strs[ii], strs[ii+1])) + ii = ii+2 + + self.uuid = 'urn:uuid:' + self.host + '-u.' + self.authorId + '-s.' + self.storyId + logging.debug('self.uuid=%s' % self.uuid) + + return result + + def getStoryName(self): + return self.storyName + + def getOutputName(self): + return self.outputName + + def getAuthorName(self): + return self.authorName + + def getText(self, url): + if url.find('http://') == -1: + url = 'http://' + self.host + '/' + url + + logging.debug('Getting data from: %s' % url) + + data = self.opener.open(url).read() + + soup = bs.BeautifulStoneSoup(data, convertEntities=bs.BeautifulStoneSoup.HTML_ENTITIES) + + div = soup.find('div', {'id' : 'story'}) + + if None == div: + return '<html/>' + + return div.__str__('utf8') + + def _getLoginScript(self): + return '/user.php?action=login' + + def reqLoginData(self, data): + if data.find('Registered Users Only. Please click OK to login or register.') != -1 or data.find('There is no such account on our website') != -1: + return True + else: + return False + + def getStoryURL(self): + logging.debug('self.url=%s' % self.url) + return self.url + + def getAuthorURL(self): + logging.debug('self.authorURL=%s' % self.authorURL) + return self.authorURL + + def getUUID(self): + logging.debug('self.uuid=%s' % self.uuid) + return self.uuid + + def getStoryDescription(self): + logging.debug('self.storyDescription=%s' % self.storyDescription) + return self.storyDescription + + def getStoryPublished(self): + logging.debug('self.storyPublished=%s' % self.storyPublished) + return self.storyPublished + + def getStoryCreated(self): + self.storyCreated = datetime.datetime.now() + logging.debug('self.storyCreated=%s' % self.storyCreated) + return self.storyCreated + + def getStoryUpdated(self): + logging.debug('self.storyUpdated=%s' % self.storyUpdated) + return self.storyUpdated + + def getLanguage(self): + logging.debug('self.language=%s' % self.language) + return self.language + + def getLanguageId(self): + logging.debug('self.languageId=%s' % self.languageId) + return self.languageId + + def getSubjects(self): + logging.debug('self.subjects=%s' % self.authorName) + return self.subjects + + def getPublisher(self): + logging.debug('self.publisher=%s' % self.publisher) + return self.publisher + + def getNumChapters(self): + logging.debug('self.numChapters=%s' % self.numChapters) + return self.numChapters + + def getNumWords(self): + logging.debug('self.numWords=%s' % self.numWords) + return self.numWords + + def getAuthorId(self): + logging.debug('self.authorId=%s' % self.authorId) + return self.authorId + + def getStoryId(self): + logging.debug('self.storyId=%s' % self.storyId) + return self.storyId + + def getCategory(self): + logging.debug('self.category=%s' % self.category) + return self.category + + def getGenre(self): + logging.debug('self.genre=%s' % self.genre) + return self.genre + + def getStoryStatus(self): + logging.debug('self.storyStatus=%s' % self.storyStatus) + return self.storyStatus + + def getStoryRating(self): + logging.debug('self.storyRating=%s' % self.storyRating) + return self.storyRating + + def getStoryUserRating(self): + logging.debug('self.storyUserRating=%s' % self.storyUserRating) + return self.storyUserRating + + def getStoryCharacters(self): + logging.debug('self.storyCharacters=%s' % self.storyCharacters) + return self.storyCharacters + + def getStorySeries(self): + logging.debug('self.storySeries=%s' % self.storySeries) + return self.storySeries class Twilighted_UnitTests(unittest.TestCase): def setUp(self):