diff --git a/adapter.py b/adapter.py
index bf37a017..246f4177 100644
--- a/adapter.py
+++ b/adapter.py
@@ -29,11 +29,80 @@ class FanfictionSiteAdapter:
def setPassword(self, password):
pass
- def getStoryName(self):
+ def getStoryURL(self):
+ pass
+
+ def getUUID(self):
+ pass
+
+ def getOutputName(self):
+ pass
+
+ def getAuthorURL(self):
+ pass
+
+ def getAuthorId(self):
pass
def getAuthorName(self):
pass
+ def getStoryId(self):
+ pass
+
+ def getStoryName(self):
+ pass
+
+ def getStoryDescription(self):
+ pass
+
+ def getStoryCreated(self):
+ pass
+
+ def getStoryPublished(self):
+ pass
+
+ def getStoryUpdated(self):
+ pass
+
+ def getStorySeries(self):
+ pass
+
+ def getLanguage(self):
+ pass
+
+ def getLanguageId(self):
+ pass
+
+ def getSubjects(self):
+ pass
+
+ def getCharacters(self):
+ pass
+
+ def getPublisher(self):
+ pass
+
+ def getNumChapters(self):
+ pass
+
+ def getNumWords(self):
+ pass
+
+ def getCategory(self):
+ pass
+
+ def getGenre(self):
+ pass
+
+ def getStoryStatus(self):
+ pass
+
+ def getStoryRating(self):
+ pass
+
+ def getStoryUserRating(self):
+ pass
+
def getPrintableUrl(self, url):
- pass
\ No newline at end of file
+ pass
diff --git a/constants.py b/constants.py
index e01342d9..6ea1f086 100644
--- a/constants.py
+++ b/constants.py
@@ -15,6 +15,9 @@ h6 { text-align: center; }
padding:0px;
}
.center {text-align: center;}
+.cover {text-align: center;}
+.full {width: 100%; }
+.quarter {width: 25%; }
.smcap {font-variant: small-caps;}
.u {text-decoration: underline;}
.bold {font-weight: bold;}
@@ -22,6 +25,37 @@ h6 { text-align: center; }
MIMETYPE = '''application/epub+zip'''
+TITLE_PAGE = '''
+%s - %s
+
+
+
+| |
+ |
+
| |
+ |
+
| Category: | %s |
+
| Genre: | %s |
+
| Status: | %s |
+
| Published: | %s |
+
| Updated: | %s |
+
| Packaged: | %s |
+
| Rating Age/User: | %s / %s |
+
| Chapters/Words: | %s / %s |
+
| URL: | |
+
| Summary: |
+
| %s |
+
| |
+ |
+
| |
+ |
+
+
+'''
+
CONTAINER = '''
@@ -30,42 +64,60 @@ CONTAINER = '''
'''
-CONTENT_START = '''
+CONTENT_START = '''
-
+ unique-identifier="fanficdownloader-uuid">
+
+ BookID-Epub-%s
%s
%s
- en-UK
+ fanficdownloader [http://fanficdownloader.googlecode.com]
+ %s
- fanfiction
- sgzmd
- %s
+ %s
+ %s
+ %s
+
+ %s
+'''
+
+CONTENT_END_METADATA = ''' %s
+ %s
+ %s
+ %s
+ FanFiction
+
'''
-CONTENT_ITEM = '''
+CONTENT_SUBJECT = ''' %s
'''
-CONTENT_END_MANIFEST = '''
-
+CONTENT_ITEM = '''
'''
-CONTENT_ITEMREF = '''
+CONTENT_END_MANIFEST = '''
+
'''
-CONTENT_END = '''
+CONTENT_ITEMREF = '''
+'''
+
+CONTENT_END = '''
'''
TOC_START = '''
-
+
@@ -502,3 +554,5 @@ FB2_DESCRIPTION = '''
2.0
'''
+
+HTML_ESC_Definitions = 'HTML_Escape.def'
diff --git a/downaloder.py b/downloader.py
similarity index 91%
rename from downaloder.py
rename to downloader.py
index b8af3abe..f8ca80c6 100644
--- a/downaloder.py
+++ b/downloader.py
@@ -34,6 +34,7 @@ class FanficLoader:
self.inmemory = inmemory
self.compress = compress
self.badLogin = False
+ self.overWrite = True
def getAdapter():
return self.adapter
@@ -48,7 +49,13 @@ class FanficLoader:
raise adapter.LoginRequiredException(self.adapter.url)
urls = self.adapter.extractIndividualUrls()
- self.writer = self.writerClass(self.booksDirectory, self.adapter.getStoryName(), self.adapter.getAuthorName(), inmemory=self.inmemory, compress=self.compress)
+
+ s = self.booksDirectory + "/" + self.adapter.getOutputName() + "." + format
+ if not self.overWrite and os.path.isfile(s):
+ print >> sys.stderr, "File " + s + " already exists! Skipping!"
+ exit(10)
+
+ self.writer = self.writerClass(self.booksDirectory, self.adapter, inmemory=self.inmemory, compress=self.compress)
i = 1
for u,n in urls:
diff --git a/ffnet.py b/ffnet.py
index f3e101fc..7320ec5a 100644
--- a/ffnet.py
+++ b/ffnet.py
@@ -15,6 +15,8 @@ import urllib2 as u2
import urlparse as up
import BeautifulSoup as bs
import htmlentitydefs as hdefs
+import time
+import datetime
from constants import *
from adapter import *
@@ -40,10 +42,37 @@ class FFNet(FanfictionSiteAdapter):
self.storyName = 'FF.Net story'
self.authorName = 'FF.Net author'
+ self.outputName = 'FF.Net_story'
+ self.storyDescription = 'Fanfiction Story'
+ self.storyCharacters = []
+ self.storySeries = ''
+ self.authorId = '0'
+ self.authorURL = self.path
+ self.storyId = '0'
+ self.storyPublished = datetime.date(1970, 01, 31)
+ self.storyCreated = datetime.datetime.now()
+ self.storyUpdated = datetime.date(1970, 01, 31)
+ self.languageId = 'en-UK'
+ self.language = 'English'
+ self.subjects = []
+ self.subjects.append ('FanFiction')
+ logging.debug('self.subjects=%s' % self.subjects)
+ self.publisher = self.host
+ self.numChapters = 0
+ self.numWords = 0
+ self.genre = 'FanFiction'
+ self.category = 'FF.Net Category'
+ self.storyStatus = 'In-Progress'
+ self.storyRating = 'K'
+ self.storyUserRating = '0'
+ logging.debug('self.path=%s' % self.path)
+
spl = self.path.split('/')
+ logging.debug('spl=%s' % spl)
if len(spl) == 5:
self.path = "/".join(spl[1:-1])
+ self.outputName = spl[4] + '-ffnet_' + spl[2]
if self.path.startswith('/'):
self.path = self.path[1:]
@@ -51,10 +80,14 @@ class FFNet(FanfictionSiteAdapter):
if self.path.endswith('/'):
self.path = self.path[:-1]
+ logging.debug('self.path=%s' % self.path)
+
(s, self.storyId, chapter) = self.path.split('/')
- logging.debug('self.storyId=%s, chapter=%s' % (self.storyId, chapter))
-
+ self.uuid = 'urn:uuid:' + self.host + '-u.' + self.authorId + '-s.' + self.storyId
+ logging.debug('self.uuid=%s' % self.uuid)
+
+ logging.debug('self.storyId=%s, chapter=%s, self.outputName=%s' % (self.storyId, chapter, self.outputName))
if not appEngine:
self.opener = u2.build_opener(u2.HTTPCookieProcessor())
else:
@@ -70,7 +103,70 @@ class FFNet(FanfictionSiteAdapter):
def performLogin(self, url = None):
return True
+
+ def _getVarValue(self, varstr):
+ #logging.debug('_getVarValue varstr=%s' % varstr)
+ vals = varstr.split('=')
+ #logging.debug('vals=%s' % vals)
+ retstr="".join(vals[+1:])
+ #logging.debug('retstr=%s' % retstr)
+ if retstr.startswith(' '):
+ retstr = retstr[1:]
+ if retstr.endswith(';'):
+ retstr = retstr[:-1]
+ return retstr
+ def _splitCrossover(self, subject):
+ if "Crossover" in subject:
+ self._addSubject ("Crossover")
+ logging.debug('Crossover=%s' % subject)
+ if subject.find(' and ') != -1:
+ words = subject.split(' ')
+ logging.debug('words=%s' % words)
+ subj = ''
+ for s in words:
+ if s in "and Crossover":
+ if len(subj) > 0:
+ self._addSubject(subj)
+ subj = ''
+ else:
+ if len(subj) > 0:
+ subj = subj + ' '
+ subj = subj + s
+ if len(subj) > 0:
+ self._addSubject(subj)
+ else:
+ self._addSubject(subject)
+ else:
+ self._addSubject(subject)
+ return True
+
+ def _splitGenre(self, subject):
+ if len(subject) > 0:
+ words = subject.split('/')
+ logging.debug('words=%s' % words)
+ for subj in words:
+ if len(subj) > 0:
+ self._addSubject(subj)
+ return True
+
+ def _addSubject(self, subject):
+ subj = subject.upper()
+ for s in self.subjects:
+ if s.upper() == subj:
+ return False
+
+ self.subjects.append(subject)
+ return True
+
+ def _addCharacter(self, character):
+ chara = character.upper()
+ for c in self.storyCharacters:
+ if c.upper() == chara:
+ return False
+ self.storyCharacters.append(character)
+ return True
+
def _fetchUrl(self, url):
if not appEngine:
return self.opener.open(url).read().decode('utf-8')
@@ -85,6 +181,8 @@ class FFNet(FanfictionSiteAdapter):
for a in allA:
if 'href' in a._getAttrMap() and a['href'].find('/u/') != -1:
self.authorName = a.string
+ (u1, u2, self.authorId, u3) = a['href'].split('/')
+ logging.debug('self.authorId=%s self.authorName=%s' % (self.authorId, self.authorName))
urls = []
lines = data.split('\n')
@@ -92,9 +190,38 @@ class FFNet(FanfictionSiteAdapter):
if l.find("»") != -1 and l.find('') != -1:
s2 = bs.BeautifulStoneSoup(l)
self.storyName = str(s2.find('b').string)
+ logging.debug('self.storyId=%s self.storyName=%s' % (self.storyId, self.storyName))
elif l.find(" 0:
continue
@@ -102,6 +229,8 @@ class FFNet(FanfictionSiteAdapter):
u = l.decode('utf-8')
except UnicodeEncodeError, e:
u = l
+ except:
+ u = l.encode('ascii', 'xmlcharrefreplace')
u = re.sub('&\#[0-9]+;', ' ', u)
s2 = bs.BeautifulSoup(u)
options = s2.findAll('option')
@@ -110,19 +239,69 @@ class FFNet(FanfictionSiteAdapter):
title = o.string
logging.debug('URL = `%s`, Title = `%s`' % (url, title))
urls.append((url,title))
- if len(urls) == 0:
+ elif l.find("var chapters") != -1:
+ self.numChapters = self._getVarValue (l)
+ logging.debug('self.numChapters=%s' % self.numChapters)
+ elif l.find("var words") != -1:
+ self.numWords = self._getVarValue (l)
+ logging.debug('self.numWords=%s' % self.numWords)
+ elif l.find("var categoryid") != -1:
+ categoryid = self._getVarValue (l)
+ logging.debug('categoryid=%s' % categoryid)
+ elif l.find("var cat_title") != -1:
+ self.category = self._getVarValue (l).strip("'")
+ logging.debug('self.category=%s' % self.category)
+ self._splitCrossover(self.category)
+ logging.debug('self.subjects=%s' % self.subjects)
+ elif l.find("var summary") != -1:
+ self.storyDescription = self._getVarValue (l).strip("'")
+ if '&' in self.storyDescription:
+ s = self.storyDescription.split('&')
+ logging.debug('s=%s' % s)
+ self.storyDescription = ''
+ for ss in s:
+ if len(self.storyDescription) > 0:
+ if len(ss) > 4 and 'amp;' in ss[1:4]:
+ self.storyDescription = self.storyDescription + '&' + ss
+ else:
+ self.storyDescription = self.storyDescription + '&' + ss
+ else:
+ self.storyDescription = ss
+ logging.debug('self.storyDescription=%s' % self.storyDescription)
+ elif l.find("var datep") != -1:
+ dateps = self._getVarValue (l)
+ self.storyPublished = datetime.datetime(*time.strptime ( dateps, "'%m-%d-%y'" )[0:5])
+ logging.debug('self.storyPublished=%s' % self.storyPublished.strftime("%Y-%m-%dT%I:%M:%S"))
+ elif l.find("var dateu") != -1:
+ dateus = self._getVarValue (l)
+ self.storyUpdated = datetime.datetime(*time.strptime ( dateus, "'%m-%d-%y'" )[0:5])
+ logging.debug('self.storyUpdated=%s' % self.storyUpdated.strftime("%Y-%m-%dT%I:%M:%S"))
+
+ if len(urls) <= 0:
# no chapters found, try url by itself.
urls.append((self.url,self.storyName))
+
+ self.uuid = 'urn:uuid:' + self.host + '-a.' + self.authorId + '-s.' + self.storyId
+ self.authorURL = 'http://' + self.host + '/u/' + self.authorId
+ logging.debug('self.uuid=%s' % self.uuid)
+
+ #logging.debug('urls=%s' % urls)
return urls
def getText(self, url):
+ time.sleep( 2.0 )
data = self._fetchUrl(url)
+ lines = data.split('\n')
+
+ textbuf = ''
+ emit = False
+
olddata = data
try:
data = data.decode('utf8')
except:
data = olddata
-
+
try:
soup = bs.BeautifulStoneSoup(data)
except:
@@ -131,23 +310,121 @@ class FFNet(FanfictionSiteAdapter):
div = soup.find('div', {'id' : 'storytext'})
if None == div:
logging.error("Error downloading Chapter: %s" % url)
- exit(1)
+ exit (20)
return ''
return div.__str__('utf8')
-
+
def setLogin(self, login):
self.login = login
def setPassword(self, password):
self.password = password
- def getStoryName(self):
- return self.storyName
+ def getStoryURL(self):
+ logging.debug('self.url=%s' % self.url)
+ return self.url
+
+ def getUUID(self):
+ logging.debug('self.uuid=%s' % self.uuid)
+ return self.uuid
+
+ def getOutputName(self):
+ logging.debug('self.storyId=%s, self.storyName=%s self.outputName=%s' % (self.storyId, self.storyName, self.outputName))
+ return self.outputName
def getAuthorName(self):
+ logging.debug('self.authorName=%s' % self.authorName)
return self.authorName
+ def getAuthorId(self):
+ logging.debug('self.authorId=%s' % self.authorId)
+ return self.authorId
+
+ def getAuthorURL(self):
+ logging.debug('self.authorURL=%s' % self.authorURL)
+ return self.authorURL
+
+ def getStoryId(self):
+ logging.debug('self.storyId=%s' % self.storyId)
+ return self.storyId
+
+ def getStoryName(self):
+ logging.debug('self.storyName=%s' % self.storyName)
+ return self.storyName
+
+ def getStoryDescription(self):
+ logging.debug('self.storyDescription=%s' % self.storyDescription)
+ return self.storyDescription
+
+ def getStoryPublished(self):
+ logging.debug('self.storyPublished=%s' % self.storyPublished)
+ return self.storyPublished
+
+ def getStoryCreated(self):
+ self.storyCreated = datetime.datetime.now()
+ logging.debug('self.storyCreated=%s' % self.storyCreated)
+ return self.storyCreated
+
+ def getStoryUpdated(self):
+ logging.debug('self.storyUpdated=%s' % self.storyUpdated)
+ return self.storyUpdated
+
+ def getLanguage(self):
+ logging.debug('self.language=%s' % self.language)
+ return self.language
+
+ def getLanguageId(self):
+ logging.debug('self.languageId=%s' % self.languageId)
+ return self.languageId
+
+ def getSubjects(self):
+ logging.debug('self.subjects=%s' % self.authorName)
+ return self.subjects
+
+ def getPublisher(self):
+ logging.debug('self.publisher=%s' % self.publisher)
+ return self.publisher
+
+ def getNumChapters(self):
+ logging.debug('self.numChapters=%s' % self.numChapters)
+ return self.numChapters
+
+ def getNumWords(self):
+ logging.debug('self.numWords=%s' % self.numWords)
+ return self.numWords
+
+ def getCategory(self):
+ logging.debug('self.category=%s' % self.category)
+ return self.category
+
+ def getGenre(self):
+ logging.debug('self.genre=%s' % self.genre)
+ return self.genre
+
+ def getStoryStatus(self):
+ logging.debug('self.storyStatus=%s' % self.storyStatus)
+ return self.storyStatus
+
+ def getStoryRating(self):
+ logging.debug('self.storyRating=%s' % self.storyRating)
+ return self.storyRating
+
+ def getStoryUserRating(self):
+ logging.debug('self.storyUserRating=%s' % self.storyUserRating)
+ return self.storyUserRating
+
+ def getPrintableUrl(self, url):
+ pass
+
+ def getStoryCharacters(self):
+ logging.debug('self.storyCharacters=%s' % self.storyCharacters)
+ return self.storyCharacters
+
+ def getStorySeries(self):
+ logging.debug('self.storySeries=%s' % self.storySeries)
+ return self.storySeries
+
class FFA_UnitTests(unittest.TestCase):
def setUp(self):
logging.basicConfig(level=logging.DEBUG)
diff --git a/fictionalley.py b/fictionalley.py
index 884720fd..20763bf9 100644
--- a/fictionalley.py
+++ b/fictionalley.py
@@ -12,13 +12,20 @@ import urlparse as up
import BeautifulSoup as bs
import htmlentitydefs as hdefs
import time as time
+import datetime
from adapter import *
class FictionAlley(FanfictionSiteAdapter):
def __init__(self, url):
self.url = url
- self.host = up.urlparse(url).netloc
+ parsedUrl = up.urlparse(url)
+ self.host = parsedUrl.netloc
+ self.path = parsedUrl.path
+
+ logging.debug('self.host=%s' % self.host)
+ logging.debug('self.path=%s' % self.path)
+
cookieproc = u2.HTTPCookieProcessor()
# FictionAlley wants a cookie to prove you're old enough to read R+ rated stuff.
@@ -35,6 +42,36 @@ class FictionAlley(FanfictionSiteAdapter):
rfc2109=False)
cookieproc.cookiejar.set_cookie(cookie)
self.opener = u2.build_opener(cookieproc)
+
+ ss = self.path.split('/')
+
+ self.storyDescription = 'Fanfiction Story'
+ self.authorId = ''
+ self.authorURL = ''
+ self.storyId = ''
+ if len(ss) > 2 and ss[1] == 'authors':
+ self.authorId = ss[2]
+ self.authorURL = 'http://' + self.host + '/authors/' + self.authorId
+ if len(ss) > 3:
+ self.storyId = ss[3].replace ('.html','')
+ self.storyPublished = datetime.date(1970, 01, 31)
+ self.storyCreated = datetime.datetime.now()
+ self.storyUpdated = datetime.date(1970, 01, 31)
+ self.languageId = 'en-UK'
+ self.language = 'English'
+ self.subjects = []
+ self.subjects.append ('fanfiction')
+ self.publisher = self.host
+ self.numChapters = 0
+ self.numWords = 0
+ self.genre = 'FanFiction'
+ self.category = 'Category'
+ self.storyStatus = 'In-Progress'
+ self.storyRating = 'K'
+ self.storyUserRating = '0'
+ self.storyCharacters = []
+ self.storySeries = ''
+
def requiresLogin(self, url = None):
return False
@@ -48,31 +85,147 @@ class FictionAlley(FanfictionSiteAdapter):
def setPassword(self, password):
self.password = password
+ def _addSubject(self, subject):
+ subj = subject.upper()
+ for s in self.subjects:
+ if s.upper() == subj:
+ return False
+ self.subjects.append(subject)
+ return True
+
+ def _addCharacter(self, character):
+ chara = character.upper()
+ for c in self.storyCharacters:
+ if c.upper() == chara:
+ return False
+ self.storyCharacters.append(character)
+ return True
+
+ def _processChapterHeaders(self, div):
+ brs = div.findAll ('br')
+ for br in brs:
+ keystr=''
+ valstr=''
+ if len(br.contents) > 2:
+ keystr = br.contents[1]
+ if keystr is not None:
+ strs = re.split ("<[^>]+>", str(keystr))
+ keystr=''
+ for s in strs:
+ keystr = keystr + s
+ valstr = br.contents[2].strip(' ')
+ if keystr is not None:
+ if keystr == 'Rating:':
+ self.storyRating = valstr
+ logging.debug('self.storyRating=%s' % self.storyRating)
+ elif keystr == 'Genre:':
+ self.genre = valstr
+ logging.debug('self.genre=%s' % self.genre)
+ s2 = valstr.split(', ')
+ for ss2 in s2:
+ self._addSubject(ss2)
+ logging.debug('self.subjects=%s' % self.subjects)
+ elif keystr == 'Main Character(s):':
+ s2 = valstr.split(', ')
+ for ss2 in s2:
+ self._addCharacter(ss2)
+ logging.debug('self.storyCharacters=%s' % self.storyCharacters)
+ elif keystr == 'Summary:':
+ self.storyDescription = valstr
+ logging.debug('self.storyDescription=%s' % self.storyDescription)
+
+
def extractIndividualUrls(self):
data = self.opener.open(self.url).read()
+
+ # There is some usefull information in the headers of the first chapter page..
+ data = data.replace('','')
soup = bs.BeautifulStoneSoup(data)
# Get title from , remove before '-'.
title = soup.find('title').string
self.storyName = "-".join(title.split('-')[1:]).strip().replace(" (Story Text)","")
+ self.outputName = self.storyName.replace(" ", "_") + '-fa_' + self.storyId
- links = soup.findAll('a', { 'class' : 'chapterlink' } )
+ links = soup.findAll('li')
+ # If it is decided that we really do care about number of words.. It's only available on the author's page..
+ #d0 = self.opener.open(self.authorURL).read()
+ #soupA = bs.BeautifulStoneSoup(d0)
+ #dls = soupA.findAll('dl')
+ #logging.debug('dls=%s' % dls)
+
+ self.numChapters = 0;
result = []
if len(links) == 0:
+ # Be aware that this means that the user has entered the {STORY}01.html
+ # We will not have valid Publised and Updated dates. User should enter
+ # the {STORY}.html instead. We should force that instead of this.
breadcrumbs = soup.find('div', {'class': 'breadcrumbs'})
self.authorName = breadcrumbs.a.string.replace("'s Fics","")
result.append((self.url,self.storyName))
+ #logging.debug('chapter[%s]=%s, %s' % (self.numChapters+1,self.url,self.storyName))
+ self.numChapters = self.numChapters + 1;
+ div = soup.find('crazytagstringnobodywouldstumbleonaccidently', {'id' : 'storyheaders'})
+ if div is not None:
+ self._processChapterHeaders(div)
else:
author = soup.find('h1', {'class' : 'title'})
self.authorName = author.a.string
- for a in links:
- url = a['href']
- title = a.string
- result.append((url,title))
+ summary = soup.find('div', {'class' : 'summary'})
+ ss = summary.contents
+ if len(ss) > 1:
+ ss1 = ss[0].split(': ')
+ if len(ss1) > 1 and ss1[0] == 'Rating':
+ self.storyRating = ss1[1]
+ logging.debug('self.storyRating=%s' % self.storyRating)
+ self.storyDescription = str(ss[1]).replace("
","").replace("","").replace('\n','')
+ logging.debug('self.storyDescription=%s' % self.storyDescription)
+
+ for li in links:
+ a = li.find('a', {'class' : 'chapterlink'})
+ s = li.contents
+ if a is not None:
+ url = a['href']
+ title = a.string
+ result.append((url,title))
+ #logging.debug('chapter[%s]=%s, %s' % (self.numChapters+1,url,title))
+ if self.numChapters == 0:
+ # fictionalley uses full URLs in chapter list.
+ d1 = self.opener.open(url).read()
+
+ # find & and
+ # replaced with matching div pair for easier parsing.
+ # Yes, it's an evil kludge, but what can ya do? Using
+ # something other than div prevents soup from pairing
+ # our div with poor html inside the story text.
+ d1 = d1.replace('','')
+ sop = bs.BeautifulStoneSoup(d1)
+
+ div = sop.find('crazytagstringnobodywouldstumbleonaccidently', {'id' : 'storyheaders'})
+ if div is not None:
+ self._processChapterHeaders(div)
+
+ self.numChapters = self.numChapters + 1
+ if len(s) > 1:
+ datestr=''
+ ss2 = s[1].replace('\n','').replace('(','').split(' ')
+ if len(ss2) > 2 and ss2[0] == 'Posted:':
+ datestr = ss2[1] + ' ' + ss2[2]
+ tmpdate = datetime.datetime.fromtimestamp(time.mktime(time.strptime(datestr.strip(' '), "%Y-%m-%d %H:%M:%S")))
+ if self.numChapters == 1:
+ self.storyPublished = tmpdate
+ self.storyUpdated = tmpdate
+ logging.debug('self.storyPublished=%s, self.storyUpdated=%s' % (self.storyPublished, self.storyUpdated))
+ else:
+ logging.debug('li chapterlink not found! li=%s' % li)
- #print('Story "%s" by %s' % (self.storyName, self.authorName))
+
+ print('Story "%s" by %s' % (self.storyName, self.authorName))
+
+ self.uuid = 'urn:uuid:' + self.host + '-u.' + self.authorId + '-s.' + self.storyId
+ logging.debug('self.uuid=%s' % self.uuid)
return result
@@ -82,6 +235,9 @@ class FictionAlley(FanfictionSiteAdapter):
def getAuthorName(self):
return self.authorName
+ def getOutputName(self):
+ return self.outputName
+
def getText(self, url):
# fictionalley uses full URLs in chapter list.
data = self.opener.open(url).read()
@@ -97,10 +253,96 @@ class FictionAlley(FanfictionSiteAdapter):
div = soup.find('crazytagstringnobodywouldstumbleonaccidently', {'id' : 'storytext'})
if None == div:
logging.error("Error downloading Chapter: %s" % url)
- exit(1)
+ exit(20)
return ''
- return div.__str__('utf8').replace('crazytagstringnobodywouldstumbleonaccidently','div')
+
+ html = soup.findAll('html')
+ if len(html) > 1:
+ return html[1].__str__('utf8')
+ else:
+ return div.__str__('utf8').replace('crazytagstringnobodywouldstumbleonaccidently','div')
+ def getStoryURL(self):
+ logging.debug('self.url=%s' % self.url)
+ return self.url
+
+ def getAuthorURL(self):
+ logging.debug('self.authorURL=%s' % self.authorURL)
+ return self.authorURL
+
+ def getUUID(self):
+ logging.debug('self.uuid=%s' % self.uuid)
+ return self.uuid
+
+ def getAuthorId(self):
+ logging.debug('self.authorId=%s' % self.authorId)
+ return self.authorId
+
+ def getStoryId(self):
+ logging.debug('self.storyId=%s' % self.storyId)
+ return self.storyId
+
+ def getStoryDescription(self):
+ logging.debug('self.storyDescription=%s' % self.storyDescription)
+ return self.storyDescription
+
+ def getStoryPublished(self):
+ logging.debug('self.storyPublished=%s' % self.storyPublished)
+ return self.storyPublished
+
+ def getStoryCreated(self):
+ self.storyCreated = datetime.datetime.now()
+ logging.debug('self.storyCreated=%s' % self.storyCreated)
+ return self.storyCreated
+
+ def getStoryUpdated(self):
+ logging.debug('self.storyUpdated=%s' % self.storyUpdated)
+ return self.storyUpdated
+
+ def getLanguage(self):
+ logging.debug('self.language=%s' % self.language)
+ return self.language
+
+ def getLanguageId(self):
+ logging.debug('self.languageId=%s' % self.languageId)
+ return self.languageId
+
+ def getSubjects(self):
+ logging.debug('self.subjects=%s' % self.authorName)
+ return self.subjects
+
+ def getPublisher(self):
+ logging.debug('self.publisher=%s' % self.publisher)
+ return self.publisher
+
+ def getNumChapters(self):
+ logging.debug('self.numChapters=%s' % self.numChapters)
+ return self.numChapters
+
+ def getNumWords(self):
+ logging.debug('self.numWords=%s' % self.numWords)
+ return self.numWords
+
+ def getCategory(self):
+ logging.debug('self.category=%s' % self.category)
+ return self.category
+
+ def getGenre(self):
+ logging.debug('self.genre=%s' % self.genre)
+ return self.genre
+
+ def getStoryStatus(self):
+ logging.debug('self.storyStatus=%s' % self.storyStatus)
+ return self.storyStatus
+
+ def getStoryRating(self):
+ logging.debug('self.storyRating=%s' % self.storyRating)
+ return self.storyRating
+
+ def getStoryUserRating(self):
+ logging.debug('self.storyUserRating=%s' % self.storyUserRating)
+ return self.storyUserRating
+
def getPrintableUrl(self, url):
return url
@@ -114,6 +356,15 @@ class FictionAlley(FanfictionSiteAdapter):
login = dict(login = 'name', password = 'pass')
other = dict(submit = 'Log In', remember='yes')
return (login, other)
+
+ def getStoryCharacters(self):
+ logging.debug('self.storyCharacters=%s' % self.storyCharacters)
+ return self.storyCharacters
+
+ def getStorySeries(self):
+ logging.debug('self.storySeries=%s' % self.storySeries)
+ return self.storySeries
+
if __name__ == '__main__':
diff --git a/ficwad.py b/ficwad.py
index 28b71584..133d424a 100644
--- a/ficwad.py
+++ b/ficwad.py
@@ -12,6 +12,8 @@ import urlparse as up
import BeautifulSoup as bs
import htmlentitydefs as hdefs
import logging
+import time
+import datetime
from adapter import *
@@ -32,7 +34,44 @@ class FicWad(FanfictionSiteAdapter):
def setPassword(self, password):
self.password = password
+ def _addSubject(self, subject):
+ subj = subject.upper()
+ for s in self.subjects:
+ if s.upper() == subj:
+ return False
+ self.subjects.append(subject)
+ return True
+
+ def _addCharacter(self, character):
+ chara = character.upper()
+ for c in self.storyCharacters:
+ if c.upper() == chara:
+ return False
+ self.storyCharacters.append(character)
+ return True
+
def extractIndividualUrls(self):
+ self.storyDescription = 'Fanfiction Story'
+ self.authorId = '0'
+ self.storyId = '0'
+ self.storyPublished = datetime.date(1970, 01, 31)
+ self.storyCreated = datetime.datetime.now()
+ self.storyUpdated = datetime.date(1970, 01, 31)
+ self.languageId = 'en-UK'
+ self.language = 'English'
+ self.subjects = []
+ self.subjects.append ('fanfiction')
+ self.publisher = self.host
+ self.numChapters = 0
+ self.numWords = 0
+ self.genre = 'FanFiction'
+ self.category = 'Category'
+ self.storyStatus = 'In-Progress'
+ self.storyRating = 'PG'
+ self.storyUserRating = '0'
+ self.storyCharacters = []
+ self.storySeries = ''
+
data = u2.urlopen(self.url).read()
soup = bs.BeautifulStoneSoup(data)
@@ -40,50 +79,254 @@ class FicWad(FanfictionSiteAdapter):
crumbtrail = story.find('h3') # the only h3 ficwad uses.
allAhrefs = crumbtrail.findAll('a')
# last of crumbtrail
- self.storyName = allAhrefs[-1].string.strip()
+ storyinfo = allAhrefs[-1]
+ (u0, u1, storyid) = storyinfo['href'].split('/')
+ if u1 == "story":
+ # This page does not have the correct information on it.. Need to get the Story Title Page
+ logging.debug('URL %s is a chapter URL. Getting Title Page http://%s/%s/%s.' % (self.url, self.host, u1, storyid))
+ self.url = 'http://' + self.host + '/' + u1 + '/' + storyid
+ data = u2.urlopen(self.url).read()
+ soup = bs.BeautifulStoneSoup(data)
+
+ story = soup.find('div', {'id' : 'story'})
+ crumbtrail = story.find('h3') # the only h3 ficwad uses.
+ allAhrefs = crumbtrail.findAll('a')
+
# save chapter name from header in case of one-shot.
- chaptername = story.find('h4').find('a').string.strip()
+ storyinfo = story.find('h4').find('a')
+ (u0, u1, self.storyId) = storyinfo['href'].split('/')
+ self.storyName = storyinfo.string.strip()
+ self.outputName = self.storyName.replace(" ", "_") + '-fw_' + self.storyId
+
+ logging.debug('self.storyName=%s, self.storyId=%s, self.outputName=%s' % (self.storyName, self.storyId, self.outputName))
author = soup.find('span', {'class' : 'author'})
self.authorName = str(author.a.string)
+ (u0, u1,self.authorId) = author.a['href'].split('/')
+ self.authorURL = 'http://' + self.host + author.a['href']
+ logging.debug('self.authorName=%s self.authorId=%s' % (self.authorName, self.authorId))
- select = soup.find('select', { 'name' : 'goto' } )
+ description = soup.find('blockquote', {'class' : 'summary'})
+ if description is not None:
+ self.storyDescription = str(description.p.string)
+ logging.debug('self.storyDescription=%s' % self.storyDescription)
+
+ meta = soup.find('p', {'class' : 'meta'})
+ if meta is not None:
+ s = str(meta).replace('\n',' ').replace('\t','').split(' - ')
+ logging.debug('meta.s=%s' % s)
+ for ss in s:
+ s1 = ss.replace(' ','').split(':')
+ #logging.debug('meta.s.s1=%s' % s1)
+ if len(s1) > 1:
+ s2 = re.split ('<[^>]+>', s1[0])
+ #logging.debug('meta.s.s1.s2=%s' % s2)
+ if len(s2) > 1:
+ s1[0] = s2[1]
+ skey = s1[0].strip()
+ #logging.debug('Checking = %s' % skey)
+ if skey == 'Category':
+ soup1 = bs.BeautifulStoneSoup(s1[1])
+ allAs = soup1.findAll('a')
+ for a in allAs:
+ if self.category == 'Category':
+ self.category = str(a.string)
+ logging.debug('self.category=%s' % self.category)
+ self._addSubject(self.category)
+ logging.debug('self.subjects=%s' % self.subjects)
+ elif skey == 'Rating':
+ self.storyRating = s1[1]
+ logging.debug('self.storyRating=%s' % self.storyRating)
+ elif skey == 'Genres':
+ self.genre = s1[1]
+ logging.debug('self.genre=%s' % self.genre)
+ s2 = s1[1].split(', ')
+ for ss2 in s2:
+ self._addSubject(ss2)
+ logging.debug('self.subjects=%s' % self.subjects)
+ elif skey == 'Characters':
+ s2 = s1[1].split(', ')
+ for ss2 in s2:
+ self._addCharacter(ss2)
+ logging.debug('self.storyCharacters=%s' % self.storyCharacters)
+ elif skey == 'Chapters':
+ self.numChapters = s1[1]
+ logging.debug('self.numChapters=%s' % self.numChapters)
+ elif skey == 'Warnings':
+ logging.debug('Warnings=%s' % s1[1])
+ elif skey == 'Published':
+ self.storyPublished = datetime.datetime.fromtimestamp(time.mktime(time.strptime(s1[1].strip(' '), "%Y/%m/%d")))
+ logging.debug('self.storyPublished=%s' % self.storyPublished)
+ elif skey == 'Updated':
+ self.storyUpdated = datetime.datetime.fromtimestamp(time.mktime(time.strptime(s1[1].strip(' '), "%Y/%m/%d")))
+ logging.debug('self.storyUpdated=%s' % self.storyUpdated)
+ else:
+ s3 = re.split ('<[^>]+>', s1[0])
+ #logging.debug('meta.s.s1.s3=%s' % s3)
+ if len(s3) > 1:
+ s1[0] = s3[0]
+ s4 = s1[0].split('w')
+ #logging.debug('meta.s.s1.s4=%s' % s4)
+ if len(s4) > 1 and s4[1] == 'ords':
+ self.numWords = s4[0]
+ logging.debug('self.numWords=%s' % self.numWords)
+
+
+ print('Story "%s" by %s' % (self.storyName, self.authorName))
result = []
- if select is None:
- # Single chapter storys don't have title in crumbtrail, just 'chapter' title in h4.
- self.storyName = chaptername
- # no chapters found, try url by itself.
- result.append((self.url,self.storyName))
- else:
- allOptions = select.findAll('option')
- for o in allOptions:
- url = 'http://' + self.host + o['value']
- title = o.string
- # ficwad includes 'Story Index' in the dropdown of chapters,
- # but it's not a real chapter.
- if title != "Story Index":
- result.append((url,title))
+ ii = 1
+
+ storylist = soup.find('ul', {'id' : 'storylist'})
+ if storylist is not None:
+ allH4s = storylist.findAll('h4')
+ #logging.debug('allH4s=%s' % allH4s)
+
+ if allH4s is not None:
+ for h4 in allH4s:
+ chapterinfo = h4.find('a')
+ #logging.debug('Chapter1=%s' % chapterinfo)
+ url = 'http://' + self.host + chapterinfo['href']
+ title = chapterinfo.string.strip()
+ #logging.debug('Chapter=%s, %s' % (url, title))
+ # ficwad includes 'Story Index' in the dropdown of chapters,
+ # but it's not a real chapter.
+ if title != "Story Index":
+ logging.debug('Chapter[%s]=%s, %s' % (ii, url, title))
+ result.append((url,title))
+ ii = ii+1
+ else:
+ logging.debug('Skipping Story Index. URL %s' % url)
+
+ if ii == 1:
+ select = soup.find('select', { 'name' : 'goto' } )
+
+ if select is None:
+ result.append((self.url,self.storyName))
+ logging.debug('Chapter[%s]=%s %s' % (ii, self.url, self.storyName))
+ else:
+ allOptions = select.findAll('option')
+ for o in allOptions:
+ url = 'http://' + self.host + o['value']
+ title = o.string
+ # ficwad includes 'Story Index' in the dropdown of chapters,
+ # but it's not a real chapter.
+ if title != "Story Index":
+ logging.debug('Chapter[%s]=%s, %s' % (ii, url, title))
+ result.append((url,title))
+ ii = ii+1
+ else:
+ logging.debug('Skipping Story Index. URL %s' % url)
+ self.uuid = 'urn:uuid:' + self.host + '-u.' + self.authorId + '-s.' + self.storyId
+ logging.debug('self.uuid=%s' % self.uuid)
+
return result
def getStoryName(self):
return self.storyName
+ def getOutputName(self):
+ return self.outputName
+
def getAuthorName(self):
return self.authorName
def getText(self, url):
+ if url.find('http://') == -1:
+ url = 'http://' + self.host + '/' + url
+
data = u2.urlopen(url).read()
soup = bs.BeautifulStoneSoup(data)
div = soup.find('div', {'id' : 'storytext'})
if None == div:
logging.error("Error downloading Chapter: %s" % url)
- exit(1)
+ exit(20)
return ''
return div.__str__('utf8')
+ def getStoryURL(self):
+ logging.debug('self.url=%s' % self.url)
+ return self.url
+
+ def getAuthorURL(self):
+ logging.debug('self.authorURL=%s' % self.authorURL)
+ return self.authorURL
+
+ def getUUID(self):
+ logging.debug('self.uuid=%s' % self.uuid)
+ return self.uuid
+
+ def getAuthorId(self):
+ logging.debug('self.authorId=%s' % self.authorId)
+ return self.authorId
+
+ def getStoryId(self):
+ logging.debug('self.storyId=%s' % self.storyId)
+ return self.storyId
+
+ def getStoryDescription(self):
+ logging.debug('self.storyDescription=%s' % self.storyDescription)
+ return self.storyDescription
+
+ def getStoryPublished(self):
+ logging.debug('self.storyPublished=%s' % self.storyPublished)
+ return self.storyPublished
+
+ def getStoryCreated(self):
+ self.storyCreated = datetime.datetime.now()
+ logging.debug('self.storyCreated=%s' % self.storyCreated)
+ return self.storyCreated
+
+ def getStoryUpdated(self):
+ logging.debug('self.storyUpdated=%s' % self.storyUpdated)
+ return self.storyUpdated
+
+ def getLanguage(self):
+ logging.debug('self.language=%s' % self.language)
+ return self.language
+
+ def getLanguageId(self):
+ logging.debug('self.languageId=%s' % self.languageId)
+ return self.languageId
+
+ def getSubjects(self):
+ logging.debug('self.subjects=%s' % self.authorName)
+ return self.subjects
+
+ def getPublisher(self):
+ logging.debug('self.publisher=%s' % self.publisher)
+ return self.publisher
+
+ def getNumChapters(self):
+ logging.debug('self.numChapters=%s' % self.numChapters)
+ return self.numChapters
+
+ def getNumWords(self):
+ logging.debug('self.numWords=%s' % self.numWords)
+ return self.numWords
+
+ def getCategory(self):
+ logging.debug('self.category=%s' % self.category)
+ return self.category
+
+ def getGenre(self):
+ logging.debug('self.genre=%s' % self.genre)
+ return self.genre
+
+ def getStoryStatus(self):
+ logging.debug('self.storyStatus=%s' % self.storyStatus)
+ return self.storyStatus
+
+ def getStoryRating(self):
+ logging.debug('self.storyRating=%s' % self.storyRating)
+ return self.storyRating
+
+ def getStoryUserRating(self):
+ logging.debug('self.storyUserRating=%s' % self.storyUserRating)
+ return self.storyUserRating
+
def getPrintableUrl(self, url):
return url
@@ -98,6 +341,15 @@ class FicWad(FanfictionSiteAdapter):
other = dict(submit = 'Log In', remember='yes')
return (login, other)
+ def getStoryCharacters(self):
+ logging.debug('self.storyCharacters=%s' % self.storyCharacters)
+ return self.storyCharacters
+
+ def getStorySeries(self):
+ logging.debug('self.storySeries=%s' % self.storySeries)
+ return self.storySeries
+
+
if __name__ == '__main__':
url = 'http://www.ficwad.com/story/14536'
diff --git a/hpfiction.py b/hpfiction.py
index 75cb4597..9f6cd467 100644
--- a/hpfiction.py
+++ b/hpfiction.py
@@ -15,6 +15,8 @@ import urllib2 as u2
import urlparse as up
import BeautifulSoup as bs
import htmlentitydefs as hdefs
+import time
+import datetime
from constants import *
from adapter import *
@@ -32,8 +34,37 @@ class HPFiction(FanfictionSiteAdapter):
self.host = parsedUrl.netloc
self.path = parsedUrl.path
+ logging.debug('self.url=%s' % self.url)
+ logging.debug('self.host=%s' % self.host)
+ logging.debug('self.path=%s' % self.path)
+
self.opener = u2.build_opener(u2.HTTPCookieProcessor())
+ self.storyDescription = 'Fanfiction Story'
+ self.authorId = '0'
+ self.authorURL = ''
+ (u1, self.storyId) = self.url.split('=')
+ self.storyPublished = datetime.date(1970, 01, 31)
+ self.storyCreated = datetime.datetime.now()
+ self.storyUpdated = datetime.date(1970, 01, 31)
+ self.languageId = 'en-UK'
+ self.language = 'English'
+ self.subjects = []
+ self.subjects.append ('fanfiction')
+ self.subjects.append ('Harry Potter')
+ self.publisher = self.host
+ self.numChapters = 0
+ self.numWords = 0
+ self.genre = 'FanFiction'
+ self.category = 'Category'
+ self.storyStatus = 'In-Progress'
+ self.storyRating = 'K'
+ self.storyUserRating = '0'
+ self.storyCharacters = []
+ self.storySeries = ''
+ self.uuid = 'urn:uuid:' + self.host + '-u.' + self.authorId + '-s.' + self.storyId
+ logging.debug('self.uuid=%s' % self.uuid)
+
logging.debug("Created HPFiction: url=%s" % (self.url))
def _getLoginScript(self):
@@ -45,23 +76,116 @@ class HPFiction(FanfictionSiteAdapter):
def performLogin(self, url = None):
return True
+ def _addSubject(self, subject):
+ subj = subject.upper()
+ for s in self.subjects:
+ if s.upper() == subj:
+ return False
+ self.subjects.append(subject)
+ return True
+
+ def _addCharacter(self, character):
+ chara = character.upper()
+ for c in self.storyCharacters:
+ if c.upper() == chara:
+ return False
+ self.storyCharacters.append(character)
+ return True
+
def extractIndividualUrls(self):
data = self.opener.open(self.url).read()
soup = bs.BeautifulSoup(data)
links = soup.findAll('a')
+ def_chapurl = ''
+ def_chaptitle = ''
for a in links:
if a['href'].find('psid') != -1:
self.storyName = a.string
+ logging.debug('self.storyName=%s' % self.storyName)
elif a['href'].find('viewuser.php') != -1:
self.authorName = a.string
+ self.authorURL = 'http://' + self.host + '/' + a['href']
+ (u1, self.authorId) = a['href'].split('=')
+ logging.debug('self.authorName=%s, self.authorId=%s' % (self.authorName, self.authorId))
+ elif a['href'].find('chapterid=') != -1 and len(def_chapurl) == 0:
+ def_chapurl = 'http://' + self.host + '/viewstory.php' + str(a['href'])
+ def_chaptitle = a.string
+ logging.debug('def_chapurl=%s, def_chaptitle=%s' % (def_chapurl, def_chaptitle))
+
+ centers = soup.findAll('center')
+ for center in centers:
+ tds = center.findAll ('td')
+ if tds is not None and len(tds) > 0:
+ for td in tds:
+ s = re.split ("<[^>]+>", str(td).replace('\n','').replace(' ',' '))
+ logging.debug('s=%s' % s)
+ ii = 0
+ ll = len(s)
+ sss = ''
+ while ii < ll - 1:
+ if s[ii] is not None and len(s[ii]) > 0:
+ if s[ii] == 'Rating:':
+ self.storyRating = s[ii+1]
+ logging.debug('self.storyRating=%s' % self.storyRating)
+ ii = ii + 2
+ elif s[ii] == 'Chapters:':
+ self.numChapters = s[ii+1]
+ logging.debug('self.numChapters=%s' % self.numChapters)
+ ii = ii + 2
+ elif s[ii] == 'Characters:':
+ s2 = s[ii+1].split(', ')
+ for ss2 in s2:
+ self._addCharacter(ss2)
+ logging.debug('self.storyCharacters=%s' % self.storyCharacters)
+ ii = ii + 2
+ elif s[ii] == 'Genre(s):':
+ self.genre = s[ii+1]
+ logging.debug('self.genre=%s' % self.genre)
+ s2 = s[ii+1].split(', ')
+ for ss2 in s2:
+ self._addSubject(ss2)
+ logging.debug('self.subjects=%s' % self.subjects)
+ ii = ii + 2
+ elif s[ii] == 'Status:':
+ if s[ii+1].strip(' ') == "Work In Progress":
+ self.storyStatus = 'In-Progress'
+ else:
+ self.storyStatus = 'Completed'
+ ii = ii + 2
+ elif s[ii] == 'First Published:':
+ self.storyPublished = datetime.datetime.fromtimestamp(time.mktime(time.strptime(s[ii+1].strip(' '), "%Y.%m.%d")))
+ logging.debug('self.storyPublished=%s' % self.storyPublished)
+ ii = ii + 2
+ elif s[ii] == 'Last Updated:':
+ self.storyUpdated = datetime.datetime.fromtimestamp(time.mktime(time.strptime(s[ii+1].strip(' '), "%Y.%m.%d")))
+ logging.debug('self.storyUpdated=%s' % self.storyUpdated)
+ ii = ii + 2
+ elif s[ii] == 'Last Published Chapter:':
+ ii = ii + 2
+ elif s[ii] == 'Pairings:':
+ ii = ii + 2
+ elif s[ii] == 'Warnings:':
+ ii = ii + 2
+ else:
+ sss = sss + ' ' + s[ii]
+ ii = ii + 1
+ else:
+ ii = ii + 1
+ self.storyDescription = sss
+ logging.debug('self.storyDescription=%s' % self.storyDescription)
urls = []
+ self.outputName = self.storyName.replace(" ", "_") + '-hp_' + self.storyId
+
select = soup.find('select', {'name' : 'chapterid'})
if select is None:
# no chapters found, try url by itself.
- urls.append((self.url,self.storyName))
+ if len(def_chapurl) > 0:
+ urls.append((def_chapurl, def_chaptitle))
+ else:
+ urls.append((self.url,self.storyName))
else:
for o in select.findAll('option'):
if 'value' in o._getAttrMap():
@@ -69,11 +193,18 @@ class HPFiction(FanfictionSiteAdapter):
title = o.string
if title != "Story Index":
urls.append((url,title))
+
+ self.uuid = 'urn:uuid:' + self.host + '-u.' + self.authorId + '-s.' + self.storyId
+ logging.debug('self.uuid=%s' % self.uuid)
+
return urls
def getStoryName(self):
return self.storyName
+ def getOutputName(self):
+ return self.outputName
+
def getAuthorName(self):
return self.authorName
@@ -84,9 +215,100 @@ class HPFiction(FanfictionSiteAdapter):
divtext = soup.find('div', {'id' : 'fluidtext'})
if None == divtext:
logging.error("Error downloading Chapter: %s" % url)
- exit(1)
+ exit(20)
return divtext.__str__('utf8')
+ def getAuthorId(self):
+ logging.debug('self.authorId=%s' % self.authorId)
+ return self.authorId
+
+ def getStoryId(self):
+ logging.debug('self.storyId=%s' % self.storyId)
+ return self.storyId
+
+ def getStoryDescription(self):
+ logging.debug('self.storyDescription=%s' % self.storyDescription)
+ return self.storyDescription
+
+ def getStoryPublished(self):
+ logging.debug('self.storyPublished=%s' % self.storyPublished)
+ return self.storyPublished
+
+ def getStoryCreated(self):
+ self.storyCreated = datetime.datetime.now()
+ logging.debug('self.storyCreated=%s' % self.storyCreated)
+ return self.storyCreated
+
+ def getStoryUpdated(self):
+ logging.debug('self.storyUpdated=%s' % self.storyUpdated)
+ return self.storyUpdated
+
+ def getLanguage(self):
+ logging.debug('self.language=%s' % self.language)
+ return self.language
+
+ def getLanguageId(self):
+ logging.debug('self.languageId=%s' % self.languageId)
+ return self.languageId
+
+ def getSubjects(self):
+ logging.debug('self.subjects=%s' % self.authorName)
+ return self.subjects
+
+ def getPublisher(self):
+ logging.debug('self.publisher=%s' % self.publisher)
+ return self.publisher
+
+ def getNumChapters(self):
+ logging.debug('self.numChapters=%s' % self.numChapters)
+ return self.numChapters
+
+ def getNumWords(self):
+ logging.debug('self.numWords=%s' % self.numWords)
+ return self.numWords
+
+ def getStoryURL(self):
+ logging.debug('self.url=%s' % self.url)
+ return self.url
+
+ def getAuthorURL(self):
+ logging.debug('self.authorURL=%s' % self.authorURL)
+ return self.authorURL
+
+ def getUUID(self):
+ logging.debug('self.uuid=%s' % self.uuid)
+ return self.uuid
+
+ def getCategory(self):
+ logging.debug('self.category=%s' % self.category)
+ return self.category
+
+ def getGenre(self):
+ logging.debug('self.genre=%s' % self.genre)
+ return self.genre
+
+ def getStoryStatus(self):
+ logging.debug('self.storyStatus=%s' % self.storyStatus)
+ return self.storyStatus
+
+ def getStoryRating(self):
+ logging.debug('self.storyRating=%s' % self.storyRating)
+ return self.storyRating
+
+ def getStoryUserRating(self):
+ logging.debug('self.storyUserRating=%s' % self.storyUserRating)
+ return self.storyUserRating
+
+ def getStoryCharacters(self):
+ logging.debug('self.storyCharacters=%s' % self.storyCharacters)
+ return self.storyCharacters
+
+ def getStorySeries(self):
+ logging.debug('self.storySeries=%s' % self.storySeries)
+ return self.storySeries
+
+
+
class FF_UnitTests(unittest.TestCase):
def setUp(self):
logging.basicConfig(level=logging.DEBUG)
diff --git a/output.py b/output.py
index 1700bfe7..9ffb1503 100644
--- a/output.py
+++ b/output.py
@@ -26,6 +26,7 @@ from constants import *
import html2text
+import datetime
class FanficWriter:
@@ -41,8 +42,8 @@ class FanficWriter:
class TextWriter(FanficWriter):
htmlWriter = None
- def __init__(self, base, name, author, inmemory=False, compress=False):
- self.htmlWriter = HTMLWriter(base, name, author, True, False)
+ def __init__(self, base, adapter, inmemory=False, compress=False):
+ self.htmlWriter = HTMLWriter(base, adapter, True, False)
def writeChapter(self, index, title, text):
self.htmlWriter.writeChapter(index, title, text)
@@ -57,12 +58,13 @@ class TextWriter(FanficWriter):
class HTMLWriter(FanficWriter):
body = ''
- def __init__(self, base, name, author, inmemory=False, compress=False):
+ def __init__(self, base, adapter, inmemory=False, compress=False):
self.basePath = base
- self.storyTitle = removeEntities(name)
- self.name = makeAcceptableFilename(name)
- self.fileName = self.basePath + '/' + self.name + '.html'
- self.authorName = removeEntities(author)
+ self.storyTitle = removeEntities(adapter.getStoryName())
+ self.name = makeAcceptableFilename(adapter.getOutputName())
+ self.fileName = self.basePath + '/' + self.name + '.html'
+ self.authorName = removeEntities(adapter.getAuthorName())
+ self.adapter = adapter
self.inmemory = inmemory
@@ -131,14 +133,14 @@ class EPubFanficWriter(FanficWriter):
for f in self.files:
self.files[f].close()
- def __init__(self, base, name, author, inmemory=False, compress=True):
+ def __init__(self, base, adapter, inmemory=False, compress=True):
self.basePath = base
- self.storyTitle = removeEntities(name)
- self.name = makeAcceptableFilename(name)
+ self.storyTitle = removeEntities(adapter.getStoryName())
+ self.name = makeAcceptableFilename(adapter.getOutputName())
self.directory = self.basePath + '/' + self.name
- self.authorName = removeEntities(author)
-
+ self.authorName = removeEntities(adapter.getAuthorName())
self.inmemory = inmemory
+ self.adapter = adapter
self.files = {}
self.chapters = []
@@ -226,17 +228,50 @@ class EPubFanficWriter(FanficWriter):
tocFilePath = "OEBPS/toc.ncx"
# toc = open(tocFilePath, 'w')
# print >> toc, TOC_START % self.storyTitle
- self._writeFile(tocFilePath, TOC_START % self.storyTitle)
+ self._writeFile(tocFilePath, TOC_START % (self.adapter.getUUID(), self.storyTitle))
+
+ published = self.adapter.getStoryPublished().strftime("%Y-%m-%d")
+ createda = self.adapter.getStoryCreated().strftime("%Y-%m-%d %H:%M:%S")
+ created = self.adapter.getStoryCreated().strftime("%Y-%m-%d")
+ updated = self.adapter.getStoryUpdated().strftime("%Y-%m-%d")
+ calibre = self.adapter.getStoryUpdated().strftime("%Y-%m-%dT%H:%M:%S")
+
+ ### writing content -- title page
+ titleFilePath = "OEBPS/title_page.xhtml"
+ self._writeFile(titleFilePath, TITLE_PAGE % (self.authorName, self.storyTitle, self.adapter.getStoryURL(), self.storyTitle, self.adapter.getAuthorURL(), self.authorName, self.adapter.getCategory(), self.adapter.getGenre(), self.adapter.getStoryStatus(), published, updated, createda, self.adapter.getStoryRating(), self.adapter.getStoryUserRating(), self.adapter.getNumChapters(), self.adapter.getNumWords(), self.adapter.getStoryURL(), self.adapter.getStoryURL(), self.adapter.getStoryDescription()))
+
### writing content -- opf file
opfFilePath = "OEBPS/content.opf"
-
+
# opf = open(opfFilePath, 'w')
- self._writeFile(opfFilePath, CONTENT_START % (self.storyTitle, self.authorName, uuid.uuid4().urn))
+ self._writeFile(opfFilePath, CONTENT_START % (uuid.uuid4().urn, self.storyTitle, self.authorName, self.adapter.getLanguageId(), published, created, updated, calibre, self.adapter.getStoryDescription()))
+
+ i = 0
+ subjs = []
+ subjs = self.adapter.getSubjects()
+ for subj in subjs:
+ self._writeFile(opfFilePath, CONTENT_SUBJECT % subj)
+ i = i + 1
+ if (i <= 0):
+ self._writeFile(opfFilePath, CONTENT_SUBJECT % "FanFiction")
+
+ self._writeFile(opfFilePath, CONTENT_END_METADATA % (self.adapter.getPublisher(), self.adapter.getUUID(), self.adapter.getStoryURL(), self.adapter.getStoryURL(), self.adapter.getStoryUserRating()))
# print >> opf, CONTENT_START % (uuid.uuid4().urn, self.storyTitle, self.authorName)
ids = []
- i = 1
+ i = 0
+
+ t = "Title Page"
+ f = "title_page.xhtml"
+ chapterId = "Title Page"
+ self._writeFile(tocFilePath, TOC_ITEM % (chapterId, i, t, f))
+ self._writeFile(opfFilePath, CONTENT_ITEM % (chapterId, f))
+
+ ids.append(chapterId)
+
+ i = i + 1
+
for t,f in self.chapters:
chapterId = "chapter%04d" % i
diff --git a/twilighted.py b/twilighted.py
index a7e77a53..f7654041 100644
--- a/twilighted.py
+++ b/twilighted.py
@@ -11,119 +11,360 @@ import urllib2 as u2
import urlparse as up
import BeautifulSoup as bs
import htmlentitydefs as hdefs
+import time
+import datetime
from adapter import *
import twipassword
class Twilighted(FanfictionSiteAdapter):
- def __init__(self, url):
- self.url = url
- parsedUrl = up.urlparse(url)
- self.host = parsedUrl.netloc
- self.path = parsedUrl.path
- self.opener = u2.build_opener(u2.HTTPCookieProcessor())
- self.password=twipassword.password
- self.login='sigizmund'
- logging.debug("Created Twilighted: url=%s" % (self.url))
-
-
- def requiresLogin(self, url = None):
- # potionsandsnitches.net doesn't require login.
- if self.host == 'potionsandsnitches.net':
- return False
- else:
- return True
-
- def performLogin(self, url = None):
- data = {}
-
- data['penname'] = self.login
- data['password'] = self.password
- data['cookiecheck'] = '1'
- data['submit'] = 'Submit'
-
- urlvals = u.urlencode(data)
- loginUrl = 'http://' + self.host + self._getLoginScript()
- logging.debug("Will now login to URL %s" % loginUrl)
-
- req = self.opener.open(loginUrl, urlvals)
-
- d = req.read().decode('utf-8')
-
- if self.reqLoginData(d) :
- return False
- else:
- return True
-
-
- def setLogin(self, login):
- self.login = login
-
- def setPassword(self, password):
- self.password = password
-
- def extractIndividualUrls(self):
- data = self.opener.open(self.url).read()
+ def __init__(self, url):
+ self.url = url
+ parsedUrl = up.urlparse(url)
+ self.host = parsedUrl.netloc
+ self.path = parsedUrl.path
+ self.opener = u2.build_opener(u2.HTTPCookieProcessor())
+ self.password=twipassword.password
+ self.login='sigizmund'
+ self.storyDescription = 'Fanfiction Story'
+ self.authorId = '0'
+ self.authorURL = ''
+ self.storyId = '0'
+ self.storyPublished = datetime.date(1970, 01, 31)
+ self.storyCreated = datetime.datetime.now()
+ self.storyUpdated = datetime.date(1970, 01, 31)
+ self.languageId = 'en-UK'
+ self.language = 'English'
+ self.subjects = []
+ self.subjects.append ('fanfiction')
+ self.subjects.append ('Twilight')
+ self.publisher = self.host
+ self.numChapters = 0
+ self.numWords = 0
+ self.genre = 'FanFiction'
+ self.category = 'Category'
+ self.storyStatus = 'In-Progress'
+ self.storyRating = 'PG'
+ self.storyUserRating = '0'
+ self.storyCharacters = []
+ self.storySeries = ''
+ self.uuid = 'urn:uuid:' + self.host + '-u.' + self.authorId + '-s.' + self.storyId
+ logging.debug('self.uuid=%s' % self.uuid)
- if self.reqLoginData(data):
- self.performLogin()
- data = self.opener.open(self.url).read()
- if self.reqLoginData(data):
- return None
+ logging.debug("Created Twilighted: url=%s" % (self.url))
+
+
+ def requiresLogin(self, url = None):
+ # potionsandsnitches.net doesn't require login.
+ if self.host == 'potionsandsnitches.net':
+ return False
+ else:
+ return True
+
+ def performLogin(self, url = None):
+ data = {}
- soup = bs.BeautifulStoneSoup(data)
-
- title = soup.find('title').string
- self.storyName = title.split(' by ')[0].strip()
- self.authorName = title.split(' by ')[1].strip()
-
- select = soup.find('select', { 'name' : 'chapter' } )
-
- result = []
- if select is None:
- # no chapters found, try url by itself.
- result.append((self.url,self.storyName))
- else:
- allOptions = select.findAll('option')
- for o in allOptions:
- url = self.url + "&chapter=%s" % o['value']
- title = o.string
- result.append((url,title))
-
- return result
-
- def getStoryName(self):
- return self.storyName
-
- def getAuthorName(self):
- return self.authorName
-
- def getText(self, url):
- if url.find('http://') == -1:
- url = 'http://' + self.host + '/' + url
-
- logging.debug('Getting data from: %s' % url)
-
- data = self.opener.open(url).read()
+ data['penname'] = self.login
+ data['password'] = self.password
+ data['cookiecheck'] = '1'
+ data['submit'] = 'Submit'
- soup = bs.BeautifulStoneSoup(data, convertEntities=bs.BeautifulStoneSoup.HTML_ENTITIES)
+ urlvals = u.urlencode(data)
+ loginUrl = 'http://' + self.host + self._getLoginScript()
+ logging.debug("Will now login to URL %s" % loginUrl)
+
+ req = self.opener.open(loginUrl, urlvals)
+
+ d = req.read().decode('utf-8')
+
+ if self.reqLoginData(d) :
+ return False
+ else:
+ return True
- div = soup.find('div', {'id' : 'story'})
- if None == div:
- return ''
+ def setLogin(self, login):
+ self.login = login
- return div.__str__('utf8')
+ def setPassword(self, password):
+ self.password = password
- def _getLoginScript(self):
- return '/user.php?action=login'
+ def _addSubject(self, subject):
+ subj = subject.upper()
+ for s in self.subjects:
+ if s.upper() == subj:
+ return False
+ self.subjects.append(subject)
+ return True
- def reqLoginData(self, data):
- if data.find('Registered Users Only. Please click OK to login or register.') != -1 or data.find('There is no such account on our website') != -1:
- return True
- else:
- return False
+ def _addCharacter(self, character):
+ chara = character.upper()
+ for c in self.storyCharacters:
+ if c.upper() == chara:
+ return False
+ self.storyCharacters.append(character)
+ return True
+ def extractIndividualUrls(self):
+ data = self.opener.open(self.url).read()
+
+ if self.reqLoginData(data):
+ self.performLogin()
+ data = self.opener.open(self.url).read()
+ if self.reqLoginData(data):
+ return None
+
+ soup = bs.BeautifulStoneSoup(data)
+
+ title = soup.find('title').string
+ self.storyName = title.split(' by ')[0].strip()
+ self.authorName = title.split(' by ')[1].strip()
+ self.outputName = self.storyName.replace(" ", "_")
+
+ select = soup.find('select', { 'name' : 'chapter' } )
+
+ result = []
+ if select is None:
+ # no chapters found, try url by itself.
+ result.append((self.url,self.storyName))
+ else:
+ allOptions = select.findAll('option')
+ for o in allOptions:
+ url = self.url + "&chapter=%s" % o['value']
+ title = o.string
+ result.append((url,title))
+
+ url = self.url + "&index=1"
+ data = self.opener.open(url).read()
+ lines = data.split('\n')
+ soup = bs.BeautifulStoneSoup(data)
+ metas = soup.findAll('meta')
+ for meta in metas:
+ if 'name' in meta._getAttrMap() and meta['name'].find('description') != -1:
+ #logging.debug('Meta: %s' % meta)
+ if 'content' in meta._getAttrMap():
+ s1 = bs.BeautifulStoneSoup(meta['content'])
+ ps = s1.findAll('p')
+ if len(ps) > 0:
+ self.storyDescription = ps[0]
+ logging.debug('self.storyDescription=%s' % (self.storyDescription))
+ else:
+ divs = meta.findAll('div')
+ #logging.debug('Divs: %s' % divs)
+
+ for div in divs:
+ #logging.debug('Div: %s' % div)
+ if 'id' in div._getAttrMap() and div['id'].find('pagetitle') != -1:
+ #logging.debug('Div PAGETITLE: %s' % div)
+ allA = div.findAll('a')
+ for a in allA:
+ if 'href' in a._getAttrMap():
+ if a['href'].find('viewstory.php?sid=') != -1:
+ str1 = a.string
+ (vs, self.storyId) = a['href'].split('=')
+ logging.debug('self.storyId=%s self.storyName=%s' % (self.storyId, self.storyName))
+ self.outputName = self.outputName + "-tw_" + self.storyId
+ logging.debug('self.outputName=%s' % self.outputName)
+ if a['href'].find('viewuser.php?uid=') != -1:
+ str1 = a.string
+ (vs, self.authorId) = a['href'].split('=')
+ logging.debug('self.authorId=%s self.authorName=%s' % (self.authorId, self.authorName))
+ self.authorURL = 'http://'+self.host+'/viewuser.php?uid='+self.authorId
+ logging.debug('self.authorURL=%s' % self.authorURL)
+ if 'class' in div._getAttrMap() and div['class'].find('content') != -1:
+ #logging.debug('Div CONTENT: %s' % div)
+ brs = div.findAll('br')
+ for br in brs:
+ buf = unicode(br).encode('utf-8')
+ strs = re.split ('<[^>]+>', buf)
+ #logging.debug('BUF: %s' % strs)
+ ii = 2
+ stlen = len(strs)
+ while stlen > ii+1:
+ if len(strs[ii]) == 0:
+ ii = ii+1
+ continue
+ if strs[ii] == 'Categories:':
+ ii = ii+1
+ while stlen > ii and len(strs[ii]) != 0 and strs[ii].find(':') == -1:
+ if strs[ii] != ' ' and strs[ii] != ', ':
+ if self.category == 'Category':
+ self.category = strs[ii].strip(' ')
+ self._addSubject(strs[ii].strip(' '))
+ ii = ii+1
+ logging.debug('self.subjects=%s' % self.subjects)
+ if strs[ii] == 'Characters: ':
+ ii = ii+1
+ while stlen > ii and len(strs[ii]) != 0 and strs[ii].find(':') == -1:
+ if strs[ii] != ' ' and strs[ii] != ', ':
+ self._addCharacter(strs[ii].strip(' '))
+ ii = ii+1
+ logging.debug('self.storyCharacters=%s' % self.storyCharacters)
+ elif strs[ii] == 'Completed:':
+ if strs[ii+1].strip(' ') == "No":
+ self.storyStatus = 'In-Progress'
+ else:
+ self.storyStatus = 'Completed'
+ ii = ii+2
+ logging.debug('self.storyStatus=%s' % self.storyStatus)
+ elif strs[ii] == 'Rated:':
+ self.storyRating = strs[ii+1].strip(' ')
+ ii = ii+2
+ logging.debug('self.storyRating=%s' % self.storyRating)
+ elif strs[ii] == 'Series:':
+ self.storySeries = strs[ii+1].strip(' ')
+ if self.storySeries == 'None':
+ self.storySeries = ''
+ ii = ii+2
+ logging.debug('self.storySeries=%s' % self.storySeries)
+ elif strs[ii] == 'Chapters: ':
+ self.numChapters = strs[ii+1].strip(' ')
+ ii = ii+2
+ logging.debug('self.numChapters=%s' % self.numChapters)
+ elif strs[ii] == 'Word count:':
+ self.numWords = strs[ii+1].strip(' ')
+ ii = ii+2
+ logging.debug('self.numWords=%s' % self.numWords)
+ elif strs[ii] == ' Published: ':
+ self.storyPublished = datetime.datetime.fromtimestamp(time.mktime(time.strptime(strs[ii+1].strip(' '), "%B %d, %Y")))
+ ii = ii+2
+ logging.debug('self.storyPublished=%s' % self.storyPublished)
+ elif strs[ii] == 'Updated:':
+ self.storyUpdated = datetime.datetime.fromtimestamp(time.mktime(time.strptime(strs[ii+1].strip(' '), "%B %d, %Y")))
+ ii = ii+2
+ logging.debug('self.storyUpdated=%s' % self.storyUpdated)
+ else:
+ logging.debug('Skipped Label \"%s\" Value \"%s\"' % (strs[ii], strs[ii+1]))
+ ii = ii+2
+
+ self.uuid = 'urn:uuid:' + self.host + '-u.' + self.authorId + '-s.' + self.storyId
+ logging.debug('self.uuid=%s' % self.uuid)
+
+ return result
+
+ def getStoryName(self):
+ return self.storyName
+
+ def getOutputName(self):
+ return self.outputName
+
+ def getAuthorName(self):
+ return self.authorName
+
+ def getText(self, url):
+ if url.find('http://') == -1:
+ url = 'http://' + self.host + '/' + url
+
+ logging.debug('Getting data from: %s' % url)
+
+ data = self.opener.open(url).read()
+
+ soup = bs.BeautifulStoneSoup(data, convertEntities=bs.BeautifulStoneSoup.HTML_ENTITIES)
+
+ div = soup.find('div', {'id' : 'story'})
+
+ if None == div:
+ return '