Add a title page to the resulting EPUB file. This required scraping more information from the web pages in order to populate the new fields. This change includes a change to the way that the output.py uses the adapters. It now gets passed in the adapter and then calls functions from the adapter in order to retrieve the scraped information. This will make it easier down the road when adding more information, or even pictures.

This commit is contained in:
wsuetholz 2010-11-09 16:35:46 -06:00
parent 778deaea00
commit 379efc34f1
9 changed files with 1577 additions and 169 deletions

View file

@ -29,11 +29,80 @@ class FanfictionSiteAdapter:
def setPassword(self, password):
pass
def getStoryName(self):
def getStoryURL(self):
pass
def getUUID(self):
pass
def getOutputName(self):
pass
def getAuthorURL(self):
pass
def getAuthorId(self):
pass
def getAuthorName(self):
pass
def getStoryId(self):
pass
def getStoryName(self):
pass
def getStoryDescription(self):
pass
def getStoryCreated(self):
pass
def getStoryPublished(self):
pass
def getStoryUpdated(self):
pass
def getStorySeries(self):
pass
def getLanguage(self):
pass
def getLanguageId(self):
pass
def getSubjects(self):
pass
def getCharacters(self):
pass
def getPublisher(self):
pass
def getNumChapters(self):
pass
def getNumWords(self):
pass
def getCategory(self):
pass
def getGenre(self):
pass
def getStoryStatus(self):
pass
def getStoryRating(self):
pass
def getStoryUserRating(self):
pass
def getPrintableUrl(self, url):
pass
pass

View file

@ -15,6 +15,9 @@ h6 { text-align: center; }
padding:0px;
}
.center {text-align: center;}
.cover {text-align: center;}
.full {width: 100%; }
.quarter {width: 25%; }
.smcap {font-variant: small-caps;}
.u {text-decoration: underline;}
.bold {font-weight: bold;}
@ -22,6 +25,37 @@ h6 { text-align: center; }
MIMETYPE = '''application/epub+zip'''
TITLE_PAGE = '''<html xmlns="http://www.w3.org/1999/xhtml" xmlns:xlink="http://www.w3.org/1999/xlink"><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
<title>%s - %s</title><link href="stylesheet.css" type="text/css" charset="UTF-8" rel="stylesheet"/></head>
<body><div class="cover">
<h1 id="cfs_0"><a id="StoryLink" href="%s">%s</a></h1>
<h2 id="cfs_1">by <a id="AuthorLink" href="%s">%s</a></h2>
</div><div style="text-align:center">
<table class="full">
<colgroup span="2"></colgroup>
<tr><td> </td>
<td> </td>
</tr><tr><td> </td>
<td> </td>
</tr><tr><td><b>Category:</b></td><td>%s</td>
</tr><tr><td><b>Genre:</b></td><td>%s</td>
</tr><tr><td><b>Status:</b></td><td>%s</td>
</tr><tr><td><b>Published:</b></td><td>%s</td>
</tr><tr><td><b>Updated:</b></td><td>%s</td>
</tr><tr><td><b>Packaged:</b></td><td>%s</td>
</tr><tr><td><b>Rating Age/User:</b></td><td>%s / %s</td>
</tr><tr><td><b>Chapters/Words:</b></td><td>%s / %s</td>
</tr><tr><td><b>URL:</b></td><td><h3 id="url0"><a id="StoryURL" href="%s">%s</a></h3></td>
</tr><tr><td><b>Summary:</b></td>
</tr><tr><td colspan="2">%s</td>
</tr><tr><td> </td>
<td> </td>
</tr><tr><td> </td>
<td> </td>
</tr></table></div>
<div class="full" id="pb_0"/></body></html>
'''
CONTAINER = '''<?xml version="1.0"?>
<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
<rootfiles>
@ -30,42 +64,60 @@ CONTAINER = '''<?xml version="1.0"?>
</container>
'''
CONTENT_START = '''<?xml version="1.0"?>
CONTENT_START = '''<?xml version="1.0" encoding="utf-8"?>
<package version="2.0" xmlns="http://www.idpf.org/2007/opf"
unique-identifier="BookID">
<metadata xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:opf="http://www.idpf.org/2007/opf">
unique-identifier="fanficdownloader-uuid">
<metadata xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:dcterms="http://purl.org/dc/terms/"
xmlns:opf="http://www.idpf.org/2007/opf"
xmlns:calibre="http://calibre.kovidgoyal.net/2009/metadata">
<dc:identifier id="fanficdownloader-uuid">BookID-Epub-%s</dc:identifier>
<dc:title>%s</dc:title>
<dc:creator opf:role="aut">%s</dc:creator>
<dc:language>en-UK</dc:language>
<dc:contributor opf:role="bkp">fanficdownloader [http://fanficdownloader.googlecode.com]</dc:contributor>
<dc:language>%s</dc:language>
<dc:rights></dc:rights>
<dc:subject>fanfiction</dc:subject>
<dc:publisher>sgzmd</dc:publisher>
<dc:identifier id="BookID">%s</dc:identifier>
<dc:date opf:event="publication">%s</dc:date>
<dc:date opf:event="creation">%s</dc:date>
<dc:date opf:event="modification">%s</dc:date>
<meta name="calibre:timestamp" content="%s"/>
<dc:description>%s</dc:description>
'''
CONTENT_END_METADATA = ''' <dc:publisher>%s</dc:publisher>
<dc:identifier id="BookId">%s</dc:identifier>
<dc:identifier opf:scheme="URL">%s</dc:identifier>
<dc:source>%s</dc:source>
<dc:type>FanFiction</dc:type>
<meta name="calibre:rating" content="%s"/>
</metadata>
<manifest>
<item id="ncx" href="toc.ncx" media-type="application/x-dtbncx+xml"/>
<item id="style" href="stylesheet.css" media-type="text/css" />
'''
CONTENT_ITEM = '''<item id="%s" href="%s" media-type="application/xhtml+xml" />
CONTENT_SUBJECT = ''' <dc:subject>%s</dc:subject>
'''
CONTENT_END_MANIFEST = '''</manifest>
<spine toc="ncx">
CONTENT_ITEM = ''' <item id="%s" href="%s" media-type="application/xhtml+xml" />
'''
CONTENT_ITEMREF = '''<itemref idref="%s" />
CONTENT_END_MANIFEST = ''' </manifest>
<spine toc="ncx">
'''
CONTENT_END = '''</spine>
CONTENT_ITEMREF = ''' <itemref idref="%s" />
'''
CONTENT_END = ''' </spine>
</package>
'''
TOC_START = '''<?xml version="1.0" encoding="UTF-8"?>
<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1">
<head>
<meta name="dtb:uid" content="sigizmund.com062820072147132"/>
<meta name="dtb:uid" content="%s"/>
<meta name="dtb:depth" content="1"/>
<meta name="dtb:totalPageCount" content="0"/>
<meta name="dtb:maxPageNumber" content="0"/>
@ -502,3 +554,5 @@ FB2_DESCRIPTION = '''<description>
<version>2.0</version>
</document-info>
</description>'''
HTML_ESC_Definitions = 'HTML_Escape.def'

View file

@ -34,6 +34,7 @@ class FanficLoader:
self.inmemory = inmemory
self.compress = compress
self.badLogin = False
self.overWrite = True
def getAdapter():
return self.adapter
@ -48,7 +49,13 @@ class FanficLoader:
raise adapter.LoginRequiredException(self.adapter.url)
urls = self.adapter.extractIndividualUrls()
self.writer = self.writerClass(self.booksDirectory, self.adapter.getStoryName(), self.adapter.getAuthorName(), inmemory=self.inmemory, compress=self.compress)
s = self.booksDirectory + "/" + self.adapter.getOutputName() + "." + format
if not self.overWrite and os.path.isfile(s):
print >> sys.stderr, "File " + s + " already exists! Skipping!"
exit(10)
self.writer = self.writerClass(self.booksDirectory, self.adapter, inmemory=self.inmemory, compress=self.compress)
i = 1
for u,n in urls:

293
ffnet.py
View file

@ -15,6 +15,8 @@ import urllib2 as u2
import urlparse as up
import BeautifulSoup as bs
import htmlentitydefs as hdefs
import time
import datetime
from constants import *
from adapter import *
@ -40,10 +42,37 @@ class FFNet(FanfictionSiteAdapter):
self.storyName = 'FF.Net story'
self.authorName = 'FF.Net author'
self.outputName = 'FF.Net_story'
self.storyDescription = 'Fanfiction Story'
self.storyCharacters = []
self.storySeries = ''
self.authorId = '0'
self.authorURL = self.path
self.storyId = '0'
self.storyPublished = datetime.date(1970, 01, 31)
self.storyCreated = datetime.datetime.now()
self.storyUpdated = datetime.date(1970, 01, 31)
self.languageId = 'en-UK'
self.language = 'English'
self.subjects = []
self.subjects.append ('FanFiction')
logging.debug('self.subjects=%s' % self.subjects)
self.publisher = self.host
self.numChapters = 0
self.numWords = 0
self.genre = 'FanFiction'
self.category = 'FF.Net Category'
self.storyStatus = 'In-Progress'
self.storyRating = 'K'
self.storyUserRating = '0'
logging.debug('self.path=%s' % self.path)
spl = self.path.split('/')
logging.debug('spl=%s' % spl)
if len(spl) == 5:
self.path = "/".join(spl[1:-1])
self.outputName = spl[4] + '-ffnet_' + spl[2]
if self.path.startswith('/'):
self.path = self.path[1:]
@ -51,10 +80,14 @@ class FFNet(FanfictionSiteAdapter):
if self.path.endswith('/'):
self.path = self.path[:-1]
logging.debug('self.path=%s' % self.path)
(s, self.storyId, chapter) = self.path.split('/')
logging.debug('self.storyId=%s, chapter=%s' % (self.storyId, chapter))
self.uuid = 'urn:uuid:' + self.host + '-u.' + self.authorId + '-s.' + self.storyId
logging.debug('self.uuid=%s' % self.uuid)
logging.debug('self.storyId=%s, chapter=%s, self.outputName=%s' % (self.storyId, chapter, self.outputName))
if not appEngine:
self.opener = u2.build_opener(u2.HTTPCookieProcessor())
else:
@ -70,7 +103,70 @@ class FFNet(FanfictionSiteAdapter):
def performLogin(self, url = None):
return True
def _getVarValue(self, varstr):
#logging.debug('_getVarValue varstr=%s' % varstr)
vals = varstr.split('=')
#logging.debug('vals=%s' % vals)
retstr="".join(vals[+1:])
#logging.debug('retstr=%s' % retstr)
if retstr.startswith(' '):
retstr = retstr[1:]
if retstr.endswith(';'):
retstr = retstr[:-1]
return retstr
def _splitCrossover(self, subject):
if "Crossover" in subject:
self._addSubject ("Crossover")
logging.debug('Crossover=%s' % subject)
if subject.find(' and ') != -1:
words = subject.split(' ')
logging.debug('words=%s' % words)
subj = ''
for s in words:
if s in "and Crossover":
if len(subj) > 0:
self._addSubject(subj)
subj = ''
else:
if len(subj) > 0:
subj = subj + ' '
subj = subj + s
if len(subj) > 0:
self._addSubject(subj)
else:
self._addSubject(subject)
else:
self._addSubject(subject)
return True
def _splitGenre(self, subject):
if len(subject) > 0:
words = subject.split('/')
logging.debug('words=%s' % words)
for subj in words:
if len(subj) > 0:
self._addSubject(subj)
return True
def _addSubject(self, subject):
subj = subject.upper()
for s in self.subjects:
if s.upper() == subj:
return False
self.subjects.append(subject)
return True
def _addCharacter(self, character):
chara = character.upper()
for c in self.storyCharacters:
if c.upper() == chara:
return False
self.storyCharacters.append(character)
return True
def _fetchUrl(self, url):
if not appEngine:
return self.opener.open(url).read().decode('utf-8')
@ -85,6 +181,8 @@ class FFNet(FanfictionSiteAdapter):
for a in allA:
if 'href' in a._getAttrMap() and a['href'].find('/u/') != -1:
self.authorName = a.string
(u1, u2, self.authorId, u3) = a['href'].split('/')
logging.debug('self.authorId=%s self.authorName=%s' % (self.authorId, self.authorName))
urls = []
lines = data.split('\n')
@ -92,9 +190,38 @@ class FFNet(FanfictionSiteAdapter):
if l.find("&#187;") != -1 and l.find('<b>') != -1:
s2 = bs.BeautifulStoneSoup(l)
self.storyName = str(s2.find('b').string)
logging.debug('self.storyId=%s self.storyName=%s' % (self.storyId, self.storyName))
elif l.find("<a href='/u/") != -1:
s2 = bs.BeautifulStoneSoup(l)
self.authorName = str(s2.a.string)
(u1, u2, self.authorId, u3) = s2.a['href'].split('/')
logging.debug('self.authorId=%s self.authorName=%s' % (self.authorId, self.authorName))
elif l.find("Rated: <a href=") != -1:
s2 = bs.BeautifulStoneSoup(l)
self.storyRating = str(s2.a.string).strip()
logging.debug('self.storyRating=%s' % self.storyRating)
logging.debug('s2.a=%s' % s2.a)
s3 = l.split('-')
logging.debug('s3=%s' % s3)
if len(s3) > 0:
if s3[1].find("Reviews: <a href=") != -1:
continue
self.language = s3[1].strip()
logging.debug('self.language=%s' % self.language)
if len(s3) > 1:
if s3[2].find("Reviews: <a href=") != -1:
continue
self.genre = s3[2].strip()
if "&" in self.genre:
self.genre = ''
continue
logging.debug('self.genre=%s' % self.genre)
self._splitGenre(self.genre)
logging.debug('self.subjects=%s' % self.subjects)
if "Complete" in l:
self.storyStatus = 'Completed'
else:
self.storyStatus = 'In-Progress'
elif l.find("<SELECT title='chapter navigation'") != -1:
if len(urls) > 0:
continue
@ -102,6 +229,8 @@ class FFNet(FanfictionSiteAdapter):
u = l.decode('utf-8')
except UnicodeEncodeError, e:
u = l
except:
u = l.encode('ascii', 'xmlcharrefreplace')
u = re.sub('&\#[0-9]+;', ' ', u)
s2 = bs.BeautifulSoup(u)
options = s2.findAll('option')
@ -110,19 +239,69 @@ class FFNet(FanfictionSiteAdapter):
title = o.string
logging.debug('URL = `%s`, Title = `%s`' % (url, title))
urls.append((url,title))
if len(urls) == 0:
elif l.find("var chapters") != -1:
self.numChapters = self._getVarValue (l)
logging.debug('self.numChapters=%s' % self.numChapters)
elif l.find("var words") != -1:
self.numWords = self._getVarValue (l)
logging.debug('self.numWords=%s' % self.numWords)
elif l.find("var categoryid") != -1:
categoryid = self._getVarValue (l)
logging.debug('categoryid=%s' % categoryid)
elif l.find("var cat_title") != -1:
self.category = self._getVarValue (l).strip("'")
logging.debug('self.category=%s' % self.category)
self._splitCrossover(self.category)
logging.debug('self.subjects=%s' % self.subjects)
elif l.find("var summary") != -1:
self.storyDescription = self._getVarValue (l).strip("'")
if '&' in self.storyDescription:
s = self.storyDescription.split('&')
logging.debug('s=%s' % s)
self.storyDescription = ''
for ss in s:
if len(self.storyDescription) > 0:
if len(ss) > 4 and 'amp;' in ss[1:4]:
self.storyDescription = self.storyDescription + '&' + ss
else:
self.storyDescription = self.storyDescription + '&amp;' + ss
else:
self.storyDescription = ss
logging.debug('self.storyDescription=%s' % self.storyDescription)
elif l.find("var datep") != -1:
dateps = self._getVarValue (l)
self.storyPublished = datetime.datetime(*time.strptime ( dateps, "'%m-%d-%y'" )[0:5])
logging.debug('self.storyPublished=%s' % self.storyPublished.strftime("%Y-%m-%dT%I:%M:%S"))
elif l.find("var dateu") != -1:
dateus = self._getVarValue (l)
self.storyUpdated = datetime.datetime(*time.strptime ( dateus, "'%m-%d-%y'" )[0:5])
logging.debug('self.storyUpdated=%s' % self.storyUpdated.strftime("%Y-%m-%dT%I:%M:%S"))
if len(urls) <= 0:
# no chapters found, try url by itself.
urls.append((self.url,self.storyName))
self.uuid = 'urn:uuid:' + self.host + '-a.' + self.authorId + '-s.' + self.storyId
self.authorURL = 'http://' + self.host + '/u/' + self.authorId
logging.debug('self.uuid=%s' % self.uuid)
#logging.debug('urls=%s' % urls)
return urls
def getText(self, url):
time.sleep( 2.0 )
data = self._fetchUrl(url)
lines = data.split('\n')
textbuf = ''
emit = False
olddata = data
try:
data = data.decode('utf8')
except:
data = olddata
try:
soup = bs.BeautifulStoneSoup(data)
except:
@ -131,23 +310,121 @@ class FFNet(FanfictionSiteAdapter):
div = soup.find('div', {'id' : 'storytext'})
if None == div:
logging.error("Error downloading Chapter: %s" % url)
exit(1)
exit (20)
return '<html/>'
return div.__str__('utf8')
def setLogin(self, login):
self.login = login
def setPassword(self, password):
self.password = password
def getStoryName(self):
return self.storyName
def getStoryURL(self):
logging.debug('self.url=%s' % self.url)
return self.url
def getUUID(self):
logging.debug('self.uuid=%s' % self.uuid)
return self.uuid
def getOutputName(self):
logging.debug('self.storyId=%s, self.storyName=%s self.outputName=%s' % (self.storyId, self.storyName, self.outputName))
return self.outputName
def getAuthorName(self):
logging.debug('self.authorName=%s' % self.authorName)
return self.authorName
def getAuthorId(self):
logging.debug('self.authorId=%s' % self.authorId)
return self.authorId
def getAuthorURL(self):
logging.debug('self.authorURL=%s' % self.authorURL)
return self.authorURL
def getStoryId(self):
logging.debug('self.storyId=%s' % self.storyId)
return self.storyId
def getStoryName(self):
logging.debug('self.storyName=%s' % self.storyName)
return self.storyName
def getStoryDescription(self):
logging.debug('self.storyDescription=%s' % self.storyDescription)
return self.storyDescription
def getStoryPublished(self):
logging.debug('self.storyPublished=%s' % self.storyPublished)
return self.storyPublished
def getStoryCreated(self):
self.storyCreated = datetime.datetime.now()
logging.debug('self.storyCreated=%s' % self.storyCreated)
return self.storyCreated
def getStoryUpdated(self):
logging.debug('self.storyUpdated=%s' % self.storyUpdated)
return self.storyUpdated
def getLanguage(self):
logging.debug('self.language=%s' % self.language)
return self.language
def getLanguageId(self):
logging.debug('self.languageId=%s' % self.languageId)
return self.languageId
def getSubjects(self):
logging.debug('self.subjects=%s' % self.authorName)
return self.subjects
def getPublisher(self):
logging.debug('self.publisher=%s' % self.publisher)
return self.publisher
def getNumChapters(self):
logging.debug('self.numChapters=%s' % self.numChapters)
return self.numChapters
def getNumWords(self):
logging.debug('self.numWords=%s' % self.numWords)
return self.numWords
def getCategory(self):
logging.debug('self.category=%s' % self.category)
return self.category
def getGenre(self):
logging.debug('self.genre=%s' % self.genre)
return self.genre
def getStoryStatus(self):
logging.debug('self.storyStatus=%s' % self.storyStatus)
return self.storyStatus
def getStoryRating(self):
logging.debug('self.storyRating=%s' % self.storyRating)
return self.storyRating
def getStoryUserRating(self):
logging.debug('self.storyUserRating=%s' % self.storyUserRating)
return self.storyUserRating
def getPrintableUrl(self, url):
pass
def getStoryCharacters(self):
logging.debug('self.storyCharacters=%s' % self.storyCharacters)
return self.storyCharacters
def getStorySeries(self):
logging.debug('self.storySeries=%s' % self.storySeries)
return self.storySeries
class FFA_UnitTests(unittest.TestCase):
def setUp(self):
logging.basicConfig(level=logging.DEBUG)

View file

@ -12,13 +12,20 @@ import urlparse as up
import BeautifulSoup as bs
import htmlentitydefs as hdefs
import time as time
import datetime
from adapter import *
class FictionAlley(FanfictionSiteAdapter):
def __init__(self, url):
self.url = url
self.host = up.urlparse(url).netloc
parsedUrl = up.urlparse(url)
self.host = parsedUrl.netloc
self.path = parsedUrl.path
logging.debug('self.host=%s' % self.host)
logging.debug('self.path=%s' % self.path)
cookieproc = u2.HTTPCookieProcessor()
# FictionAlley wants a cookie to prove you're old enough to read R+ rated stuff.
@ -35,6 +42,36 @@ class FictionAlley(FanfictionSiteAdapter):
rfc2109=False)
cookieproc.cookiejar.set_cookie(cookie)
self.opener = u2.build_opener(cookieproc)
ss = self.path.split('/')
self.storyDescription = 'Fanfiction Story'
self.authorId = ''
self.authorURL = ''
self.storyId = ''
if len(ss) > 2 and ss[1] == 'authors':
self.authorId = ss[2]
self.authorURL = 'http://' + self.host + '/authors/' + self.authorId
if len(ss) > 3:
self.storyId = ss[3].replace ('.html','')
self.storyPublished = datetime.date(1970, 01, 31)
self.storyCreated = datetime.datetime.now()
self.storyUpdated = datetime.date(1970, 01, 31)
self.languageId = 'en-UK'
self.language = 'English'
self.subjects = []
self.subjects.append ('fanfiction')
self.publisher = self.host
self.numChapters = 0
self.numWords = 0
self.genre = 'FanFiction'
self.category = 'Category'
self.storyStatus = 'In-Progress'
self.storyRating = 'K'
self.storyUserRating = '0'
self.storyCharacters = []
self.storySeries = ''
def requiresLogin(self, url = None):
return False
@ -48,31 +85,147 @@ class FictionAlley(FanfictionSiteAdapter):
def setPassword(self, password):
self.password = password
def _addSubject(self, subject):
subj = subject.upper()
for s in self.subjects:
if s.upper() == subj:
return False
self.subjects.append(subject)
return True
def _addCharacter(self, character):
chara = character.upper()
for c in self.storyCharacters:
if c.upper() == chara:
return False
self.storyCharacters.append(character)
return True
def _processChapterHeaders(self, div):
brs = div.findAll ('br')
for br in brs:
keystr=''
valstr=''
if len(br.contents) > 2:
keystr = br.contents[1]
if keystr is not None:
strs = re.split ("<[^>]+>", str(keystr))
keystr=''
for s in strs:
keystr = keystr + s
valstr = br.contents[2].strip(' ')
if keystr is not None:
if keystr == 'Rating:':
self.storyRating = valstr
logging.debug('self.storyRating=%s' % self.storyRating)
elif keystr == 'Genre:':
self.genre = valstr
logging.debug('self.genre=%s' % self.genre)
s2 = valstr.split(', ')
for ss2 in s2:
self._addSubject(ss2)
logging.debug('self.subjects=%s' % self.subjects)
elif keystr == 'Main Character(s):':
s2 = valstr.split(', ')
for ss2 in s2:
self._addCharacter(ss2)
logging.debug('self.storyCharacters=%s' % self.storyCharacters)
elif keystr == 'Summary:':
self.storyDescription = valstr
logging.debug('self.storyDescription=%s' % self.storyDescription)
def extractIndividualUrls(self):
data = self.opener.open(self.url).read()
# There is some usefull information in the headers of the first chapter page..
data = data.replace('<!-- headerstart -->','<crazytagstringnobodywouldstumbleonaccidently id="storyheaders">').replace('<!-- headerend -->','</crazytagstringnobodywouldstumbleonaccidently>')
soup = bs.BeautifulStoneSoup(data)
# Get title from <title>, remove before '-'.
title = soup.find('title').string
self.storyName = "-".join(title.split('-')[1:]).strip().replace(" (Story Text)","")
self.outputName = self.storyName.replace(" ", "_") + '-fa_' + self.storyId
links = soup.findAll('a', { 'class' : 'chapterlink' } )
links = soup.findAll('li')
# If it is decided that we really do care about number of words.. It's only available on the author's page..
#d0 = self.opener.open(self.authorURL).read()
#soupA = bs.BeautifulStoneSoup(d0)
#dls = soupA.findAll('dl')
#logging.debug('dls=%s' % dls)
self.numChapters = 0;
result = []
if len(links) == 0:
# Be aware that this means that the user has entered the {STORY}01.html
# We will not have valid Publised and Updated dates. User should enter
# the {STORY}.html instead. We should force that instead of this.
breadcrumbs = soup.find('div', {'class': 'breadcrumbs'})
self.authorName = breadcrumbs.a.string.replace("'s Fics","")
result.append((self.url,self.storyName))
#logging.debug('chapter[%s]=%s, %s' % (self.numChapters+1,self.url,self.storyName))
self.numChapters = self.numChapters + 1;
div = soup.find('crazytagstringnobodywouldstumbleonaccidently', {'id' : 'storyheaders'})
if div is not None:
self._processChapterHeaders(div)
else:
author = soup.find('h1', {'class' : 'title'})
self.authorName = author.a.string
for a in links:
url = a['href']
title = a.string
result.append((url,title))
summary = soup.find('div', {'class' : 'summary'})
ss = summary.contents
if len(ss) > 1:
ss1 = ss[0].split(': ')
if len(ss1) > 1 and ss1[0] == 'Rating':
self.storyRating = ss1[1]
logging.debug('self.storyRating=%s' % self.storyRating)
self.storyDescription = str(ss[1]).replace("<br>","").replace("</br>","").replace('\n','')
logging.debug('self.storyDescription=%s' % self.storyDescription)
for li in links:
a = li.find('a', {'class' : 'chapterlink'})
s = li.contents
if a is not None:
url = a['href']
title = a.string
result.append((url,title))
#logging.debug('chapter[%s]=%s, %s' % (self.numChapters+1,url,title))
if self.numChapters == 0:
# fictionalley uses full URLs in chapter list.
d1 = self.opener.open(url).read()
# find <!-- headerstart --> & <!-- headerend --> and
# replaced with matching div pair for easier parsing.
# Yes, it's an evil kludge, but what can ya do? Using
# something other than div prevents soup from pairing
# our div with poor html inside the story text.
d1 = d1.replace('<!-- headerstart -->','<crazytagstringnobodywouldstumbleonaccidently id="storyheaders">').replace('<!-- headerend -->','</crazytagstringnobodywouldstumbleonaccidently>')
sop = bs.BeautifulStoneSoup(d1)
div = sop.find('crazytagstringnobodywouldstumbleonaccidently', {'id' : 'storyheaders'})
if div is not None:
self._processChapterHeaders(div)
self.numChapters = self.numChapters + 1
if len(s) > 1:
datestr=''
ss2 = s[1].replace('\n','').replace('(','').split(' ')
if len(ss2) > 2 and ss2[0] == 'Posted:':
datestr = ss2[1] + ' ' + ss2[2]
tmpdate = datetime.datetime.fromtimestamp(time.mktime(time.strptime(datestr.strip(' '), "%Y-%m-%d %H:%M:%S")))
if self.numChapters == 1:
self.storyPublished = tmpdate
self.storyUpdated = tmpdate
logging.debug('self.storyPublished=%s, self.storyUpdated=%s' % (self.storyPublished, self.storyUpdated))
else:
logging.debug('li chapterlink not found! li=%s' % li)
#print('Story "%s" by %s' % (self.storyName, self.authorName))
print('Story "%s" by %s' % (self.storyName, self.authorName))
self.uuid = 'urn:uuid:' + self.host + '-u.' + self.authorId + '-s.' + self.storyId
logging.debug('self.uuid=%s' % self.uuid)
return result
@ -82,6 +235,9 @@ class FictionAlley(FanfictionSiteAdapter):
def getAuthorName(self):
return self.authorName
def getOutputName(self):
return self.outputName
def getText(self, url):
# fictionalley uses full URLs in chapter list.
data = self.opener.open(url).read()
@ -97,10 +253,96 @@ class FictionAlley(FanfictionSiteAdapter):
div = soup.find('crazytagstringnobodywouldstumbleonaccidently', {'id' : 'storytext'})
if None == div:
logging.error("Error downloading Chapter: %s" % url)
exit(1)
exit(20)
return '<html/>'
return div.__str__('utf8').replace('crazytagstringnobodywouldstumbleonaccidently','div')
html = soup.findAll('html')
if len(html) > 1:
return html[1].__str__('utf8')
else:
return div.__str__('utf8').replace('crazytagstringnobodywouldstumbleonaccidently','div')
def getStoryURL(self):
logging.debug('self.url=%s' % self.url)
return self.url
def getAuthorURL(self):
logging.debug('self.authorURL=%s' % self.authorURL)
return self.authorURL
def getUUID(self):
logging.debug('self.uuid=%s' % self.uuid)
return self.uuid
def getAuthorId(self):
logging.debug('self.authorId=%s' % self.authorId)
return self.authorId
def getStoryId(self):
logging.debug('self.storyId=%s' % self.storyId)
return self.storyId
def getStoryDescription(self):
logging.debug('self.storyDescription=%s' % self.storyDescription)
return self.storyDescription
def getStoryPublished(self):
logging.debug('self.storyPublished=%s' % self.storyPublished)
return self.storyPublished
def getStoryCreated(self):
self.storyCreated = datetime.datetime.now()
logging.debug('self.storyCreated=%s' % self.storyCreated)
return self.storyCreated
def getStoryUpdated(self):
logging.debug('self.storyUpdated=%s' % self.storyUpdated)
return self.storyUpdated
def getLanguage(self):
logging.debug('self.language=%s' % self.language)
return self.language
def getLanguageId(self):
logging.debug('self.languageId=%s' % self.languageId)
return self.languageId
def getSubjects(self):
logging.debug('self.subjects=%s' % self.authorName)
return self.subjects
def getPublisher(self):
logging.debug('self.publisher=%s' % self.publisher)
return self.publisher
def getNumChapters(self):
logging.debug('self.numChapters=%s' % self.numChapters)
return self.numChapters
def getNumWords(self):
logging.debug('self.numWords=%s' % self.numWords)
return self.numWords
def getCategory(self):
logging.debug('self.category=%s' % self.category)
return self.category
def getGenre(self):
logging.debug('self.genre=%s' % self.genre)
return self.genre
def getStoryStatus(self):
logging.debug('self.storyStatus=%s' % self.storyStatus)
return self.storyStatus
def getStoryRating(self):
logging.debug('self.storyRating=%s' % self.storyRating)
return self.storyRating
def getStoryUserRating(self):
logging.debug('self.storyUserRating=%s' % self.storyUserRating)
return self.storyUserRating
def getPrintableUrl(self, url):
return url
@ -114,6 +356,15 @@ class FictionAlley(FanfictionSiteAdapter):
login = dict(login = 'name', password = 'pass')
other = dict(submit = 'Log In', remember='yes')
return (login, other)
def getStoryCharacters(self):
logging.debug('self.storyCharacters=%s' % self.storyCharacters)
return self.storyCharacters
def getStorySeries(self):
logging.debug('self.storySeries=%s' % self.storySeries)
return self.storySeries
if __name__ == '__main__':

288
ficwad.py
View file

@ -12,6 +12,8 @@ import urlparse as up
import BeautifulSoup as bs
import htmlentitydefs as hdefs
import logging
import time
import datetime
from adapter import *
@ -32,7 +34,44 @@ class FicWad(FanfictionSiteAdapter):
def setPassword(self, password):
self.password = password
def _addSubject(self, subject):
subj = subject.upper()
for s in self.subjects:
if s.upper() == subj:
return False
self.subjects.append(subject)
return True
def _addCharacter(self, character):
chara = character.upper()
for c in self.storyCharacters:
if c.upper() == chara:
return False
self.storyCharacters.append(character)
return True
def extractIndividualUrls(self):
self.storyDescription = 'Fanfiction Story'
self.authorId = '0'
self.storyId = '0'
self.storyPublished = datetime.date(1970, 01, 31)
self.storyCreated = datetime.datetime.now()
self.storyUpdated = datetime.date(1970, 01, 31)
self.languageId = 'en-UK'
self.language = 'English'
self.subjects = []
self.subjects.append ('fanfiction')
self.publisher = self.host
self.numChapters = 0
self.numWords = 0
self.genre = 'FanFiction'
self.category = 'Category'
self.storyStatus = 'In-Progress'
self.storyRating = 'PG'
self.storyUserRating = '0'
self.storyCharacters = []
self.storySeries = ''
data = u2.urlopen(self.url).read()
soup = bs.BeautifulStoneSoup(data)
@ -40,50 +79,254 @@ class FicWad(FanfictionSiteAdapter):
crumbtrail = story.find('h3') # the only h3 ficwad uses.
allAhrefs = crumbtrail.findAll('a')
# last of crumbtrail
self.storyName = allAhrefs[-1].string.strip()
storyinfo = allAhrefs[-1]
(u0, u1, storyid) = storyinfo['href'].split('/')
if u1 == "story":
# This page does not have the correct information on it.. Need to get the Story Title Page
logging.debug('URL %s is a chapter URL. Getting Title Page http://%s/%s/%s.' % (self.url, self.host, u1, storyid))
self.url = 'http://' + self.host + '/' + u1 + '/' + storyid
data = u2.urlopen(self.url).read()
soup = bs.BeautifulStoneSoup(data)
story = soup.find('div', {'id' : 'story'})
crumbtrail = story.find('h3') # the only h3 ficwad uses.
allAhrefs = crumbtrail.findAll('a')
# save chapter name from header in case of one-shot.
chaptername = story.find('h4').find('a').string.strip()
storyinfo = story.find('h4').find('a')
(u0, u1, self.storyId) = storyinfo['href'].split('/')
self.storyName = storyinfo.string.strip()
self.outputName = self.storyName.replace(" ", "_") + '-fw_' + self.storyId
logging.debug('self.storyName=%s, self.storyId=%s, self.outputName=%s' % (self.storyName, self.storyId, self.outputName))
author = soup.find('span', {'class' : 'author'})
self.authorName = str(author.a.string)
(u0, u1,self.authorId) = author.a['href'].split('/')
self.authorURL = 'http://' + self.host + author.a['href']
logging.debug('self.authorName=%s self.authorId=%s' % (self.authorName, self.authorId))
select = soup.find('select', { 'name' : 'goto' } )
description = soup.find('blockquote', {'class' : 'summary'})
if description is not None:
self.storyDescription = str(description.p.string)
logging.debug('self.storyDescription=%s' % self.storyDescription)
meta = soup.find('p', {'class' : 'meta'})
if meta is not None:
s = str(meta).replace('\n',' ').replace('\t','').split(' - ')
logging.debug('meta.s=%s' % s)
for ss in s:
s1 = ss.replace('&nbsp;','').split(':')
#logging.debug('meta.s.s1=%s' % s1)
if len(s1) > 1:
s2 = re.split ('<[^>]+>', s1[0])
#logging.debug('meta.s.s1.s2=%s' % s2)
if len(s2) > 1:
s1[0] = s2[1]
skey = s1[0].strip()
#logging.debug('Checking = %s' % skey)
if skey == 'Category':
soup1 = bs.BeautifulStoneSoup(s1[1])
allAs = soup1.findAll('a')
for a in allAs:
if self.category == 'Category':
self.category = str(a.string)
logging.debug('self.category=%s' % self.category)
self._addSubject(self.category)
logging.debug('self.subjects=%s' % self.subjects)
elif skey == 'Rating':
self.storyRating = s1[1]
logging.debug('self.storyRating=%s' % self.storyRating)
elif skey == 'Genres':
self.genre = s1[1]
logging.debug('self.genre=%s' % self.genre)
s2 = s1[1].split(', ')
for ss2 in s2:
self._addSubject(ss2)
logging.debug('self.subjects=%s' % self.subjects)
elif skey == 'Characters':
s2 = s1[1].split(', ')
for ss2 in s2:
self._addCharacter(ss2)
logging.debug('self.storyCharacters=%s' % self.storyCharacters)
elif skey == 'Chapters':
self.numChapters = s1[1]
logging.debug('self.numChapters=%s' % self.numChapters)
elif skey == 'Warnings':
logging.debug('Warnings=%s' % s1[1])
elif skey == 'Published':
self.storyPublished = datetime.datetime.fromtimestamp(time.mktime(time.strptime(s1[1].strip(' '), "%Y/%m/%d")))
logging.debug('self.storyPublished=%s' % self.storyPublished)
elif skey == 'Updated':
self.storyUpdated = datetime.datetime.fromtimestamp(time.mktime(time.strptime(s1[1].strip(' '), "%Y/%m/%d")))
logging.debug('self.storyUpdated=%s' % self.storyUpdated)
else:
s3 = re.split ('<[^>]+>', s1[0])
#logging.debug('meta.s.s1.s3=%s' % s3)
if len(s3) > 1:
s1[0] = s3[0]
s4 = s1[0].split('w')
#logging.debug('meta.s.s1.s4=%s' % s4)
if len(s4) > 1 and s4[1] == 'ords':
self.numWords = s4[0]
logging.debug('self.numWords=%s' % self.numWords)
print('Story "%s" by %s' % (self.storyName, self.authorName))
result = []
if select is None:
# Single chapter storys don't have title in crumbtrail, just 'chapter' title in h4.
self.storyName = chaptername
# no chapters found, try url by itself.
result.append((self.url,self.storyName))
else:
allOptions = select.findAll('option')
for o in allOptions:
url = 'http://' + self.host + o['value']
title = o.string
# ficwad includes 'Story Index' in the dropdown of chapters,
# but it's not a real chapter.
if title != "Story Index":
result.append((url,title))
ii = 1
storylist = soup.find('ul', {'id' : 'storylist'})
if storylist is not None:
allH4s = storylist.findAll('h4')
#logging.debug('allH4s=%s' % allH4s)
if allH4s is not None:
for h4 in allH4s:
chapterinfo = h4.find('a')
#logging.debug('Chapter1=%s' % chapterinfo)
url = 'http://' + self.host + chapterinfo['href']
title = chapterinfo.string.strip()
#logging.debug('Chapter=%s, %s' % (url, title))
# ficwad includes 'Story Index' in the dropdown of chapters,
# but it's not a real chapter.
if title != "Story Index":
logging.debug('Chapter[%s]=%s, %s' % (ii, url, title))
result.append((url,title))
ii = ii+1
else:
logging.debug('Skipping Story Index. URL %s' % url)
if ii == 1:
select = soup.find('select', { 'name' : 'goto' } )
if select is None:
result.append((self.url,self.storyName))
logging.debug('Chapter[%s]=%s %s' % (ii, self.url, self.storyName))
else:
allOptions = select.findAll('option')
for o in allOptions:
url = 'http://' + self.host + o['value']
title = o.string
# ficwad includes 'Story Index' in the dropdown of chapters,
# but it's not a real chapter.
if title != "Story Index":
logging.debug('Chapter[%s]=%s, %s' % (ii, url, title))
result.append((url,title))
ii = ii+1
else:
logging.debug('Skipping Story Index. URL %s' % url)
self.uuid = 'urn:uuid:' + self.host + '-u.' + self.authorId + '-s.' + self.storyId
logging.debug('self.uuid=%s' % self.uuid)
return result
def getStoryName(self):
return self.storyName
def getOutputName(self):
return self.outputName
def getAuthorName(self):
return self.authorName
def getText(self, url):
if url.find('http://') == -1:
url = 'http://' + self.host + '/' + url
data = u2.urlopen(url).read()
soup = bs.BeautifulStoneSoup(data)
div = soup.find('div', {'id' : 'storytext'})
if None == div:
logging.error("Error downloading Chapter: %s" % url)
exit(1)
exit(20)
return '<html/>'
return div.__str__('utf8')
def getStoryURL(self):
logging.debug('self.url=%s' % self.url)
return self.url
def getAuthorURL(self):
logging.debug('self.authorURL=%s' % self.authorURL)
return self.authorURL
def getUUID(self):
logging.debug('self.uuid=%s' % self.uuid)
return self.uuid
def getAuthorId(self):
logging.debug('self.authorId=%s' % self.authorId)
return self.authorId
def getStoryId(self):
logging.debug('self.storyId=%s' % self.storyId)
return self.storyId
def getStoryDescription(self):
logging.debug('self.storyDescription=%s' % self.storyDescription)
return self.storyDescription
def getStoryPublished(self):
logging.debug('self.storyPublished=%s' % self.storyPublished)
return self.storyPublished
def getStoryCreated(self):
self.storyCreated = datetime.datetime.now()
logging.debug('self.storyCreated=%s' % self.storyCreated)
return self.storyCreated
def getStoryUpdated(self):
logging.debug('self.storyUpdated=%s' % self.storyUpdated)
return self.storyUpdated
def getLanguage(self):
logging.debug('self.language=%s' % self.language)
return self.language
def getLanguageId(self):
logging.debug('self.languageId=%s' % self.languageId)
return self.languageId
def getSubjects(self):
logging.debug('self.subjects=%s' % self.authorName)
return self.subjects
def getPublisher(self):
logging.debug('self.publisher=%s' % self.publisher)
return self.publisher
def getNumChapters(self):
logging.debug('self.numChapters=%s' % self.numChapters)
return self.numChapters
def getNumWords(self):
logging.debug('self.numWords=%s' % self.numWords)
return self.numWords
def getCategory(self):
logging.debug('self.category=%s' % self.category)
return self.category
def getGenre(self):
logging.debug('self.genre=%s' % self.genre)
return self.genre
def getStoryStatus(self):
logging.debug('self.storyStatus=%s' % self.storyStatus)
return self.storyStatus
def getStoryRating(self):
logging.debug('self.storyRating=%s' % self.storyRating)
return self.storyRating
def getStoryUserRating(self):
logging.debug('self.storyUserRating=%s' % self.storyUserRating)
return self.storyUserRating
def getPrintableUrl(self, url):
return url
@ -98,6 +341,15 @@ class FicWad(FanfictionSiteAdapter):
other = dict(submit = 'Log In', remember='yes')
return (login, other)
def getStoryCharacters(self):
logging.debug('self.storyCharacters=%s' % self.storyCharacters)
return self.storyCharacters
def getStorySeries(self):
logging.debug('self.storySeries=%s' % self.storySeries)
return self.storySeries
if __name__ == '__main__':
url = 'http://www.ficwad.com/story/14536'

View file

@ -15,6 +15,8 @@ import urllib2 as u2
import urlparse as up
import BeautifulSoup as bs
import htmlentitydefs as hdefs
import time
import datetime
from constants import *
from adapter import *
@ -32,8 +34,37 @@ class HPFiction(FanfictionSiteAdapter):
self.host = parsedUrl.netloc
self.path = parsedUrl.path
logging.debug('self.url=%s' % self.url)
logging.debug('self.host=%s' % self.host)
logging.debug('self.path=%s' % self.path)
self.opener = u2.build_opener(u2.HTTPCookieProcessor())
self.storyDescription = 'Fanfiction Story'
self.authorId = '0'
self.authorURL = ''
(u1, self.storyId) = self.url.split('=')
self.storyPublished = datetime.date(1970, 01, 31)
self.storyCreated = datetime.datetime.now()
self.storyUpdated = datetime.date(1970, 01, 31)
self.languageId = 'en-UK'
self.language = 'English'
self.subjects = []
self.subjects.append ('fanfiction')
self.subjects.append ('Harry Potter')
self.publisher = self.host
self.numChapters = 0
self.numWords = 0
self.genre = 'FanFiction'
self.category = 'Category'
self.storyStatus = 'In-Progress'
self.storyRating = 'K'
self.storyUserRating = '0'
self.storyCharacters = []
self.storySeries = ''
self.uuid = 'urn:uuid:' + self.host + '-u.' + self.authorId + '-s.' + self.storyId
logging.debug('self.uuid=%s' % self.uuid)
logging.debug("Created HPFiction: url=%s" % (self.url))
def _getLoginScript(self):
@ -45,23 +76,116 @@ class HPFiction(FanfictionSiteAdapter):
def performLogin(self, url = None):
return True
def _addSubject(self, subject):
subj = subject.upper()
for s in self.subjects:
if s.upper() == subj:
return False
self.subjects.append(subject)
return True
def _addCharacter(self, character):
chara = character.upper()
for c in self.storyCharacters:
if c.upper() == chara:
return False
self.storyCharacters.append(character)
return True
def extractIndividualUrls(self):
data = self.opener.open(self.url).read()
soup = bs.BeautifulSoup(data)
links = soup.findAll('a')
def_chapurl = ''
def_chaptitle = ''
for a in links:
if a['href'].find('psid') != -1:
self.storyName = a.string
logging.debug('self.storyName=%s' % self.storyName)
elif a['href'].find('viewuser.php') != -1:
self.authorName = a.string
self.authorURL = 'http://' + self.host + '/' + a['href']
(u1, self.authorId) = a['href'].split('=')
logging.debug('self.authorName=%s, self.authorId=%s' % (self.authorName, self.authorId))
elif a['href'].find('chapterid=') != -1 and len(def_chapurl) == 0:
def_chapurl = 'http://' + self.host + '/viewstory.php' + str(a['href'])
def_chaptitle = a.string
logging.debug('def_chapurl=%s, def_chaptitle=%s' % (def_chapurl, def_chaptitle))
centers = soup.findAll('center')
for center in centers:
tds = center.findAll ('td')
if tds is not None and len(tds) > 0:
for td in tds:
s = re.split ("<[^>]+>", str(td).replace('\n','').replace('&nbsp;',' '))
logging.debug('s=%s' % s)
ii = 0
ll = len(s)
sss = ''
while ii < ll - 1:
if s[ii] is not None and len(s[ii]) > 0:
if s[ii] == 'Rating:':
self.storyRating = s[ii+1]
logging.debug('self.storyRating=%s' % self.storyRating)
ii = ii + 2
elif s[ii] == 'Chapters:':
self.numChapters = s[ii+1]
logging.debug('self.numChapters=%s' % self.numChapters)
ii = ii + 2
elif s[ii] == 'Characters:':
s2 = s[ii+1].split(', ')
for ss2 in s2:
self._addCharacter(ss2)
logging.debug('self.storyCharacters=%s' % self.storyCharacters)
ii = ii + 2
elif s[ii] == 'Genre(s):':
self.genre = s[ii+1]
logging.debug('self.genre=%s' % self.genre)
s2 = s[ii+1].split(', ')
for ss2 in s2:
self._addSubject(ss2)
logging.debug('self.subjects=%s' % self.subjects)
ii = ii + 2
elif s[ii] == 'Status:':
if s[ii+1].strip(' ') == "Work In Progress":
self.storyStatus = 'In-Progress'
else:
self.storyStatus = 'Completed'
ii = ii + 2
elif s[ii] == 'First Published:':
self.storyPublished = datetime.datetime.fromtimestamp(time.mktime(time.strptime(s[ii+1].strip(' '), "%Y.%m.%d")))
logging.debug('self.storyPublished=%s' % self.storyPublished)
ii = ii + 2
elif s[ii] == 'Last Updated:':
self.storyUpdated = datetime.datetime.fromtimestamp(time.mktime(time.strptime(s[ii+1].strip(' '), "%Y.%m.%d")))
logging.debug('self.storyUpdated=%s' % self.storyUpdated)
ii = ii + 2
elif s[ii] == 'Last Published Chapter:':
ii = ii + 2
elif s[ii] == 'Pairings:':
ii = ii + 2
elif s[ii] == 'Warnings:':
ii = ii + 2
else:
sss = sss + ' ' + s[ii]
ii = ii + 1
else:
ii = ii + 1
self.storyDescription = sss
logging.debug('self.storyDescription=%s' % self.storyDescription)
urls = []
self.outputName = self.storyName.replace(" ", "_") + '-hp_' + self.storyId
select = soup.find('select', {'name' : 'chapterid'})
if select is None:
# no chapters found, try url by itself.
urls.append((self.url,self.storyName))
if len(def_chapurl) > 0:
urls.append((def_chapurl, def_chaptitle))
else:
urls.append((self.url,self.storyName))
else:
for o in select.findAll('option'):
if 'value' in o._getAttrMap():
@ -69,11 +193,18 @@ class HPFiction(FanfictionSiteAdapter):
title = o.string
if title != "Story Index":
urls.append((url,title))
self.uuid = 'urn:uuid:' + self.host + '-u.' + self.authorId + '-s.' + self.storyId
logging.debug('self.uuid=%s' % self.uuid)
return urls
def getStoryName(self):
return self.storyName
def getOutputName(self):
return self.outputName
def getAuthorName(self):
return self.authorName
@ -84,9 +215,100 @@ class HPFiction(FanfictionSiteAdapter):
divtext = soup.find('div', {'id' : 'fluidtext'})
if None == divtext:
logging.error("Error downloading Chapter: %s" % url)
exit(1)
exit(20)
return divtext.__str__('utf8')
def getAuthorId(self):
logging.debug('self.authorId=%s' % self.authorId)
return self.authorId
def getStoryId(self):
logging.debug('self.storyId=%s' % self.storyId)
return self.storyId
def getStoryDescription(self):
logging.debug('self.storyDescription=%s' % self.storyDescription)
return self.storyDescription
def getStoryPublished(self):
logging.debug('self.storyPublished=%s' % self.storyPublished)
return self.storyPublished
def getStoryCreated(self):
self.storyCreated = datetime.datetime.now()
logging.debug('self.storyCreated=%s' % self.storyCreated)
return self.storyCreated
def getStoryUpdated(self):
logging.debug('self.storyUpdated=%s' % self.storyUpdated)
return self.storyUpdated
def getLanguage(self):
logging.debug('self.language=%s' % self.language)
return self.language
def getLanguageId(self):
logging.debug('self.languageId=%s' % self.languageId)
return self.languageId
def getSubjects(self):
logging.debug('self.subjects=%s' % self.authorName)
return self.subjects
def getPublisher(self):
logging.debug('self.publisher=%s' % self.publisher)
return self.publisher
def getNumChapters(self):
logging.debug('self.numChapters=%s' % self.numChapters)
return self.numChapters
def getNumWords(self):
logging.debug('self.numWords=%s' % self.numWords)
return self.numWords
def getStoryURL(self):
logging.debug('self.url=%s' % self.url)
return self.url
def getAuthorURL(self):
logging.debug('self.authorURL=%s' % self.authorURL)
return self.authorURL
def getUUID(self):
logging.debug('self.uuid=%s' % self.uuid)
return self.uuid
def getCategory(self):
logging.debug('self.category=%s' % self.category)
return self.category
def getGenre(self):
logging.debug('self.genre=%s' % self.genre)
return self.genre
def getStoryStatus(self):
logging.debug('self.storyStatus=%s' % self.storyStatus)
return self.storyStatus
def getStoryRating(self):
logging.debug('self.storyRating=%s' % self.storyRating)
return self.storyRating
def getStoryUserRating(self):
logging.debug('self.storyUserRating=%s' % self.storyUserRating)
return self.storyUserRating
def getStoryCharacters(self):
logging.debug('self.storyCharacters=%s' % self.storyCharacters)
return self.storyCharacters
def getStorySeries(self):
logging.debug('self.storySeries=%s' % self.storySeries)
return self.storySeries
class FF_UnitTests(unittest.TestCase):
def setUp(self):
logging.basicConfig(level=logging.DEBUG)

View file

@ -26,6 +26,7 @@ from constants import *
import html2text
import datetime
class FanficWriter:
@ -41,8 +42,8 @@ class FanficWriter:
class TextWriter(FanficWriter):
htmlWriter = None
def __init__(self, base, name, author, inmemory=False, compress=False):
self.htmlWriter = HTMLWriter(base, name, author, True, False)
def __init__(self, base, adapter, inmemory=False, compress=False):
self.htmlWriter = HTMLWriter(base, adapter, True, False)
def writeChapter(self, index, title, text):
self.htmlWriter.writeChapter(index, title, text)
@ -57,12 +58,13 @@ class TextWriter(FanficWriter):
class HTMLWriter(FanficWriter):
body = ''
def __init__(self, base, name, author, inmemory=False, compress=False):
def __init__(self, base, adapter, inmemory=False, compress=False):
self.basePath = base
self.storyTitle = removeEntities(name)
self.name = makeAcceptableFilename(name)
self.fileName = self.basePath + '/' + self.name + '.html'
self.authorName = removeEntities(author)
self.storyTitle = removeEntities(adapter.getStoryName())
self.name = makeAcceptableFilename(adapter.getOutputName())
self.fileName = self.basePath + '/' + self.name + '.html'
self.authorName = removeEntities(adapter.getAuthorName())
self.adapter = adapter
self.inmemory = inmemory
@ -131,14 +133,14 @@ class EPubFanficWriter(FanficWriter):
for f in self.files:
self.files[f].close()
def __init__(self, base, name, author, inmemory=False, compress=True):
def __init__(self, base, adapter, inmemory=False, compress=True):
self.basePath = base
self.storyTitle = removeEntities(name)
self.name = makeAcceptableFilename(name)
self.storyTitle = removeEntities(adapter.getStoryName())
self.name = makeAcceptableFilename(adapter.getOutputName())
self.directory = self.basePath + '/' + self.name
self.authorName = removeEntities(author)
self.authorName = removeEntities(adapter.getAuthorName())
self.inmemory = inmemory
self.adapter = adapter
self.files = {}
self.chapters = []
@ -226,17 +228,50 @@ class EPubFanficWriter(FanficWriter):
tocFilePath = "OEBPS/toc.ncx"
# toc = open(tocFilePath, 'w')
# print >> toc, TOC_START % self.storyTitle
self._writeFile(tocFilePath, TOC_START % self.storyTitle)
self._writeFile(tocFilePath, TOC_START % (self.adapter.getUUID(), self.storyTitle))
published = self.adapter.getStoryPublished().strftime("%Y-%m-%d")
createda = self.adapter.getStoryCreated().strftime("%Y-%m-%d %H:%M:%S")
created = self.adapter.getStoryCreated().strftime("%Y-%m-%d")
updated = self.adapter.getStoryUpdated().strftime("%Y-%m-%d")
calibre = self.adapter.getStoryUpdated().strftime("%Y-%m-%dT%H:%M:%S")
### writing content -- title page
titleFilePath = "OEBPS/title_page.xhtml"
self._writeFile(titleFilePath, TITLE_PAGE % (self.authorName, self.storyTitle, self.adapter.getStoryURL(), self.storyTitle, self.adapter.getAuthorURL(), self.authorName, self.adapter.getCategory(), self.adapter.getGenre(), self.adapter.getStoryStatus(), published, updated, createda, self.adapter.getStoryRating(), self.adapter.getStoryUserRating(), self.adapter.getNumChapters(), self.adapter.getNumWords(), self.adapter.getStoryURL(), self.adapter.getStoryURL(), self.adapter.getStoryDescription()))
### writing content -- opf file
opfFilePath = "OEBPS/content.opf"
# opf = open(opfFilePath, 'w')
self._writeFile(opfFilePath, CONTENT_START % (self.storyTitle, self.authorName, uuid.uuid4().urn))
self._writeFile(opfFilePath, CONTENT_START % (uuid.uuid4().urn, self.storyTitle, self.authorName, self.adapter.getLanguageId(), published, created, updated, calibre, self.adapter.getStoryDescription()))
i = 0
subjs = []
subjs = self.adapter.getSubjects()
for subj in subjs:
self._writeFile(opfFilePath, CONTENT_SUBJECT % subj)
i = i + 1
if (i <= 0):
self._writeFile(opfFilePath, CONTENT_SUBJECT % "FanFiction")
self._writeFile(opfFilePath, CONTENT_END_METADATA % (self.adapter.getPublisher(), self.adapter.getUUID(), self.adapter.getStoryURL(), self.adapter.getStoryURL(), self.adapter.getStoryUserRating()))
# print >> opf, CONTENT_START % (uuid.uuid4().urn, self.storyTitle, self.authorName)
ids = []
i = 1
i = 0
t = "Title Page"
f = "title_page.xhtml"
chapterId = "Title Page"
self._writeFile(tocFilePath, TOC_ITEM % (chapterId, i, t, f))
self._writeFile(opfFilePath, CONTENT_ITEM % (chapterId, f))
ids.append(chapterId)
i = i + 1
for t,f in self.chapters:
chapterId = "chapter%04d" % i

View file

@ -11,119 +11,360 @@ import urllib2 as u2
import urlparse as up
import BeautifulSoup as bs
import htmlentitydefs as hdefs
import time
import datetime
from adapter import *
import twipassword
class Twilighted(FanfictionSiteAdapter):
def __init__(self, url):
self.url = url
parsedUrl = up.urlparse(url)
self.host = parsedUrl.netloc
self.path = parsedUrl.path
self.opener = u2.build_opener(u2.HTTPCookieProcessor())
self.password=twipassword.password
self.login='sigizmund'
logging.debug("Created Twilighted: url=%s" % (self.url))
def requiresLogin(self, url = None):
# potionsandsnitches.net doesn't require login.
if self.host == 'potionsandsnitches.net':
return False
else:
return True
def performLogin(self, url = None):
data = {}
data['penname'] = self.login
data['password'] = self.password
data['cookiecheck'] = '1'
data['submit'] = 'Submit'
urlvals = u.urlencode(data)
loginUrl = 'http://' + self.host + self._getLoginScript()
logging.debug("Will now login to URL %s" % loginUrl)
req = self.opener.open(loginUrl, urlvals)
d = req.read().decode('utf-8')
if self.reqLoginData(d) :
return False
else:
return True
def setLogin(self, login):
self.login = login
def setPassword(self, password):
self.password = password
def extractIndividualUrls(self):
data = self.opener.open(self.url).read()
def __init__(self, url):
self.url = url
parsedUrl = up.urlparse(url)
self.host = parsedUrl.netloc
self.path = parsedUrl.path
self.opener = u2.build_opener(u2.HTTPCookieProcessor())
self.password=twipassword.password
self.login='sigizmund'
self.storyDescription = 'Fanfiction Story'
self.authorId = '0'
self.authorURL = ''
self.storyId = '0'
self.storyPublished = datetime.date(1970, 01, 31)
self.storyCreated = datetime.datetime.now()
self.storyUpdated = datetime.date(1970, 01, 31)
self.languageId = 'en-UK'
self.language = 'English'
self.subjects = []
self.subjects.append ('fanfiction')
self.subjects.append ('Twilight')
self.publisher = self.host
self.numChapters = 0
self.numWords = 0
self.genre = 'FanFiction'
self.category = 'Category'
self.storyStatus = 'In-Progress'
self.storyRating = 'PG'
self.storyUserRating = '0'
self.storyCharacters = []
self.storySeries = ''
self.uuid = 'urn:uuid:' + self.host + '-u.' + self.authorId + '-s.' + self.storyId
logging.debug('self.uuid=%s' % self.uuid)
if self.reqLoginData(data):
self.performLogin()
data = self.opener.open(self.url).read()
if self.reqLoginData(data):
return None
logging.debug("Created Twilighted: url=%s" % (self.url))
def requiresLogin(self, url = None):
# potionsandsnitches.net doesn't require login.
if self.host == 'potionsandsnitches.net':
return False
else:
return True
def performLogin(self, url = None):
data = {}
soup = bs.BeautifulStoneSoup(data)
title = soup.find('title').string
self.storyName = title.split(' by ')[0].strip()
self.authorName = title.split(' by ')[1].strip()
select = soup.find('select', { 'name' : 'chapter' } )
result = []
if select is None:
# no chapters found, try url by itself.
result.append((self.url,self.storyName))
else:
allOptions = select.findAll('option')
for o in allOptions:
url = self.url + "&chapter=%s" % o['value']
title = o.string
result.append((url,title))
return result
def getStoryName(self):
return self.storyName
def getAuthorName(self):
return self.authorName
def getText(self, url):
if url.find('http://') == -1:
url = 'http://' + self.host + '/' + url
logging.debug('Getting data from: %s' % url)
data = self.opener.open(url).read()
data['penname'] = self.login
data['password'] = self.password
data['cookiecheck'] = '1'
data['submit'] = 'Submit'
soup = bs.BeautifulStoneSoup(data, convertEntities=bs.BeautifulStoneSoup.HTML_ENTITIES)
urlvals = u.urlencode(data)
loginUrl = 'http://' + self.host + self._getLoginScript()
logging.debug("Will now login to URL %s" % loginUrl)
req = self.opener.open(loginUrl, urlvals)
d = req.read().decode('utf-8')
if self.reqLoginData(d) :
return False
else:
return True
div = soup.find('div', {'id' : 'story'})
if None == div:
return '<html/>'
def setLogin(self, login):
self.login = login
return div.__str__('utf8')
def setPassword(self, password):
self.password = password
def _getLoginScript(self):
return '/user.php?action=login'
def _addSubject(self, subject):
subj = subject.upper()
for s in self.subjects:
if s.upper() == subj:
return False
self.subjects.append(subject)
return True
def reqLoginData(self, data):
if data.find('Registered Users Only. Please click OK to login or register.') != -1 or data.find('There is no such account on our website') != -1:
return True
else:
return False
def _addCharacter(self, character):
chara = character.upper()
for c in self.storyCharacters:
if c.upper() == chara:
return False
self.storyCharacters.append(character)
return True
def extractIndividualUrls(self):
data = self.opener.open(self.url).read()
if self.reqLoginData(data):
self.performLogin()
data = self.opener.open(self.url).read()
if self.reqLoginData(data):
return None
soup = bs.BeautifulStoneSoup(data)
title = soup.find('title').string
self.storyName = title.split(' by ')[0].strip()
self.authorName = title.split(' by ')[1].strip()
self.outputName = self.storyName.replace(" ", "_")
select = soup.find('select', { 'name' : 'chapter' } )
result = []
if select is None:
# no chapters found, try url by itself.
result.append((self.url,self.storyName))
else:
allOptions = select.findAll('option')
for o in allOptions:
url = self.url + "&chapter=%s" % o['value']
title = o.string
result.append((url,title))
url = self.url + "&index=1"
data = self.opener.open(url).read()
lines = data.split('\n')
soup = bs.BeautifulStoneSoup(data)
metas = soup.findAll('meta')
for meta in metas:
if 'name' in meta._getAttrMap() and meta['name'].find('description') != -1:
#logging.debug('Meta: %s' % meta)
if 'content' in meta._getAttrMap():
s1 = bs.BeautifulStoneSoup(meta['content'])
ps = s1.findAll('p')
if len(ps) > 0:
self.storyDescription = ps[0]
logging.debug('self.storyDescription=%s' % (self.storyDescription))
else:
divs = meta.findAll('div')
#logging.debug('Divs: %s' % divs)
for div in divs:
#logging.debug('Div: %s' % div)
if 'id' in div._getAttrMap() and div['id'].find('pagetitle') != -1:
#logging.debug('Div PAGETITLE: %s' % div)
allA = div.findAll('a')
for a in allA:
if 'href' in a._getAttrMap():
if a['href'].find('viewstory.php?sid=') != -1:
str1 = a.string
(vs, self.storyId) = a['href'].split('=')
logging.debug('self.storyId=%s self.storyName=%s' % (self.storyId, self.storyName))
self.outputName = self.outputName + "-tw_" + self.storyId
logging.debug('self.outputName=%s' % self.outputName)
if a['href'].find('viewuser.php?uid=') != -1:
str1 = a.string
(vs, self.authorId) = a['href'].split('=')
logging.debug('self.authorId=%s self.authorName=%s' % (self.authorId, self.authorName))
self.authorURL = 'http://'+self.host+'/viewuser.php?uid='+self.authorId
logging.debug('self.authorURL=%s' % self.authorURL)
if 'class' in div._getAttrMap() and div['class'].find('content') != -1:
#logging.debug('Div CONTENT: %s' % div)
brs = div.findAll('br')
for br in brs:
buf = unicode(br).encode('utf-8')
strs = re.split ('<[^>]+>', buf)
#logging.debug('BUF: %s' % strs)
ii = 2
stlen = len(strs)
while stlen > ii+1:
if len(strs[ii]) == 0:
ii = ii+1
continue
if strs[ii] == 'Categories:':
ii = ii+1
while stlen > ii and len(strs[ii]) != 0 and strs[ii].find(':') == -1:
if strs[ii] != ' ' and strs[ii] != ', ':
if self.category == 'Category':
self.category = strs[ii].strip(' ')
self._addSubject(strs[ii].strip(' '))
ii = ii+1
logging.debug('self.subjects=%s' % self.subjects)
if strs[ii] == 'Characters: ':
ii = ii+1
while stlen > ii and len(strs[ii]) != 0 and strs[ii].find(':') == -1:
if strs[ii] != ' ' and strs[ii] != ', ':
self._addCharacter(strs[ii].strip(' '))
ii = ii+1
logging.debug('self.storyCharacters=%s' % self.storyCharacters)
elif strs[ii] == 'Completed:':
if strs[ii+1].strip(' ') == "No":
self.storyStatus = 'In-Progress'
else:
self.storyStatus = 'Completed'
ii = ii+2
logging.debug('self.storyStatus=%s' % self.storyStatus)
elif strs[ii] == 'Rated:':
self.storyRating = strs[ii+1].strip(' ')
ii = ii+2
logging.debug('self.storyRating=%s' % self.storyRating)
elif strs[ii] == 'Series:':
self.storySeries = strs[ii+1].strip(' ')
if self.storySeries == 'None':
self.storySeries = ''
ii = ii+2
logging.debug('self.storySeries=%s' % self.storySeries)
elif strs[ii] == 'Chapters: ':
self.numChapters = strs[ii+1].strip(' ')
ii = ii+2
logging.debug('self.numChapters=%s' % self.numChapters)
elif strs[ii] == 'Word count:':
self.numWords = strs[ii+1].strip(' ')
ii = ii+2
logging.debug('self.numWords=%s' % self.numWords)
elif strs[ii] == ' Published: ':
self.storyPublished = datetime.datetime.fromtimestamp(time.mktime(time.strptime(strs[ii+1].strip(' '), "%B %d, %Y")))
ii = ii+2
logging.debug('self.storyPublished=%s' % self.storyPublished)
elif strs[ii] == 'Updated:':
self.storyUpdated = datetime.datetime.fromtimestamp(time.mktime(time.strptime(strs[ii+1].strip(' '), "%B %d, %Y")))
ii = ii+2
logging.debug('self.storyUpdated=%s' % self.storyUpdated)
else:
logging.debug('Skipped Label \"%s\" Value \"%s\"' % (strs[ii], strs[ii+1]))
ii = ii+2
self.uuid = 'urn:uuid:' + self.host + '-u.' + self.authorId + '-s.' + self.storyId
logging.debug('self.uuid=%s' % self.uuid)
return result
def getStoryName(self):
return self.storyName
def getOutputName(self):
return self.outputName
def getAuthorName(self):
return self.authorName
def getText(self, url):
if url.find('http://') == -1:
url = 'http://' + self.host + '/' + url
logging.debug('Getting data from: %s' % url)
data = self.opener.open(url).read()
soup = bs.BeautifulStoneSoup(data, convertEntities=bs.BeautifulStoneSoup.HTML_ENTITIES)
div = soup.find('div', {'id' : 'story'})
if None == div:
return '<html/>'
return div.__str__('utf8')
def _getLoginScript(self):
return '/user.php?action=login'
def reqLoginData(self, data):
if data.find('Registered Users Only. Please click OK to login or register.') != -1 or data.find('There is no such account on our website') != -1:
return True
else:
return False
def getStoryURL(self):
logging.debug('self.url=%s' % self.url)
return self.url
def getAuthorURL(self):
logging.debug('self.authorURL=%s' % self.authorURL)
return self.authorURL
def getUUID(self):
logging.debug('self.uuid=%s' % self.uuid)
return self.uuid
def getStoryDescription(self):
logging.debug('self.storyDescription=%s' % self.storyDescription)
return self.storyDescription
def getStoryPublished(self):
logging.debug('self.storyPublished=%s' % self.storyPublished)
return self.storyPublished
def getStoryCreated(self):
self.storyCreated = datetime.datetime.now()
logging.debug('self.storyCreated=%s' % self.storyCreated)
return self.storyCreated
def getStoryUpdated(self):
logging.debug('self.storyUpdated=%s' % self.storyUpdated)
return self.storyUpdated
def getLanguage(self):
logging.debug('self.language=%s' % self.language)
return self.language
def getLanguageId(self):
logging.debug('self.languageId=%s' % self.languageId)
return self.languageId
def getSubjects(self):
logging.debug('self.subjects=%s' % self.authorName)
return self.subjects
def getPublisher(self):
logging.debug('self.publisher=%s' % self.publisher)
return self.publisher
def getNumChapters(self):
logging.debug('self.numChapters=%s' % self.numChapters)
return self.numChapters
def getNumWords(self):
logging.debug('self.numWords=%s' % self.numWords)
return self.numWords
def getAuthorId(self):
logging.debug('self.authorId=%s' % self.authorId)
return self.authorId
def getStoryId(self):
logging.debug('self.storyId=%s' % self.storyId)
return self.storyId
def getCategory(self):
logging.debug('self.category=%s' % self.category)
return self.category
def getGenre(self):
logging.debug('self.genre=%s' % self.genre)
return self.genre
def getStoryStatus(self):
logging.debug('self.storyStatus=%s' % self.storyStatus)
return self.storyStatus
def getStoryRating(self):
logging.debug('self.storyRating=%s' % self.storyRating)
return self.storyRating
def getStoryUserRating(self):
logging.debug('self.storyUserRating=%s' % self.storyUserRating)
return self.storyUserRating
def getStoryCharacters(self):
logging.debug('self.storyCharacters=%s' % self.storyCharacters)
return self.storyCharacters
def getStorySeries(self):
logging.debug('self.storySeries=%s' % self.storySeries)
return self.storySeries
class Twilighted_UnitTests(unittest.TestCase):
def setUp(self):