mirror of
https://github.com/JimmXinu/FanFicFare.git
synced 2025-12-06 08:52:55 +01:00
Add a title page to the resulting EPUB file. This required scraping more information from the web pages in order to populate the new fields. This change includes a change to the way that the output.py uses the adapters. It now gets passed in the adapter and then calls functions from the adapter in order to retrieve the scraped information. This will make it easier down the road when adding more information, or even pictures.
This commit is contained in:
parent
778deaea00
commit
379efc34f1
9 changed files with 1577 additions and 169 deletions
73
adapter.py
73
adapter.py
|
|
@ -29,11 +29,80 @@ class FanfictionSiteAdapter:
|
|||
def setPassword(self, password):
|
||||
pass
|
||||
|
||||
def getStoryName(self):
|
||||
def getStoryURL(self):
|
||||
pass
|
||||
|
||||
def getUUID(self):
|
||||
pass
|
||||
|
||||
def getOutputName(self):
|
||||
pass
|
||||
|
||||
def getAuthorURL(self):
|
||||
pass
|
||||
|
||||
def getAuthorId(self):
|
||||
pass
|
||||
|
||||
def getAuthorName(self):
|
||||
pass
|
||||
|
||||
def getStoryId(self):
|
||||
pass
|
||||
|
||||
def getStoryName(self):
|
||||
pass
|
||||
|
||||
def getStoryDescription(self):
|
||||
pass
|
||||
|
||||
def getStoryCreated(self):
|
||||
pass
|
||||
|
||||
def getStoryPublished(self):
|
||||
pass
|
||||
|
||||
def getStoryUpdated(self):
|
||||
pass
|
||||
|
||||
def getStorySeries(self):
|
||||
pass
|
||||
|
||||
def getLanguage(self):
|
||||
pass
|
||||
|
||||
def getLanguageId(self):
|
||||
pass
|
||||
|
||||
def getSubjects(self):
|
||||
pass
|
||||
|
||||
def getCharacters(self):
|
||||
pass
|
||||
|
||||
def getPublisher(self):
|
||||
pass
|
||||
|
||||
def getNumChapters(self):
|
||||
pass
|
||||
|
||||
def getNumWords(self):
|
||||
pass
|
||||
|
||||
def getCategory(self):
|
||||
pass
|
||||
|
||||
def getGenre(self):
|
||||
pass
|
||||
|
||||
def getStoryStatus(self):
|
||||
pass
|
||||
|
||||
def getStoryRating(self):
|
||||
pass
|
||||
|
||||
def getStoryUserRating(self):
|
||||
pass
|
||||
|
||||
def getPrintableUrl(self, url):
|
||||
pass
|
||||
pass
|
||||
|
|
|
|||
82
constants.py
82
constants.py
|
|
@ -15,6 +15,9 @@ h6 { text-align: center; }
|
|||
padding:0px;
|
||||
}
|
||||
.center {text-align: center;}
|
||||
.cover {text-align: center;}
|
||||
.full {width: 100%; }
|
||||
.quarter {width: 25%; }
|
||||
.smcap {font-variant: small-caps;}
|
||||
.u {text-decoration: underline;}
|
||||
.bold {font-weight: bold;}
|
||||
|
|
@ -22,6 +25,37 @@ h6 { text-align: center; }
|
|||
|
||||
MIMETYPE = '''application/epub+zip'''
|
||||
|
||||
TITLE_PAGE = '''<html xmlns="http://www.w3.org/1999/xhtml" xmlns:xlink="http://www.w3.org/1999/xlink"><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
|
||||
<title>%s - %s</title><link href="stylesheet.css" type="text/css" charset="UTF-8" rel="stylesheet"/></head>
|
||||
<body><div class="cover">
|
||||
<h1 id="cfs_0"><a id="StoryLink" href="%s">%s</a></h1>
|
||||
<h2 id="cfs_1">by <a id="AuthorLink" href="%s">%s</a></h2>
|
||||
</div><div style="text-align:center">
|
||||
<table class="full">
|
||||
<colgroup span="2"></colgroup>
|
||||
<tr><td> </td>
|
||||
<td> </td>
|
||||
</tr><tr><td> </td>
|
||||
<td> </td>
|
||||
</tr><tr><td><b>Category:</b></td><td>%s</td>
|
||||
</tr><tr><td><b>Genre:</b></td><td>%s</td>
|
||||
</tr><tr><td><b>Status:</b></td><td>%s</td>
|
||||
</tr><tr><td><b>Published:</b></td><td>%s</td>
|
||||
</tr><tr><td><b>Updated:</b></td><td>%s</td>
|
||||
</tr><tr><td><b>Packaged:</b></td><td>%s</td>
|
||||
</tr><tr><td><b>Rating Age/User:</b></td><td>%s / %s</td>
|
||||
</tr><tr><td><b>Chapters/Words:</b></td><td>%s / %s</td>
|
||||
</tr><tr><td><b>URL:</b></td><td><h3 id="url0"><a id="StoryURL" href="%s">%s</a></h3></td>
|
||||
</tr><tr><td><b>Summary:</b></td>
|
||||
</tr><tr><td colspan="2">%s</td>
|
||||
</tr><tr><td> </td>
|
||||
<td> </td>
|
||||
</tr><tr><td> </td>
|
||||
<td> </td>
|
||||
</tr></table></div>
|
||||
<div class="full" id="pb_0"/></body></html>
|
||||
'''
|
||||
|
||||
CONTAINER = '''<?xml version="1.0"?>
|
||||
<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
|
||||
<rootfiles>
|
||||
|
|
@ -30,42 +64,60 @@ CONTAINER = '''<?xml version="1.0"?>
|
|||
</container>
|
||||
'''
|
||||
|
||||
CONTENT_START = '''<?xml version="1.0"?>
|
||||
CONTENT_START = '''<?xml version="1.0" encoding="utf-8"?>
|
||||
<package version="2.0" xmlns="http://www.idpf.org/2007/opf"
|
||||
unique-identifier="BookID">
|
||||
<metadata xmlns:dc="http://purl.org/dc/elements/1.1/"
|
||||
xmlns:opf="http://www.idpf.org/2007/opf">
|
||||
unique-identifier="fanficdownloader-uuid">
|
||||
<metadata xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xmlns:dc="http://purl.org/dc/elements/1.1/"
|
||||
xmlns:dcterms="http://purl.org/dc/terms/"
|
||||
xmlns:opf="http://www.idpf.org/2007/opf"
|
||||
xmlns:calibre="http://calibre.kovidgoyal.net/2009/metadata">
|
||||
<dc:identifier id="fanficdownloader-uuid">BookID-Epub-%s</dc:identifier>
|
||||
<dc:title>%s</dc:title>
|
||||
<dc:creator opf:role="aut">%s</dc:creator>
|
||||
<dc:language>en-UK</dc:language>
|
||||
<dc:contributor opf:role="bkp">fanficdownloader [http://fanficdownloader.googlecode.com]</dc:contributor>
|
||||
<dc:language>%s</dc:language>
|
||||
<dc:rights></dc:rights>
|
||||
<dc:subject>fanfiction</dc:subject>
|
||||
<dc:publisher>sgzmd</dc:publisher>
|
||||
<dc:identifier id="BookID">%s</dc:identifier>
|
||||
<dc:date opf:event="publication">%s</dc:date>
|
||||
<dc:date opf:event="creation">%s</dc:date>
|
||||
<dc:date opf:event="modification">%s</dc:date>
|
||||
<meta name="calibre:timestamp" content="%s"/>
|
||||
<dc:description>%s</dc:description>
|
||||
'''
|
||||
|
||||
CONTENT_END_METADATA = ''' <dc:publisher>%s</dc:publisher>
|
||||
<dc:identifier id="BookId">%s</dc:identifier>
|
||||
<dc:identifier opf:scheme="URL">%s</dc:identifier>
|
||||
<dc:source>%s</dc:source>
|
||||
<dc:type>FanFiction</dc:type>
|
||||
<meta name="calibre:rating" content="%s"/>
|
||||
</metadata>
|
||||
<manifest>
|
||||
<item id="ncx" href="toc.ncx" media-type="application/x-dtbncx+xml"/>
|
||||
<item id="style" href="stylesheet.css" media-type="text/css" />
|
||||
'''
|
||||
|
||||
CONTENT_ITEM = '''<item id="%s" href="%s" media-type="application/xhtml+xml" />
|
||||
CONTENT_SUBJECT = ''' <dc:subject>%s</dc:subject>
|
||||
'''
|
||||
|
||||
CONTENT_END_MANIFEST = '''</manifest>
|
||||
<spine toc="ncx">
|
||||
CONTENT_ITEM = ''' <item id="%s" href="%s" media-type="application/xhtml+xml" />
|
||||
'''
|
||||
|
||||
CONTENT_ITEMREF = '''<itemref idref="%s" />
|
||||
CONTENT_END_MANIFEST = ''' </manifest>
|
||||
<spine toc="ncx">
|
||||
'''
|
||||
|
||||
CONTENT_END = '''</spine>
|
||||
CONTENT_ITEMREF = ''' <itemref idref="%s" />
|
||||
'''
|
||||
|
||||
CONTENT_END = ''' </spine>
|
||||
</package>
|
||||
'''
|
||||
|
||||
TOC_START = '''<?xml version="1.0" encoding="UTF-8"?>
|
||||
<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1">
|
||||
<head>
|
||||
<meta name="dtb:uid" content="sigizmund.com062820072147132"/>
|
||||
<meta name="dtb:uid" content="%s"/>
|
||||
<meta name="dtb:depth" content="1"/>
|
||||
<meta name="dtb:totalPageCount" content="0"/>
|
||||
<meta name="dtb:maxPageNumber" content="0"/>
|
||||
|
|
@ -502,3 +554,5 @@ FB2_DESCRIPTION = '''<description>
|
|||
<version>2.0</version>
|
||||
</document-info>
|
||||
</description>'''
|
||||
|
||||
HTML_ESC_Definitions = 'HTML_Escape.def'
|
||||
|
|
|
|||
|
|
@ -34,6 +34,7 @@ class FanficLoader:
|
|||
self.inmemory = inmemory
|
||||
self.compress = compress
|
||||
self.badLogin = False
|
||||
self.overWrite = True
|
||||
|
||||
def getAdapter():
|
||||
return self.adapter
|
||||
|
|
@ -48,7 +49,13 @@ class FanficLoader:
|
|||
raise adapter.LoginRequiredException(self.adapter.url)
|
||||
|
||||
urls = self.adapter.extractIndividualUrls()
|
||||
self.writer = self.writerClass(self.booksDirectory, self.adapter.getStoryName(), self.adapter.getAuthorName(), inmemory=self.inmemory, compress=self.compress)
|
||||
|
||||
s = self.booksDirectory + "/" + self.adapter.getOutputName() + "." + format
|
||||
if not self.overWrite and os.path.isfile(s):
|
||||
print >> sys.stderr, "File " + s + " already exists! Skipping!"
|
||||
exit(10)
|
||||
|
||||
self.writer = self.writerClass(self.booksDirectory, self.adapter, inmemory=self.inmemory, compress=self.compress)
|
||||
|
||||
i = 1
|
||||
for u,n in urls:
|
||||
293
ffnet.py
293
ffnet.py
|
|
@ -15,6 +15,8 @@ import urllib2 as u2
|
|||
import urlparse as up
|
||||
import BeautifulSoup as bs
|
||||
import htmlentitydefs as hdefs
|
||||
import time
|
||||
import datetime
|
||||
|
||||
from constants import *
|
||||
from adapter import *
|
||||
|
|
@ -40,10 +42,37 @@ class FFNet(FanfictionSiteAdapter):
|
|||
|
||||
self.storyName = 'FF.Net story'
|
||||
self.authorName = 'FF.Net author'
|
||||
self.outputName = 'FF.Net_story'
|
||||
self.storyDescription = 'Fanfiction Story'
|
||||
self.storyCharacters = []
|
||||
self.storySeries = ''
|
||||
self.authorId = '0'
|
||||
self.authorURL = self.path
|
||||
self.storyId = '0'
|
||||
self.storyPublished = datetime.date(1970, 01, 31)
|
||||
self.storyCreated = datetime.datetime.now()
|
||||
self.storyUpdated = datetime.date(1970, 01, 31)
|
||||
self.languageId = 'en-UK'
|
||||
self.language = 'English'
|
||||
self.subjects = []
|
||||
self.subjects.append ('FanFiction')
|
||||
logging.debug('self.subjects=%s' % self.subjects)
|
||||
self.publisher = self.host
|
||||
self.numChapters = 0
|
||||
self.numWords = 0
|
||||
self.genre = 'FanFiction'
|
||||
self.category = 'FF.Net Category'
|
||||
self.storyStatus = 'In-Progress'
|
||||
self.storyRating = 'K'
|
||||
self.storyUserRating = '0'
|
||||
|
||||
logging.debug('self.path=%s' % self.path)
|
||||
|
||||
spl = self.path.split('/')
|
||||
logging.debug('spl=%s' % spl)
|
||||
if len(spl) == 5:
|
||||
self.path = "/".join(spl[1:-1])
|
||||
self.outputName = spl[4] + '-ffnet_' + spl[2]
|
||||
|
||||
if self.path.startswith('/'):
|
||||
self.path = self.path[1:]
|
||||
|
|
@ -51,10 +80,14 @@ class FFNet(FanfictionSiteAdapter):
|
|||
if self.path.endswith('/'):
|
||||
self.path = self.path[:-1]
|
||||
|
||||
logging.debug('self.path=%s' % self.path)
|
||||
|
||||
(s, self.storyId, chapter) = self.path.split('/')
|
||||
|
||||
logging.debug('self.storyId=%s, chapter=%s' % (self.storyId, chapter))
|
||||
|
||||
self.uuid = 'urn:uuid:' + self.host + '-u.' + self.authorId + '-s.' + self.storyId
|
||||
logging.debug('self.uuid=%s' % self.uuid)
|
||||
|
||||
logging.debug('self.storyId=%s, chapter=%s, self.outputName=%s' % (self.storyId, chapter, self.outputName))
|
||||
if not appEngine:
|
||||
self.opener = u2.build_opener(u2.HTTPCookieProcessor())
|
||||
else:
|
||||
|
|
@ -70,7 +103,70 @@ class FFNet(FanfictionSiteAdapter):
|
|||
|
||||
def performLogin(self, url = None):
|
||||
return True
|
||||
|
||||
def _getVarValue(self, varstr):
|
||||
#logging.debug('_getVarValue varstr=%s' % varstr)
|
||||
vals = varstr.split('=')
|
||||
#logging.debug('vals=%s' % vals)
|
||||
retstr="".join(vals[+1:])
|
||||
#logging.debug('retstr=%s' % retstr)
|
||||
if retstr.startswith(' '):
|
||||
retstr = retstr[1:]
|
||||
if retstr.endswith(';'):
|
||||
retstr = retstr[:-1]
|
||||
return retstr
|
||||
|
||||
def _splitCrossover(self, subject):
|
||||
if "Crossover" in subject:
|
||||
self._addSubject ("Crossover")
|
||||
logging.debug('Crossover=%s' % subject)
|
||||
if subject.find(' and ') != -1:
|
||||
words = subject.split(' ')
|
||||
logging.debug('words=%s' % words)
|
||||
subj = ''
|
||||
for s in words:
|
||||
if s in "and Crossover":
|
||||
if len(subj) > 0:
|
||||
self._addSubject(subj)
|
||||
subj = ''
|
||||
else:
|
||||
if len(subj) > 0:
|
||||
subj = subj + ' '
|
||||
subj = subj + s
|
||||
if len(subj) > 0:
|
||||
self._addSubject(subj)
|
||||
else:
|
||||
self._addSubject(subject)
|
||||
else:
|
||||
self._addSubject(subject)
|
||||
return True
|
||||
|
||||
def _splitGenre(self, subject):
|
||||
if len(subject) > 0:
|
||||
words = subject.split('/')
|
||||
logging.debug('words=%s' % words)
|
||||
for subj in words:
|
||||
if len(subj) > 0:
|
||||
self._addSubject(subj)
|
||||
return True
|
||||
|
||||
def _addSubject(self, subject):
|
||||
subj = subject.upper()
|
||||
for s in self.subjects:
|
||||
if s.upper() == subj:
|
||||
return False
|
||||
|
||||
self.subjects.append(subject)
|
||||
return True
|
||||
|
||||
def _addCharacter(self, character):
|
||||
chara = character.upper()
|
||||
for c in self.storyCharacters:
|
||||
if c.upper() == chara:
|
||||
return False
|
||||
self.storyCharacters.append(character)
|
||||
return True
|
||||
|
||||
def _fetchUrl(self, url):
|
||||
if not appEngine:
|
||||
return self.opener.open(url).read().decode('utf-8')
|
||||
|
|
@ -85,6 +181,8 @@ class FFNet(FanfictionSiteAdapter):
|
|||
for a in allA:
|
||||
if 'href' in a._getAttrMap() and a['href'].find('/u/') != -1:
|
||||
self.authorName = a.string
|
||||
(u1, u2, self.authorId, u3) = a['href'].split('/')
|
||||
logging.debug('self.authorId=%s self.authorName=%s' % (self.authorId, self.authorName))
|
||||
|
||||
urls = []
|
||||
lines = data.split('\n')
|
||||
|
|
@ -92,9 +190,38 @@ class FFNet(FanfictionSiteAdapter):
|
|||
if l.find("»") != -1 and l.find('<b>') != -1:
|
||||
s2 = bs.BeautifulStoneSoup(l)
|
||||
self.storyName = str(s2.find('b').string)
|
||||
logging.debug('self.storyId=%s self.storyName=%s' % (self.storyId, self.storyName))
|
||||
elif l.find("<a href='/u/") != -1:
|
||||
s2 = bs.BeautifulStoneSoup(l)
|
||||
self.authorName = str(s2.a.string)
|
||||
(u1, u2, self.authorId, u3) = s2.a['href'].split('/')
|
||||
logging.debug('self.authorId=%s self.authorName=%s' % (self.authorId, self.authorName))
|
||||
elif l.find("Rated: <a href=") != -1:
|
||||
s2 = bs.BeautifulStoneSoup(l)
|
||||
self.storyRating = str(s2.a.string).strip()
|
||||
logging.debug('self.storyRating=%s' % self.storyRating)
|
||||
logging.debug('s2.a=%s' % s2.a)
|
||||
s3 = l.split('-')
|
||||
logging.debug('s3=%s' % s3)
|
||||
if len(s3) > 0:
|
||||
if s3[1].find("Reviews: <a href=") != -1:
|
||||
continue
|
||||
self.language = s3[1].strip()
|
||||
logging.debug('self.language=%s' % self.language)
|
||||
if len(s3) > 1:
|
||||
if s3[2].find("Reviews: <a href=") != -1:
|
||||
continue
|
||||
self.genre = s3[2].strip()
|
||||
if "&" in self.genre:
|
||||
self.genre = ''
|
||||
continue
|
||||
logging.debug('self.genre=%s' % self.genre)
|
||||
self._splitGenre(self.genre)
|
||||
logging.debug('self.subjects=%s' % self.subjects)
|
||||
if "Complete" in l:
|
||||
self.storyStatus = 'Completed'
|
||||
else:
|
||||
self.storyStatus = 'In-Progress'
|
||||
elif l.find("<SELECT title='chapter navigation'") != -1:
|
||||
if len(urls) > 0:
|
||||
continue
|
||||
|
|
@ -102,6 +229,8 @@ class FFNet(FanfictionSiteAdapter):
|
|||
u = l.decode('utf-8')
|
||||
except UnicodeEncodeError, e:
|
||||
u = l
|
||||
except:
|
||||
u = l.encode('ascii', 'xmlcharrefreplace')
|
||||
u = re.sub('&\#[0-9]+;', ' ', u)
|
||||
s2 = bs.BeautifulSoup(u)
|
||||
options = s2.findAll('option')
|
||||
|
|
@ -110,19 +239,69 @@ class FFNet(FanfictionSiteAdapter):
|
|||
title = o.string
|
||||
logging.debug('URL = `%s`, Title = `%s`' % (url, title))
|
||||
urls.append((url,title))
|
||||
if len(urls) == 0:
|
||||
elif l.find("var chapters") != -1:
|
||||
self.numChapters = self._getVarValue (l)
|
||||
logging.debug('self.numChapters=%s' % self.numChapters)
|
||||
elif l.find("var words") != -1:
|
||||
self.numWords = self._getVarValue (l)
|
||||
logging.debug('self.numWords=%s' % self.numWords)
|
||||
elif l.find("var categoryid") != -1:
|
||||
categoryid = self._getVarValue (l)
|
||||
logging.debug('categoryid=%s' % categoryid)
|
||||
elif l.find("var cat_title") != -1:
|
||||
self.category = self._getVarValue (l).strip("'")
|
||||
logging.debug('self.category=%s' % self.category)
|
||||
self._splitCrossover(self.category)
|
||||
logging.debug('self.subjects=%s' % self.subjects)
|
||||
elif l.find("var summary") != -1:
|
||||
self.storyDescription = self._getVarValue (l).strip("'")
|
||||
if '&' in self.storyDescription:
|
||||
s = self.storyDescription.split('&')
|
||||
logging.debug('s=%s' % s)
|
||||
self.storyDescription = ''
|
||||
for ss in s:
|
||||
if len(self.storyDescription) > 0:
|
||||
if len(ss) > 4 and 'amp;' in ss[1:4]:
|
||||
self.storyDescription = self.storyDescription + '&' + ss
|
||||
else:
|
||||
self.storyDescription = self.storyDescription + '&' + ss
|
||||
else:
|
||||
self.storyDescription = ss
|
||||
logging.debug('self.storyDescription=%s' % self.storyDescription)
|
||||
elif l.find("var datep") != -1:
|
||||
dateps = self._getVarValue (l)
|
||||
self.storyPublished = datetime.datetime(*time.strptime ( dateps, "'%m-%d-%y'" )[0:5])
|
||||
logging.debug('self.storyPublished=%s' % self.storyPublished.strftime("%Y-%m-%dT%I:%M:%S"))
|
||||
elif l.find("var dateu") != -1:
|
||||
dateus = self._getVarValue (l)
|
||||
self.storyUpdated = datetime.datetime(*time.strptime ( dateus, "'%m-%d-%y'" )[0:5])
|
||||
logging.debug('self.storyUpdated=%s' % self.storyUpdated.strftime("%Y-%m-%dT%I:%M:%S"))
|
||||
|
||||
if len(urls) <= 0:
|
||||
# no chapters found, try url by itself.
|
||||
urls.append((self.url,self.storyName))
|
||||
|
||||
self.uuid = 'urn:uuid:' + self.host + '-a.' + self.authorId + '-s.' + self.storyId
|
||||
self.authorURL = 'http://' + self.host + '/u/' + self.authorId
|
||||
logging.debug('self.uuid=%s' % self.uuid)
|
||||
|
||||
#logging.debug('urls=%s' % urls)
|
||||
return urls
|
||||
|
||||
def getText(self, url):
|
||||
time.sleep( 2.0 )
|
||||
data = self._fetchUrl(url)
|
||||
lines = data.split('\n')
|
||||
|
||||
textbuf = ''
|
||||
emit = False
|
||||
|
||||
olddata = data
|
||||
try:
|
||||
data = data.decode('utf8')
|
||||
except:
|
||||
data = olddata
|
||||
|
||||
|
||||
try:
|
||||
soup = bs.BeautifulStoneSoup(data)
|
||||
except:
|
||||
|
|
@ -131,23 +310,121 @@ class FFNet(FanfictionSiteAdapter):
|
|||
div = soup.find('div', {'id' : 'storytext'})
|
||||
if None == div:
|
||||
logging.error("Error downloading Chapter: %s" % url)
|
||||
exit(1)
|
||||
exit (20)
|
||||
return '<html/>'
|
||||
|
||||
return div.__str__('utf8')
|
||||
|
||||
|
||||
def setLogin(self, login):
|
||||
self.login = login
|
||||
|
||||
def setPassword(self, password):
|
||||
self.password = password
|
||||
|
||||
def getStoryName(self):
|
||||
return self.storyName
|
||||
def getStoryURL(self):
|
||||
logging.debug('self.url=%s' % self.url)
|
||||
return self.url
|
||||
|
||||
def getUUID(self):
|
||||
logging.debug('self.uuid=%s' % self.uuid)
|
||||
return self.uuid
|
||||
|
||||
def getOutputName(self):
|
||||
logging.debug('self.storyId=%s, self.storyName=%s self.outputName=%s' % (self.storyId, self.storyName, self.outputName))
|
||||
return self.outputName
|
||||
|
||||
def getAuthorName(self):
|
||||
logging.debug('self.authorName=%s' % self.authorName)
|
||||
return self.authorName
|
||||
|
||||
def getAuthorId(self):
|
||||
logging.debug('self.authorId=%s' % self.authorId)
|
||||
return self.authorId
|
||||
|
||||
def getAuthorURL(self):
|
||||
logging.debug('self.authorURL=%s' % self.authorURL)
|
||||
return self.authorURL
|
||||
|
||||
def getStoryId(self):
|
||||
logging.debug('self.storyId=%s' % self.storyId)
|
||||
return self.storyId
|
||||
|
||||
def getStoryName(self):
|
||||
logging.debug('self.storyName=%s' % self.storyName)
|
||||
return self.storyName
|
||||
|
||||
def getStoryDescription(self):
|
||||
logging.debug('self.storyDescription=%s' % self.storyDescription)
|
||||
return self.storyDescription
|
||||
|
||||
def getStoryPublished(self):
|
||||
logging.debug('self.storyPublished=%s' % self.storyPublished)
|
||||
return self.storyPublished
|
||||
|
||||
def getStoryCreated(self):
|
||||
self.storyCreated = datetime.datetime.now()
|
||||
logging.debug('self.storyCreated=%s' % self.storyCreated)
|
||||
return self.storyCreated
|
||||
|
||||
def getStoryUpdated(self):
|
||||
logging.debug('self.storyUpdated=%s' % self.storyUpdated)
|
||||
return self.storyUpdated
|
||||
|
||||
def getLanguage(self):
|
||||
logging.debug('self.language=%s' % self.language)
|
||||
return self.language
|
||||
|
||||
def getLanguageId(self):
|
||||
logging.debug('self.languageId=%s' % self.languageId)
|
||||
return self.languageId
|
||||
|
||||
def getSubjects(self):
|
||||
logging.debug('self.subjects=%s' % self.authorName)
|
||||
return self.subjects
|
||||
|
||||
def getPublisher(self):
|
||||
logging.debug('self.publisher=%s' % self.publisher)
|
||||
return self.publisher
|
||||
|
||||
def getNumChapters(self):
|
||||
logging.debug('self.numChapters=%s' % self.numChapters)
|
||||
return self.numChapters
|
||||
|
||||
def getNumWords(self):
|
||||
logging.debug('self.numWords=%s' % self.numWords)
|
||||
return self.numWords
|
||||
|
||||
def getCategory(self):
|
||||
logging.debug('self.category=%s' % self.category)
|
||||
return self.category
|
||||
|
||||
def getGenre(self):
|
||||
logging.debug('self.genre=%s' % self.genre)
|
||||
return self.genre
|
||||
|
||||
def getStoryStatus(self):
|
||||
logging.debug('self.storyStatus=%s' % self.storyStatus)
|
||||
return self.storyStatus
|
||||
|
||||
def getStoryRating(self):
|
||||
logging.debug('self.storyRating=%s' % self.storyRating)
|
||||
return self.storyRating
|
||||
|
||||
def getStoryUserRating(self):
|
||||
logging.debug('self.storyUserRating=%s' % self.storyUserRating)
|
||||
return self.storyUserRating
|
||||
|
||||
def getPrintableUrl(self, url):
|
||||
pass
|
||||
|
||||
def getStoryCharacters(self):
|
||||
logging.debug('self.storyCharacters=%s' % self.storyCharacters)
|
||||
return self.storyCharacters
|
||||
|
||||
def getStorySeries(self):
|
||||
logging.debug('self.storySeries=%s' % self.storySeries)
|
||||
return self.storySeries
|
||||
|
||||
class FFA_UnitTests(unittest.TestCase):
|
||||
def setUp(self):
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
|
|
|
|||
269
fictionalley.py
269
fictionalley.py
|
|
@ -12,13 +12,20 @@ import urlparse as up
|
|||
import BeautifulSoup as bs
|
||||
import htmlentitydefs as hdefs
|
||||
import time as time
|
||||
import datetime
|
||||
from adapter import *
|
||||
|
||||
|
||||
class FictionAlley(FanfictionSiteAdapter):
|
||||
def __init__(self, url):
|
||||
self.url = url
|
||||
self.host = up.urlparse(url).netloc
|
||||
parsedUrl = up.urlparse(url)
|
||||
self.host = parsedUrl.netloc
|
||||
self.path = parsedUrl.path
|
||||
|
||||
logging.debug('self.host=%s' % self.host)
|
||||
logging.debug('self.path=%s' % self.path)
|
||||
|
||||
cookieproc = u2.HTTPCookieProcessor()
|
||||
|
||||
# FictionAlley wants a cookie to prove you're old enough to read R+ rated stuff.
|
||||
|
|
@ -35,6 +42,36 @@ class FictionAlley(FanfictionSiteAdapter):
|
|||
rfc2109=False)
|
||||
cookieproc.cookiejar.set_cookie(cookie)
|
||||
self.opener = u2.build_opener(cookieproc)
|
||||
|
||||
ss = self.path.split('/')
|
||||
|
||||
self.storyDescription = 'Fanfiction Story'
|
||||
self.authorId = ''
|
||||
self.authorURL = ''
|
||||
self.storyId = ''
|
||||
if len(ss) > 2 and ss[1] == 'authors':
|
||||
self.authorId = ss[2]
|
||||
self.authorURL = 'http://' + self.host + '/authors/' + self.authorId
|
||||
if len(ss) > 3:
|
||||
self.storyId = ss[3].replace ('.html','')
|
||||
self.storyPublished = datetime.date(1970, 01, 31)
|
||||
self.storyCreated = datetime.datetime.now()
|
||||
self.storyUpdated = datetime.date(1970, 01, 31)
|
||||
self.languageId = 'en-UK'
|
||||
self.language = 'English'
|
||||
self.subjects = []
|
||||
self.subjects.append ('fanfiction')
|
||||
self.publisher = self.host
|
||||
self.numChapters = 0
|
||||
self.numWords = 0
|
||||
self.genre = 'FanFiction'
|
||||
self.category = 'Category'
|
||||
self.storyStatus = 'In-Progress'
|
||||
self.storyRating = 'K'
|
||||
self.storyUserRating = '0'
|
||||
self.storyCharacters = []
|
||||
self.storySeries = ''
|
||||
|
||||
|
||||
def requiresLogin(self, url = None):
|
||||
return False
|
||||
|
|
@ -48,31 +85,147 @@ class FictionAlley(FanfictionSiteAdapter):
|
|||
def setPassword(self, password):
|
||||
self.password = password
|
||||
|
||||
def _addSubject(self, subject):
|
||||
subj = subject.upper()
|
||||
for s in self.subjects:
|
||||
if s.upper() == subj:
|
||||
return False
|
||||
self.subjects.append(subject)
|
||||
return True
|
||||
|
||||
def _addCharacter(self, character):
|
||||
chara = character.upper()
|
||||
for c in self.storyCharacters:
|
||||
if c.upper() == chara:
|
||||
return False
|
||||
self.storyCharacters.append(character)
|
||||
return True
|
||||
|
||||
def _processChapterHeaders(self, div):
|
||||
brs = div.findAll ('br')
|
||||
for br in brs:
|
||||
keystr=''
|
||||
valstr=''
|
||||
if len(br.contents) > 2:
|
||||
keystr = br.contents[1]
|
||||
if keystr is not None:
|
||||
strs = re.split ("<[^>]+>", str(keystr))
|
||||
keystr=''
|
||||
for s in strs:
|
||||
keystr = keystr + s
|
||||
valstr = br.contents[2].strip(' ')
|
||||
if keystr is not None:
|
||||
if keystr == 'Rating:':
|
||||
self.storyRating = valstr
|
||||
logging.debug('self.storyRating=%s' % self.storyRating)
|
||||
elif keystr == 'Genre:':
|
||||
self.genre = valstr
|
||||
logging.debug('self.genre=%s' % self.genre)
|
||||
s2 = valstr.split(', ')
|
||||
for ss2 in s2:
|
||||
self._addSubject(ss2)
|
||||
logging.debug('self.subjects=%s' % self.subjects)
|
||||
elif keystr == 'Main Character(s):':
|
||||
s2 = valstr.split(', ')
|
||||
for ss2 in s2:
|
||||
self._addCharacter(ss2)
|
||||
logging.debug('self.storyCharacters=%s' % self.storyCharacters)
|
||||
elif keystr == 'Summary:':
|
||||
self.storyDescription = valstr
|
||||
logging.debug('self.storyDescription=%s' % self.storyDescription)
|
||||
|
||||
|
||||
def extractIndividualUrls(self):
|
||||
data = self.opener.open(self.url).read()
|
||||
|
||||
# There is some usefull information in the headers of the first chapter page..
|
||||
data = data.replace('<!-- headerstart -->','<crazytagstringnobodywouldstumbleonaccidently id="storyheaders">').replace('<!-- headerend -->','</crazytagstringnobodywouldstumbleonaccidently>')
|
||||
soup = bs.BeautifulStoneSoup(data)
|
||||
|
||||
# Get title from <title>, remove before '-'.
|
||||
title = soup.find('title').string
|
||||
self.storyName = "-".join(title.split('-')[1:]).strip().replace(" (Story Text)","")
|
||||
self.outputName = self.storyName.replace(" ", "_") + '-fa_' + self.storyId
|
||||
|
||||
links = soup.findAll('a', { 'class' : 'chapterlink' } )
|
||||
links = soup.findAll('li')
|
||||
|
||||
# If it is decided that we really do care about number of words.. It's only available on the author's page..
|
||||
#d0 = self.opener.open(self.authorURL).read()
|
||||
#soupA = bs.BeautifulStoneSoup(d0)
|
||||
#dls = soupA.findAll('dl')
|
||||
#logging.debug('dls=%s' % dls)
|
||||
|
||||
self.numChapters = 0;
|
||||
result = []
|
||||
if len(links) == 0:
|
||||
# Be aware that this means that the user has entered the {STORY}01.html
|
||||
# We will not have valid Publised and Updated dates. User should enter
|
||||
# the {STORY}.html instead. We should force that instead of this.
|
||||
breadcrumbs = soup.find('div', {'class': 'breadcrumbs'})
|
||||
self.authorName = breadcrumbs.a.string.replace("'s Fics","")
|
||||
result.append((self.url,self.storyName))
|
||||
#logging.debug('chapter[%s]=%s, %s' % (self.numChapters+1,self.url,self.storyName))
|
||||
self.numChapters = self.numChapters + 1;
|
||||
div = soup.find('crazytagstringnobodywouldstumbleonaccidently', {'id' : 'storyheaders'})
|
||||
if div is not None:
|
||||
self._processChapterHeaders(div)
|
||||
else:
|
||||
author = soup.find('h1', {'class' : 'title'})
|
||||
self.authorName = author.a.string
|
||||
|
||||
for a in links:
|
||||
url = a['href']
|
||||
title = a.string
|
||||
result.append((url,title))
|
||||
summary = soup.find('div', {'class' : 'summary'})
|
||||
ss = summary.contents
|
||||
if len(ss) > 1:
|
||||
ss1 = ss[0].split(': ')
|
||||
if len(ss1) > 1 and ss1[0] == 'Rating':
|
||||
self.storyRating = ss1[1]
|
||||
logging.debug('self.storyRating=%s' % self.storyRating)
|
||||
self.storyDescription = str(ss[1]).replace("<br>","").replace("</br>","").replace('\n','')
|
||||
logging.debug('self.storyDescription=%s' % self.storyDescription)
|
||||
|
||||
for li in links:
|
||||
a = li.find('a', {'class' : 'chapterlink'})
|
||||
s = li.contents
|
||||
if a is not None:
|
||||
url = a['href']
|
||||
title = a.string
|
||||
result.append((url,title))
|
||||
#logging.debug('chapter[%s]=%s, %s' % (self.numChapters+1,url,title))
|
||||
if self.numChapters == 0:
|
||||
# fictionalley uses full URLs in chapter list.
|
||||
d1 = self.opener.open(url).read()
|
||||
|
||||
# find <!-- headerstart --> & <!-- headerend --> and
|
||||
# replaced with matching div pair for easier parsing.
|
||||
# Yes, it's an evil kludge, but what can ya do? Using
|
||||
# something other than div prevents soup from pairing
|
||||
# our div with poor html inside the story text.
|
||||
d1 = d1.replace('<!-- headerstart -->','<crazytagstringnobodywouldstumbleonaccidently id="storyheaders">').replace('<!-- headerend -->','</crazytagstringnobodywouldstumbleonaccidently>')
|
||||
sop = bs.BeautifulStoneSoup(d1)
|
||||
|
||||
div = sop.find('crazytagstringnobodywouldstumbleonaccidently', {'id' : 'storyheaders'})
|
||||
if div is not None:
|
||||
self._processChapterHeaders(div)
|
||||
|
||||
self.numChapters = self.numChapters + 1
|
||||
if len(s) > 1:
|
||||
datestr=''
|
||||
ss2 = s[1].replace('\n','').replace('(','').split(' ')
|
||||
if len(ss2) > 2 and ss2[0] == 'Posted:':
|
||||
datestr = ss2[1] + ' ' + ss2[2]
|
||||
tmpdate = datetime.datetime.fromtimestamp(time.mktime(time.strptime(datestr.strip(' '), "%Y-%m-%d %H:%M:%S")))
|
||||
if self.numChapters == 1:
|
||||
self.storyPublished = tmpdate
|
||||
self.storyUpdated = tmpdate
|
||||
logging.debug('self.storyPublished=%s, self.storyUpdated=%s' % (self.storyPublished, self.storyUpdated))
|
||||
else:
|
||||
logging.debug('li chapterlink not found! li=%s' % li)
|
||||
|
||||
#print('Story "%s" by %s' % (self.storyName, self.authorName))
|
||||
|
||||
print('Story "%s" by %s' % (self.storyName, self.authorName))
|
||||
|
||||
self.uuid = 'urn:uuid:' + self.host + '-u.' + self.authorId + '-s.' + self.storyId
|
||||
logging.debug('self.uuid=%s' % self.uuid)
|
||||
|
||||
return result
|
||||
|
||||
|
|
@ -82,6 +235,9 @@ class FictionAlley(FanfictionSiteAdapter):
|
|||
def getAuthorName(self):
|
||||
return self.authorName
|
||||
|
||||
def getOutputName(self):
|
||||
return self.outputName
|
||||
|
||||
def getText(self, url):
|
||||
# fictionalley uses full URLs in chapter list.
|
||||
data = self.opener.open(url).read()
|
||||
|
|
@ -97,10 +253,96 @@ class FictionAlley(FanfictionSiteAdapter):
|
|||
div = soup.find('crazytagstringnobodywouldstumbleonaccidently', {'id' : 'storytext'})
|
||||
if None == div:
|
||||
logging.error("Error downloading Chapter: %s" % url)
|
||||
exit(1)
|
||||
exit(20)
|
||||
return '<html/>'
|
||||
return div.__str__('utf8').replace('crazytagstringnobodywouldstumbleonaccidently','div')
|
||||
|
||||
html = soup.findAll('html')
|
||||
if len(html) > 1:
|
||||
return html[1].__str__('utf8')
|
||||
else:
|
||||
return div.__str__('utf8').replace('crazytagstringnobodywouldstumbleonaccidently','div')
|
||||
|
||||
def getStoryURL(self):
|
||||
logging.debug('self.url=%s' % self.url)
|
||||
return self.url
|
||||
|
||||
def getAuthorURL(self):
|
||||
logging.debug('self.authorURL=%s' % self.authorURL)
|
||||
return self.authorURL
|
||||
|
||||
def getUUID(self):
|
||||
logging.debug('self.uuid=%s' % self.uuid)
|
||||
return self.uuid
|
||||
|
||||
def getAuthorId(self):
|
||||
logging.debug('self.authorId=%s' % self.authorId)
|
||||
return self.authorId
|
||||
|
||||
def getStoryId(self):
|
||||
logging.debug('self.storyId=%s' % self.storyId)
|
||||
return self.storyId
|
||||
|
||||
def getStoryDescription(self):
|
||||
logging.debug('self.storyDescription=%s' % self.storyDescription)
|
||||
return self.storyDescription
|
||||
|
||||
def getStoryPublished(self):
|
||||
logging.debug('self.storyPublished=%s' % self.storyPublished)
|
||||
return self.storyPublished
|
||||
|
||||
def getStoryCreated(self):
|
||||
self.storyCreated = datetime.datetime.now()
|
||||
logging.debug('self.storyCreated=%s' % self.storyCreated)
|
||||
return self.storyCreated
|
||||
|
||||
def getStoryUpdated(self):
|
||||
logging.debug('self.storyUpdated=%s' % self.storyUpdated)
|
||||
return self.storyUpdated
|
||||
|
||||
def getLanguage(self):
|
||||
logging.debug('self.language=%s' % self.language)
|
||||
return self.language
|
||||
|
||||
def getLanguageId(self):
|
||||
logging.debug('self.languageId=%s' % self.languageId)
|
||||
return self.languageId
|
||||
|
||||
def getSubjects(self):
|
||||
logging.debug('self.subjects=%s' % self.authorName)
|
||||
return self.subjects
|
||||
|
||||
def getPublisher(self):
|
||||
logging.debug('self.publisher=%s' % self.publisher)
|
||||
return self.publisher
|
||||
|
||||
def getNumChapters(self):
|
||||
logging.debug('self.numChapters=%s' % self.numChapters)
|
||||
return self.numChapters
|
||||
|
||||
def getNumWords(self):
|
||||
logging.debug('self.numWords=%s' % self.numWords)
|
||||
return self.numWords
|
||||
|
||||
def getCategory(self):
|
||||
logging.debug('self.category=%s' % self.category)
|
||||
return self.category
|
||||
|
||||
def getGenre(self):
|
||||
logging.debug('self.genre=%s' % self.genre)
|
||||
return self.genre
|
||||
|
||||
def getStoryStatus(self):
|
||||
logging.debug('self.storyStatus=%s' % self.storyStatus)
|
||||
return self.storyStatus
|
||||
|
||||
def getStoryRating(self):
|
||||
logging.debug('self.storyRating=%s' % self.storyRating)
|
||||
return self.storyRating
|
||||
|
||||
def getStoryUserRating(self):
|
||||
logging.debug('self.storyUserRating=%s' % self.storyUserRating)
|
||||
return self.storyUserRating
|
||||
|
||||
def getPrintableUrl(self, url):
|
||||
return url
|
||||
|
||||
|
|
@ -114,6 +356,15 @@ class FictionAlley(FanfictionSiteAdapter):
|
|||
login = dict(login = 'name', password = 'pass')
|
||||
other = dict(submit = 'Log In', remember='yes')
|
||||
return (login, other)
|
||||
|
||||
def getStoryCharacters(self):
|
||||
logging.debug('self.storyCharacters=%s' % self.storyCharacters)
|
||||
return self.storyCharacters
|
||||
|
||||
def getStorySeries(self):
|
||||
logging.debug('self.storySeries=%s' % self.storySeries)
|
||||
return self.storySeries
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
|
|
|||
288
ficwad.py
288
ficwad.py
|
|
@ -12,6 +12,8 @@ import urlparse as up
|
|||
import BeautifulSoup as bs
|
||||
import htmlentitydefs as hdefs
|
||||
import logging
|
||||
import time
|
||||
import datetime
|
||||
|
||||
from adapter import *
|
||||
|
||||
|
|
@ -32,7 +34,44 @@ class FicWad(FanfictionSiteAdapter):
|
|||
def setPassword(self, password):
|
||||
self.password = password
|
||||
|
||||
def _addSubject(self, subject):
|
||||
subj = subject.upper()
|
||||
for s in self.subjects:
|
||||
if s.upper() == subj:
|
||||
return False
|
||||
self.subjects.append(subject)
|
||||
return True
|
||||
|
||||
def _addCharacter(self, character):
|
||||
chara = character.upper()
|
||||
for c in self.storyCharacters:
|
||||
if c.upper() == chara:
|
||||
return False
|
||||
self.storyCharacters.append(character)
|
||||
return True
|
||||
|
||||
def extractIndividualUrls(self):
|
||||
self.storyDescription = 'Fanfiction Story'
|
||||
self.authorId = '0'
|
||||
self.storyId = '0'
|
||||
self.storyPublished = datetime.date(1970, 01, 31)
|
||||
self.storyCreated = datetime.datetime.now()
|
||||
self.storyUpdated = datetime.date(1970, 01, 31)
|
||||
self.languageId = 'en-UK'
|
||||
self.language = 'English'
|
||||
self.subjects = []
|
||||
self.subjects.append ('fanfiction')
|
||||
self.publisher = self.host
|
||||
self.numChapters = 0
|
||||
self.numWords = 0
|
||||
self.genre = 'FanFiction'
|
||||
self.category = 'Category'
|
||||
self.storyStatus = 'In-Progress'
|
||||
self.storyRating = 'PG'
|
||||
self.storyUserRating = '0'
|
||||
self.storyCharacters = []
|
||||
self.storySeries = ''
|
||||
|
||||
data = u2.urlopen(self.url).read()
|
||||
soup = bs.BeautifulStoneSoup(data)
|
||||
|
||||
|
|
@ -40,50 +79,254 @@ class FicWad(FanfictionSiteAdapter):
|
|||
crumbtrail = story.find('h3') # the only h3 ficwad uses.
|
||||
allAhrefs = crumbtrail.findAll('a')
|
||||
# last of crumbtrail
|
||||
self.storyName = allAhrefs[-1].string.strip()
|
||||
storyinfo = allAhrefs[-1]
|
||||
(u0, u1, storyid) = storyinfo['href'].split('/')
|
||||
if u1 == "story":
|
||||
# This page does not have the correct information on it.. Need to get the Story Title Page
|
||||
logging.debug('URL %s is a chapter URL. Getting Title Page http://%s/%s/%s.' % (self.url, self.host, u1, storyid))
|
||||
self.url = 'http://' + self.host + '/' + u1 + '/' + storyid
|
||||
data = u2.urlopen(self.url).read()
|
||||
soup = bs.BeautifulStoneSoup(data)
|
||||
|
||||
story = soup.find('div', {'id' : 'story'})
|
||||
crumbtrail = story.find('h3') # the only h3 ficwad uses.
|
||||
allAhrefs = crumbtrail.findAll('a')
|
||||
|
||||
# save chapter name from header in case of one-shot.
|
||||
chaptername = story.find('h4').find('a').string.strip()
|
||||
storyinfo = story.find('h4').find('a')
|
||||
(u0, u1, self.storyId) = storyinfo['href'].split('/')
|
||||
self.storyName = storyinfo.string.strip()
|
||||
self.outputName = self.storyName.replace(" ", "_") + '-fw_' + self.storyId
|
||||
|
||||
logging.debug('self.storyName=%s, self.storyId=%s, self.outputName=%s' % (self.storyName, self.storyId, self.outputName))
|
||||
|
||||
author = soup.find('span', {'class' : 'author'})
|
||||
self.authorName = str(author.a.string)
|
||||
(u0, u1,self.authorId) = author.a['href'].split('/')
|
||||
self.authorURL = 'http://' + self.host + author.a['href']
|
||||
logging.debug('self.authorName=%s self.authorId=%s' % (self.authorName, self.authorId))
|
||||
|
||||
select = soup.find('select', { 'name' : 'goto' } )
|
||||
description = soup.find('blockquote', {'class' : 'summary'})
|
||||
if description is not None:
|
||||
self.storyDescription = str(description.p.string)
|
||||
logging.debug('self.storyDescription=%s' % self.storyDescription)
|
||||
|
||||
meta = soup.find('p', {'class' : 'meta'})
|
||||
if meta is not None:
|
||||
s = str(meta).replace('\n',' ').replace('\t','').split(' - ')
|
||||
logging.debug('meta.s=%s' % s)
|
||||
for ss in s:
|
||||
s1 = ss.replace(' ','').split(':')
|
||||
#logging.debug('meta.s.s1=%s' % s1)
|
||||
if len(s1) > 1:
|
||||
s2 = re.split ('<[^>]+>', s1[0])
|
||||
#logging.debug('meta.s.s1.s2=%s' % s2)
|
||||
if len(s2) > 1:
|
||||
s1[0] = s2[1]
|
||||
skey = s1[0].strip()
|
||||
#logging.debug('Checking = %s' % skey)
|
||||
if skey == 'Category':
|
||||
soup1 = bs.BeautifulStoneSoup(s1[1])
|
||||
allAs = soup1.findAll('a')
|
||||
for a in allAs:
|
||||
if self.category == 'Category':
|
||||
self.category = str(a.string)
|
||||
logging.debug('self.category=%s' % self.category)
|
||||
self._addSubject(self.category)
|
||||
logging.debug('self.subjects=%s' % self.subjects)
|
||||
elif skey == 'Rating':
|
||||
self.storyRating = s1[1]
|
||||
logging.debug('self.storyRating=%s' % self.storyRating)
|
||||
elif skey == 'Genres':
|
||||
self.genre = s1[1]
|
||||
logging.debug('self.genre=%s' % self.genre)
|
||||
s2 = s1[1].split(', ')
|
||||
for ss2 in s2:
|
||||
self._addSubject(ss2)
|
||||
logging.debug('self.subjects=%s' % self.subjects)
|
||||
elif skey == 'Characters':
|
||||
s2 = s1[1].split(', ')
|
||||
for ss2 in s2:
|
||||
self._addCharacter(ss2)
|
||||
logging.debug('self.storyCharacters=%s' % self.storyCharacters)
|
||||
elif skey == 'Chapters':
|
||||
self.numChapters = s1[1]
|
||||
logging.debug('self.numChapters=%s' % self.numChapters)
|
||||
elif skey == 'Warnings':
|
||||
logging.debug('Warnings=%s' % s1[1])
|
||||
elif skey == 'Published':
|
||||
self.storyPublished = datetime.datetime.fromtimestamp(time.mktime(time.strptime(s1[1].strip(' '), "%Y/%m/%d")))
|
||||
logging.debug('self.storyPublished=%s' % self.storyPublished)
|
||||
elif skey == 'Updated':
|
||||
self.storyUpdated = datetime.datetime.fromtimestamp(time.mktime(time.strptime(s1[1].strip(' '), "%Y/%m/%d")))
|
||||
logging.debug('self.storyUpdated=%s' % self.storyUpdated)
|
||||
else:
|
||||
s3 = re.split ('<[^>]+>', s1[0])
|
||||
#logging.debug('meta.s.s1.s3=%s' % s3)
|
||||
if len(s3) > 1:
|
||||
s1[0] = s3[0]
|
||||
s4 = s1[0].split('w')
|
||||
#logging.debug('meta.s.s1.s4=%s' % s4)
|
||||
if len(s4) > 1 and s4[1] == 'ords':
|
||||
self.numWords = s4[0]
|
||||
logging.debug('self.numWords=%s' % self.numWords)
|
||||
|
||||
|
||||
print('Story "%s" by %s' % (self.storyName, self.authorName))
|
||||
|
||||
result = []
|
||||
if select is None:
|
||||
# Single chapter storys don't have title in crumbtrail, just 'chapter' title in h4.
|
||||
self.storyName = chaptername
|
||||
# no chapters found, try url by itself.
|
||||
result.append((self.url,self.storyName))
|
||||
else:
|
||||
allOptions = select.findAll('option')
|
||||
for o in allOptions:
|
||||
url = 'http://' + self.host + o['value']
|
||||
title = o.string
|
||||
# ficwad includes 'Story Index' in the dropdown of chapters,
|
||||
# but it's not a real chapter.
|
||||
if title != "Story Index":
|
||||
result.append((url,title))
|
||||
ii = 1
|
||||
|
||||
storylist = soup.find('ul', {'id' : 'storylist'})
|
||||
if storylist is not None:
|
||||
allH4s = storylist.findAll('h4')
|
||||
#logging.debug('allH4s=%s' % allH4s)
|
||||
|
||||
if allH4s is not None:
|
||||
for h4 in allH4s:
|
||||
chapterinfo = h4.find('a')
|
||||
#logging.debug('Chapter1=%s' % chapterinfo)
|
||||
url = 'http://' + self.host + chapterinfo['href']
|
||||
title = chapterinfo.string.strip()
|
||||
#logging.debug('Chapter=%s, %s' % (url, title))
|
||||
# ficwad includes 'Story Index' in the dropdown of chapters,
|
||||
# but it's not a real chapter.
|
||||
if title != "Story Index":
|
||||
logging.debug('Chapter[%s]=%s, %s' % (ii, url, title))
|
||||
result.append((url,title))
|
||||
ii = ii+1
|
||||
else:
|
||||
logging.debug('Skipping Story Index. URL %s' % url)
|
||||
|
||||
if ii == 1:
|
||||
select = soup.find('select', { 'name' : 'goto' } )
|
||||
|
||||
if select is None:
|
||||
result.append((self.url,self.storyName))
|
||||
logging.debug('Chapter[%s]=%s %s' % (ii, self.url, self.storyName))
|
||||
else:
|
||||
allOptions = select.findAll('option')
|
||||
for o in allOptions:
|
||||
url = 'http://' + self.host + o['value']
|
||||
title = o.string
|
||||
# ficwad includes 'Story Index' in the dropdown of chapters,
|
||||
# but it's not a real chapter.
|
||||
if title != "Story Index":
|
||||
logging.debug('Chapter[%s]=%s, %s' % (ii, url, title))
|
||||
result.append((url,title))
|
||||
ii = ii+1
|
||||
else:
|
||||
logging.debug('Skipping Story Index. URL %s' % url)
|
||||
|
||||
self.uuid = 'urn:uuid:' + self.host + '-u.' + self.authorId + '-s.' + self.storyId
|
||||
logging.debug('self.uuid=%s' % self.uuid)
|
||||
|
||||
return result
|
||||
|
||||
def getStoryName(self):
|
||||
return self.storyName
|
||||
|
||||
def getOutputName(self):
|
||||
return self.outputName
|
||||
|
||||
def getAuthorName(self):
|
||||
return self.authorName
|
||||
|
||||
def getText(self, url):
|
||||
if url.find('http://') == -1:
|
||||
url = 'http://' + self.host + '/' + url
|
||||
|
||||
data = u2.urlopen(url).read()
|
||||
|
||||
soup = bs.BeautifulStoneSoup(data)
|
||||
div = soup.find('div', {'id' : 'storytext'})
|
||||
if None == div:
|
||||
logging.error("Error downloading Chapter: %s" % url)
|
||||
exit(1)
|
||||
exit(20)
|
||||
return '<html/>'
|
||||
return div.__str__('utf8')
|
||||
|
||||
def getStoryURL(self):
|
||||
logging.debug('self.url=%s' % self.url)
|
||||
return self.url
|
||||
|
||||
def getAuthorURL(self):
|
||||
logging.debug('self.authorURL=%s' % self.authorURL)
|
||||
return self.authorURL
|
||||
|
||||
def getUUID(self):
|
||||
logging.debug('self.uuid=%s' % self.uuid)
|
||||
return self.uuid
|
||||
|
||||
def getAuthorId(self):
|
||||
logging.debug('self.authorId=%s' % self.authorId)
|
||||
return self.authorId
|
||||
|
||||
def getStoryId(self):
|
||||
logging.debug('self.storyId=%s' % self.storyId)
|
||||
return self.storyId
|
||||
|
||||
def getStoryDescription(self):
|
||||
logging.debug('self.storyDescription=%s' % self.storyDescription)
|
||||
return self.storyDescription
|
||||
|
||||
def getStoryPublished(self):
|
||||
logging.debug('self.storyPublished=%s' % self.storyPublished)
|
||||
return self.storyPublished
|
||||
|
||||
def getStoryCreated(self):
|
||||
self.storyCreated = datetime.datetime.now()
|
||||
logging.debug('self.storyCreated=%s' % self.storyCreated)
|
||||
return self.storyCreated
|
||||
|
||||
def getStoryUpdated(self):
|
||||
logging.debug('self.storyUpdated=%s' % self.storyUpdated)
|
||||
return self.storyUpdated
|
||||
|
||||
def getLanguage(self):
|
||||
logging.debug('self.language=%s' % self.language)
|
||||
return self.language
|
||||
|
||||
def getLanguageId(self):
|
||||
logging.debug('self.languageId=%s' % self.languageId)
|
||||
return self.languageId
|
||||
|
||||
def getSubjects(self):
|
||||
logging.debug('self.subjects=%s' % self.authorName)
|
||||
return self.subjects
|
||||
|
||||
def getPublisher(self):
|
||||
logging.debug('self.publisher=%s' % self.publisher)
|
||||
return self.publisher
|
||||
|
||||
def getNumChapters(self):
|
||||
logging.debug('self.numChapters=%s' % self.numChapters)
|
||||
return self.numChapters
|
||||
|
||||
def getNumWords(self):
|
||||
logging.debug('self.numWords=%s' % self.numWords)
|
||||
return self.numWords
|
||||
|
||||
def getCategory(self):
|
||||
logging.debug('self.category=%s' % self.category)
|
||||
return self.category
|
||||
|
||||
def getGenre(self):
|
||||
logging.debug('self.genre=%s' % self.genre)
|
||||
return self.genre
|
||||
|
||||
def getStoryStatus(self):
|
||||
logging.debug('self.storyStatus=%s' % self.storyStatus)
|
||||
return self.storyStatus
|
||||
|
||||
def getStoryRating(self):
|
||||
logging.debug('self.storyRating=%s' % self.storyRating)
|
||||
return self.storyRating
|
||||
|
||||
def getStoryUserRating(self):
|
||||
logging.debug('self.storyUserRating=%s' % self.storyUserRating)
|
||||
return self.storyUserRating
|
||||
|
||||
def getPrintableUrl(self, url):
|
||||
return url
|
||||
|
||||
|
|
@ -98,6 +341,15 @@ class FicWad(FanfictionSiteAdapter):
|
|||
other = dict(submit = 'Log In', remember='yes')
|
||||
return (login, other)
|
||||
|
||||
def getStoryCharacters(self):
|
||||
logging.debug('self.storyCharacters=%s' % self.storyCharacters)
|
||||
return self.storyCharacters
|
||||
|
||||
def getStorySeries(self):
|
||||
logging.debug('self.storySeries=%s' % self.storySeries)
|
||||
return self.storySeries
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
url = 'http://www.ficwad.com/story/14536'
|
||||
|
|
|
|||
226
hpfiction.py
226
hpfiction.py
|
|
@ -15,6 +15,8 @@ import urllib2 as u2
|
|||
import urlparse as up
|
||||
import BeautifulSoup as bs
|
||||
import htmlentitydefs as hdefs
|
||||
import time
|
||||
import datetime
|
||||
|
||||
from constants import *
|
||||
from adapter import *
|
||||
|
|
@ -32,8 +34,37 @@ class HPFiction(FanfictionSiteAdapter):
|
|||
self.host = parsedUrl.netloc
|
||||
self.path = parsedUrl.path
|
||||
|
||||
logging.debug('self.url=%s' % self.url)
|
||||
logging.debug('self.host=%s' % self.host)
|
||||
logging.debug('self.path=%s' % self.path)
|
||||
|
||||
self.opener = u2.build_opener(u2.HTTPCookieProcessor())
|
||||
|
||||
self.storyDescription = 'Fanfiction Story'
|
||||
self.authorId = '0'
|
||||
self.authorURL = ''
|
||||
(u1, self.storyId) = self.url.split('=')
|
||||
self.storyPublished = datetime.date(1970, 01, 31)
|
||||
self.storyCreated = datetime.datetime.now()
|
||||
self.storyUpdated = datetime.date(1970, 01, 31)
|
||||
self.languageId = 'en-UK'
|
||||
self.language = 'English'
|
||||
self.subjects = []
|
||||
self.subjects.append ('fanfiction')
|
||||
self.subjects.append ('Harry Potter')
|
||||
self.publisher = self.host
|
||||
self.numChapters = 0
|
||||
self.numWords = 0
|
||||
self.genre = 'FanFiction'
|
||||
self.category = 'Category'
|
||||
self.storyStatus = 'In-Progress'
|
||||
self.storyRating = 'K'
|
||||
self.storyUserRating = '0'
|
||||
self.storyCharacters = []
|
||||
self.storySeries = ''
|
||||
self.uuid = 'urn:uuid:' + self.host + '-u.' + self.authorId + '-s.' + self.storyId
|
||||
logging.debug('self.uuid=%s' % self.uuid)
|
||||
|
||||
logging.debug("Created HPFiction: url=%s" % (self.url))
|
||||
|
||||
def _getLoginScript(self):
|
||||
|
|
@ -45,23 +76,116 @@ class HPFiction(FanfictionSiteAdapter):
|
|||
def performLogin(self, url = None):
|
||||
return True
|
||||
|
||||
def _addSubject(self, subject):
|
||||
subj = subject.upper()
|
||||
for s in self.subjects:
|
||||
if s.upper() == subj:
|
||||
return False
|
||||
self.subjects.append(subject)
|
||||
return True
|
||||
|
||||
def _addCharacter(self, character):
|
||||
chara = character.upper()
|
||||
for c in self.storyCharacters:
|
||||
if c.upper() == chara:
|
||||
return False
|
||||
self.storyCharacters.append(character)
|
||||
return True
|
||||
|
||||
def extractIndividualUrls(self):
|
||||
data = self.opener.open(self.url).read()
|
||||
soup = bs.BeautifulSoup(data)
|
||||
|
||||
links = soup.findAll('a')
|
||||
def_chapurl = ''
|
||||
def_chaptitle = ''
|
||||
|
||||
for a in links:
|
||||
if a['href'].find('psid') != -1:
|
||||
self.storyName = a.string
|
||||
logging.debug('self.storyName=%s' % self.storyName)
|
||||
elif a['href'].find('viewuser.php') != -1:
|
||||
self.authorName = a.string
|
||||
self.authorURL = 'http://' + self.host + '/' + a['href']
|
||||
(u1, self.authorId) = a['href'].split('=')
|
||||
logging.debug('self.authorName=%s, self.authorId=%s' % (self.authorName, self.authorId))
|
||||
elif a['href'].find('chapterid=') != -1 and len(def_chapurl) == 0:
|
||||
def_chapurl = 'http://' + self.host + '/viewstory.php' + str(a['href'])
|
||||
def_chaptitle = a.string
|
||||
logging.debug('def_chapurl=%s, def_chaptitle=%s' % (def_chapurl, def_chaptitle))
|
||||
|
||||
centers = soup.findAll('center')
|
||||
for center in centers:
|
||||
tds = center.findAll ('td')
|
||||
if tds is not None and len(tds) > 0:
|
||||
for td in tds:
|
||||
s = re.split ("<[^>]+>", str(td).replace('\n','').replace(' ',' '))
|
||||
logging.debug('s=%s' % s)
|
||||
ii = 0
|
||||
ll = len(s)
|
||||
sss = ''
|
||||
while ii < ll - 1:
|
||||
if s[ii] is not None and len(s[ii]) > 0:
|
||||
if s[ii] == 'Rating:':
|
||||
self.storyRating = s[ii+1]
|
||||
logging.debug('self.storyRating=%s' % self.storyRating)
|
||||
ii = ii + 2
|
||||
elif s[ii] == 'Chapters:':
|
||||
self.numChapters = s[ii+1]
|
||||
logging.debug('self.numChapters=%s' % self.numChapters)
|
||||
ii = ii + 2
|
||||
elif s[ii] == 'Characters:':
|
||||
s2 = s[ii+1].split(', ')
|
||||
for ss2 in s2:
|
||||
self._addCharacter(ss2)
|
||||
logging.debug('self.storyCharacters=%s' % self.storyCharacters)
|
||||
ii = ii + 2
|
||||
elif s[ii] == 'Genre(s):':
|
||||
self.genre = s[ii+1]
|
||||
logging.debug('self.genre=%s' % self.genre)
|
||||
s2 = s[ii+1].split(', ')
|
||||
for ss2 in s2:
|
||||
self._addSubject(ss2)
|
||||
logging.debug('self.subjects=%s' % self.subjects)
|
||||
ii = ii + 2
|
||||
elif s[ii] == 'Status:':
|
||||
if s[ii+1].strip(' ') == "Work In Progress":
|
||||
self.storyStatus = 'In-Progress'
|
||||
else:
|
||||
self.storyStatus = 'Completed'
|
||||
ii = ii + 2
|
||||
elif s[ii] == 'First Published:':
|
||||
self.storyPublished = datetime.datetime.fromtimestamp(time.mktime(time.strptime(s[ii+1].strip(' '), "%Y.%m.%d")))
|
||||
logging.debug('self.storyPublished=%s' % self.storyPublished)
|
||||
ii = ii + 2
|
||||
elif s[ii] == 'Last Updated:':
|
||||
self.storyUpdated = datetime.datetime.fromtimestamp(time.mktime(time.strptime(s[ii+1].strip(' '), "%Y.%m.%d")))
|
||||
logging.debug('self.storyUpdated=%s' % self.storyUpdated)
|
||||
ii = ii + 2
|
||||
elif s[ii] == 'Last Published Chapter:':
|
||||
ii = ii + 2
|
||||
elif s[ii] == 'Pairings:':
|
||||
ii = ii + 2
|
||||
elif s[ii] == 'Warnings:':
|
||||
ii = ii + 2
|
||||
else:
|
||||
sss = sss + ' ' + s[ii]
|
||||
ii = ii + 1
|
||||
else:
|
||||
ii = ii + 1
|
||||
self.storyDescription = sss
|
||||
logging.debug('self.storyDescription=%s' % self.storyDescription)
|
||||
|
||||
urls = []
|
||||
self.outputName = self.storyName.replace(" ", "_") + '-hp_' + self.storyId
|
||||
|
||||
select = soup.find('select', {'name' : 'chapterid'})
|
||||
if select is None:
|
||||
# no chapters found, try url by itself.
|
||||
urls.append((self.url,self.storyName))
|
||||
if len(def_chapurl) > 0:
|
||||
urls.append((def_chapurl, def_chaptitle))
|
||||
else:
|
||||
urls.append((self.url,self.storyName))
|
||||
else:
|
||||
for o in select.findAll('option'):
|
||||
if 'value' in o._getAttrMap():
|
||||
|
|
@ -69,11 +193,18 @@ class HPFiction(FanfictionSiteAdapter):
|
|||
title = o.string
|
||||
if title != "Story Index":
|
||||
urls.append((url,title))
|
||||
|
||||
self.uuid = 'urn:uuid:' + self.host + '-u.' + self.authorId + '-s.' + self.storyId
|
||||
logging.debug('self.uuid=%s' % self.uuid)
|
||||
|
||||
return urls
|
||||
|
||||
def getStoryName(self):
|
||||
return self.storyName
|
||||
|
||||
def getOutputName(self):
|
||||
return self.outputName
|
||||
|
||||
def getAuthorName(self):
|
||||
return self.authorName
|
||||
|
||||
|
|
@ -84,9 +215,100 @@ class HPFiction(FanfictionSiteAdapter):
|
|||
divtext = soup.find('div', {'id' : 'fluidtext'})
|
||||
if None == divtext:
|
||||
logging.error("Error downloading Chapter: %s" % url)
|
||||
exit(1)
|
||||
exit(20)
|
||||
return divtext.__str__('utf8')
|
||||
|
||||
def getAuthorId(self):
|
||||
logging.debug('self.authorId=%s' % self.authorId)
|
||||
return self.authorId
|
||||
|
||||
def getStoryId(self):
|
||||
logging.debug('self.storyId=%s' % self.storyId)
|
||||
return self.storyId
|
||||
|
||||
def getStoryDescription(self):
|
||||
logging.debug('self.storyDescription=%s' % self.storyDescription)
|
||||
return self.storyDescription
|
||||
|
||||
def getStoryPublished(self):
|
||||
logging.debug('self.storyPublished=%s' % self.storyPublished)
|
||||
return self.storyPublished
|
||||
|
||||
def getStoryCreated(self):
|
||||
self.storyCreated = datetime.datetime.now()
|
||||
logging.debug('self.storyCreated=%s' % self.storyCreated)
|
||||
return self.storyCreated
|
||||
|
||||
def getStoryUpdated(self):
|
||||
logging.debug('self.storyUpdated=%s' % self.storyUpdated)
|
||||
return self.storyUpdated
|
||||
|
||||
def getLanguage(self):
|
||||
logging.debug('self.language=%s' % self.language)
|
||||
return self.language
|
||||
|
||||
def getLanguageId(self):
|
||||
logging.debug('self.languageId=%s' % self.languageId)
|
||||
return self.languageId
|
||||
|
||||
def getSubjects(self):
|
||||
logging.debug('self.subjects=%s' % self.authorName)
|
||||
return self.subjects
|
||||
|
||||
def getPublisher(self):
|
||||
logging.debug('self.publisher=%s' % self.publisher)
|
||||
return self.publisher
|
||||
|
||||
def getNumChapters(self):
|
||||
logging.debug('self.numChapters=%s' % self.numChapters)
|
||||
return self.numChapters
|
||||
|
||||
def getNumWords(self):
|
||||
logging.debug('self.numWords=%s' % self.numWords)
|
||||
return self.numWords
|
||||
|
||||
def getStoryURL(self):
|
||||
logging.debug('self.url=%s' % self.url)
|
||||
return self.url
|
||||
|
||||
def getAuthorURL(self):
|
||||
logging.debug('self.authorURL=%s' % self.authorURL)
|
||||
return self.authorURL
|
||||
|
||||
def getUUID(self):
|
||||
logging.debug('self.uuid=%s' % self.uuid)
|
||||
return self.uuid
|
||||
|
||||
def getCategory(self):
|
||||
logging.debug('self.category=%s' % self.category)
|
||||
return self.category
|
||||
|
||||
def getGenre(self):
|
||||
logging.debug('self.genre=%s' % self.genre)
|
||||
return self.genre
|
||||
|
||||
def getStoryStatus(self):
|
||||
logging.debug('self.storyStatus=%s' % self.storyStatus)
|
||||
return self.storyStatus
|
||||
|
||||
def getStoryRating(self):
|
||||
logging.debug('self.storyRating=%s' % self.storyRating)
|
||||
return self.storyRating
|
||||
|
||||
def getStoryUserRating(self):
|
||||
logging.debug('self.storyUserRating=%s' % self.storyUserRating)
|
||||
return self.storyUserRating
|
||||
|
||||
def getStoryCharacters(self):
|
||||
logging.debug('self.storyCharacters=%s' % self.storyCharacters)
|
||||
return self.storyCharacters
|
||||
|
||||
def getStorySeries(self):
|
||||
logging.debug('self.storySeries=%s' % self.storySeries)
|
||||
return self.storySeries
|
||||
|
||||
|
||||
|
||||
class FF_UnitTests(unittest.TestCase):
|
||||
def setUp(self):
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
|
|
|
|||
67
output.py
67
output.py
|
|
@ -26,6 +26,7 @@ from constants import *
|
|||
|
||||
|
||||
import html2text
|
||||
import datetime
|
||||
|
||||
|
||||
class FanficWriter:
|
||||
|
|
@ -41,8 +42,8 @@ class FanficWriter:
|
|||
class TextWriter(FanficWriter):
|
||||
htmlWriter = None
|
||||
|
||||
def __init__(self, base, name, author, inmemory=False, compress=False):
|
||||
self.htmlWriter = HTMLWriter(base, name, author, True, False)
|
||||
def __init__(self, base, adapter, inmemory=False, compress=False):
|
||||
self.htmlWriter = HTMLWriter(base, adapter, True, False)
|
||||
|
||||
def writeChapter(self, index, title, text):
|
||||
self.htmlWriter.writeChapter(index, title, text)
|
||||
|
|
@ -57,12 +58,13 @@ class TextWriter(FanficWriter):
|
|||
class HTMLWriter(FanficWriter):
|
||||
body = ''
|
||||
|
||||
def __init__(self, base, name, author, inmemory=False, compress=False):
|
||||
def __init__(self, base, adapter, inmemory=False, compress=False):
|
||||
self.basePath = base
|
||||
self.storyTitle = removeEntities(name)
|
||||
self.name = makeAcceptableFilename(name)
|
||||
self.fileName = self.basePath + '/' + self.name + '.html'
|
||||
self.authorName = removeEntities(author)
|
||||
self.storyTitle = removeEntities(adapter.getStoryName())
|
||||
self.name = makeAcceptableFilename(adapter.getOutputName())
|
||||
self.fileName = self.basePath + '/' + self.name + '.html'
|
||||
self.authorName = removeEntities(adapter.getAuthorName())
|
||||
self.adapter = adapter
|
||||
|
||||
self.inmemory = inmemory
|
||||
|
||||
|
|
@ -131,14 +133,14 @@ class EPubFanficWriter(FanficWriter):
|
|||
for f in self.files:
|
||||
self.files[f].close()
|
||||
|
||||
def __init__(self, base, name, author, inmemory=False, compress=True):
|
||||
def __init__(self, base, adapter, inmemory=False, compress=True):
|
||||
self.basePath = base
|
||||
self.storyTitle = removeEntities(name)
|
||||
self.name = makeAcceptableFilename(name)
|
||||
self.storyTitle = removeEntities(adapter.getStoryName())
|
||||
self.name = makeAcceptableFilename(adapter.getOutputName())
|
||||
self.directory = self.basePath + '/' + self.name
|
||||
self.authorName = removeEntities(author)
|
||||
|
||||
self.authorName = removeEntities(adapter.getAuthorName())
|
||||
self.inmemory = inmemory
|
||||
self.adapter = adapter
|
||||
|
||||
self.files = {}
|
||||
self.chapters = []
|
||||
|
|
@ -226,17 +228,50 @@ class EPubFanficWriter(FanficWriter):
|
|||
tocFilePath = "OEBPS/toc.ncx"
|
||||
# toc = open(tocFilePath, 'w')
|
||||
# print >> toc, TOC_START % self.storyTitle
|
||||
self._writeFile(tocFilePath, TOC_START % self.storyTitle)
|
||||
self._writeFile(tocFilePath, TOC_START % (self.adapter.getUUID(), self.storyTitle))
|
||||
|
||||
published = self.adapter.getStoryPublished().strftime("%Y-%m-%d")
|
||||
createda = self.adapter.getStoryCreated().strftime("%Y-%m-%d %H:%M:%S")
|
||||
created = self.adapter.getStoryCreated().strftime("%Y-%m-%d")
|
||||
updated = self.adapter.getStoryUpdated().strftime("%Y-%m-%d")
|
||||
calibre = self.adapter.getStoryUpdated().strftime("%Y-%m-%dT%H:%M:%S")
|
||||
|
||||
### writing content -- title page
|
||||
titleFilePath = "OEBPS/title_page.xhtml"
|
||||
self._writeFile(titleFilePath, TITLE_PAGE % (self.authorName, self.storyTitle, self.adapter.getStoryURL(), self.storyTitle, self.adapter.getAuthorURL(), self.authorName, self.adapter.getCategory(), self.adapter.getGenre(), self.adapter.getStoryStatus(), published, updated, createda, self.adapter.getStoryRating(), self.adapter.getStoryUserRating(), self.adapter.getNumChapters(), self.adapter.getNumWords(), self.adapter.getStoryURL(), self.adapter.getStoryURL(), self.adapter.getStoryDescription()))
|
||||
|
||||
### writing content -- opf file
|
||||
opfFilePath = "OEBPS/content.opf"
|
||||
|
||||
|
||||
# opf = open(opfFilePath, 'w')
|
||||
self._writeFile(opfFilePath, CONTENT_START % (self.storyTitle, self.authorName, uuid.uuid4().urn))
|
||||
self._writeFile(opfFilePath, CONTENT_START % (uuid.uuid4().urn, self.storyTitle, self.authorName, self.adapter.getLanguageId(), published, created, updated, calibre, self.adapter.getStoryDescription()))
|
||||
|
||||
i = 0
|
||||
subjs = []
|
||||
subjs = self.adapter.getSubjects()
|
||||
for subj in subjs:
|
||||
self._writeFile(opfFilePath, CONTENT_SUBJECT % subj)
|
||||
i = i + 1
|
||||
if (i <= 0):
|
||||
self._writeFile(opfFilePath, CONTENT_SUBJECT % "FanFiction")
|
||||
|
||||
self._writeFile(opfFilePath, CONTENT_END_METADATA % (self.adapter.getPublisher(), self.adapter.getUUID(), self.adapter.getStoryURL(), self.adapter.getStoryURL(), self.adapter.getStoryUserRating()))
|
||||
# print >> opf, CONTENT_START % (uuid.uuid4().urn, self.storyTitle, self.authorName)
|
||||
|
||||
ids = []
|
||||
|
||||
i = 1
|
||||
i = 0
|
||||
|
||||
t = "Title Page"
|
||||
f = "title_page.xhtml"
|
||||
chapterId = "Title Page"
|
||||
self._writeFile(tocFilePath, TOC_ITEM % (chapterId, i, t, f))
|
||||
self._writeFile(opfFilePath, CONTENT_ITEM % (chapterId, f))
|
||||
|
||||
ids.append(chapterId)
|
||||
|
||||
i = i + 1
|
||||
|
||||
for t,f in self.chapters:
|
||||
chapterId = "chapter%04d" % i
|
||||
|
||||
|
|
|
|||
439
twilighted.py
439
twilighted.py
|
|
@ -11,119 +11,360 @@ import urllib2 as u2
|
|||
import urlparse as up
|
||||
import BeautifulSoup as bs
|
||||
import htmlentitydefs as hdefs
|
||||
import time
|
||||
import datetime
|
||||
|
||||
from adapter import *
|
||||
import twipassword
|
||||
|
||||
class Twilighted(FanfictionSiteAdapter):
|
||||
def __init__(self, url):
|
||||
self.url = url
|
||||
parsedUrl = up.urlparse(url)
|
||||
self.host = parsedUrl.netloc
|
||||
self.path = parsedUrl.path
|
||||
self.opener = u2.build_opener(u2.HTTPCookieProcessor())
|
||||
self.password=twipassword.password
|
||||
self.login='sigizmund'
|
||||
logging.debug("Created Twilighted: url=%s" % (self.url))
|
||||
|
||||
|
||||
def requiresLogin(self, url = None):
|
||||
# potionsandsnitches.net doesn't require login.
|
||||
if self.host == 'potionsandsnitches.net':
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
def performLogin(self, url = None):
|
||||
data = {}
|
||||
|
||||
data['penname'] = self.login
|
||||
data['password'] = self.password
|
||||
data['cookiecheck'] = '1'
|
||||
data['submit'] = 'Submit'
|
||||
|
||||
urlvals = u.urlencode(data)
|
||||
loginUrl = 'http://' + self.host + self._getLoginScript()
|
||||
logging.debug("Will now login to URL %s" % loginUrl)
|
||||
|
||||
req = self.opener.open(loginUrl, urlvals)
|
||||
|
||||
d = req.read().decode('utf-8')
|
||||
|
||||
if self.reqLoginData(d) :
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
|
||||
def setLogin(self, login):
|
||||
self.login = login
|
||||
|
||||
def setPassword(self, password):
|
||||
self.password = password
|
||||
|
||||
def extractIndividualUrls(self):
|
||||
data = self.opener.open(self.url).read()
|
||||
def __init__(self, url):
|
||||
self.url = url
|
||||
parsedUrl = up.urlparse(url)
|
||||
self.host = parsedUrl.netloc
|
||||
self.path = parsedUrl.path
|
||||
self.opener = u2.build_opener(u2.HTTPCookieProcessor())
|
||||
self.password=twipassword.password
|
||||
self.login='sigizmund'
|
||||
self.storyDescription = 'Fanfiction Story'
|
||||
self.authorId = '0'
|
||||
self.authorURL = ''
|
||||
self.storyId = '0'
|
||||
self.storyPublished = datetime.date(1970, 01, 31)
|
||||
self.storyCreated = datetime.datetime.now()
|
||||
self.storyUpdated = datetime.date(1970, 01, 31)
|
||||
self.languageId = 'en-UK'
|
||||
self.language = 'English'
|
||||
self.subjects = []
|
||||
self.subjects.append ('fanfiction')
|
||||
self.subjects.append ('Twilight')
|
||||
self.publisher = self.host
|
||||
self.numChapters = 0
|
||||
self.numWords = 0
|
||||
self.genre = 'FanFiction'
|
||||
self.category = 'Category'
|
||||
self.storyStatus = 'In-Progress'
|
||||
self.storyRating = 'PG'
|
||||
self.storyUserRating = '0'
|
||||
self.storyCharacters = []
|
||||
self.storySeries = ''
|
||||
self.uuid = 'urn:uuid:' + self.host + '-u.' + self.authorId + '-s.' + self.storyId
|
||||
logging.debug('self.uuid=%s' % self.uuid)
|
||||
|
||||
if self.reqLoginData(data):
|
||||
self.performLogin()
|
||||
data = self.opener.open(self.url).read()
|
||||
if self.reqLoginData(data):
|
||||
return None
|
||||
logging.debug("Created Twilighted: url=%s" % (self.url))
|
||||
|
||||
|
||||
def requiresLogin(self, url = None):
|
||||
# potionsandsnitches.net doesn't require login.
|
||||
if self.host == 'potionsandsnitches.net':
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
def performLogin(self, url = None):
|
||||
data = {}
|
||||
|
||||
soup = bs.BeautifulStoneSoup(data)
|
||||
|
||||
title = soup.find('title').string
|
||||
self.storyName = title.split(' by ')[0].strip()
|
||||
self.authorName = title.split(' by ')[1].strip()
|
||||
|
||||
select = soup.find('select', { 'name' : 'chapter' } )
|
||||
|
||||
result = []
|
||||
if select is None:
|
||||
# no chapters found, try url by itself.
|
||||
result.append((self.url,self.storyName))
|
||||
else:
|
||||
allOptions = select.findAll('option')
|
||||
for o in allOptions:
|
||||
url = self.url + "&chapter=%s" % o['value']
|
||||
title = o.string
|
||||
result.append((url,title))
|
||||
|
||||
return result
|
||||
|
||||
def getStoryName(self):
|
||||
return self.storyName
|
||||
|
||||
def getAuthorName(self):
|
||||
return self.authorName
|
||||
|
||||
def getText(self, url):
|
||||
if url.find('http://') == -1:
|
||||
url = 'http://' + self.host + '/' + url
|
||||
|
||||
logging.debug('Getting data from: %s' % url)
|
||||
|
||||
data = self.opener.open(url).read()
|
||||
data['penname'] = self.login
|
||||
data['password'] = self.password
|
||||
data['cookiecheck'] = '1'
|
||||
data['submit'] = 'Submit'
|
||||
|
||||
soup = bs.BeautifulStoneSoup(data, convertEntities=bs.BeautifulStoneSoup.HTML_ENTITIES)
|
||||
urlvals = u.urlencode(data)
|
||||
loginUrl = 'http://' + self.host + self._getLoginScript()
|
||||
logging.debug("Will now login to URL %s" % loginUrl)
|
||||
|
||||
req = self.opener.open(loginUrl, urlvals)
|
||||
|
||||
d = req.read().decode('utf-8')
|
||||
|
||||
if self.reqLoginData(d) :
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
div = soup.find('div', {'id' : 'story'})
|
||||
|
||||
if None == div:
|
||||
return '<html/>'
|
||||
def setLogin(self, login):
|
||||
self.login = login
|
||||
|
||||
return div.__str__('utf8')
|
||||
def setPassword(self, password):
|
||||
self.password = password
|
||||
|
||||
def _getLoginScript(self):
|
||||
return '/user.php?action=login'
|
||||
def _addSubject(self, subject):
|
||||
subj = subject.upper()
|
||||
for s in self.subjects:
|
||||
if s.upper() == subj:
|
||||
return False
|
||||
self.subjects.append(subject)
|
||||
return True
|
||||
|
||||
def reqLoginData(self, data):
|
||||
if data.find('Registered Users Only. Please click OK to login or register.') != -1 or data.find('There is no such account on our website') != -1:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
def _addCharacter(self, character):
|
||||
chara = character.upper()
|
||||
for c in self.storyCharacters:
|
||||
if c.upper() == chara:
|
||||
return False
|
||||
self.storyCharacters.append(character)
|
||||
return True
|
||||
|
||||
def extractIndividualUrls(self):
|
||||
data = self.opener.open(self.url).read()
|
||||
|
||||
if self.reqLoginData(data):
|
||||
self.performLogin()
|
||||
data = self.opener.open(self.url).read()
|
||||
if self.reqLoginData(data):
|
||||
return None
|
||||
|
||||
soup = bs.BeautifulStoneSoup(data)
|
||||
|
||||
title = soup.find('title').string
|
||||
self.storyName = title.split(' by ')[0].strip()
|
||||
self.authorName = title.split(' by ')[1].strip()
|
||||
self.outputName = self.storyName.replace(" ", "_")
|
||||
|
||||
select = soup.find('select', { 'name' : 'chapter' } )
|
||||
|
||||
result = []
|
||||
if select is None:
|
||||
# no chapters found, try url by itself.
|
||||
result.append((self.url,self.storyName))
|
||||
else:
|
||||
allOptions = select.findAll('option')
|
||||
for o in allOptions:
|
||||
url = self.url + "&chapter=%s" % o['value']
|
||||
title = o.string
|
||||
result.append((url,title))
|
||||
|
||||
url = self.url + "&index=1"
|
||||
data = self.opener.open(url).read()
|
||||
lines = data.split('\n')
|
||||
soup = bs.BeautifulStoneSoup(data)
|
||||
metas = soup.findAll('meta')
|
||||
for meta in metas:
|
||||
if 'name' in meta._getAttrMap() and meta['name'].find('description') != -1:
|
||||
#logging.debug('Meta: %s' % meta)
|
||||
if 'content' in meta._getAttrMap():
|
||||
s1 = bs.BeautifulStoneSoup(meta['content'])
|
||||
ps = s1.findAll('p')
|
||||
if len(ps) > 0:
|
||||
self.storyDescription = ps[0]
|
||||
logging.debug('self.storyDescription=%s' % (self.storyDescription))
|
||||
else:
|
||||
divs = meta.findAll('div')
|
||||
#logging.debug('Divs: %s' % divs)
|
||||
|
||||
for div in divs:
|
||||
#logging.debug('Div: %s' % div)
|
||||
if 'id' in div._getAttrMap() and div['id'].find('pagetitle') != -1:
|
||||
#logging.debug('Div PAGETITLE: %s' % div)
|
||||
allA = div.findAll('a')
|
||||
for a in allA:
|
||||
if 'href' in a._getAttrMap():
|
||||
if a['href'].find('viewstory.php?sid=') != -1:
|
||||
str1 = a.string
|
||||
(vs, self.storyId) = a['href'].split('=')
|
||||
logging.debug('self.storyId=%s self.storyName=%s' % (self.storyId, self.storyName))
|
||||
self.outputName = self.outputName + "-tw_" + self.storyId
|
||||
logging.debug('self.outputName=%s' % self.outputName)
|
||||
if a['href'].find('viewuser.php?uid=') != -1:
|
||||
str1 = a.string
|
||||
(vs, self.authorId) = a['href'].split('=')
|
||||
logging.debug('self.authorId=%s self.authorName=%s' % (self.authorId, self.authorName))
|
||||
self.authorURL = 'http://'+self.host+'/viewuser.php?uid='+self.authorId
|
||||
logging.debug('self.authorURL=%s' % self.authorURL)
|
||||
if 'class' in div._getAttrMap() and div['class'].find('content') != -1:
|
||||
#logging.debug('Div CONTENT: %s' % div)
|
||||
brs = div.findAll('br')
|
||||
for br in brs:
|
||||
buf = unicode(br).encode('utf-8')
|
||||
strs = re.split ('<[^>]+>', buf)
|
||||
#logging.debug('BUF: %s' % strs)
|
||||
ii = 2
|
||||
stlen = len(strs)
|
||||
while stlen > ii+1:
|
||||
if len(strs[ii]) == 0:
|
||||
ii = ii+1
|
||||
continue
|
||||
if strs[ii] == 'Categories:':
|
||||
ii = ii+1
|
||||
while stlen > ii and len(strs[ii]) != 0 and strs[ii].find(':') == -1:
|
||||
if strs[ii] != ' ' and strs[ii] != ', ':
|
||||
if self.category == 'Category':
|
||||
self.category = strs[ii].strip(' ')
|
||||
self._addSubject(strs[ii].strip(' '))
|
||||
ii = ii+1
|
||||
logging.debug('self.subjects=%s' % self.subjects)
|
||||
if strs[ii] == 'Characters: ':
|
||||
ii = ii+1
|
||||
while stlen > ii and len(strs[ii]) != 0 and strs[ii].find(':') == -1:
|
||||
if strs[ii] != ' ' and strs[ii] != ', ':
|
||||
self._addCharacter(strs[ii].strip(' '))
|
||||
ii = ii+1
|
||||
logging.debug('self.storyCharacters=%s' % self.storyCharacters)
|
||||
elif strs[ii] == 'Completed:':
|
||||
if strs[ii+1].strip(' ') == "No":
|
||||
self.storyStatus = 'In-Progress'
|
||||
else:
|
||||
self.storyStatus = 'Completed'
|
||||
ii = ii+2
|
||||
logging.debug('self.storyStatus=%s' % self.storyStatus)
|
||||
elif strs[ii] == 'Rated:':
|
||||
self.storyRating = strs[ii+1].strip(' ')
|
||||
ii = ii+2
|
||||
logging.debug('self.storyRating=%s' % self.storyRating)
|
||||
elif strs[ii] == 'Series:':
|
||||
self.storySeries = strs[ii+1].strip(' ')
|
||||
if self.storySeries == 'None':
|
||||
self.storySeries = ''
|
||||
ii = ii+2
|
||||
logging.debug('self.storySeries=%s' % self.storySeries)
|
||||
elif strs[ii] == 'Chapters: ':
|
||||
self.numChapters = strs[ii+1].strip(' ')
|
||||
ii = ii+2
|
||||
logging.debug('self.numChapters=%s' % self.numChapters)
|
||||
elif strs[ii] == 'Word count:':
|
||||
self.numWords = strs[ii+1].strip(' ')
|
||||
ii = ii+2
|
||||
logging.debug('self.numWords=%s' % self.numWords)
|
||||
elif strs[ii] == ' Published: ':
|
||||
self.storyPublished = datetime.datetime.fromtimestamp(time.mktime(time.strptime(strs[ii+1].strip(' '), "%B %d, %Y")))
|
||||
ii = ii+2
|
||||
logging.debug('self.storyPublished=%s' % self.storyPublished)
|
||||
elif strs[ii] == 'Updated:':
|
||||
self.storyUpdated = datetime.datetime.fromtimestamp(time.mktime(time.strptime(strs[ii+1].strip(' '), "%B %d, %Y")))
|
||||
ii = ii+2
|
||||
logging.debug('self.storyUpdated=%s' % self.storyUpdated)
|
||||
else:
|
||||
logging.debug('Skipped Label \"%s\" Value \"%s\"' % (strs[ii], strs[ii+1]))
|
||||
ii = ii+2
|
||||
|
||||
self.uuid = 'urn:uuid:' + self.host + '-u.' + self.authorId + '-s.' + self.storyId
|
||||
logging.debug('self.uuid=%s' % self.uuid)
|
||||
|
||||
return result
|
||||
|
||||
def getStoryName(self):
|
||||
return self.storyName
|
||||
|
||||
def getOutputName(self):
|
||||
return self.outputName
|
||||
|
||||
def getAuthorName(self):
|
||||
return self.authorName
|
||||
|
||||
def getText(self, url):
|
||||
if url.find('http://') == -1:
|
||||
url = 'http://' + self.host + '/' + url
|
||||
|
||||
logging.debug('Getting data from: %s' % url)
|
||||
|
||||
data = self.opener.open(url).read()
|
||||
|
||||
soup = bs.BeautifulStoneSoup(data, convertEntities=bs.BeautifulStoneSoup.HTML_ENTITIES)
|
||||
|
||||
div = soup.find('div', {'id' : 'story'})
|
||||
|
||||
if None == div:
|
||||
return '<html/>'
|
||||
|
||||
return div.__str__('utf8')
|
||||
|
||||
def _getLoginScript(self):
|
||||
return '/user.php?action=login'
|
||||
|
||||
def reqLoginData(self, data):
|
||||
if data.find('Registered Users Only. Please click OK to login or register.') != -1 or data.find('There is no such account on our website') != -1:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def getStoryURL(self):
|
||||
logging.debug('self.url=%s' % self.url)
|
||||
return self.url
|
||||
|
||||
def getAuthorURL(self):
|
||||
logging.debug('self.authorURL=%s' % self.authorURL)
|
||||
return self.authorURL
|
||||
|
||||
def getUUID(self):
|
||||
logging.debug('self.uuid=%s' % self.uuid)
|
||||
return self.uuid
|
||||
|
||||
def getStoryDescription(self):
|
||||
logging.debug('self.storyDescription=%s' % self.storyDescription)
|
||||
return self.storyDescription
|
||||
|
||||
def getStoryPublished(self):
|
||||
logging.debug('self.storyPublished=%s' % self.storyPublished)
|
||||
return self.storyPublished
|
||||
|
||||
def getStoryCreated(self):
|
||||
self.storyCreated = datetime.datetime.now()
|
||||
logging.debug('self.storyCreated=%s' % self.storyCreated)
|
||||
return self.storyCreated
|
||||
|
||||
def getStoryUpdated(self):
|
||||
logging.debug('self.storyUpdated=%s' % self.storyUpdated)
|
||||
return self.storyUpdated
|
||||
|
||||
def getLanguage(self):
|
||||
logging.debug('self.language=%s' % self.language)
|
||||
return self.language
|
||||
|
||||
def getLanguageId(self):
|
||||
logging.debug('self.languageId=%s' % self.languageId)
|
||||
return self.languageId
|
||||
|
||||
def getSubjects(self):
|
||||
logging.debug('self.subjects=%s' % self.authorName)
|
||||
return self.subjects
|
||||
|
||||
def getPublisher(self):
|
||||
logging.debug('self.publisher=%s' % self.publisher)
|
||||
return self.publisher
|
||||
|
||||
def getNumChapters(self):
|
||||
logging.debug('self.numChapters=%s' % self.numChapters)
|
||||
return self.numChapters
|
||||
|
||||
def getNumWords(self):
|
||||
logging.debug('self.numWords=%s' % self.numWords)
|
||||
return self.numWords
|
||||
|
||||
def getAuthorId(self):
|
||||
logging.debug('self.authorId=%s' % self.authorId)
|
||||
return self.authorId
|
||||
|
||||
def getStoryId(self):
|
||||
logging.debug('self.storyId=%s' % self.storyId)
|
||||
return self.storyId
|
||||
|
||||
def getCategory(self):
|
||||
logging.debug('self.category=%s' % self.category)
|
||||
return self.category
|
||||
|
||||
def getGenre(self):
|
||||
logging.debug('self.genre=%s' % self.genre)
|
||||
return self.genre
|
||||
|
||||
def getStoryStatus(self):
|
||||
logging.debug('self.storyStatus=%s' % self.storyStatus)
|
||||
return self.storyStatus
|
||||
|
||||
def getStoryRating(self):
|
||||
logging.debug('self.storyRating=%s' % self.storyRating)
|
||||
return self.storyRating
|
||||
|
||||
def getStoryUserRating(self):
|
||||
logging.debug('self.storyUserRating=%s' % self.storyUserRating)
|
||||
return self.storyUserRating
|
||||
|
||||
def getStoryCharacters(self):
|
||||
logging.debug('self.storyCharacters=%s' % self.storyCharacters)
|
||||
return self.storyCharacters
|
||||
|
||||
def getStorySeries(self):
|
||||
logging.debug('self.storySeries=%s' % self.storySeries)
|
||||
return self.storySeries
|
||||
|
||||
class Twilighted_UnitTests(unittest.TestCase):
|
||||
def setUp(self):
|
||||
|
|
|
|||
Loading…
Reference in a new issue