Fix up exception handling, first working appengine(SDK) version.

This commit is contained in:
Jim Miller 2011-05-04 17:54:36 -05:00
parent 547411666d
commit 94669a2179
16 changed files with 159 additions and 149 deletions

View file

@ -1,6 +1,6 @@
# fanfictionloader
application: fanfictionloader
version: 3-0-2
application: ffd-retief
version: 4-0-0
runtime: python
api_version: 1

View file

@ -55,7 +55,7 @@ safe_filename: true
extratags: FanFiction
## number of seconds to sleep between calls to the story site.
slow_down_sleep_time:0.5
#slow_down_sleep_time:0.5
## Each output format has a section that overrides [defaults]
@ -87,7 +87,7 @@ wide_titlepage_entries: description, storyUrl, author URL
## Each site has a section that overrides [defaults] *and* the format section
[test1.com]
titlepage_entries: title,description,category,genre, status,dateCreated,rating,numChapters,numWords,extratags,description,storyUrl,extratags
#titlepage_entries: title,description,category,genre, status,dateCreated,rating,numChapters,numWords,extratags,description,storyUrl,extratags
extratags: FanFiction,Testing
## If necessary, you can define [<site>:<format>] sections to customize

View file

@ -5,6 +5,8 @@ from os.path import dirname, basename, normpath
import logging
import urlparse as up
import fanficdownloader.exceptions as exceptions
## This bit of complexity allows adapters to be added by just adding
## the source file. It eliminates the long if/else clauses we used to
## need to pick out the adapter.
@ -21,11 +23,10 @@ def getAdapter(config,url):
adapter = cls(config,url) # raises InvalidStoryURL
return adapter
# No adapter found.
raise UnknownSite( url, (cls.getSiteDomain() for cls in __class_list) )
raise exceptions.UnknownSite( url, [cls.getSiteDomain() for cls in __class_list] )
## Automatically import each adapter_*.py file.
## Each must call _register_handler() with their class to be
## registered.
## Each implement getClass() to their class
filelist = glob.glob(dirname(__file__)+'/adapter_*.py')
sys.path.insert(0,normpath(dirname(__file__)))

View file

@ -5,8 +5,10 @@ import datetime
import logging
import re
import urllib2
import time
import fanficdownloader.BeautifulSoup as bs
import fanficdownloader.exceptions as exceptions
from base_adapter import BaseSiteAdapter, utf8FromSoup
@ -50,7 +52,7 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter):
soup = bs.BeautifulSoup(self._fetchUrl(url))
except urllib2.HTTPError, e:
if e.code == 404:
raise adapters.StoryDoesNotExist(self.url)
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
@ -166,14 +168,16 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter):
def getChapterText(self, url):
logging.debug('Getting chapter text from: %s' % url)
time.sleep(0.5) ## ffnet tends to fail more if hit too fast.
## This is in additional to what ever the
## slow_down_sleep_time setting is.
soup = bs.BeautifulStoneSoup(self._fetchUrl(url),
selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
span = soup.find('div', {'id' : 'storytext'})
if None == span:
raise adapters.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return utf8FromSoup(span)

File diff suppressed because one or more lines are too long

View file

@ -9,6 +9,7 @@ import urllib2
import fanficdownloader.BeautifulSoup as bs
from fanficdownloader.htmlcleanup import stripHTML
import fanficdownloader.exceptions as exceptions
from base_adapter import BaseSiteAdapter, utf8FromSoup
@ -74,7 +75,7 @@ class TwilightedNetSiteAdapter(BaseSiteAdapter):
if self.needToLoginCheck(d) :
logging.info("Failed to login to URL %s as %s" % (loginUrl,
data['penname']))
raise adapters.FailedToLogin(url,data['penname'])
raise exceptions.FailedToLogin(url,data['penname'])
return False
else:
return True
@ -88,7 +89,7 @@ class TwilightedNetSiteAdapter(BaseSiteAdapter):
data = self._fetchUrl(url)
except urllib2.HTTPError, e:
if e.code == 404:
raise adapters.StoryDoesNotExist(self.url)
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
@ -190,7 +191,7 @@ class TwilightedNetSiteAdapter(BaseSiteAdapter):
span = soup.find('div', {'id' : 'story'})
if None == span:
raise adapters.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return utf8FromSoup(span)

View file

@ -7,6 +7,7 @@ import re
import urllib2
import fanficdownloader.BeautifulSoup as bs
import fanficdownloader.exceptions as exceptions
from base_adapter import BaseSiteAdapter, utf8FromSoup
@ -44,7 +45,7 @@ class WhoficComSiteAdapter(BaseSiteAdapter):
soup = bs.BeautifulSoup(self._fetchUrl(url))
except urllib2.HTTPError, e:
if e.code == 404:
raise adapters.StoryDoesNotExist(self.url)
raise exceptions.StoryDoesNotExist(self.url)
else:
raise e
@ -173,7 +174,7 @@ class WhoficComSiteAdapter(BaseSiteAdapter):
span = soup.find('span', {'style' : 'font-size: 100%;'})
if None == span:
raise adapters.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return utf8FromSoup(span)

View file

@ -9,7 +9,7 @@ import urlparse as up
from fanficdownloader.story import Story
from fanficdownloader.configurable import Configurable
from fanficdownloader.htmlcleanup import removeEntities, removeAllEntities, stripHTML
from fanficdownloader.adapters.exceptions import InvalidStoryURL
from fanficdownloader.exceptions import InvalidStoryURL
class BaseSiteAdapter(Configurable):
@ -29,6 +29,7 @@ class BaseSiteAdapter(Configurable):
self.addConfigSection(self.getSiteDomain())
self.opener = u2.build_opener(u2.HTTPCookieProcessor())
self.storyDone = False
self.metadataDone = False
self.story = Story()
self.story.setMetadata('site',self.getSiteDomain())
self.story.setMetadata('dateCreated',datetime.datetime.now())
@ -58,13 +59,19 @@ class BaseSiteAdapter(Configurable):
# Does the download the first time it's called.
def getStory(self):
if not self.storyDone:
self.extractChapterUrlsAndMetadata()
self.getStoryMetadataOnly()
for (title,url) in self.chapterUrls:
self.story.addChapter(removeEntities(title),
removeEntities(self.getChapterText(url)))
self.storyDone = True
return self.story
def getStoryMetadataOnly(self):
if not self.metadataDone:
self.extractChapterUrlsAndMetadata()
self.metadataDone = True
return self.story
###############################
@staticmethod

View file

@ -14,7 +14,7 @@ class InvalidStoryURL(Exception):
self.example=example
def __str__(self):
return "Bad Story URL: %s\nFor site: %s\nExample: %s" % (self.url, self.domain, self.example)
return "Bad Story URL: (%s) for site: (%s) Example: (%s)" % (self.url, self.domain, self.example)
class FailedToLogin(Exception):
def __init__(self,url,username):
@ -22,14 +22,14 @@ class FailedToLogin(Exception):
self.username=username
def __str__(self):
return "Failed to Login for URL: %s with username: %s" % (self.url, self.username)
return "Failed to Login for URL: (%s) with username: (%s)" % (self.url, self.username)
class StoryDoesNotExist(Exception):
def __init__(self,url):
self.url=url
def __str__(self):
return "Story Does Not Exit: " + self.url
return "Story does not exist: (%s)" % self.url
class UnknownSite(Exception):
def __init__(self,url,supported_sites_list):
@ -37,5 +37,5 @@ class UnknownSite(Exception):
self.supported_sites_list=supported_sites_list
def __str__(self):
return "Unknown Site("+self.url+"). Supported sites: "+", ".join(self.supported_sites_list)
return "Unknown Site(%s). Supported sites: (%s)" % (self.url, ", ".join(self.supported_sites_list))

View file

@ -65,8 +65,8 @@ src="http://pagead2.googlesyndication.com/pagead/show_ads.js">
<div id='typeoptions'>
<input type='radio' name='format' value='epub' checked>EPub</input>
<input type='radio' name='format' value='html'>HTML</input>
<input type='radio' name='format' value='text'>Plain Text</input>
<input type='radio' name='format' value='mobi'>Mobi (Kindle)</input>
<input type='radio' name='format' value='txt'>Plain Text</input>
<p><i>For Mobi (Kindle) select EPub and Convert it.</i></p>
</div>
</div>

224
main.py
View file

@ -15,10 +15,15 @@
# limitations under the License.
#
import logging
logging.getLogger().setLevel(logging.DEBUG)
import os
from os.path import dirname, basename, normpath
import sys
import zlib
import logging
import urllib
import traceback
import StringIO
@ -42,6 +47,9 @@ from fanficdownloader.zipdir import *
from ffstorage import *
from fanficdownloader import adapters, writers
import ConfigParser
class LoginRequired(webapp.RequestHandler):
def get(self):
user = users.get_current_user()
@ -104,29 +112,29 @@ class FileServer(webapp.RequestHandler):
name = fanfic.name.encode('utf-8')
name = makeAcceptableFilename(name)
#name = urllib.quote(name)
logging.info("Serving file: %s" % name)
if fanfic.format == 'epub':
if name.endswith('.epub'):
self.response.headers['Content-Type'] = 'application/epub+zip'
self.response.headers['Content-disposition'] = 'attachment; filename=' + name + '.epub'
elif fanfic.format == 'html':
elif name.endswith('.html'):
self.response.headers['Content-Type'] = 'text/html'
self.response.headers['Content-disposition'] = 'attachment; filename=' + name + '.html.zip'
elif fanfic.format == 'text':
elif name.endswith('.txt'):
self.response.headers['Content-Type'] = 'text/plain'
self.response.headers['Content-disposition'] = 'attachment; filename=' +name + '.txt.zip'
elif fanfic.format == 'mobi':
self.response.headers['Content-Type'] = 'application/x-mobipocket-ebook'
self.response.headers['Content-disposition'] = 'attachment; filename=' + name + '.mobi'
elif name.endswith('.zip'):
self.response.headers['Content-Type'] = 'application/zip'
else:
self.response.headers['Content-Type'] = 'application/octet-stream'
self.response.headers['Content-disposition'] = 'attachment; filename="%s"' % name
data = DownloadData.all().filter("download =", fanfic).order("index")
# epub, txt and html are all already compressed.
# epubs are all already compressed.
# Each chunk is compress individually to avoid having
# to hold the whole in memory just for the
# compress/uncompress
if fanfic.format == 'mobi':
if fanfic.format != 'epub':
def dc(data):
try:
return zlib.decompress(data)
@ -230,18 +238,47 @@ class FanfictionDownloader(webapp.RequestHandler):
download.user = user
download.url = url
download.format = format
download.put()
adapter = None
taskqueue.add(url='/fdowntask',
queue_name="download",
params={'format':format,
'url':url,
'login':login,
'password':password,
'user':user.email()})
try:
config = ConfigParser.ConfigParser()
logging.debug('reading defaults.ini config file, if present')
config.read('defaults.ini')
logging.debug('reading appengine.ini config file, if present')
config.read('appengine.ini')
adapter = adapters.getAdapter(config,url)
logging.info('Created an adaper: %s' % adapter)
if len(login) > 1:
adapter.username=login
adapter.password=password
## This scrapes the metadata, which will be
## duplicated in the queue task, but it
## detects bad URLs, bad login, bad story, etc
## without waiting for the queue. So I think
## it's worth the double up. Could maybe save
## it all in the download object someday.
story = adapter.getStoryMetadataOnly()
download.title = story.getMetadata('title')
download.author = story.getMetadata('author')
download.put()
taskqueue.add(url='/fdowntask',
queue_name="download",
params={'format':format,
'url':url,
'login':login,
'password':password,
'user':user.email()})
logging.info("enqueued download key: " + str(download.key()))
except Exception, e:
logging.exception(e)
download.failure = str(e)
download.put()
logging.info("enqueued download key: " + str(download.key()))
self.redirect('/status?id='+str(download.key()))
return
@ -289,120 +326,67 @@ class FanfictionDownloaderTask(webapp.RequestHandler):
logging.info('Creating adapter...')
try:
if url.find('fictionalley') != -1:
adapter = fictionalley.FictionAlley(url)
elif url.find('ficwad') != -1:
adapter = ficwad.FicWad(url)
elif url.find('fanfiction.net') != -1:
adapter = ffnet.FFNet(url)
elif url.find('fictionpress.com') != -1:
adapter = fpcom.FPCom(url)
elif url.find('harrypotterfanfiction.com') != -1:
adapter = hpfiction.HPFiction(url)
elif url.find('twilighted.net') != -1:
adapter = twilighted.Twilighted(url)
elif url.find('twiwrite.net') != -1:
adapter = twiwrite.Twiwrite(url)
elif url.find('adastrafanfic.com') != -1:
adapter = adastrafanfic.Adastrafanfic(url)
elif url.find('whofic.com') != -1:
adapter = whofic.Whofic(url)
elif url.find('potionsandsnitches.net') != -1:
adapter = potionsNsnitches.PotionsNSnitches(url)
elif url.find('mediaminer.org') != -1:
adapter = mediaminer.MediaMiner(url)
else:
logging.debug("Bad URL detected")
download.failure = url +" is not a valid story URL."
download.put()
return
config = ConfigParser.ConfigParser()
logging.debug('reading defaults.ini config file, if present')
config.read('defaults.ini')
logging.debug('reading appengine.ini config file, if present')
config.read('appengine.ini')
adapter = adapters.getAdapter(config,url)
except Exception, e:
logging.exception(e)
download.failure = "Adapter was not created: " + str(e)
download.failure = str(e)
download.put()
return
logging.info('Created an adaper: %s' % adapter)
if len(login) > 1:
adapter.setLogin(login)
adapter.setPassword(password)
adapter.username=login
adapter.password=password
if format == 'epub':
writerClass = output.EPubFanficWriter
elif format == 'html':
writerClass = output.HTMLWriter
elif format == 'mobi':
writerClass = output.MobiWriter
else:
writerClass = output.TextWriter
loader = FanficLoader(adapter,
writerClass,
quiet = True,
inmemory=True,
compress=False)
try:
data = loader.download()
if format == 'html' or format == 'text':
# data is uncompressed hence huge
ext = '.html'
if format == 'text':
ext = '.txt'
logging.debug(data)
files = {makeAcceptableFilename(str(adapter.getOutputName())) + ext : StringIO.StringIO(data.decode('utf-8')) }
d = inMemoryZip(files)
data = d.getvalue()
except LoginRequiredException, e:
logging.exception(e)
download.failure = 'Login problem detected'
download.put()
return
# adapter.getStory() is what does all the heavy lifting.
writer = writers.getWriter(format,config,adapter.getStory())
except Exception, e:
logging.exception(e)
download.failure = 'Some exception happened in downloader: ' + str(e)
download.failure = str(e)
download.put()
return
if data == None:
if loader.badLogin:
logging.debug("Bad login detected")
download.failure = 'Login failed'
download.put()
return
download.failure = 'No data returned by adaptor'
download.put()
else:
download.name = self._printableVersion(adapter.getOutputName())
download.title = self._printableVersion(adapter.getStoryName())
download.author = self._printableVersion(adapter.getAuthorName())
download.put()
index=0
download.name = writer.getOutputFileName()
download.title = adapter.getStory().getMetadata('title')
download.author = adapter.getStory().getMetadata('author')
download.put()
index=0
# epub, txt and html are all already compressed.
# Each chunk is compressed individually to avoid having
# to hold the whole in memory just for the
# compress/uncompress.
if format == 'mobi':
def c(data):
return zlib.compress(data)
else:
def c(data):
return data
while( len(data) > 0 ):
DownloadData(download=download,
index=index,
blob=c(data[:1000000])).put()
index += 1
data = data[1000000:]
download.completed=True
download.put()
outbuffer = StringIO.StringIO()
writer.writeStory(outbuffer)
data = outbuffer.getvalue()
outbuffer.close()
del writer
del adapter
# epubs are all already compressed.
# Each chunk is compressed individually to avoid having
# to hold the whole in memory just for the
# compress/uncompress.
if format != 'epub':
def c(data):
return zlib.compress(data)
else:
def c(data):
return data
logging.info("Download finished OK")
while( len(data) > 0 ):
DownloadData(download=download,
index=index,
blob=c(data[:1000000])).put()
index += 1
data = data[1000000:]
download.completed=True
download.put()
logging.info("Download finished OK")
return
def toPercentDecimal(match):

View file

@ -6,7 +6,7 @@ logging.basicConfig(level=logging.DEBUG,format="%(levelname)s:%(filename)s(%(lin
import sys, os
import getpass
from fanficdownloader import adapters,writers
from fanficdownloader import adapters,writers,exceptions
import ConfigParser
@ -27,7 +27,7 @@ try:
try:
print adapter.getStory()
except adapters.FailedToLogin, ftl:
except exceptions.FailedToLogin, ftl:
print "Login Failed, Need Username/Password."
sys.stdout.write("Username: ")
adapter.username = sys.stdin.readline().strip()
@ -40,9 +40,9 @@ try:
writeStory(adapter,"txt")
del adapter
except adapters.InvalidStoryURL, isu:
except exceptions.InvalidStoryURL, isu:
print isu
except adapters.StoryDoesNotExist, dne:
except exceptions.StoryDoesNotExist, dne:
print dne
except adapters.UnknownSite, us:
except exceptions.UnknownSite, us:
print us

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.