diff --git a/adapter.py b/adapter.py index 054267b9..9c1d6d5a 100644 --- a/adapter.py +++ b/adapter.py @@ -3,6 +3,11 @@ import logging import datetime +try: + from google.appengine.api.urlfetch import fetch as googlefetch + appEngineGlob = True +except: + appEngineGlob = False class LoginRequiredException(Exception): def __init__(self, url): @@ -12,11 +17,7 @@ class LoginRequiredException(Exception): return repr(self.url + ' requires user to be logged in') class FanfictionSiteAdapter: - try: - from google.appengine.api.urlfetch import fetch as googlefetch - appEngine = True - except: - appEngine = False + appEngine = appEngineGlob login = '' password = '' url = '' @@ -30,6 +31,7 @@ class FanfictionSiteAdapter: authorURL = '' outputStorySep = '-Ukn_' outputName = '' + outputFileName = '' storyDescription = '' storyCharacters = [] storySeries = '' @@ -57,6 +59,12 @@ class FanfictionSiteAdapter: def hasAppEngine(self): return self.appEngine + def fetchUrl(self, url): + if not self.appEngine: + return self.opener.open(url).read().decode('utf-8') + else: + return googlefetch(url).content + def requiresLogin(self, url = None): return False @@ -86,9 +94,14 @@ class FanfictionSiteAdapter: def getOutputName(self): self.outputName = self.storyName.replace(" ", "_") + self.outputStorySep + self.storyId - logging.debug('self.storyId=%s, self.storyName=%s self.outputName=%s' % (self.storyId, self.storyName, self.outputName)) + logging.debug('self.outputName=%s' % self.outputName) return self.outputName + def getOutputFileName(self, booksDirectory, format): + self.outputFileName = booksDirectory + "/" + self.getOutputName() + "." + format + logging.debug('self.outputFileName=%s' % self.outputFileName) + return self.outputNameFileName + def getAuthorURL(self): logging.debug('self.authorURL=%s' % self.authorURL) return self.authorURL diff --git a/downloader.py b/downloader.py index 310eac41..ee0120f3 100644 --- a/downloader.py +++ b/downloader.py @@ -55,10 +55,13 @@ class FanficLoader: urls = self.adapter.extractIndividualUrls() - s = self.booksDirectory + "/" + self.adapter.getOutputName() + "." + format - if not self.overWrite and os.path.isfile(s): - print >> sys.stderr, "File " + s + " already exists! Skipping!" - exit(10) + if (self.adapter.hasAppEngine): + self.overWrite = True + else: + s = self.adapter.getOutputFileName(self.booksDirectory, format) + if not self.overWrite and os.path.isfile(s): + print >> sys.stderr, "File " + s + " already exists! Skipping!" + exit(10) self.writer = self.writerClass(self.booksDirectory, self.adapter, inmemory=self.inmemory, compress=self.compress) diff --git a/ffnet.py b/ffnet.py index a036988e..d0eff9e7 100644 --- a/ffnet.py +++ b/ffnet.py @@ -143,14 +143,8 @@ class FFNet(FanfictionSiteAdapter): self.addSubject(subj) return True - def _fetchUrl(self, url): - if not self.appEngine: - return self.opener.open(url).read().decode('utf-8') - else: - return googlefetch(url).content - def extractIndividualUrls(self): - data = self._fetchUrl(self.url) + data = self.fetchUrl(self.url) d2 = re.sub('&\#[0-9]+;', ' ', data) soup = bs.BeautifulStoneSoup(d2) allA = soup.findAll('a') @@ -264,7 +258,7 @@ class FFNet(FanfictionSiteAdapter): def getText(self, url): time.sleep( 2.0 ) - data = self._fetchUrl(url) + data = self.fetchUrl(url) lines = data.split('\n') textbuf = '' diff --git a/fpcom.py b/fpcom.py index 04266888..b9431322 100644 --- a/fpcom.py +++ b/fpcom.py @@ -136,15 +136,9 @@ class FPCom(FanfictionSiteAdapter): if len(subj) > 0: self.addSubject(subj) return True - - def _fetchUrl(self, url): - if not self.appEngine: - return self.opener.open(url).read().decode('utf-8') - else: - return googlefetch(url).content def extractIndividualUrls(self): - data = self._fetchUrl(self.url) + data = self.fetchUrl(self.url) d2 = re.sub('&\#[0-9]+;', ' ', data) soup = bs.BeautifulStoneSoup(d2) allA = soup.findAll('a') @@ -283,7 +277,7 @@ class FPCom(FanfictionSiteAdapter): def getText(self, url): time.sleep( 2.0 ) - data = self._fetchUrl(url) + data = self.fetchUrl(url) lines = data.split('\n') textbuf = '' diff --git a/mediaminer.py b/mediaminer.py index dd2ef3ea..77ff29bd 100644 --- a/mediaminer.py +++ b/mediaminer.py @@ -143,14 +143,8 @@ class MediaMiner(FanfictionSiteAdapter): self.addSubject(subj) return True - def _fetchUrl(self, url): - if not self.appEngine: - return self.opener.open(url).read().decode('utf-8') - else: - return googlefetch(url).content - def extractIndividualUrls(self): - data = self._fetchUrl(self.url) + data = self.fetchUrl(self.url) #data.replace('
',' ').replace('
',' ').replace('
',' ') soup = bs.BeautifulSoup(data) #logging.debug('soap=%s' % soup) @@ -318,7 +312,7 @@ class MediaMiner(FanfictionSiteAdapter): def getText(self, url): time.sleep( 2.0 ) logging.debug('url=%s' % url) - data = self._fetchUrl(url) + data = self.fetchUrl(url) try: soup = bs.BeautifulSoup(data)