are the parts of the summary.
- divcontent = soup.find('div',{'class':'content'})
-
- # metadesc = soup.find('meta',{'name':'description'})
- # contentsoup = bs.BeautifulStoneSoup(metadesc['content'])
- ps = divcontent.findAll('p')
- pstext=[]
- for p in ps:
- if p.string:
- s = p.string.replace(' ',' ').strip()
- if s:
- pstext.append(p.string)
-
- self.storyDescription = ' '.join(pstext)
- print "self.storyDescription: %s"%self.storyDescription
-
- return result
-
- def getText(self, url):
- if url.find('http://') == -1:
- url = 'http://' + self.host + '/' + url
-
- logging.debug('Getting data from: %s' % url)
-
- data = ''
- try:
- data = self.opener.open(url).read()
- except Exception, e:
- data = ''
- logging.error("Caught an exception reading URL " + url + ". Exception " + unicode(e) + ".")
- if data is None:
- raise FailedToDownload("Error downloading Chapter: %s! Problem getting page!" % url)
-
- soup = None
- try:
- soup = bs.BeautifulStoneSoup(data, convertEntities=bs.BeautifulStoneSoup.HTML_ENTITIES)
- except:
- logging.info("Failed to decode: <%s>" % data)
- raise FailedToDownload("Error downloading Chapter: %s! Problem decoding page!" % url)
-
- div = soup.find('div', {'id' : 'story'})
-
- if None == div:
- raise FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
-
- return div.__str__('utf8')
-
-
-class Twiwrite_UnitTests(unittest.TestCase):
- def setUp(self):
- logging.basicConfig(level=logging.DEBUG)
- pass
-
- def testLoginWorks(self):
- url = 'http://www.twiwrite.net/viewstory.php?sid=117'
- self.assertTrue(Twiwrite(url).performLogin())
-
- def testGetUrlsWorks(self):
- url = 'http://www.twiwrite.net/viewstory.php?sid=117'
- self.assertEquals(36, len(Twiwrite(url).extractIndividualUrls()))
-
-if __name__ == '__main__':
- unittest.main()
diff --git a/fanficdownloader/whofic.py b/fanficdownloader/whofic.py
deleted file mode 100644
index 79fec927..00000000
--- a/fanficdownloader/whofic.py
+++ /dev/null
@@ -1,225 +0,0 @@
-# -*- coding: utf-8 -*-
-
-import os
-import re
-import sys
-import shutil
-import os.path
-import urllib as u
-import logging
-import pprint as pp
-import unittest
-import urllib2 as u2
-import urlparse as up
-import BeautifulSoup as bs
-import htmlentitydefs as hdefs
-import time
-import datetime
-
-from adapter import *
-
-class Whofic(FanfictionSiteAdapter):
- def __init__(self, url):
- self.url = url
- parsedUrl = up.urlparse(url)
- self.host = parsedUrl.netloc
- self.path = parsedUrl.path
- self.opener = u2.build_opener(u2.HTTPCookieProcessor())
- self.storyDescription = 'Fanfiction Story'
- self.authorId = '0'
- self.authorURL = ''
- self.storyId = '0'
- self.storyPublished = datetime.date(1970, 01, 31)
- self.storyCreated = datetime.datetime.now()
- self.storyUpdated = datetime.date(1970, 01, 31)
- self.languageId = 'en-UK'
- self.language = 'English'
- self.subjects = []
- self.subjects.append ('Fanfiction')
- self.subjects.append ('Doctor Who')
- self.publisher = self.host
- self.numChapters = 0
- self.numWords = 0
- self.genre = ''
- self.category = ''
- self.storyStatus = 'In-Progress'
- self.storyRating = 'PG'
- self.storyUserRating = '0'
- self.storyCharacters = []
- self.storySeries = ''
- self.outputName = ''
- self.outputStorySep = '-whof_'
-
- self.chapurl = False
- ss=self.url.split('?')
- logging.debug('ss=%s' % ss)
- if ss is not None and len(ss) > 1:
- sss = ss[1].replace('&','&').split('&')
- logging.debug('sss=%s' % sss)
- if sss is not None and len(sss) > 0:
- ssss = sss[0].split('=')
- logging.debug('ssss=%s' % ssss)
- if ssss is not None and len(ssss) > 1 and ssss[0] == 'sid':
- self.storyId = ssss[1]
- if len(sss) > 1:
- ssss = sss[1].split('=')
- logging.debug('ssss=%s' % ssss)
- if ssss is not None and len(ssss) > 1 and ssss[0] == 'chapter':
- self.chapurl = True
-
- self.url = 'http://' + self.host + self.path + '?sid=' + self.storyId
- logging.debug('self.url=%s' % self.url)
-
- logging.debug("Created Whofic: url=%s" % (self.url))
-
- def requiresLogin(self, url = None):
- return False
-
- def extractIndividualUrls(self):
- url = self.url + '&chapter=1'
-
- data = ''
- try:
- data = self.opener.open(url).read()
- except Exception, e:
- data = ''
- logging.error("Caught an exception reading URL " + url + ". Exception " + unicode(e) + ".")
- if data is None:
- raise StoryDoesNotExist("Problem reading story URL " + url + "!")
-
- soup = None
- try:
- soup = bs.BeautifulStoneSoup(data)
- except:
- raise FailedToDownload("Error downloading Story: %s! Problem decoding page!" % url)
-
- title = soup.find('title').string
- title = title.split('::')[1].strip()
- logging.debug('Title: %s' % title)
- self.storyName = title.split(' by ')[0].strip()
- self.authorName = title.split(' by ')[1].strip()
-
- for a in soup.findAll('a'):
- if a['href'].startswith('viewuser.php'):
- self.authorId = a['href'].split('=')[1]
- self.authorURL = 'http://'+self.host+'/'+a['href']
-
- logging.debug('self.storyId=%s, self.storyName=%s' % (self.storyId, self.storyName))
- logging.debug('self.authorId=%s, self.authorName=%s' % (self.authorId, self.authorName))
-
- select = soup.find('select', { 'name' : 'chapter' } )
-
- result = []
- if select is None:
- # no chapters found, try url by itself.
- result.append((url,self.storyName))
- else:
- allOptions = select.findAll('option')
- for o in allOptions:
- url = self.url + "&chapter=%s" % o['value']
- # just in case there's tags, like
in chapter titles.
- title = "%s" % o
- title = re.sub('<[^>]+>','',title)
- result.append((url,title))
-
- ## Whofic.com puts none of the meta data in the chapters or
- ## even the story chapter index page. Need to scrape the
- ## author page to find it.
- data = self.opener.open(self.authorURL).read()
-
- soup = bs.BeautifulStoneSoup(data, selfClosingTags=('br','hr'))
- # find this story in the list, parse it's metadata based on
- # lots of assumptions, since there's little tagging.
- for a in soup.findAll('a'):
- if a['href'].find('viewstory.php?sid='+self.storyId) != -1:
- metadata = a.findParent('td')
- metadatachunks = metadata.__str__('utf8').split('
')
- # process metadata for this story.
- self.storyDescription = metadatachunks[1].strip()
-
- # the stuff with ' - ' separators
- moremeta = metadatachunks[2]
- moremeta = re.sub('<[^>]+>','',moremeta) # strip tags.
-
- moremetaparts = moremeta.split(' - ')
-
- self.category = moremetaparts[0]
- for cat in self.category.split(', '):
- self.addSubject(cat.strip())
-
- self.storyRating = moremetaparts[1]
-
- for warn in moremetaparts[2].split(', '):
- self.addSubject(warn.strip())
-
- self.genre = moremetaparts[3]
-
- # the stuff with ' - ' separators *and* names
- moremeta = metadatachunks[5]
- moremeta = re.sub('<[^>]+>','',moremeta) # strip tags.
-
- moremetaparts = moremeta.split(' - ')
-
- for part in moremetaparts:
- (name,value) = part.split(': ')
- name=name.strip()
- value=value.strip()
- if name == 'Published':
- self.storyPublished = datetime.datetime.fromtimestamp(time.mktime(time.strptime(value, '%Y.%m.%d')))
- if name == 'Updated':
- self.storyUpdated = datetime.datetime.fromtimestamp(time.mktime(time.strptime(value, '%Y.%m.%d')))
- if name == 'Completed' and value == 'Yes':
- self.storyStatus = name
- if name == 'Word Count':
- self.numWords = value
-
- break
-
- self.numChapters = len(result)
-
- return result
-
- def getText(self, url):
- if url.find('http://') == -1:
- url = 'http://' + self.host + '/' + url
-
- logging.debug('Getting data from: %s' % url)
-
- data = ''
- try:
- data = self.opener.open(url).read()
- except Exception, e:
- data = ''
- logging.error("Caught an exception reading URL " + url + ". Exception " + unicode(e) + ".")
- if data is None:
- raise FailedToDownload("Error downloading Chapter: %s! Problem getting page!" % url)
-
- soup = None
- try:
- # I really wish I knew why adastra needs the selfClosingTags to make
work, but ficwad doesn't.
- soup = bs.BeautifulStoneSoup(data, convertEntities=bs.BeautifulStoneSoup.HTML_ENTITIES, selfClosingTags=('br','hr'))
- except:
- logging.info("Failed to decode: <%s>" % data)
- raise FailedToDownload("Error downloading Chapter: %s! Problem decoding page!" % url)
-
- # hardly a great identifier, I know, but whofic really doesn't
- # give us anything better to work with.
- span = soup.find('span', {'style' : 'font-size: 100%;'})
-
- if None == span:
- raise FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
-
- return span.__str__('utf8')
-
-
-class Whofic_UnitTests(unittest.TestCase):
- def setUp(self):
- logging.basicConfig(level=logging.DEBUG)
- pass
-
- def testGetUrlsWorks(self):
- url = 'http://www.whofic.com/viewstory.php?sid=37139'
- self.assertEquals(6, len(Whofic(url).extractIndividualUrls()))
-
-if __name__ == '__main__':
- unittest.main()
diff --git a/fanficdownloader/writers/base_writer.py b/fanficdownloader/writers/base_writer.py
index d81d7d0b..6c3a68e1 100644
--- a/fanficdownloader/writers/base_writer.py
+++ b/fanficdownloader/writers/base_writer.py
@@ -68,7 +68,8 @@ class BaseStoryWriter(Configurable):
'authorUrl',
'formatname',
'formatext',
- 'siteabbrev']
+ 'siteabbrev',
+ 'version']
# fall back labels.
self.titleLabels = {
@@ -93,7 +94,8 @@ class BaseStoryWriter(Configurable):
'authorUrl':'Author URL',
'formatname':'File Format',
'formatext':'File Extension',
- 'siteabbrev':'Site Abbrev'
+ 'siteabbrev':'Site Abbrev',
+ 'version':'FFD Version'
}
self.story.setMetadata('formatname',self.getFormatName())
self.story.setMetadata('formatext',self.getFormatExt())
@@ -206,6 +208,11 @@ class BaseStoryWriter(Configurable):
print "File(%s) Updated(%s) more recently than Story(%s) - Skipping" % (outfilename,fileupdated,lastupdated)
return
+ self.story = self.adapter.getStory() # get full story now,
+ # just before
+ # writing. Fetch
+ # before opening
+ # file.
outstream = open(outfilename,"wb")
else:
close=False
@@ -213,6 +220,9 @@ class BaseStoryWriter(Configurable):
self.story = self.adapter.getStory() # get full story now,
# just before writing.
+ # Okay if double called
+ # with above, it will
+ # only fetch once.
if self.getConfig('zip_output'):
out = StringIO.StringIO()
self.writeStoryImpl(out)
diff --git a/fanficdownloader/writers/writer_html.py b/fanficdownloader/writers/writer_html.py
index b8a60f1c..3b7ac36b 100644
--- a/fanficdownloader/writers/writer_html.py
+++ b/fanficdownloader/writers/writer_html.py
@@ -38,6 +38,21 @@ class HTMLWriter(BaseStoryWriter):
${title} by ${author}
+
diff --git a/fanficdownloader/zipdir.py b/fanficdownloader/zipdir.py
deleted file mode 100644
index eb50c961..00000000
--- a/fanficdownloader/zipdir.py
+++ /dev/null
@@ -1,177 +0,0 @@
-# -*- coding: utf-8 -*-
-
-from __future__ import with_statement
-
-import sys
-import os
-import zlib
-import zipfile
-from zipfile import ZipFile, ZIP_STORED, ZIP_DEFLATED
-from contextlib import closing
-import logging
-
-import BeautifulSoup as bs
-import htmlentitydefs as hdefs
-import time
-import datetime
-from datetime import timedelta
-
-import StringIO
-
-class InvalidEPub(Exception):
- pass
-
-def checkNewer(filename, curdte):
- ret = True
-
- if not os.path.isfile(filename):
- logging.debug('File %s does not already exist.' % filename)
- return ret
-
- #logging.debug('filename=%s, curdte=%s' % (filename, curdte))
- lastdate = None
- with closing(ZipFile(open(filename, 'rb'))) as epub:
- titleFilePath = "OEBPS/title_page.xhtml"
- contentFilePath = "OEBPS/content.opf"
-
- namelist = set(epub.namelist())
- #logging.debug('namelist=%s' % namelist)
- if 'mimetype' not in namelist or \
- 'META-INF/container.xml' not in namelist:
- #raise InvalidEPub('%s: not a valid EPUB' % filename)
- logging.debug('File %s is not a valid EPub format file.' % filename)
- return ret
-
- if contentFilePath not in namelist:
- return ret # file is not newer
-
- data = epub.read(contentFilePath)
- soup = bs.BeautifulStoneSoup(data)
- lstdte = soup.find ('dc:date', {'opf:event' : 'modification'})
- #logging.debug('lstdte=%s' % lstdte.string)
- if lstdte is None and titleFilePath in namelist:
- data = epub.read(titleFilePath)
- soup = bs.BeautifulStoneSoup(data)
- fld = ''
- allTDs = soup.findAll ('td')
- for td in allTDs:
- b = td.find ('b')
- if b is not None:
- fld = b.string
- if td.string is not None and fld == "Updated:":
- lastdate = td.string
- #logging.debug('title lastdate=%s' % lastdate)
- else:
- lastdate = lstdte.string.strip(' ')
- #logging.debug('contents lastdate=%s' % lastdate)
-
- if lastdate is not None:
- currUpdated = datetime.datetime.fromtimestamp(time.mktime(time.strptime(curdte.strftime('%Y-%m-%d'), "%Y-%m-%d")))
- storyUpdated = datetime.datetime.fromtimestamp(time.mktime(time.strptime(lastdate, "%Y-%m-%d")))
- logging.debug('File %s last update date is %s, comparing to %s' % (filename, storyUpdated, currUpdated))
- if currUpdated <= storyUpdated :
- ret = False
-
- logging.debug("Does %s need to be updated? %s" % (filename, ret))
- return ret
-
-
-def toZip(filename, directory):
- zippedHelp = zipfile.ZipFile(filename, "w", compression=zipfile.ZIP_DEFLATED)
- lst = os.listdir(directory)
-
- for entity in lst:
- if entity.startswith('.'):
- continue
-
- each = os.path.join(directory,entity)
- print(each)
-
- if os.path.isfile(each):
- print(each)
- # epub standard requires mimetype to be uncompressed and first file.
- if entity == 'mimetype':
- zippedHelp.write(each, arcname=entity, compress_type=zipfile.ZIP_STORED)
- else:
- zippedHelp.write(each, arcname=entity)
- else:
- addFolderToZip(zippedHelp,entity, each)
-
- zippedHelp.close()
-
-def addFolderToZip(zippedHelp,folder,fpath):
- #print('addFolderToZip(%s)' % folder)
-
- if folder == '.' or folder == '..':
- return
-
- folderFiles = os.listdir(fpath)
- for f in folderFiles:
- if os.path.isfile(fpath + '/' + f):
- #print('basename=%s' % os.path.basename(fpath + '/' + f))
- zippedHelp.write(fpath + '/' + f, folder + '/' + f, zipfile.ZIP_DEFLATED)
- elif os.path.isdir(f):
- addFolderToZip(zippedHelp,f)
-
-def inMemoryZip(files):
- # files have a structure of {'path/to/file' => content} dictionary
- io = StringIO.StringIO()
-
- if 'mimetype' in files:
- # This fixes the uncompressed mimetype-first issue by opening
- # the in memory file as STORE, putting in the mimetype, then
- # closing and re-opening with DEFLATED. while it is often
- # true that mimetype is the first file, we can't assume it,
- # because the dict object is defined as unordered.
- path='mimetype'
- memzip = zipfile.ZipFile(io, 'a', compression=zipfile.ZIP_STORED)
- memzip.debug = 3
- if type(files[path]) != type('str'):
- data = files[path].getvalue()
- else:
- data = files[path]
-
- logging.debug("Writing ZIP path %s" % path)
- try:
- memzip.writestr(path, data.encode('utf-8'))
- except UnicodeDecodeError, e:
- memzip.writestr(path.encode('utf-8'), data.encode('utf-8'))
-
- memzip.close()
-
- # remove it from the files dict.
- del(files['mimetype'])
-
- # open in 'a' append mode.
- memzip = zipfile.ZipFile(io, 'a', compression=zipfile.ZIP_DEFLATED)
- memzip.debug = 3
-
- for path in files:
- if type(files[path]) != type('str'):
- data = files[path].getvalue()
- else:
- data = files[path]
-
-# logging.debug(data)
- logging.debug("Writing ZIP path %s" % path)
- try:
- memzip.writestr(path, data.encode('utf-8'))
- except UnicodeDecodeError, e:
- memzip.writestr(path.encode('utf-8'), data.encode('utf-8'))
-
- # declares all the files created by Windows.
- for zf in memzip.filelist:
- zf.create_system = 0
-
- memzip.close()
-
- return io
-
-if __name__ == '__main__':
-# toZip('sample.epub', "books/A_Time_To_Reflect")
-# z = zipfile.ZipFile('sample.epub', 'r')
- files = {'test.txt' : 'test', 'data/abc.txt' : 'abc'}
- data = inMemoryZip(files)
- f = open('res.zip', 'w')
- f.write(data)
- f.close()
diff --git a/main.py b/main.py
index 98ddc3de..a54bcb89 100644
--- a/main.py
+++ b/main.py
@@ -21,6 +21,7 @@ logging.getLogger().setLevel(logging.DEBUG)
import os
from os.path import dirname, basename, normpath
+import re
import sys
import zlib
import urllib
@@ -36,16 +37,8 @@ from google.appengine.api import users
from google.appengine.ext import webapp
from google.appengine.ext.webapp import util
-from fanficdownloader.downloader import *
-from fanficdownloader.ffnet import *
-from fanficdownloader.output import *
-from fanficdownloader import twilighted
-from fanficdownloader import adastrafanfic
-
from google.appengine.ext import db
-from fanficdownloader.zipdir import *
-
from ffstorage import *
from fanficdownloader import adapters, writers, exceptions
diff --git a/fanficdownloader/readme.txt b/readme.txt
similarity index 52%
rename from fanficdownloader/readme.txt
rename to readme.txt
index c8b2c8e9..a6e59751 100644
--- a/fanficdownloader/readme.txt
+++ b/readme.txt
@@ -1,10 +1,14 @@
To use, do:
-python downloader.py (epub|html|text|mobi)
+python downloader.py [-f (epub|html|txt)]
+
+Default format is epub.
Eg:
-python downloader.py http://www.fanfiction.net/s/5192986/1/A_Fox_in_Tokyo epub
+python downloader.py http://www.fanfiction.net/s/5192986/1/A_Fox_in_Tokyo
+
+Do 'python downloader.py -h' for more options.
This tool uses Python 2.5.2, but should work with newer versions of Python.