Merge changes from trunk.

This commit is contained in:
Jim Miller 2012-02-27 10:52:49 -06:00
commit 4306bfc301
39 changed files with 786 additions and 599 deletions

View file

@ -1,6 +1,6 @@
# ffd-retief-hrd fanfictiondownloader
application: fanfictiondownloader
version: 4-3-2
application: ffd-retief-hrd
version: 4-3-3
runtime: python27
api_version: 1
threadsafe: true

View file

@ -27,7 +27,7 @@ class FanFictionDownLoaderBase(InterfaceActionBase):
description = 'UI plugin to download FanFiction stories from various sites.'
supported_platforms = ['windows', 'osx', 'linux']
author = 'Jim Miller'
version = (1, 4, 6)
version = (1, 5, 0)
minimum_calibre_version = (0, 8, 30)
#: This field defines the GUI plugin class that contains all the code

View file

@ -36,6 +36,7 @@ all_prefs = JSONConfig('plugins/fanfictiondownloader_plugin')
# take from here.
all_prefs.defaults['personal.ini'] = get_resources('plugin-example.ini')
all_prefs.defaults['updatemeta'] = True
all_prefs.defaults['updatecover'] = False
all_prefs.defaults['keeptags'] = False
all_prefs.defaults['urlsfromclip'] = True
all_prefs.defaults['updatedefault'] = True
@ -53,6 +54,7 @@ all_prefs.defaults['custom_cols'] = {}
# when config is called for the first time on a library.
copylist = ['personal.ini',
'updatemeta',
'updatecover',
'keeptags',
'urlsfromclip',
'updatedefault',
@ -144,6 +146,7 @@ class ConfigWidget(QWidget):
prefs['fileform'] = unicode(self.basic_tab.fileform.currentText())
prefs['collision'] = unicode(self.basic_tab.collision.currentText())
prefs['updatemeta'] = self.basic_tab.updatemeta.isChecked()
prefs['updatecover'] = self.basic_tab.updatecover.isChecked()
prefs['keeptags'] = self.basic_tab.keeptags.isChecked()
prefs['urlsfromclip'] = self.basic_tab.urlsfromclip.isChecked()
prefs['updatedefault'] = self.basic_tab.updatedefault.isChecked()
@ -234,6 +237,11 @@ class BasicTab(QWidget):
self.updatemeta.setChecked(prefs['updatemeta'])
self.l.addWidget(self.updatemeta)
self.updatecover = QCheckBox('Update Cover when Updating Metadata?',self)
self.updatecover.setToolTip('Update cover image when metadata is updated. EPUB only.')
self.updatecover.setChecked(prefs['updatecover'])
self.l.addWidget(self.updatecover)
self.keeptags = QCheckBox('Keep Existing Tags when Updating Metadata?',self)
self.keeptags.setToolTip('Existing tags will be kept and any new tags added.\nCompleted and In-Progress tags will be still be updated, if known.\nLast Updated tags will be updated if lastupdate in include_subject_tags.')
self.keeptags.setChecked(prefs['keeptags'])

View file

@ -1,30 +0,0 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2012, Jim Miller'
__docformat__ = 'restructuredtext en'
from zipfile import ZipFile
from xml.dom.minidom import parseString
def get_dcsource(inputio):
epub = ZipFile(inputio, 'r')
## Find the .opf file.
container = epub.read("META-INF/container.xml")
containerdom = parseString(container)
rootfilenodelist = containerdom.getElementsByTagName("rootfile")
rootfilename = rootfilenodelist[0].getAttribute("full-path")
metadom = parseString(epub.read(rootfilename))
firstmetadom = metadom.getElementsByTagName("metadata")[0]
try:
source=firstmetadom.getElementsByTagName("dc:source")[0].firstChild.data.encode("utf-8")
except:
source=None
return source

View file

@ -15,6 +15,10 @@ from datetime import datetime
from PyQt4.Qt import (QApplication, QMenu, QToolButton)
from PyQt4.Qt import QPixmap, Qt
from PyQt4.QtCore import QBuffer
from calibre.ptempfile import PersistentTemporaryFile, PersistentTemporaryDirectory, remove_dir
from calibre.ebooks.metadata import MetaInformation, authors_to_string
from calibre.ebooks.metadata.meta import get_metadata
@ -30,8 +34,9 @@ from calibre_plugins.fanfictiondownloader_plugin.common_utils import (set_plugin
create_menu_action_unique, get_library_uuid)
from calibre_plugins.fanfictiondownloader_plugin.fanficdownloader import adapters, writers, exceptions
from calibre_plugins.fanfictiondownloader_plugin.epubmerge import doMerge
from calibre_plugins.fanfictiondownloader_plugin.dcsource import get_dcsource
from calibre_plugins.fanfictiondownloader_plugin.fanficdownloader.htmlcleanup import stripHTML
#from calibre_plugins.fanfictiondownloader_plugin.epubmerge import doMerge
from calibre_plugins.fanfictiondownloader_plugin.fanficdownloader.epubutils import get_dcsource, get_dcsource_chaptercount
from calibre_plugins.fanfictiondownloader_plugin.config import (prefs, permitted_values)
from calibre_plugins.fanfictiondownloader_plugin.dialogs import (
@ -93,6 +98,8 @@ class FanFictionDownLoaderPlugin(InterfaceAction):
# are not found in the zip file will result in null QIcons.
icon = get_icon('images/icon.png')
#self.qaction.setText('FFDL')
# The qaction is automatically created from the action_spec defined
# above
self.qaction.setIcon(icon)
@ -408,7 +415,7 @@ class FanFictionDownLoaderPlugin(InterfaceAction):
ffdlconfig = SafeConfigParser()
ffdlconfig.readfp(StringIO(get_resources("plugin-defaults.ini")))
ffdlconfig.readfp(StringIO(prefs['personal.ini']))
adapter = adapters.getAdapter(ffdlconfig,url)
adapter = adapters.getAdapter(ffdlconfig,url,fileform)
options['personal.ini'] = prefs['personal.ini']
@ -440,7 +447,7 @@ class FanFictionDownLoaderPlugin(InterfaceAction):
book['author_sort'] = book['author'] = story.getMetadata("author", removeallentities=True)
book['publisher'] = story.getMetadata("site")
book['tags'] = writer.getTags()
book['comments'] = story.getMetadata("description") #, removeallentities=True) comments handles entities better.
book['comments'] = stripHTML(story.getMetadata("description")) #, removeallentities=True) comments handles entities better.
book['series'] = story.getMetadata("series")
# adapter.opener is the element with a threadlock. But del
@ -517,13 +524,15 @@ class FanFictionDownLoaderPlugin(InterfaceAction):
# 'book' can exist without epub. If there's no existing epub,
# let it go and it will download it.
if db.has_format(book_id,fileform,index_is_id=True):
toupdateio = StringIO()
(epuburl,chaptercount) = doMerge(toupdateio,
[StringIO(db.format(book_id,'EPUB',
index_is_id=True))],
titlenavpoints=False,
striptitletoc=True,
forceunique=False)
#toupdateio = StringIO()
(epuburl,chaptercount) = get_dcsource_chaptercount(StringIO(db.format(book_id,'EPUB',
index_is_id=True)))
# (epuburl,chaptercount) = doMerge(toupdateio,
# [StringIO(db.format(book_id,'EPUB',
# index_is_id=True))],
# titlenavpoints=False,
# striptitletoc=True,
# forceunique=False)
urlchaptercount = int(story.getMetadata('numChapters'))
if chaptercount == urlchaptercount:
if collision == UPDATE:
@ -630,7 +639,7 @@ class FanFictionDownLoaderPlugin(InterfaceAction):
if options['collision'] == CALIBREONLY or \
(options['updatemeta'] and book['good']):
self._update_metadata(db, book['calibre_id'], book, mi)
self._update_metadata(db, book['calibre_id'], book, mi, options)
def _update_books_completed(self, book_list, options={}):
@ -649,6 +658,9 @@ class FanFictionDownLoaderPlugin(InterfaceAction):
self.gui.library_view.model().current_changed(current, self.previous)
self.gui.tags_view.recount()
if self.gui.cover_flow:
self.gui.cover_flow.dataChanged()
self.gui.status_bar.show_message(_('Finished Adding/Updating %d books.'%(len(update_list) + len(add_list))), 3000)
if len(update_list) + len(add_list) != len(book_list):
@ -729,7 +741,7 @@ class FanFictionDownLoaderPlugin(InterfaceAction):
return book_id
def _update_metadata(self, db, book_id, book, mi):
def _update_metadata(self, db, book_id, book, mi, options):
if prefs['keeptags']:
old_tags = db.get_tags(book_id)
# remove old Completed/In-Progress only if there's a new one.
@ -748,6 +760,13 @@ class FanFictionDownLoaderPlugin(InterfaceAction):
oldmi = db.get_metadata(book_id,index_is_id=True)
if not oldmi.languages:
mi.languages=['eng']
if options['fileform'] == 'epub' and prefs['updatecover']:
existingepub = db.format(book_id,'EPUB',index_is_id=True, as_file=True)
epubmi = get_metadata(existingepub,'EPUB')
if epubmi.cover_data[1] is not None:
db.set_cover(book_id, epubmi.cover_data[1])
#mi.cover = epubmi.cover_data[1]
db.set_metadata(book_id,mi)
@ -780,7 +799,7 @@ class FanFictionDownLoaderPlugin(InterfaceAction):
if meta == 'status-I':
val = book['all_metadata']['status'] == 'In-Progress'
db.set_custom(book_id, val, label=label, commit=False)
db.commit()
def _get_clean_reading_lists(self,lists):

View file

@ -23,7 +23,8 @@ from calibre.utils.logging import Log
from calibre_plugins.fanfictiondownloader_plugin.dialogs import (NotGoingToDownload,
OVERWRITE, OVERWRITEALWAYS, UPDATE, UPDATEALWAYS, ADDNEW, SKIP, CALIBREONLY)
from calibre_plugins.fanfictiondownloader_plugin.fanficdownloader import adapters, writers, exceptions
from calibre_plugins.fanfictiondownloader_plugin.epubmerge import doMerge
#from calibre_plugins.fanfictiondownloader_plugin.epubmerge import doMerge
from calibre_plugins.fanfictiondownloader_plugin.fanficdownloader.epubutils import get_update_data
# ------------------------------------------------------------------------------
#
@ -110,7 +111,7 @@ def do_download_for_worker(book,options):
ffdlconfig.readfp(StringIO(get_resources("plugin-defaults.ini")))
ffdlconfig.readfp(StringIO(options['personal.ini']))
adapter = adapters.getAdapter(ffdlconfig,book['url'])
adapter = adapters.getAdapter(ffdlconfig,book['url'],options['fileform'])
adapter.is_adult = book['is_adult']
adapter.username = book['username']
adapter.password = book['password']
@ -136,38 +137,44 @@ def do_download_for_worker(book,options):
elif 'epub_for_update' in book and options['collision'] in (UPDATE, UPDATEALWAYS):
urlchaptercount = int(story.getMetadata('numChapters'))
## First, get existing epub with titlepage and tocpage stripped.
updateio = StringIO()
(epuburl,chaptercount) = doMerge(updateio,
[book['epub_for_update']],
titlenavpoints=False,
striptitletoc=True,
forceunique=False)
(url,chaptercount,
adapter.oldchapters,
adapter.oldimgs) = get_update_data(book['epub_for_update'])
print("Do update - epub(%d) vs url(%d)" % (chaptercount, urlchaptercount))
print("write to %s"%outfile)
## Get updated title page/metadata by itself in an epub.
## Even if the title page isn't included, this carries the metadata.
titleio = StringIO()
writer.writeStory(outstream=titleio,metaonly=True)
writer.writeStory(outfilename=outfile, forceOverwrite=True)
## First, get existing epub with titlepage and tocpage stripped.
# updateio = StringIO()
# (epuburl,chaptercount) = doMerge(updateio,
# [book['epub_for_update']],
# titlenavpoints=False,
# striptitletoc=True,
# forceunique=False)
# ## Get updated title page/metadata by itself in an epub.
# ## Even if the title page isn't included, this carries the metadata.
# titleio = StringIO()
# writer.writeStory(outstream=titleio,metaonly=True)
newchaptersio = None
if urlchaptercount > chaptercount :
## Go get the new chapters
newchaptersio = StringIO()
adapter.setChaptersRange(chaptercount+1,urlchaptercount)
# newchaptersio = None
# if urlchaptercount > chaptercount :
# ## Go get the new chapters
# newchaptersio = StringIO()
# adapter.setChaptersRange(chaptercount+1,urlchaptercount)
adapter.config.set("overrides",'include_tocpage','false')
adapter.config.set("overrides",'include_titlepage','false')
writer.writeStory(outstream=newchaptersio)
# adapter.config.set("overrides",'include_tocpage','false')
# adapter.config.set("overrides",'include_titlepage','false')
# writer.writeStory(outstream=newchaptersio)
## Merge the three epubs together.
doMerge(outfile,
[titleio,updateio,newchaptersio],
fromfirst=True,
titlenavpoints=False,
striptitletoc=False,
forceunique=False)
# ## Merge the three epubs together.
# doMerge(outfile,
# [titleio,updateio,newchaptersio],
# fromfirst=True,
# titlenavpoints=False,
# striptitletoc=False,
# forceunique=False)
book['comment'] = 'Update %s completed, added %s chapters for %s total.'%\
(options['fileform'],(urlchaptercount-chaptercount),urlchaptercount)

View file

@ -126,7 +126,6 @@ extratags: FanFiction
## number of seconds to sleep between calls to the story site. May by
## useful if pulling large numbers of stories or if the site is slow.
## Primarily for commandline.
#slow_down_sleep_time:0.5
## For use only with stand-alone CLI version--run a command on the
@ -231,6 +230,37 @@ output_css:
.u {text-decoration: underline;}
.bold {font-weight: bold;}
## include images from img tags in the body and summary of
## stories. Images will be converted to jpg for size if possible.
#include_images:false
## If not set, the summary will have all html stripped for safety.
## Both this and include_images must be true to get images in the
## summary.
#keep_summary_html:false
## If set, the first image found will be made the cover image. If
## keep_summary_html is true, any images in summary will be before any
## in chapters.
#make_firstimage_cover: false
## If set, and there isn't already a cover image from the adapter or
## from make_firstimage_cover, this image will be made the cover.
## It can be either a 'file:' or 'http:' url.
## Note that if you enable make_firstimage_cover in [epub], but want
## to use default_cover_image for a specific site, use the site:format
## section, for example: [www.ficwad.com:epub]
#default_cover_image:file:///C:/Users/username/Desktop/nook/images/icon.png
#default_cover_image:http://www.somesite.com/someimage.gif
## Resize images down to width, height, preserving aspect ratio.
## Nook size, with margin.
image_max_size: 580, 725
## Change image to grayscale, if graphics library allows, to save
## space.
#grayscale_images: false
[mobi]
## mobi TOC cannot be turned off right now.
#include_tocpage: true

View file

@ -25,19 +25,16 @@ from StringIO import StringIO
from optparse import OptionParser
import getpass
import string
import ConfigParser
from subprocess import call
from epubmerge import doMerge
from fanficdownloader import adapters,writers,exceptions
from fanficdownloader.epubutils import get_dcsource_chaptercount, get_update_data
if sys.version_info < (2, 5):
print "This program requires Python 2.5 or newer."
sys.exit(1)
from fanficdownloader import adapters,writers,exceptions
import ConfigParser
def writeStory(config,adapter,writeformat,metaonly=False,outstream=None):
writer = writers.getWriter(writeformat,config,adapter)
writer.writeStory(outstream=outstream,metaonly=metaonly)
@ -116,19 +113,30 @@ def main():
try:
## Attempt to update an existing epub.
if options.update:
updateio = StringIO()
(url,chaptercount) = doMerge(updateio,
args,
titlenavpoints=False,
striptitletoc=True,
forceunique=False)
# updateio = StringIO()
# (url,chaptercount) = doMerge(updateio,
# args,
# titlenavpoints=False,
# striptitletoc=True,
# forceunique=False)
(url,chaptercount) = get_dcsource_chaptercount(args[0])
print "Updating %s, URL: %s" % (args[0],url)
output_filename = args[0]
config.set("overrides","output_filename",args[0])
else:
url = args[0]
adapter = adapters.getAdapter(config,url)
adapter = adapters.getAdapter(config,url,options.format)
## Check for include_images and absence of PIL, give warning.
if adapter.getConfig('include_images'):
try:
import Image
except:
print "You have include_images enabled, but Python Image Library(PIL) isn't found.\nImages will be included full size in original format.\nContinue? (y/n)?"
if not sys.stdin.readline().strip().lower().startswith('y'):
return
## three tries, that's enough if both user/pass & is_adult needed,
## or a couple tries of one or the other
@ -157,17 +165,23 @@ def main():
print "Do update - epub(%d) vs url(%d)" % (chaptercount, urlchaptercount)
## Get updated title page/metadata by itself in an epub.
## Even if the title page isn't included, this carries the metadata.
titleio = StringIO()
writeStory(config,adapter,"epub",metaonly=True,outstream=titleio)
# titleio = StringIO()
# writeStory(config,adapter,"epub",metaonly=True,outstream=titleio)
newchaptersio = None
# newchaptersio = None
if not options.metaonly:
(url,chaptercount,
adapter.oldchapters,
adapter.oldimgs) = get_update_data(args[0])
writeStory(config,adapter,"epub")
## Go get the new chapters only in another epub.
newchaptersio = StringIO()
adapter.setChaptersRange(chaptercount+1,urlchaptercount)
config.set("overrides",'include_tocpage','false')
config.set("overrides",'include_titlepage','false')
writeStory(config,adapter,"epub",outstream=newchaptersio)
# newchaptersio = StringIO()
# adapter.setChaptersRange(chaptercount+1,urlchaptercount)
# config.set("overrides",'include_tocpage','false')
# config.set("overrides",'include_titlepage','false')
# writeStory(config,adapter,"epub",outstream=newchaptersio)
# out = open("testing/titleio.epub","wb")
# out.write(titleio.getvalue())
@ -182,12 +196,12 @@ def main():
# out.close()
## Merge the three epubs together.
doMerge(args[0],
[titleio,updateio,newchaptersio],
fromfirst=True,
titlenavpoints=False,
striptitletoc=False,
forceunique=False)
# doMerge(args[0],
# [titleio,updateio,newchaptersio],
# fromfirst=True,
# titlenavpoints=False,
# striptitletoc=False,
# forceunique=False)
else:
# regular download

View file

@ -16,374 +16,10 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
import os
import re
#import StringIO
from optparse import OptionParser
import zlib
import zipfile
from zipfile import ZipFile, ZIP_STORED, ZIP_DEFLATED
from time import time
from exceptions import KeyError
from xml.dom.minidom import parse, parseString, getDOMImplementation
def main(argv):
# read in args, anything starting with -- will be treated as --<varible>=<value>
usage = "usage: %prog [options] <input epub> [<input epub>...]"
parser = OptionParser(usage)
parser.add_option("-o", "--output", dest="outputopt", default="merge.epub",
help="Set OUTPUT file, Default: merge.epub", metavar="OUTPUT")
parser.add_option("-t", "--title", dest="titleopt", default=None,
help="Use TITLE as the metadata title. Default: '<first epub title> Anthology'", metavar="TITLE")
parser.add_option("-d", "--description", dest="descopt", default=None,
help="Use DESC as the metadata description. Default: '<epub title> by <author>' for each epub.", metavar="DESC")
parser.add_option("-a", "--author",
action="append", dest="authoropts", default=[],
help="Use AUTHOR as a metadata author, multiple authors may be given, Default: <All authors from epubs>", metavar="AUTHOR")
parser.add_option("-f", "--first",
action="store_true", dest="fromfirst", default=False,
help="Take all metadata from first input epub",)
parser.add_option("-n", "--titles-in-toc",
action="store_true", dest="titlenavpoints",
help="Put an entry in the TOC for each epub, in addition to each epub's chapters.",)
parser.add_option("-s", "--strip-title-toc",
action="store_true", dest="striptitletoc",
help="Strip any title_page.xhtml and toc_page.xhtml files.",)
(options, args) = parser.parse_args()
## Add .epub if not already there.
if not options.outputopt.lower().endswith(".epub"):
options.outputopt=options.outputopt+".epub"
print "output file: "+options.outputopt
doMerge(options.outputopt,
args,
options.authoropts,
options.titleopt,
options.descopt,
options.fromfirst,
options.titlenavpoints,
options.striptitletoc)
# output = StringIO.StringIO()
# files = []
# for file in args:
# f = open(file,"rb")
# fio = StringIO.StringIO(f.read())
# f.close()
# files.append(fio)
# doMerge(output,files,authoropts,titleopt,descopt,fromfirst,titlenavpoints,striptitletoc)
# out = open(outputopt,"wb")
# out.write(output.getvalue())
def doMerge(outputio,files,authoropts=[],titleopt=None,descopt=None,
fromfirst=False,
titlenavpoints=True,
striptitletoc=False,
forceunique=True):
'''
outputio = output file name or StringIO.
files = list of input file names or StringIOs.
authoropts = list of authors to use, otherwise add from all input
titleopt = title, otherwise '<first title> Anthology'
descopt = description, otherwise '<title> by <author>' list for all input
fromfirst if true, take all metadata (including author, title, desc) from first input
titlenavpoints if true, put in a new TOC entry for each epub
striptitletoc if true, strip out any (title|toc)_page.xhtml files
forceunique if true, guarantee uniqueness of contents by adding a dir for each input
'''
## Python 2.5 ZipFile is rather more primative than later
## versions. It can operate on a file, or on a StringIO, but
## not on an open stream. OTOH, I suspect we would have had
## problems with closing and opening again to change the
## compression type anyway.
filecount=0
source=None
## Write mimetype file, must be first and uncompressed.
## Older versions of python(2.4/5) don't allow you to specify
## compression by individual file.
## Overwrite if existing output file.
outputepub = ZipFile(outputio, "w", compression=ZIP_STORED)
outputepub.debug = 3
outputepub.writestr("mimetype", "application/epub+zip")
outputepub.close()
## Re-open file for content.
outputepub = ZipFile(outputio, "a", compression=ZIP_DEFLATED)
outputepub.debug = 3
## Create META-INF/container.xml file. The only thing it does is
## point to content.opf
containerdom = getDOMImplementation().createDocument(None, "container", None)
containertop = containerdom.documentElement
containertop.setAttribute("version","1.0")
containertop.setAttribute("xmlns","urn:oasis:names:tc:opendocument:xmlns:container")
rootfiles = containerdom.createElement("rootfiles")
containertop.appendChild(rootfiles)
rootfiles.appendChild(newTag(containerdom,"rootfile",{"full-path":"content.opf",
"media-type":"application/oebps-package+xml"}))
outputepub.writestr("META-INF/container.xml",containerdom.toprettyxml(indent=' ',encoding='utf-8'))
## Process input epubs.
items = [] # list of (id, href, type) tuples(all strings) -- From .opfs' manifests
items.append(("ncx","toc.ncx","application/x-dtbncx+xml")) ## we'll generate the toc.ncx file,
## but it needs to be in the items manifest.
itemrefs = [] # list of strings -- idrefs from .opfs' spines
navmaps = [] # list of navMap DOM elements -- TOC data for each from toc.ncx files
booktitles = [] # list of strings -- Each book's title
allauthors = [] # list of lists of strings -- Each book's list of authors.
filelist = []
booknum=1
firstmetadom = None
for file in files:
if file == None : continue
book = "%d" % booknum
bookdir = ""
bookid = ""
if forceunique:
bookdir = "%d/" % booknum
bookid = "a%d" % booknum
#print "book %d" % booknum
epub = ZipFile(file, 'r')
## Find the .opf file.
container = epub.read("META-INF/container.xml")
containerdom = parseString(container)
rootfilenodelist = containerdom.getElementsByTagName("rootfile")
rootfilename = rootfilenodelist[0].getAttribute("full-path")
## Save the path to the .opf file--hrefs inside it are relative to it.
relpath = os.path.dirname(rootfilename)
if( len(relpath) > 0 ):
relpath=relpath+"/"
metadom = parseString(epub.read(rootfilename))
if booknum==1:
firstmetadom = metadom.getElementsByTagName("metadata")[0]
try:
source=firstmetadom.getElementsByTagName("dc:source")[0].firstChild.data.encode("utf-8")
except:
source=""
#print "Source:%s"%source
## Save indiv book title
booktitles.append(metadom.getElementsByTagName("dc:title")[0].firstChild.data)
## Save authors.
authors=[]
for creator in metadom.getElementsByTagName("dc:creator"):
if( creator.getAttribute("opf:role") == "aut" ):
authors.append(creator.firstChild.data)
allauthors.append(authors)
for item in metadom.getElementsByTagName("item"):
if( item.getAttribute("media-type") == "application/x-dtbncx+xml" ):
# TOC file is only one with this type--as far as I know.
# grab the whole navmap, deal with it later.
tocdom = parseString(epub.read(relpath+item.getAttribute("href")))
for navpoint in tocdom.getElementsByTagName("navPoint"):
navpoint.setAttribute("id",bookid+navpoint.getAttribute("id"))
for content in tocdom.getElementsByTagName("content"):
content.setAttribute("src",bookdir+relpath+content.getAttribute("src"))
navmaps.append(tocdom.getElementsByTagName("navMap")[0])
else:
id=bookid+item.getAttribute("id")
href=bookdir+relpath+item.getAttribute("href")
href=href.encode('utf8')
#print "href:"+href
if not striptitletoc or not re.match(r'.*/(title|toc)_page\.xhtml',
item.getAttribute("href")):
if href not in filelist:
try:
outputepub.writestr(href,
epub.read(relpath+item.getAttribute("href")))
if re.match(r'.*/(file|chapter)\d+\.x?html',href):
filecount+=1
items.append((id,href,item.getAttribute("media-type")))
filelist.append(href)
except KeyError, ke:
pass # Skip missing files.
for itemref in metadom.getElementsByTagName("itemref"):
if not striptitletoc or not re.match(r'(title|toc)_page', itemref.getAttribute("idref")):
itemrefs.append(bookid+itemref.getAttribute("idref"))
booknum=booknum+1;
if not forceunique:
# If not forceunique, it's an epub update.
# If there's a "calibre_bookmarks.txt", it's from reading
# in Calibre and should be preserved.
try:
fn = "META-INF/calibre_bookmarks.txt"
outputepub.writestr(fn,epub.read(fn))
except:
pass
## create content.opf file.
uniqueid="epubmerge-uid-%d" % time() # real sophisticated uid scheme.
contentdom = getDOMImplementation().createDocument(None, "package", None)
package = contentdom.documentElement
if fromfirst and firstmetadom:
metadata = firstmetadom
firstpackage = firstmetadom.parentNode
package.setAttribute("version",firstpackage.getAttribute("version"))
package.setAttribute("xmlns",firstpackage.getAttribute("xmlns"))
package.setAttribute("unique-identifier",firstpackage.getAttribute("unique-identifier"))
else:
package.setAttribute("version","2.0")
package.setAttribute("xmlns","http://www.idpf.org/2007/opf")
package.setAttribute("unique-identifier","epubmerge-id")
metadata=newTag(contentdom,"metadata",
attrs={"xmlns:dc":"http://purl.org/dc/elements/1.1/",
"xmlns:opf":"http://www.idpf.org/2007/opf"})
metadata.appendChild(newTag(contentdom,"dc:identifier",text=uniqueid,attrs={"id":"epubmerge-id"}))
if( titleopt is None ):
titleopt = booktitles[0]+" Anthology"
metadata.appendChild(newTag(contentdom,"dc:title",text=titleopt))
# If cmdline authors, use those instead of those collected from the epubs
# (allauthors kept for TOC & description gen below.
if( len(authoropts) > 1 ):
useauthors=[authoropts]
else:
useauthors=allauthors
usedauthors=dict()
for authorlist in useauthors:
for author in authorlist:
if( not usedauthors.has_key(author) ):
usedauthors[author]=author
metadata.appendChild(newTag(contentdom,"dc:creator",
attrs={"opf:role":"aut"},
text=author))
metadata.appendChild(newTag(contentdom,"dc:contributor",text="epubmerge",attrs={"opf:role":"bkp"}))
metadata.appendChild(newTag(contentdom,"dc:rights",text="Copyrights as per source stories"))
metadata.appendChild(newTag(contentdom,"dc:language",text="en"))
if not descopt:
# created now, but not filled in until TOC generation to save loops.
description = newTag(contentdom,"dc:description",text="Anthology containing:\n")
else:
description = newTag(contentdom,"dc:description",text=descopt)
metadata.appendChild(description)
package.appendChild(metadata)
manifest = contentdom.createElement("manifest")
package.appendChild(manifest)
for item in items:
(id,href,type)=item
manifest.appendChild(newTag(contentdom,"item",
attrs={'id':id,
'href':href,
'media-type':type}))
spine = newTag(contentdom,"spine",attrs={"toc":"ncx"})
package.appendChild(spine)
for itemref in itemrefs:
spine.appendChild(newTag(contentdom,"itemref",
attrs={"idref":itemref,
"linear":"yes"}))
## create toc.ncx file
tocncxdom = getDOMImplementation().createDocument(None, "ncx", None)
ncx = tocncxdom.documentElement
ncx.setAttribute("version","2005-1")
ncx.setAttribute("xmlns","http://www.daisy.org/z3986/2005/ncx/")
head = tocncxdom.createElement("head")
ncx.appendChild(head)
head.appendChild(newTag(tocncxdom,"meta",
attrs={"name":"dtb:uid", "content":uniqueid}))
head.appendChild(newTag(tocncxdom,"meta",
attrs={"name":"dtb:depth", "content":"1"}))
head.appendChild(newTag(tocncxdom,"meta",
attrs={"name":"dtb:totalPageCount", "content":"0"}))
head.appendChild(newTag(tocncxdom,"meta",
attrs={"name":"dtb:maxPageNumber", "content":"0"}))
docTitle = tocncxdom.createElement("docTitle")
docTitle.appendChild(newTag(tocncxdom,"text",text=titleopt))
ncx.appendChild(docTitle)
tocnavMap = tocncxdom.createElement("navMap")
ncx.appendChild(tocnavMap)
## TOC navPoints can be nested, but this flattens them for
## simplicity, plus adds a navPoint for each epub.
booknum=0
for navmap in navmaps:
navpoints = navmap.getElementsByTagName("navPoint")
if titlenavpoints:
## Copy first navPoint of each epub, give a different id and
## text: bookname by authorname
newnav = navpoints[0].cloneNode(True)
newnav.setAttribute("id","book"+newnav.getAttribute("id"))
## For purposes of TOC titling & desc, use first book author
newtext = newTag(tocncxdom,"text",text=booktitles[booknum]+" by "+allauthors[booknum][0])
text = newnav.getElementsByTagName("text")[0]
text.parentNode.replaceChild(newtext,text)
tocnavMap.appendChild(newnav)
if not descopt and not fromfirst:
description.appendChild(contentdom.createTextNode(booktitles[booknum]+" by "+allauthors[booknum][0]+"\n"))
for navpoint in navpoints:
#print "navpoint:%s"%navpoint.getAttribute("id")
if not striptitletoc or not re.match(r'(title|toc)_page',navpoint.getAttribute("id")):
tocnavMap.appendChild(navpoint)
booknum=booknum+1;
## Force strict ordering of playOrder
playorder=1
for navpoint in tocncxdom.getElementsByTagName("navPoint"):
navpoint.setAttribute("playOrder","%d" % playorder)
if( not navpoint.getAttribute("id").startswith("book") ):
playorder = playorder + 1
## content.opf written now due to description being filled in
## during TOC generation to save loops.
outputepub.writestr("content.opf",contentdom.toxml('utf-8'))
outputepub.writestr("toc.ncx",tocncxdom.toxml('utf-8'))
# declares all the files created by Windows. otherwise, when
# it runs in appengine, windows unzips the files as 000 perms.
for zf in outputepub.filelist:
zf.create_system = 0
outputepub.close()
return (source,filecount)
## Utility method for creating new tags.
def newTag(dom,name,attrs=None,text=None):
tag = dom.createElement(name)
if( attrs is not None ):
for attr in attrs.keys():
tag.setAttribute(attr,attrs[attr])
if( text is not None ):
tag.appendChild(dom.createTextNode(text))
return tag
if __name__ == "__main__":
main(sys.argv[1:])
print('''
The this utility has been split out into it's own project.
See: http://code.google.com/p/epubmerge/
...for a CLI epubmerge.py program and calibre plugin.
''')

View file

@ -64,7 +64,7 @@ for x in imports():
#print x
__class_list.append(sys.modules[x].getClass())
def getAdapter(config,url):
def getAdapter(config,url,fileform=None):
## fix up leading protocol.
fixedurl = re.sub(r"(?i)^[htp]+[:/]+","http://",url.strip())
if not fixedurl.startswith("http"):
@ -89,6 +89,7 @@ def getAdapter(config,url):
fixedurl = fixedurl.replace("http://","http://www.")
if cls:
adapter = cls(config,fixedurl) # raises InvalidStoryURL
adapter.setSectionOrder(adapter.getSiteDomain(),fileform)
return adapter
# No adapter found.
raise exceptions.UnknownSite( url, [cls.getSiteDomain() for cls in __class_list] )

View file

@ -25,7 +25,7 @@ from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, utf8FromSoup, makeDate
from base_adapter import BaseSiteAdapter, makeDate
class AdAstraFanficComSiteAdapter(BaseSiteAdapter):
@ -133,7 +133,8 @@ class AdAstraFanficComSiteAdapter(BaseSiteAdapter):
# sometimes poorly formated desc (<p> w/o </p>) leads
# to all labels being included.
svalue=svalue[:svalue.find('<span class="label">')]
self.story.setMetadata('description',stripHTML(svalue))
self.setDescription(url,svalue)
#self.story.setMetadata('description',stripHTML(svalue))
if 'Rated' in label:
self.story.setMetadata('rating', value)
@ -220,7 +221,7 @@ class AdAstraFanficComSiteAdapter(BaseSiteAdapter):
if None == span:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return utf8FromSoup(span)
return self.utf8FromSoup(url,span)
def getClass():
return AdAstraFanficComSiteAdapter

View file

@ -24,7 +24,7 @@ from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, utf8FromSoup, makeDate
from base_adapter import BaseSiteAdapter, makeDate
def getClass():
return ArchiveOfOurOwnOrgAdapter
@ -126,7 +126,8 @@ class ArchiveOfOurOwnOrgAdapter(BaseSiteAdapter):
a = metasoup.find('blockquote',{'class':'userstuff'})
if a != None:
self.story.setMetadata('description',a.text)
self.setDescription(url,a.text)
#self.story.setMetadata('description',a.text)
a = metasoup.find('dd',{'class':"rating tags"})
if a != None:
@ -216,7 +217,8 @@ class ArchiveOfOurOwnOrgAdapter(BaseSiteAdapter):
logging.debug('Getting chapter text from: %s' % url)
chapter=bs.BeautifulSoup('<div class="story"></div>')
soup = bs.BeautifulSoup(self._fetchUrl(url),selfClosingTags=('br','hr'))
data = self._fetchUrl(url)
soup = bs.BeautifulSoup(data,selfClosingTags=('br','hr'))
headnotes = soup.find('div', {'class' : "preface group"}).find('div', {'class' : "notes module"})
if headnotes != None:
@ -257,5 +259,5 @@ class ArchiveOfOurOwnOrgAdapter(BaseSiteAdapter):
if None == soup:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return utf8FromSoup(chapter)
return self.utf8FromSoup(url,chapter)

View file

@ -24,7 +24,7 @@ from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, utf8FromSoup, makeDate
from base_adapter import BaseSiteAdapter, makeDate
# By virtue of being recent and requiring both is_adult and user/pass,
# adapter_fanficcastletvnet.py is the best choice for learning to
@ -218,7 +218,8 @@ class CastleFansOrgAdapter(BaseSiteAdapter): # XXX
while not defaultGetattr(value,'class') == 'label':
svalue += str(value)
value = value.nextSibling
self.story.setMetadata('description',stripHTML(svalue))
self.setDescription(url,svalue)
#self.story.setMetadata('description',stripHTML(svalue))
if 'Rated' in label:
self.story.setMetadata('rating', value)
@ -305,4 +306,4 @@ class CastleFansOrgAdapter(BaseSiteAdapter): # XXX
if None == div:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return utf8FromSoup(div)
return self.utf8FromSoup(url,div)

View file

@ -24,7 +24,7 @@ import time
from .. import BeautifulSoup as bs
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, utf8FromSoup, makeDate
from base_adapter import BaseSiteAdapter, makeDate
class FanFictionNetSiteAdapter(BaseSiteAdapter):
@ -153,7 +153,8 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter):
if 'title_t' in var:
self.story.setMetadata('title', value)
if 'summary' in var:
self.story.setMetadata('description', value)
self.setDescription(url,value)
#self.story.setMetadata('description', value)
if 'datep' in var:
self.story.setMetadata('datePublished',makeDate(value, '%m-%d-%y'))
if 'dateu' in var:
@ -270,7 +271,7 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter):
logging.debug('div id=storytext not found. data:%s'%data)
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return utf8FromSoup(div)
return self.utf8FromSoup(url,div)
def getClass():
return FanFictionNetSiteAdapter

View file

@ -26,7 +26,7 @@ from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, utf8FromSoup, makeDate
from base_adapter import BaseSiteAdapter, makeDate
def getClass():
@ -201,7 +201,8 @@ class FicBookNetAdapter(BaseSiteAdapter):
break
summary=soup.find('span', {'class' : 'urlize'})
self.story.setMetadata('description', summary.text)
self.setDescription(url,summary.text)
#self.story.setMetadata('description', summary.text)
# grab the text for an individual chapter.
def getChapterText(self, url):
@ -218,4 +219,4 @@ class FicBookNetAdapter(BaseSiteAdapter):
if None == chapter:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return utf8FromSoup(chapter)
return self.utf8FromSoup(url,chapter)

View file

@ -25,7 +25,7 @@ from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, utf8FromSoup, makeDate
from base_adapter import BaseSiteAdapter, makeDate
class FictionAlleyOrgSiteAdapter(BaseSiteAdapter):
@ -187,7 +187,8 @@ class FictionAlleyOrgSiteAdapter(BaseSiteAdapter):
for small in storydd.findAll('small'):
small.extract() ## removes the <small> tags, leaving only the summary.
self.story.setMetadata('description',stripHTML(storydd))
self.setDescription(url,storydd)
#self.story.setMetadata('description',stripHTML(storydd))
return
@ -223,7 +224,7 @@ class FictionAlleyOrgSiteAdapter(BaseSiteAdapter):
if not data or not text:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return utf8FromSoup(text)
return self.utf8FromSoup(url,text)
def getClass():
return FictionAlleyOrgSiteAdapter

View file

@ -26,7 +26,7 @@ from .. import BeautifulSoup as bs
from .. import exceptions as exceptions
from ..htmlcleanup import stripHTML
from base_adapter import BaseSiteAdapter, utf8FromSoup, makeDate
from base_adapter import BaseSiteAdapter, makeDate
class FicwadComSiteAdapter(BaseSiteAdapter):
@ -124,7 +124,8 @@ class FicwadComSiteAdapter(BaseSiteAdapter):
# description
storydiv = soup.find("div",{"id":"story"})
self.story.setMetadata('description', storydiv.find("blockquote",{'class':'summary'}).p.string)
self.setDescription(url,storydiv.find("blockquote",{'class':'summary'}).p.string)
#self.story.setMetadata('description', storydiv.find("blockquote",{'class':'summary'}).p.string)
# most of the meta data is here:
metap = storydiv.find("p",{"class":"meta"})
@ -209,7 +210,7 @@ class FicwadComSiteAdapter(BaseSiteAdapter):
if None == span:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return utf8FromSoup(span)
return self.utf8FromSoup(url,span)
def getClass():
return FicwadComSiteAdapter

View file

@ -26,7 +26,7 @@ from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, utf8FromSoup, makeDate
from base_adapter import BaseSiteAdapter, makeDate
def getClass():
return FimFictionNetSiteAdapter
@ -141,7 +141,13 @@ class FimFictionNetSiteAdapter(BaseSiteAdapter):
description_soup.find('a', {"class":"more"}).extract()
except:
pass
self.story.setMetadata('description', description_soup.text)
# fimfic is the first site with an explicit cover image.
story_img = soup.find('img',{'class':'story_image'})
if self.getConfig('include_images') and story_img:
self.story.addImgUrl(self,self.url,story_img['src'],self._fetchUrlRaw,cover=True)
self.setDescription(self.url,description_soup.text)
#self.story.setMetadata('description', description_soup.text)
# Unfortunately, nowhere on the page is the year mentioned.
# Best effort to deal with this:
@ -171,5 +177,5 @@ class FimFictionNetSiteAdapter(BaseSiteAdapter):
soup = bs.BeautifulSoup(self._fetchUrl(url),selfClosingTags=('br','hr')).find('div', {'id' : 'chapter_container'})
if soup == None:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return utf8FromSoup(soup)
return self.utf8FromSoup(url,soup)

View file

@ -25,7 +25,7 @@ from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, utf8FromSoup, makeDate
from base_adapter import BaseSiteAdapter, makeDate
def getClass():
return GayAuthorsAdapter
@ -162,7 +162,8 @@ class GayAuthorsAdapter(BaseSiteAdapter):
self.story.setMetadata('rating',rating.text)
summary = msoup.find('span', {'itemprop' : 'description'})
self.story.setMetadata('description',summary.text)
self.setDescription(self.url,summary.text)
#self.story.setMetadata('description',summary.text)
stats = msoup.find('dl',{'class':'info'})
@ -200,4 +201,4 @@ class GayAuthorsAdapter(BaseSiteAdapter):
if None == div:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return utf8FromSoup(div)
return self.utf8FromSoup(url,div)

View file

@ -25,7 +25,7 @@ from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, utf8FromSoup, makeDate
from base_adapter import BaseSiteAdapter, makeDate
class HarryPotterFanFictionComSiteAdapter(BaseSiteAdapter):
@ -125,7 +125,8 @@ class HarryPotterFanFictionComSiteAdapter(BaseSiteAdapter):
## Finding the metadata is a bit of a pain. Desc is the only thing this color.
desctable= soup.find('table',{'bgcolor':'#f0e8e8'})
self.story.setMetadata('description',stripHTML(desctable))
self.setDescription(url,desctable)
#self.story.setMetadata('description',stripHTML(desctable))
## Finding the metadata is a bit of a pain. Most of the meta
## data is in a center.table without a bgcolor.
@ -193,7 +194,7 @@ class HarryPotterFanFictionComSiteAdapter(BaseSiteAdapter):
if None == div:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return utf8FromSoup(div)
return self.utf8FromSoup(url,div)
def getClass():
return HarryPotterFanFictionComSiteAdapter

View file

@ -25,7 +25,7 @@ from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, utf8FromSoup, makeDate
from base_adapter import BaseSiteAdapter, makeDate
class MediaMinerOrgSiteAdapter(BaseSiteAdapter):
@ -174,7 +174,8 @@ class MediaMinerOrgSiteAdapter(BaseSiteAdapter):
# Summary: ....
m = re.match(r".*?Summary: (.*)$",metastr)
if m:
self.story.setMetadata('description', m.group(1))
self.setDescription(url, m.group(1))
#self.story.setMetadata('description', m.group(1))
# completed
m = re.match(r".*?Status: Completed.*?",metastr)
@ -210,7 +211,7 @@ class MediaMinerOrgSiteAdapter(BaseSiteAdapter):
del div['style']
del div['align']
anchor.name='div'
return utf8FromSoup(anchor)
return self.utf8FromSoup(url,anchor)
else:
logging.debug('Using kludgey text find for older mediaminer story.')
@ -226,7 +227,7 @@ class MediaMinerOrgSiteAdapter(BaseSiteAdapter):
soup.findAll('table',{'class':'tbbrdr'}):
tag.extract() # remove tag from soup.
return utf8FromSoup(soup)
return self.utf8FromSoup(url,soup)
def getClass():

View file

@ -25,7 +25,7 @@ from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, utf8FromSoup, makeDate
from base_adapter import BaseSiteAdapter, makeDate
class PotionsAndSnitchesNetSiteAdapter(BaseSiteAdapter):
@ -131,7 +131,8 @@ class PotionsAndSnitchesNetSiteAdapter(BaseSiteAdapter):
while not defaultGetattr(value,'class') == 'listbox':
svalue += str(value)
value = value.nextSibling
self.story.setMetadata('description',stripHTML(svalue))
self.setDescription(url,svalue)
#self.story.setMetadata('description',stripHTML(svalue))
if 'Rated' in label:
self.story.setMetadata('rating', value)
@ -209,7 +210,7 @@ class PotionsAndSnitchesNetSiteAdapter(BaseSiteAdapter):
if None == div:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return utf8FromSoup(div)
return self.utf8FromSoup(url,div)
def getClass():
return PotionsAndSnitchesNetSiteAdapter

View file

@ -24,7 +24,7 @@ from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, utf8FromSoup, makeDate
from base_adapter import BaseSiteAdapter, makeDate
# This function is called by the downloader in all adapter_*.py files
# in this dir to register the adapter class. So it needs to be
@ -227,7 +227,8 @@ class SiyeCoUkAdapter(BaseSiteAdapter): # XXX
if part.startswith("Summary:"):
part = part[part.find(':')+1:]
self.story.setMetadata('description',part)
self.setDescription(url,part)
#self.story.setMetadata('description',part)
# want to get the next tr of the table.
#print("%s"%titlea.parent.parent.findNextSibling('tr'))
@ -295,4 +296,4 @@ class SiyeCoUkAdapter(BaseSiteAdapter): # XXX
if None == story:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return utf8FromSoup(story)
return self.utf8FromSoup(url,story)

View file

@ -25,7 +25,7 @@ from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, utf8FromSoup, makeDate
from base_adapter import BaseSiteAdapter, makeDate
class TenhawkPresentsComSiteAdapter(BaseSiteAdapter):
@ -164,7 +164,8 @@ class TenhawkPresentsComSiteAdapter(BaseSiteAdapter):
while not defaultGetattr(value,'class') == 'label':
svalue += str(value)
value = value.nextSibling
self.story.setMetadata('description',stripHTML(svalue))
self.setDescription(url,svalue)
#self.story.setMetadata('description',stripHTML(svalue))
if 'Rated' in label:
self.story.setMetadata('rating', value)
@ -238,7 +239,7 @@ class TenhawkPresentsComSiteAdapter(BaseSiteAdapter):
if None == span:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return utf8FromSoup(span)
return self.utf8FromSoup(url,span)
def getClass():
return TenhawkPresentsComSiteAdapter

View file

@ -22,7 +22,7 @@ import logging
from .. import BeautifulSoup as bs
from .. import exceptions
from base_adapter import BaseSiteAdapter, utf8FromSoup, makeDate
from base_adapter import BaseSiteAdapter, makeDate
class TestSiteAdapter(BaseSiteAdapter):
@ -78,7 +78,6 @@ class TestSiteAdapter(BaseSiteAdapter):
Some more longer description. "I suck at summaries!" "Better than it sounds!" "My first fic"
''')
self.story.setMetadata('datePublished',makeDate("1975-03-15","%Y-%m-%d"))
self.story.setMetadata('dateCreated',datetime.datetime.now())
if self.story.getMetadata('storyId') == '669':
self.story.setMetadata('dateUpdated',datetime.datetime.now())
else:
@ -127,7 +126,7 @@ Some more longer description. "I suck at summaries!" "Better than it sounds!"
('Chapter 4',self.url+"&chapter=5"),
('Chapter 5',self.url+"&chapter=6"),
('Chapter 6',self.url+"&chapter=6"),
('Chapter 7',self.url+"&chapter=6"),
# ('Chapter 7',self.url+"&chapter=6"),
# ('Chapter 8',self.url+"&chapter=6"),
# ('Chapter 9',self.url+"&chapter=6"),
# ('Chapter 0',self.url+"&chapter=6"),
@ -178,10 +177,12 @@ Some more longer description. "I suck at summaries!" "Better than it sounds!"
else:
text=u'''
<div>
<h3>Chapter</h3>
<h3>Chapter title from site</h3>
<p><center>Centered text</center></p>
<p>Lorem '''+self.crazystring+''' <i>italics</i>, <b>bold</b>, <u>underline</u> consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.</p>
br breaks<br><br>
<a href="http://code.google.com/p/fanficdownloader/wiki/FanFictionDownLoaderPluginWithReadingList" title="Tilt-a-Whirl by Jim &amp; Sarah, on Flickr"><img src="http://i.imgur.com/bo8eD.png"></a><br/>
br breaks<br><br>
<hr>
horizontal rules
@ -191,7 +192,7 @@ horizontal rules
</div>
'''
soup = bs.BeautifulStoneSoup(text,selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
return utf8FromSoup(soup)
return self.utf8FromSoup(url,soup)
def getClass():
return TestSiteAdapter

View file

@ -25,7 +25,7 @@ from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, utf8FromSoup, makeDate
from base_adapter import BaseSiteAdapter, makeDate
class TheWritersCoffeeShopComSiteAdapter(BaseSiteAdapter):
@ -166,7 +166,7 @@ class TheWritersCoffeeShopComSiteAdapter(BaseSiteAdapter):
while not defaultGetattr(value,'class') == 'label':
svalue += str(value)
value = value.nextSibling
self.story.setMetadata('description',stripHTML(svalue))
self.setDescription(url,svalue)
if 'Rated' in label:
self.story.setMetadata('rating', value)
@ -245,7 +245,7 @@ class TheWritersCoffeeShopComSiteAdapter(BaseSiteAdapter):
if None == span:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return utf8FromSoup(span)
return self.utf8FromSoup(url,span)
def getClass():
return TheWritersCoffeeShopComSiteAdapter

View file

@ -25,7 +25,7 @@ from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, utf8FromSoup, makeDate
from base_adapter import BaseSiteAdapter, makeDate
class TwistingTheHellmouthSiteAdapter(BaseSiteAdapter):
@ -127,6 +127,8 @@ class TwistingTheHellmouthSiteAdapter(BaseSiteAdapter):
else:
raise e
descurl = url
if "<h2>Story Not Found</h2>" in data:
raise exceptions.StoryDoesNotExist(url)
@ -154,12 +156,14 @@ class TwistingTheHellmouthSiteAdapter(BaseSiteAdapter):
# going to pull part of the meta data from author list page.
logging.debug("**AUTHOR** URL: "+self.story.getMetadata('authorUrl'))
authordata = self._fetchUrl(self.story.getMetadata('authorUrl'))
descurl=self.story.getMetadata('authorUrl')
authorsoup = bs.BeautifulSoup(authordata)
# author can have several pages, scan until we find it.
while( not authorsoup.find('a', href=re.compile(r"^/Story-"+self.story.getMetadata('storyId'))) ):
nextpage = 'http://'+self.host+authorsoup.find('a', {'class':'arrowf'})['href']
logging.debug("**AUTHOR** nextpage URL: "+nextpage)
authordata = self._fetchUrl(nextpage)
descurl=nextpage
authorsoup = bs.BeautifulSoup(authordata)
except urllib2.HTTPError, e:
if e.code == 404:
@ -168,7 +172,8 @@ class TwistingTheHellmouthSiteAdapter(BaseSiteAdapter):
raise e
storydiv = authorsoup.find('div', {'id':'st'+self.story.getMetadata('storyId'), 'class':re.compile(r"storylistitem")})
self.story.setMetadata('description',stripHTML(storydiv.find('div',{'class':'storydesc'})))
self.setDescription(descurl,storydiv.find('div',{'class':'storydesc'}))
#self.story.setMetadata('description',stripHTML(storydiv.find('div',{'class':'storydesc'})))
self.story.setMetadata('title',stripHTML(storydiv.find('a',{'class':'storylink'})))
verticaltable = soup.find('table', {'class':'verticaltable'})
@ -238,7 +243,7 @@ class TwistingTheHellmouthSiteAdapter(BaseSiteAdapter):
div.find('h3').extract()
except:
pass
return utf8FromSoup(div)
return self.utf8FromSoup(url,div)
def getClass():
return TwistingTheHellmouthSiteAdapter

View file

@ -25,7 +25,7 @@ from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, utf8FromSoup, makeDate
from base_adapter import BaseSiteAdapter, makeDate
class TwilightedNetSiteAdapter(BaseSiteAdapter):
@ -162,7 +162,7 @@ class TwilightedNetSiteAdapter(BaseSiteAdapter):
while not defaultGetattr(value,'class') == 'label':
svalue += str(value)
value = value.nextSibling
self.story.setMetadata('description',stripHTML(svalue))
self.setDescription(url,svalue)
if 'Rated' in label:
self.story.setMetadata('rating', value)
@ -243,7 +243,7 @@ class TwilightedNetSiteAdapter(BaseSiteAdapter):
if None == span:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return utf8FromSoup(span)
return self.utf8FromSoup(url,span)
def getClass():
return TwilightedNetSiteAdapter

View file

@ -25,7 +25,7 @@ from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, utf8FromSoup, makeDate
from base_adapter import BaseSiteAdapter, makeDate
class TwiwriteNetSiteAdapter(BaseSiteAdapter):
@ -169,7 +169,8 @@ class TwiwriteNetSiteAdapter(BaseSiteAdapter):
while not defaultGetattr(value,'class') == 'label':
svalue += str(value)
value = value.nextSibling
self.story.setMetadata('description',stripHTML(svalue))
self.setDescription(url,svalue)
#self.story.setMetadata('description',stripHTML(svalue))
if 'Rated' in label:
self.story.setMetadata('rating', value)
@ -255,7 +256,7 @@ class TwiwriteNetSiteAdapter(BaseSiteAdapter):
if None == span:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return utf8FromSoup(span)
return self.utf8FromSoup(url,span)
def getClass():
return TwiwriteNetSiteAdapter

View file

@ -23,7 +23,7 @@ import urllib2
from .. import BeautifulSoup as bs
from .. import exceptions as exceptions
from base_adapter import BaseSiteAdapter, utf8FromSoup, makeDate
from base_adapter import BaseSiteAdapter, makeDate
class WhoficComSiteAdapter(BaseSiteAdapter):
@ -120,9 +120,10 @@ class WhoficComSiteAdapter(BaseSiteAdapter):
# link instead to find the appropriate metadata.
a = soup.find('a', href=re.compile(r'reviews.php\?sid='+self.story.getMetadata('storyId')))
metadata = a.findParent('td')
metadatachunks = utf8FromSoup(metadata).split('<br />')
metadatachunks = self.utf8FromSoup(None,metadata).split('<br />')
# process metadata for this story.
self.story.setMetadata('description', metadatachunks[1])
self.setDescription(url,metadatachunks[1])
#self.story.setMetadata('description', metadatachunks[1])
# First line of the stuff with ' - ' separators
moremeta = metadatachunks[2]
@ -224,7 +225,7 @@ class WhoficComSiteAdapter(BaseSiteAdapter):
if None == span:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
return utf8FromSoup(span)
return self.utf8FromSoup(url,span)
def getClass():
return WhoficComSiteAdapter

View file

@ -22,6 +22,10 @@ import logging
import urllib
import urllib2 as u2
import urlparse as up
from functools import partial
from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML
try:
from google.appengine.api import apiproxy_stub_map
@ -66,8 +70,9 @@ class BaseSiteAdapter(Configurable):
def __init__(self, config, url):
self.config = config
Configurable.__init__(self, config)
self.addConfigSection(self.getSiteDomain())
self.addConfigSection("overrides")
self.setSectionOrder(self.getSiteDomain())
# self.addConfigSection(self.getSiteDomain())
# self.addConfigSection("overrides")
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
self.password = ""
@ -82,6 +87,8 @@ class BaseSiteAdapter(Configurable):
self.chapterUrls = [] # tuples of (chapter title,chapter url)
self.chapterFirst = None
self.chapterLast = None
self.oldchapters = None
self.oldimgs = None
## order of preference for decoding.
self.decode = ["utf8",
"Windows-1252"] # 1252 is a superset of
@ -150,6 +157,12 @@ class BaseSiteAdapter(Configurable):
headers=headers)
return self._decode(self.opener.open(req).read())
def _fetchUrlRaw(self, url, parameters=None):
if parameters != None:
return self.opener.open(url,urllib.urlencode(parameters)).read()
else:
return self.opener.open(url).read()
# parameters is a dict()
def _fetchUrl(self, url, parameters=None):
if self.getConfig('slow_down_sleep_time'):
@ -159,10 +172,7 @@ class BaseSiteAdapter(Configurable):
for sleeptime in [0, 0.5, 4, 9]:
time.sleep(sleeptime)
try:
if parameters:
return self._decode(self.opener.open(url,urllib.urlencode(parameters)).read())
else:
return self._decode(self.opener.open(url).read())
return self._decode(self._fetchUrlRaw(url,parameters))
except Exception, e:
excpt=e
logging.warn("Caught an exception reading URL: %s Exception %s."%(unicode(url),unicode(e)))
@ -182,15 +192,32 @@ class BaseSiteAdapter(Configurable):
def getStory(self):
if not self.storyDone:
self.getStoryMetadataOnly()
for index, (title,url) in enumerate(self.chapterUrls):
if (self.chapterFirst!=None and index < self.chapterFirst) or \
(self.chapterLast!=None and index > self.chapterLast):
self.story.addChapter(removeEntities(title),
None)
else:
if self.oldchapters and index < len(self.oldchapters):
data = self.utf8FromSoup(None,
self.oldchapters[index],
partial(cachedfetch,self._fetchUrlRaw,self.oldimgs))
else:
data = self.getChapterText(url)
self.story.addChapter(removeEntities(title),
removeEntities(self.getChapterText(url)))
removeEntities(data))
self.storyDone = True
# include image, but no cover from story, add default_cover_image cover.
if self.getConfig('include_images') and \
not self.story.cover and \
self.getConfig('default_cover_image'):
self.story.addImgUrl(self,
None,
self.getConfig('default_cover_image'),
self._fetchUrlRaw,
cover=True)
return self.story
def getStoryMetadataOnly(self):
@ -235,17 +262,74 @@ class BaseSiteAdapter(Configurable):
if self.getConfig('collect_series'):
self.story.setMetadata('series','%s [%s]'%(name, num))
def setDescription(self,url,svalue):
#print("\n\nsvalue:\n%s\n"%svalue)
if self.getConfig('keep_summary_html'):
if isinstance(svalue,str) or isinstance(svalue,unicode):
svalue = bs.BeautifulSoup(svalue)
self.story.setMetadata('description',self.utf8FromSoup(url,svalue))
else:
self.story.setMetadata('description',stripHTML(svalue))
#print("\n\ndescription:\n"+self.story.getMetadata('description')+"\n\n")
# this gives us a unicode object, not just a string containing bytes.
# (I gave soup a unicode string, you'd think it could give it back...)
def utf8FromSoup(self,url,soup,fetch=None):
if not fetch:
fetch=self._fetchUrlRaw
acceptable_attributes = ['href','name']
#print("include_images:"+self.getConfig('include_images'))
if self.getConfig('include_images'):
acceptable_attributes.extend(('src','alt','origsrc'))
for img in soup.findAll('img'):
img['origsrc']=img['src']
img['src']=self.story.addImgUrl(self,url,img['src'],fetch)
for attr in soup._getAttrMap().keys():
if attr not in acceptable_attributes:
del soup[attr] ## strip all tag attributes except href and name
for t in soup.findAll(recursive=True):
for attr in t._getAttrMap().keys():
if attr not in acceptable_attributes:
del t[attr] ## strip all tag attributes except href and name
# these are not acceptable strict XHTML. But we do already have
# CSS classes of the same names defined in constants.py
if t.name in ('u'):
t['class']=t.name
t.name='span'
if t.name in ('center'):
t['class']=t.name
t.name='div'
# removes paired, but empty tags.
if t.string != None and len(t.string.strip()) == 0 :
t.extract()
# Don't want body tags in chapter html--writers add them.
return re.sub(r"</?body>\r?\n?","",soup.__str__('utf8').decode('utf-8'))
fullmon = {"January":"01", "February":"02", "March":"03", "April":"04", "May":"05",
"June":"06","July":"07", "August":"08", "September":"09", "October":"10",
"November":"11", "December":"12" }
def cachedfetch(realfetch,cache,url):
if url in cache:
print("cache hit")
return cache[url]
else:
return realfetch(url)
def makeDate(string,format):
# Surprise! Abstracting this turned out to be more useful than
# just saving bytes.
# fudge english month names for people who's locale is set to
# non-english. All our current sites date in english, even if
# there's non-english content.
# there's non-english content. -- ficbook.net now makes that a
# lie. It has to do something even more complicated to get
# Russian month names correct everywhere.
do_abbrev = "%b" in format
if "%B" in format or do_abbrev:
@ -259,24 +343,3 @@ def makeDate(string,format):
return datetime.datetime.strptime(string,format)
acceptable_attributes = ['href','name']
# this gives us a unicode object, not just a string containing bytes.
# (I gave soup a unicode string, you'd think it could give it back...)
def utf8FromSoup(soup):
for t in soup.findAll(recursive=True):
for attr in t._getAttrMap().keys():
if attr not in acceptable_attributes:
del t[attr] ## strip all tag attributes except href and name
# these are not acceptable strict XHTML. But we do already have
# CSS classes of the same names defined in constants.py
if t.name in ('u'):
t['class']=t.name
t.name='span'
if t.name in ('center'):
t['class']=t.name
t.name='div'
# removes paired, but empty tags.
if t.string != None and len(t.string.strip()) == 0 :
t.extract()
return soup.__str__('utf8').decode('utf-8')

View file

@ -21,16 +21,21 @@ import ConfigParser
# inherit from Configurable. The config file(s) uses ini format:
# [sections] with key:value settings.
#
# There's a [defaults] section which is overriden by the writer's
# section [epub], which is overriden by the adapter's section for each
# site.
# writer does [defaults], [www.whofic.com], [epub], [www.whofic.com:epub], [overrides]
#
# Until a write is created, the adapter only has [defaults], [www.whofic.com], [overrides]
#
# [defaults]
# titlepage_entries: category,genre, status
# [epub]
# titlepage_entries: category,genre, status,datePublished,dateUpdated,dateCreated
# [www.whofic.com]
# titlepage_entries: category,genre, status,dateUpdated,rating
# [epub]
# titlepage_entries: category,genre, status,datePublished,dateUpdated,dateCreated
# [www.whofic.com:epub]
# titlepage_entries: category,genre, status,datePublished
# [overrides]
# titlepage_entries: category
class Configurable(object):
@ -38,6 +43,14 @@ class Configurable(object):
self.config = config
self.sectionslist = ['defaults']
def setSectionOrder(self,site,fileform=None):
self.sectionslist = ['defaults']
self.addConfigSection(site)
if fileform:
self.addConfigSection(fileform)
self.addConfigSection(site+":"+fileform)
self.addConfigSection("overrides")
def addConfigSection(self,section):
self.sectionslist.insert(0,section)

View file

@ -0,0 +1,86 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2012, Jim Miller'
__docformat__ = 'restructuredtext en'
import re, os, traceback
from zipfile import ZipFile
from xml.dom.minidom import parseString
from . import BeautifulSoup as bs
def get_dcsource(inputio):
return get_update_data(inputio,getfilecount=False,getsoups=False)[0]
def get_dcsource_chaptercount(inputio):
return get_update_data(inputio,getfilecount=True,getsoups=False)[:2] # (source,filecount)
def get_update_data(inputio,
getfilecount=True,
getsoups=True):
epub = ZipFile(inputio, 'r')
## Find the .opf file.
container = epub.read("META-INF/container.xml")
containerdom = parseString(container)
rootfilenodelist = containerdom.getElementsByTagName("rootfile")
rootfilename = rootfilenodelist[0].getAttribute("full-path")
contentdom = parseString(epub.read(rootfilename))
firstmetadom = contentdom.getElementsByTagName("metadata")[0]
try:
source=firstmetadom.getElementsByTagName("dc:source")[0].firstChild.data.encode("utf-8")
except:
source=None
## Save the path to the .opf file--hrefs inside it are relative to it.
relpath = get_path_part(rootfilename)
filecount = 0
soups = [] # list of xhmtl blocks
images = {} # dict() origsrc->data
if getfilecount:
# spin through the manifest--only place there are item tags.
for item in contentdom.getElementsByTagName("item"):
# First, count the 'chapter' files. FFDL uses file0000.xhtml,
# but can also update epubs downloaded from Twisting the
# Hellmouth, which uses chapter0.html.
if( item.getAttribute("media-type") == "application/xhtml+xml" ):
href=relpath+item.getAttribute("href")
print("---- item href:%s path part: %s"%(href,get_path_part(href)))
if re.match(r'.*/(file|chapter)\d+\.x?html',href):
if getsoups:
soup = bs.BeautifulSoup(epub.read(href).decode("utf-8"))
for img in soup.findAll('img'):
try:
newsrc=get_path_part(href)+img['src']
# remove all .. and the path part above it, if present.
# Most for epubs edited by Sigil.
newsrc = re.sub(r"([^/]+/\.\./)","",newsrc)
origsrc=img['origsrc']
data = epub.read(newsrc)
images[origsrc] = data
img['src'] = img['origsrc']
except Exception as e:
print("Image %s not found!\n(originally:%s)"%(newsrc,origsrc))
print("Exception: %s"%(unicode(e)))
traceback.print_exc()
soup = soup.find('body')
soup.find('h3').extract()
soups.append(soup)
filecount+=1
for k in images.keys():
print("\torigsrc:%s\n\tData len:%s\n"%(k,len(images[k])))
return (source,filecount,soups,images)
def get_path_part(n):
relpath = os.path.dirname(n)
if( len(relpath) > 0 ):
relpath=relpath+"/"
return relpath

View file

@ -16,9 +16,127 @@
#
import os, re
import urlparse
from math import floor
from htmlcleanup import conditionalRemoveEntities, removeAllEntities
# Create convert_image method depending on which graphics lib we can
# load. Preferred: calibre, PIL, none
try:
from calibre.utils.magick import Image
def convert_image(url,data,sizes,grayscale):
export = False
img = Image()
img.load(data)
owidth, oheight = img.size
nwidth, nheight = sizes
scaled, nwidth, nheight = fit_image(owidth, oheight, nwidth, nheight)
if scaled:
img.size = (nwidth, nheight)
export = True
if grayscale and img.type != "GrayscaleType":
img.type = "GrayscaleType"
export = True
if normalize_format_name(img.format) != "jpg":
export = True
if export:
return (img.export('JPG'),'jpg','image/jpeg')
else:
print("image used unchanged")
return (data,'jpg','image/jpeg')
except:
# No calibre routines, try for PIL for CLI.
try:
import Image
from StringIO import StringIO
def convert_image(url,data,sizes,grayscale):
export = False
img = Image.open(StringIO(data))
owidth, oheight = img.size
nwidth, nheight = sizes
scaled, nwidth, nheight = fit_image(owidth, oheight, nwidth, nheight)
if scaled:
img = img.resize((nwidth, nheight),Image.ANTIALIAS)
export = True
if grayscale and img.mode != "L":
img = img.convert("L")
export = True
if normalize_format_name(img.format) != "jpg":
export = True
if export:
outsio = StringIO()
img.save(outsio,'JPEG')
return (outsio.getvalue(),'jpg','image/jpeg')
else:
print("image used unchanged")
return (data,'jpg','image/jpeg')
except:
# No calibre or PIL, simple pass through with mimetype.
imagetypes = {
'jpg':'image/jpeg',
'jpeg':'image/jpeg',
'png':'image/png',
'gif':'image/gif',
'svg':'image/svg+xml',
}
def convert_image(url,data,sizes,grayscale):
ext=url[url.rfind('.')+1:].lower()
return (data,ext,imagetypes[ext])
def normalize_format_name(fmt):
if fmt:
fmt = fmt.lower()
if fmt == 'jpeg':
fmt = 'jpg'
return fmt
def fit_image(width, height, pwidth, pheight):
'''
Fit image in box of width pwidth and height pheight.
@param width: Width of image
@param height: Height of image
@param pwidth: Width of box
@param pheight: Height of box
@return: scaled, new_width, new_height. scaled is True iff new_width and/or new_height is different from width or height.
'''
scaled = height > pheight or width > pwidth
if height > pheight:
corrf = pheight/float(height)
width, height = floor(corrf*width), pheight
if width > pwidth:
corrf = pwidth/float(width)
width, height = pwidth, floor(corrf*height)
if height > pheight:
corrf = pheight/float(height)
width, height = floor(corrf*width), pheight
return scaled, int(width), int(height)
try:
# doesn't really matter what, just checking for appengine.
from google.appengine.api import apiproxy_stub_map
is_appengine = True
except:
is_appengine = False
# The list comes from ffnet, the only multi-language site we support
# at the time of writing. Values are taken largely from pycountry,
# but with some corrections and guesses.
@ -72,7 +190,10 @@ class Story:
self.metadata = {'version':'4.3'}
self.replacements = []
self.chapters = [] # chapters will be tuples of (title,html)
self.imgurls = []
self.imgtuples = []
self.listables = {} # some items (extratags, category, warnings & genres) are also kept as lists.
self.cover=None
def setMetadata(self, key, value):
## still keeps &lt; &lt; and &amp;
@ -153,6 +274,90 @@ class Story:
def getChapters(self):
"Chapters will be tuples of (title,html)"
return self.chapters
# pass fetch in from adapter in case we need the cookies collected
# as well as it's a base_story class method.
def addImgUrl(self,configurable,parenturl,url,fetch,cover=False):
# appengine (web version) isn't allowed to do images--just
# gets too big too fast and breaks things.
if is_appengine:
return
if url.startswith("http") or url.startswith("file") or parenturl == None:
imgurl = url
else:
parsedUrl = urlparse.urlparse(parenturl)
if url.startswith("/") :
imgurl = urlparse.urlunparse(
(parsedUrl.scheme,
parsedUrl.netloc,
url,
'','',''))
else:
imgurl = urlparse.urlunparse(
(parsedUrl.scheme,
parsedUrl.netloc,
parsedUrl.path + url,
'','',''))
# This version, prefixing the images with the creation
# timestamp, still allows for dup images to be detected and
# not dup'ed in a single download. And it prevents 0.jpg from
# earlier update being overwritten by the first image in newer
# chapter. It does not, however, prevent dup copies of the
# same image being d/l'ed and saved in different updates. A
# bit of corner case inefficiency I can live with rather than
# scanning all the pre-existing files on update. oldsrc is
# being saved on img tags just in case, however.
prefix='ffdl' #self.getMetadataRaw('dateCreated').strftime("%Y%m%d%H%M%S")
if imgurl not in self.imgurls:
parsedUrl = urlparse.urlparse(imgurl)
sizes = [ int(x) for x in configurable.getConfigList('image_max_size') ]
(data,ext,mime) = convert_image(imgurl,
fetch(imgurl),
sizes,
configurable.getConfig('grayscale_images'))
# explicit cover, make the first image.
if cover:
if len(self.imgtuples) > 0 and 'cover' in self.imgtuples[0]['newsrc']:
# remove existing cover, if there is one.
del self.imgurls[0]
del self.imgtuples[0]
self.imgurls.insert(0,imgurl)
newsrc = "images/cover.%s"%ext
self.cover=newsrc
self.imgtuples.insert(0,{'newsrc':newsrc,'mime':mime,'data':data})
else:
self.imgurls.append(imgurl)
# First image, copy not link because calibre will replace with it's cover.
if (len(self.imgurls)==1 and configurable.getConfig('make_firstimage_cover')):
newsrc = "images/cover.%s"%ext
self.cover=newsrc
self.imgtuples.append({'newsrc':newsrc,'mime':mime,'data':data})
self.imgurls.append(imgurl)
newsrc = "images/%s-%s.%s"%(
prefix,
self.imgurls.index(imgurl),
ext)
self.imgtuples.append({'newsrc':newsrc,'mime':mime,'data':data})
print("\nimgurl:%s\nnewsrc:%s\nimage size:%d\n"%(imgurl,newsrc,len(data)))
else:
newsrc = self.imgtuples[self.imgurls.index(imgurl)]['newsrc']
#print("===============\n%s\nimg url:%s\n============"%(newsrc,self.imgurls[-1]))
return newsrc
def getImgUrls(self):
retlist = []
for i, url in enumerate(self.imgurls):
#parsedUrl = urlparse.urlparse(url)
retlist.append(self.imgtuples[i])
return retlist
def __str__(self):
return "Metadata: " +str(self.metadata) + "\nListables: " +str(self.listables) #+ "\nChapters: "+str(self.chapters)

View file

@ -39,10 +39,11 @@ class BaseStoryWriter(Configurable):
def __init__(self, config, adapter):
Configurable.__init__(self, config)
self.addConfigSection(adapter.getSiteDomain())
self.addConfigSection(self.getFormatName())
self.addConfigSection(adapter.getSiteDomain()+":"+self.getFormatName())
self.addConfigSection("overrides")
self.setSectionOrder(adapter.getSiteDomain(),self.getFormatName())
# self.addConfigSection(adapter.getSiteDomain())
# self.addConfigSection(self.getFormatName())
# self.addConfigSection(adapter.getSiteDomain()+":"+self.getFormatName())
# self.addConfigSection("overrides")
self.adapter = adapter
self.story = adapter.getStoryMetadataOnly() # only cache the metadata initially.
@ -144,7 +145,7 @@ class BaseStoryWriter(Configurable):
def _write(self, out, text):
out.write(text.encode('utf8'))
def writeTitlePage(self, out, START, ENTRY, END, WIDE_ENTRY=None):
def writeTitlePage(self, out, START, ENTRY, END, WIDE_ENTRY=None, NO_TITLE_ENTRY=None):
"""
Write the title page, but only include entries that there's
metadata for. START, ENTRY and END are expected to already by
@ -171,6 +172,12 @@ class BaseStoryWriter(Configurable):
label=self.getConfig(entry+"_label")
else:
label=self.titleLabels[entry]
# If the label for the title entry is empty, use the
# 'no title' option if there is one.
if label == "" and NO_TITLE_ENTRY:
TEMPLATE= NO_TITLE_ENTRY
self._write(out,TEMPLATE.substitute({'label':label,
'value':self.story.getMetadata(entry)}))

View file

@ -20,6 +20,7 @@ import string
import StringIO
import zipfile
from zipfile import ZipFile, ZIP_STORED, ZIP_DEFLATED
import urllib
## XML isn't as forgiving as HTML, so rather than generate as strings,
## use DOM to generate the XML files.
@ -57,6 +58,10 @@ class EpubWriter(BaseStoryWriter):
self.EPUB_TITLE_ENTRY = string.Template('''
<b>${label}:</b> ${value}<br />
''')
self.EPUB_NO_TITLE_ENTRY = string.Template('''
${value}<br />
''')
self.EPUB_TITLE_PAGE_END = string.Template('''
@ -84,6 +89,10 @@ class EpubWriter(BaseStoryWriter):
self.EPUB_TABLE_TITLE_WIDE_ENTRY = string.Template('''
<tr><td colspan="2"><b>${label}:</b> ${value}</td></tr>
''')
self.EPUB_TABLE_NO_TITLE_ENTRY = string.Template('''
<tr><td colspan="2">${label}${value}</td></tr>
''')
self.EPUB_TABLE_TITLE_PAGE_END = string.Template('''
@ -252,7 +261,52 @@ class EpubWriter(BaseStoryWriter):
itemrefs = [] # list of strings -- idrefs from .opfs' spines
items.append(("ncx","toc.ncx","application/x-dtbncx+xml",None)) ## we'll generate the toc.ncx file,
## but it needs to be in the items manifest.
if self.getConfig('include_images'):
imgcount=0
for imgmap in self.story.getImgUrls():
imgfile = "OEBPS/"+imgmap['newsrc']
outputepub.writestr(imgfile,imgmap['data'])
items.append(("image%04d"%imgcount,
imgfile,
imgmap['mime'],
None))
imgcount+=1
items.append(("style","OEBPS/stylesheet.css","text/css",None))
guide = None
coverIO = None
if self.story.cover:
items.append(("cover","OEBPS/cover.xhtml","application/xhtml+xml",None))
itemrefs.append("cover")
#
# <meta name="cover" content="cover.jpg"/>
metadata.appendChild(newTag(contentdom,"meta",{"content":"image0000",
"name":"cover"}))
# cover stuff for later:
# at end of <package>:
# <guide>
# <reference type="cover" title="Cover" href="Text/cover.xhtml"/>
# </guide>
guide = newTag(contentdom,"guide")
guide.appendChild(newTag(contentdom,"reference",attrs={"type":"cover",
"title":"Cover",
"href":"OEBPS/cover.xhtml"}))
coverIO = StringIO.StringIO()
coverIO.write('''
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en"><head><title>Cover</title><style type="text/css" title="override_css">
@page {padding: 0pt; margin:0pt}
body { text-align: center; padding:0pt; margin: 0pt; }
div { margin: 0pt; padding: 0pt; }
</style></head><body><div>
<img src="%s" alt="cover"/>
</div></body></html>
'''%self.story.cover)
if self.getConfig("include_titlepage"):
items.append(("title_page","OEBPS/title_page.xhtml","application/xhtml+xml","Title Page"))
itemrefs.append("title_page")
@ -283,8 +337,17 @@ class EpubWriter(BaseStoryWriter):
spine.appendChild(newTag(contentdom,"itemref",
attrs={"idref":itemref,
"linear":"yes"}))
# guide only exists if there's a cover.
if guide:
package.appendChild(guide)
# write content.opf to zip.
outputepub.writestr("content.opf",contentdom.toxml(encoding='utf-8'))
contentxml = contentdom.toxml(encoding='utf-8')
# tweak for brain damaged Nook STR.
contentxml = contentxml.replace('<meta content="image0000" name="cover"/>',
'<meta name="cover" content="image0000"/>')
outputepub.writestr("content.opf",contentxml)
contentdom.unlink()
del contentdom
@ -320,7 +383,7 @@ class EpubWriter(BaseStoryWriter):
index=0
for item in items:
(id,href,type,title)=item
# only items to be skipped, toc.ncx, stylesheet.css, should have no title.
# only items to be skipped, cover.xhtml, images, toc.ncx, stylesheet.css, should have no title.
if title :
navPoint = newTag(tocncxdom,"navPoint",
attrs={'id':id,
@ -333,7 +396,7 @@ class EpubWriter(BaseStoryWriter):
navPoint.appendChild(newTag(tocncxdom,"content",attrs={"src":href}))
index=index+1
# write toc.ncs to zip file
# write toc.ncx to zip file
outputepub.writestr("toc.ncx",tocncxdom.toxml(encoding='utf-8'))
tocncxdom.unlink()
del tocncxdom
@ -346,19 +409,26 @@ class EpubWriter(BaseStoryWriter):
TITLE_PAGE_START = self.EPUB_TABLE_TITLE_PAGE_START
TITLE_ENTRY = self.EPUB_TABLE_TITLE_ENTRY
WIDE_TITLE_ENTRY = self.EPUB_TABLE_TITLE_WIDE_ENTRY
NO_TITLE_ENTRY = self.EPUB_TABLE_NO_TITLE_ENTRY
TITLE_PAGE_END = self.EPUB_TABLE_TITLE_PAGE_END
else:
TITLE_PAGE_START = self.EPUB_TITLE_PAGE_START
TITLE_ENTRY = self.EPUB_TITLE_ENTRY
WIDE_TITLE_ENTRY = self.EPUB_TITLE_ENTRY # same, only wide in tables.
NO_TITLE_ENTRY = self.EPUB_NO_TITLE_ENTRY
TITLE_PAGE_END = self.EPUB_TITLE_PAGE_END
if coverIO:
outputepub.writestr("OEBPS/cover.xhtml",coverIO.getvalue())
coverIO.close()
titlepageIO = StringIO.StringIO()
self.writeTitlePage(out=titlepageIO,
START=TITLE_PAGE_START,
ENTRY=TITLE_ENTRY,
WIDE_ENTRY=WIDE_TITLE_ENTRY,
END=TITLE_PAGE_END)
END=TITLE_PAGE_END,
NO_TITLE_ENTRY=NO_TITLE_ENTRY)
if titlepageIO.getvalue(): # will be false if no title page.
outputepub.writestr("OEBPS/title_page.xhtml",titlepageIO.getvalue())
titlepageIO.close()
@ -384,7 +454,7 @@ class EpubWriter(BaseStoryWriter):
fullhtml = fullhtml.replace('</p>','</p>\n').replace('<br />','<br />\n')
outputepub.writestr("OEBPS/file%04d.xhtml"%(index+1),fullhtml.encode('utf-8'))
del fullhtml
# declares all the files created by Windows. otherwise, when
# it runs in appengine, windows unzips the files as 000 perms.
for zf in outputepub.filelist:

View file

@ -49,6 +49,10 @@ class MobiWriter(BaseStoryWriter):
self.MOBI_TITLE_ENTRY = string.Template('''
<b>${label}:</b> ${value}<br />
''')
self.MOBI_NO_TITLE_ENTRY = string.Template('''
${value}<br />
''')
self.MOBI_TITLE_PAGE_END = string.Template('''
@ -75,6 +79,10 @@ class MobiWriter(BaseStoryWriter):
self.MOBI_TABLE_TITLE_WIDE_ENTRY = string.Template('''
<tr><td colspan="2"><b>${label}:</b> ${value}</td></tr>
''')
self.MOBI_TABLE_NO_TITLE_WIDE_ENTRY = string.Template('''
<tr><td colspan="2">${value}</td></tr>
''')
self.MOBI_TABLE_TITLE_PAGE_END = string.Template('''
@ -129,11 +137,13 @@ class MobiWriter(BaseStoryWriter):
TITLE_PAGE_START = self.MOBI_TABLE_TITLE_PAGE_START
TITLE_ENTRY = self.MOBI_TABLE_TITLE_ENTRY
WIDE_TITLE_ENTRY = self.MOBI_TABLE_TITLE_WIDE_ENTRY
NO_TITLE_ENTRY = self.MOBI_TABLE_NO_TITLE_ENTRY
TITLE_PAGE_END = self.MOBI_TABLE_TITLE_PAGE_END
else:
TITLE_PAGE_START = self.MOBI_TITLE_PAGE_START
TITLE_ENTRY = self.MOBI_TITLE_ENTRY
WIDE_TITLE_ENTRY = self.MOBI_TITLE_ENTRY # same, only wide in tables.
NO_TITLE_ENTRY = self.MOBI_NO_TITLE_ENTRY
TITLE_PAGE_END = self.MOBI_TITLE_PAGE_END
titlepageIO = StringIO.StringIO()
@ -141,7 +151,8 @@ class MobiWriter(BaseStoryWriter):
START=TITLE_PAGE_START,
ENTRY=TITLE_ENTRY,
WIDE_ENTRY=WIDE_TITLE_ENTRY,
END=TITLE_PAGE_END)
END=TITLE_PAGE_END,
NO_TITLE_ENTRY=NO_TITLE_ENTRY)
if titlepageIO.getvalue(): # will be false if no title page.
files.append(titlepageIO.getvalue())
titlepageIO.close()

View file

@ -339,7 +339,7 @@ class FanfictionDownloader(UserConfigServer):
self.redirect("/?error=custom&errtext=%s"%urlEscape("There's an error in your User Configuration: "+str(e)))
return
adapter = adapters.getAdapter(config,url)
adapter = adapters.getAdapter(config,url,format)
logging.info('Created an adaper: %s' % adapter)
if len(login) > 1:
@ -442,7 +442,7 @@ class FanfictionDownloaderTask(UserConfigServer):
try:
config = self.getUserConfig(user)
adapter = adapters.getAdapter(config,url)
adapter = adapters.getAdapter(config,url,format)
logging.info('Created an adapter: %s' % adapter)

View file

@ -108,10 +108,6 @@ extratags: FanFiction
## useful if pulling large numbers of stories or if the site is slow.
#slow_down_sleep_time:0.5
## output background color--only used by html and epub (and ignored in
## epub by many readers). Must be hex code, # will be added.
background_color: ffffff
## Use regular expressions to find and replace (or remove) metadata.
## For example, you could change Sci-Fi=>SF, remove *-Centered tags,
## etc. See http://docs.python.org/library/re.html (look for re.sub)
@ -160,13 +156,6 @@ titlepage_entries: series,category,genre,language,status,datePublished,dateUpdat
## use \r\n for line endings, the windows convention. text output only.
windows_eol: true
[txt]
## Add URLs since there aren't links.
titlepage_entries: series,category,genre,status,datePublished,dateUpdated,dateCreated,rating,warnings,numChapters,numWords,site,storyUrl, authorUrl, description
## use \r\n for line endings, the windows convention. text output only.
windows_eol: true
[epub]
## epub carries the TOC in metadata.
@ -213,11 +202,41 @@ output_css:
.u {text-decoration: underline;}
.bold {font-weight: bold;}
## include images from img tags in the body and summary of
## stories. Images will be converted to jpg for size if possible.
#include_images:false
## If not set, the summary will have all html stripped for safety.
## Both this and include_images must be true to get images in the
## summary.
#keep_summary_html:false
## If set, the first image found will be made the cover image. If
## keep_summary_html is true, any images in summary will be before any
## in chapters.
#make_firstimage_cover: false
## If set, and there isn't already a cover image from the adapter or
## from make_firstimage_cover, this image will be made the cover.
## It can be either a 'file:' or 'http:' url.
## Note that if you enable make_firstimage_cover in [epub], but want
## to use default_cover_image for a specific site, use the site:format
## section, for example: [www.ficwad.com:epub]
#default_cover_image:file:///C:/Users/username/Desktop/nook/images/icon.png
#default_cover_image:http://www.somesite.com/someimage.gif
## Resize images down to width, height, preserving aspect ratio.
## Nook size, with margin.
image_max_size: 580, 725
## Change image to grayscale, if graphics library allows, to save
## space.
#grayscale_images: false
[mobi]
## mobi TOC cannot be turned off right now.
#include_tocpage: true
## Each site has a section that overrides [defaults] *and* the format
## sections test1.com specifically is not a real story site. Instead,
## it is a fake site for testing configuration and output. It uses