mirror of
https://github.com/JimmXinu/FanFicFare.git
synced 2026-04-23 07:23:23 +02:00
Merge changes from trunk.
This commit is contained in:
commit
4306bfc301
39 changed files with 786 additions and 599 deletions
4
app.yaml
4
app.yaml
|
|
@ -1,6 +1,6 @@
|
|||
# ffd-retief-hrd fanfictiondownloader
|
||||
application: fanfictiondownloader
|
||||
version: 4-3-2
|
||||
application: ffd-retief-hrd
|
||||
version: 4-3-3
|
||||
runtime: python27
|
||||
api_version: 1
|
||||
threadsafe: true
|
||||
|
|
|
|||
|
|
@ -27,7 +27,7 @@ class FanFictionDownLoaderBase(InterfaceActionBase):
|
|||
description = 'UI plugin to download FanFiction stories from various sites.'
|
||||
supported_platforms = ['windows', 'osx', 'linux']
|
||||
author = 'Jim Miller'
|
||||
version = (1, 4, 6)
|
||||
version = (1, 5, 0)
|
||||
minimum_calibre_version = (0, 8, 30)
|
||||
|
||||
#: This field defines the GUI plugin class that contains all the code
|
||||
|
|
|
|||
|
|
@ -36,6 +36,7 @@ all_prefs = JSONConfig('plugins/fanfictiondownloader_plugin')
|
|||
# take from here.
|
||||
all_prefs.defaults['personal.ini'] = get_resources('plugin-example.ini')
|
||||
all_prefs.defaults['updatemeta'] = True
|
||||
all_prefs.defaults['updatecover'] = False
|
||||
all_prefs.defaults['keeptags'] = False
|
||||
all_prefs.defaults['urlsfromclip'] = True
|
||||
all_prefs.defaults['updatedefault'] = True
|
||||
|
|
@ -53,6 +54,7 @@ all_prefs.defaults['custom_cols'] = {}
|
|||
# when config is called for the first time on a library.
|
||||
copylist = ['personal.ini',
|
||||
'updatemeta',
|
||||
'updatecover',
|
||||
'keeptags',
|
||||
'urlsfromclip',
|
||||
'updatedefault',
|
||||
|
|
@ -144,6 +146,7 @@ class ConfigWidget(QWidget):
|
|||
prefs['fileform'] = unicode(self.basic_tab.fileform.currentText())
|
||||
prefs['collision'] = unicode(self.basic_tab.collision.currentText())
|
||||
prefs['updatemeta'] = self.basic_tab.updatemeta.isChecked()
|
||||
prefs['updatecover'] = self.basic_tab.updatecover.isChecked()
|
||||
prefs['keeptags'] = self.basic_tab.keeptags.isChecked()
|
||||
prefs['urlsfromclip'] = self.basic_tab.urlsfromclip.isChecked()
|
||||
prefs['updatedefault'] = self.basic_tab.updatedefault.isChecked()
|
||||
|
|
@ -234,6 +237,11 @@ class BasicTab(QWidget):
|
|||
self.updatemeta.setChecked(prefs['updatemeta'])
|
||||
self.l.addWidget(self.updatemeta)
|
||||
|
||||
self.updatecover = QCheckBox('Update Cover when Updating Metadata?',self)
|
||||
self.updatecover.setToolTip('Update cover image when metadata is updated. EPUB only.')
|
||||
self.updatecover.setChecked(prefs['updatecover'])
|
||||
self.l.addWidget(self.updatecover)
|
||||
|
||||
self.keeptags = QCheckBox('Keep Existing Tags when Updating Metadata?',self)
|
||||
self.keeptags.setToolTip('Existing tags will be kept and any new tags added.\nCompleted and In-Progress tags will be still be updated, if known.\nLast Updated tags will be updated if lastupdate in include_subject_tags.')
|
||||
self.keeptags.setChecked(prefs['keeptags'])
|
||||
|
|
|
|||
|
|
@ -1,30 +0,0 @@
|
|||
#!/usr/bin/env python
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import (unicode_literals, division, absolute_import,
|
||||
print_function)
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2012, Jim Miller'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
from zipfile import ZipFile
|
||||
|
||||
from xml.dom.minidom import parseString
|
||||
|
||||
def get_dcsource(inputio):
|
||||
epub = ZipFile(inputio, 'r')
|
||||
|
||||
## Find the .opf file.
|
||||
container = epub.read("META-INF/container.xml")
|
||||
containerdom = parseString(container)
|
||||
rootfilenodelist = containerdom.getElementsByTagName("rootfile")
|
||||
rootfilename = rootfilenodelist[0].getAttribute("full-path")
|
||||
|
||||
metadom = parseString(epub.read(rootfilename))
|
||||
firstmetadom = metadom.getElementsByTagName("metadata")[0]
|
||||
try:
|
||||
source=firstmetadom.getElementsByTagName("dc:source")[0].firstChild.data.encode("utf-8")
|
||||
except:
|
||||
source=None
|
||||
|
||||
return source
|
||||
|
|
@ -15,6 +15,10 @@ from datetime import datetime
|
|||
|
||||
from PyQt4.Qt import (QApplication, QMenu, QToolButton)
|
||||
|
||||
from PyQt4.Qt import QPixmap, Qt
|
||||
from PyQt4.QtCore import QBuffer
|
||||
|
||||
|
||||
from calibre.ptempfile import PersistentTemporaryFile, PersistentTemporaryDirectory, remove_dir
|
||||
from calibre.ebooks.metadata import MetaInformation, authors_to_string
|
||||
from calibre.ebooks.metadata.meta import get_metadata
|
||||
|
|
@ -30,8 +34,9 @@ from calibre_plugins.fanfictiondownloader_plugin.common_utils import (set_plugin
|
|||
create_menu_action_unique, get_library_uuid)
|
||||
|
||||
from calibre_plugins.fanfictiondownloader_plugin.fanficdownloader import adapters, writers, exceptions
|
||||
from calibre_plugins.fanfictiondownloader_plugin.epubmerge import doMerge
|
||||
from calibre_plugins.fanfictiondownloader_plugin.dcsource import get_dcsource
|
||||
from calibre_plugins.fanfictiondownloader_plugin.fanficdownloader.htmlcleanup import stripHTML
|
||||
#from calibre_plugins.fanfictiondownloader_plugin.epubmerge import doMerge
|
||||
from calibre_plugins.fanfictiondownloader_plugin.fanficdownloader.epubutils import get_dcsource, get_dcsource_chaptercount
|
||||
|
||||
from calibre_plugins.fanfictiondownloader_plugin.config import (prefs, permitted_values)
|
||||
from calibre_plugins.fanfictiondownloader_plugin.dialogs import (
|
||||
|
|
@ -93,6 +98,8 @@ class FanFictionDownLoaderPlugin(InterfaceAction):
|
|||
# are not found in the zip file will result in null QIcons.
|
||||
icon = get_icon('images/icon.png')
|
||||
|
||||
#self.qaction.setText('FFDL')
|
||||
|
||||
# The qaction is automatically created from the action_spec defined
|
||||
# above
|
||||
self.qaction.setIcon(icon)
|
||||
|
|
@ -408,7 +415,7 @@ class FanFictionDownLoaderPlugin(InterfaceAction):
|
|||
ffdlconfig = SafeConfigParser()
|
||||
ffdlconfig.readfp(StringIO(get_resources("plugin-defaults.ini")))
|
||||
ffdlconfig.readfp(StringIO(prefs['personal.ini']))
|
||||
adapter = adapters.getAdapter(ffdlconfig,url)
|
||||
adapter = adapters.getAdapter(ffdlconfig,url,fileform)
|
||||
|
||||
options['personal.ini'] = prefs['personal.ini']
|
||||
|
||||
|
|
@ -440,7 +447,7 @@ class FanFictionDownLoaderPlugin(InterfaceAction):
|
|||
book['author_sort'] = book['author'] = story.getMetadata("author", removeallentities=True)
|
||||
book['publisher'] = story.getMetadata("site")
|
||||
book['tags'] = writer.getTags()
|
||||
book['comments'] = story.getMetadata("description") #, removeallentities=True) comments handles entities better.
|
||||
book['comments'] = stripHTML(story.getMetadata("description")) #, removeallentities=True) comments handles entities better.
|
||||
book['series'] = story.getMetadata("series")
|
||||
|
||||
# adapter.opener is the element with a threadlock. But del
|
||||
|
|
@ -517,13 +524,15 @@ class FanFictionDownLoaderPlugin(InterfaceAction):
|
|||
# 'book' can exist without epub. If there's no existing epub,
|
||||
# let it go and it will download it.
|
||||
if db.has_format(book_id,fileform,index_is_id=True):
|
||||
toupdateio = StringIO()
|
||||
(epuburl,chaptercount) = doMerge(toupdateio,
|
||||
[StringIO(db.format(book_id,'EPUB',
|
||||
index_is_id=True))],
|
||||
titlenavpoints=False,
|
||||
striptitletoc=True,
|
||||
forceunique=False)
|
||||
#toupdateio = StringIO()
|
||||
(epuburl,chaptercount) = get_dcsource_chaptercount(StringIO(db.format(book_id,'EPUB',
|
||||
index_is_id=True)))
|
||||
# (epuburl,chaptercount) = doMerge(toupdateio,
|
||||
# [StringIO(db.format(book_id,'EPUB',
|
||||
# index_is_id=True))],
|
||||
# titlenavpoints=False,
|
||||
# striptitletoc=True,
|
||||
# forceunique=False)
|
||||
urlchaptercount = int(story.getMetadata('numChapters'))
|
||||
if chaptercount == urlchaptercount:
|
||||
if collision == UPDATE:
|
||||
|
|
@ -630,7 +639,7 @@ class FanFictionDownLoaderPlugin(InterfaceAction):
|
|||
|
||||
if options['collision'] == CALIBREONLY or \
|
||||
(options['updatemeta'] and book['good']):
|
||||
self._update_metadata(db, book['calibre_id'], book, mi)
|
||||
self._update_metadata(db, book['calibre_id'], book, mi, options)
|
||||
|
||||
def _update_books_completed(self, book_list, options={}):
|
||||
|
||||
|
|
@ -649,6 +658,9 @@ class FanFictionDownLoaderPlugin(InterfaceAction):
|
|||
self.gui.library_view.model().current_changed(current, self.previous)
|
||||
self.gui.tags_view.recount()
|
||||
|
||||
if self.gui.cover_flow:
|
||||
self.gui.cover_flow.dataChanged()
|
||||
|
||||
self.gui.status_bar.show_message(_('Finished Adding/Updating %d books.'%(len(update_list) + len(add_list))), 3000)
|
||||
|
||||
if len(update_list) + len(add_list) != len(book_list):
|
||||
|
|
@ -729,7 +741,7 @@ class FanFictionDownLoaderPlugin(InterfaceAction):
|
|||
|
||||
return book_id
|
||||
|
||||
def _update_metadata(self, db, book_id, book, mi):
|
||||
def _update_metadata(self, db, book_id, book, mi, options):
|
||||
if prefs['keeptags']:
|
||||
old_tags = db.get_tags(book_id)
|
||||
# remove old Completed/In-Progress only if there's a new one.
|
||||
|
|
@ -748,6 +760,13 @@ class FanFictionDownLoaderPlugin(InterfaceAction):
|
|||
oldmi = db.get_metadata(book_id,index_is_id=True)
|
||||
if not oldmi.languages:
|
||||
mi.languages=['eng']
|
||||
|
||||
if options['fileform'] == 'epub' and prefs['updatecover']:
|
||||
existingepub = db.format(book_id,'EPUB',index_is_id=True, as_file=True)
|
||||
epubmi = get_metadata(existingepub,'EPUB')
|
||||
if epubmi.cover_data[1] is not None:
|
||||
db.set_cover(book_id, epubmi.cover_data[1])
|
||||
#mi.cover = epubmi.cover_data[1]
|
||||
|
||||
db.set_metadata(book_id,mi)
|
||||
|
||||
|
|
@ -780,7 +799,7 @@ class FanFictionDownLoaderPlugin(InterfaceAction):
|
|||
if meta == 'status-I':
|
||||
val = book['all_metadata']['status'] == 'In-Progress'
|
||||
db.set_custom(book_id, val, label=label, commit=False)
|
||||
|
||||
|
||||
db.commit()
|
||||
|
||||
def _get_clean_reading_lists(self,lists):
|
||||
|
|
|
|||
|
|
@ -23,7 +23,8 @@ from calibre.utils.logging import Log
|
|||
from calibre_plugins.fanfictiondownloader_plugin.dialogs import (NotGoingToDownload,
|
||||
OVERWRITE, OVERWRITEALWAYS, UPDATE, UPDATEALWAYS, ADDNEW, SKIP, CALIBREONLY)
|
||||
from calibre_plugins.fanfictiondownloader_plugin.fanficdownloader import adapters, writers, exceptions
|
||||
from calibre_plugins.fanfictiondownloader_plugin.epubmerge import doMerge
|
||||
#from calibre_plugins.fanfictiondownloader_plugin.epubmerge import doMerge
|
||||
from calibre_plugins.fanfictiondownloader_plugin.fanficdownloader.epubutils import get_update_data
|
||||
|
||||
# ------------------------------------------------------------------------------
|
||||
#
|
||||
|
|
@ -110,7 +111,7 @@ def do_download_for_worker(book,options):
|
|||
ffdlconfig.readfp(StringIO(get_resources("plugin-defaults.ini")))
|
||||
ffdlconfig.readfp(StringIO(options['personal.ini']))
|
||||
|
||||
adapter = adapters.getAdapter(ffdlconfig,book['url'])
|
||||
adapter = adapters.getAdapter(ffdlconfig,book['url'],options['fileform'])
|
||||
adapter.is_adult = book['is_adult']
|
||||
adapter.username = book['username']
|
||||
adapter.password = book['password']
|
||||
|
|
@ -136,38 +137,44 @@ def do_download_for_worker(book,options):
|
|||
elif 'epub_for_update' in book and options['collision'] in (UPDATE, UPDATEALWAYS):
|
||||
|
||||
urlchaptercount = int(story.getMetadata('numChapters'))
|
||||
## First, get existing epub with titlepage and tocpage stripped.
|
||||
updateio = StringIO()
|
||||
(epuburl,chaptercount) = doMerge(updateio,
|
||||
[book['epub_for_update']],
|
||||
titlenavpoints=False,
|
||||
striptitletoc=True,
|
||||
forceunique=False)
|
||||
(url,chaptercount,
|
||||
adapter.oldchapters,
|
||||
adapter.oldimgs) = get_update_data(book['epub_for_update'])
|
||||
|
||||
print("Do update - epub(%d) vs url(%d)" % (chaptercount, urlchaptercount))
|
||||
print("write to %s"%outfile)
|
||||
|
||||
## Get updated title page/metadata by itself in an epub.
|
||||
## Even if the title page isn't included, this carries the metadata.
|
||||
titleio = StringIO()
|
||||
writer.writeStory(outstream=titleio,metaonly=True)
|
||||
writer.writeStory(outfilename=outfile, forceOverwrite=True)
|
||||
|
||||
## First, get existing epub with titlepage and tocpage stripped.
|
||||
# updateio = StringIO()
|
||||
# (epuburl,chaptercount) = doMerge(updateio,
|
||||
# [book['epub_for_update']],
|
||||
# titlenavpoints=False,
|
||||
# striptitletoc=True,
|
||||
# forceunique=False)
|
||||
# ## Get updated title page/metadata by itself in an epub.
|
||||
# ## Even if the title page isn't included, this carries the metadata.
|
||||
# titleio = StringIO()
|
||||
# writer.writeStory(outstream=titleio,metaonly=True)
|
||||
|
||||
newchaptersio = None
|
||||
if urlchaptercount > chaptercount :
|
||||
## Go get the new chapters
|
||||
newchaptersio = StringIO()
|
||||
adapter.setChaptersRange(chaptercount+1,urlchaptercount)
|
||||
# newchaptersio = None
|
||||
# if urlchaptercount > chaptercount :
|
||||
# ## Go get the new chapters
|
||||
# newchaptersio = StringIO()
|
||||
# adapter.setChaptersRange(chaptercount+1,urlchaptercount)
|
||||
|
||||
adapter.config.set("overrides",'include_tocpage','false')
|
||||
adapter.config.set("overrides",'include_titlepage','false')
|
||||
writer.writeStory(outstream=newchaptersio)
|
||||
# adapter.config.set("overrides",'include_tocpage','false')
|
||||
# adapter.config.set("overrides",'include_titlepage','false')
|
||||
# writer.writeStory(outstream=newchaptersio)
|
||||
|
||||
## Merge the three epubs together.
|
||||
doMerge(outfile,
|
||||
[titleio,updateio,newchaptersio],
|
||||
fromfirst=True,
|
||||
titlenavpoints=False,
|
||||
striptitletoc=False,
|
||||
forceunique=False)
|
||||
# ## Merge the three epubs together.
|
||||
# doMerge(outfile,
|
||||
# [titleio,updateio,newchaptersio],
|
||||
# fromfirst=True,
|
||||
# titlenavpoints=False,
|
||||
# striptitletoc=False,
|
||||
# forceunique=False)
|
||||
|
||||
book['comment'] = 'Update %s completed, added %s chapters for %s total.'%\
|
||||
(options['fileform'],(urlchaptercount-chaptercount),urlchaptercount)
|
||||
|
|
|
|||
32
defaults.ini
32
defaults.ini
|
|
@ -126,7 +126,6 @@ extratags: FanFiction
|
|||
|
||||
## number of seconds to sleep between calls to the story site. May by
|
||||
## useful if pulling large numbers of stories or if the site is slow.
|
||||
## Primarily for commandline.
|
||||
#slow_down_sleep_time:0.5
|
||||
|
||||
## For use only with stand-alone CLI version--run a command on the
|
||||
|
|
@ -231,6 +230,37 @@ output_css:
|
|||
.u {text-decoration: underline;}
|
||||
.bold {font-weight: bold;}
|
||||
|
||||
## include images from img tags in the body and summary of
|
||||
## stories. Images will be converted to jpg for size if possible.
|
||||
#include_images:false
|
||||
|
||||
## If not set, the summary will have all html stripped for safety.
|
||||
## Both this and include_images must be true to get images in the
|
||||
## summary.
|
||||
#keep_summary_html:false
|
||||
|
||||
## If set, the first image found will be made the cover image. If
|
||||
## keep_summary_html is true, any images in summary will be before any
|
||||
## in chapters.
|
||||
#make_firstimage_cover: false
|
||||
|
||||
## If set, and there isn't already a cover image from the adapter or
|
||||
## from make_firstimage_cover, this image will be made the cover.
|
||||
## It can be either a 'file:' or 'http:' url.
|
||||
## Note that if you enable make_firstimage_cover in [epub], but want
|
||||
## to use default_cover_image for a specific site, use the site:format
|
||||
## section, for example: [www.ficwad.com:epub]
|
||||
#default_cover_image:file:///C:/Users/username/Desktop/nook/images/icon.png
|
||||
#default_cover_image:http://www.somesite.com/someimage.gif
|
||||
|
||||
## Resize images down to width, height, preserving aspect ratio.
|
||||
## Nook size, with margin.
|
||||
image_max_size: 580, 725
|
||||
|
||||
## Change image to grayscale, if graphics library allows, to save
|
||||
## space.
|
||||
#grayscale_images: false
|
||||
|
||||
[mobi]
|
||||
## mobi TOC cannot be turned off right now.
|
||||
#include_tocpage: true
|
||||
|
|
|
|||
|
|
@ -25,19 +25,16 @@ from StringIO import StringIO
|
|||
from optparse import OptionParser
|
||||
import getpass
|
||||
import string
|
||||
|
||||
import ConfigParser
|
||||
from subprocess import call
|
||||
|
||||
from epubmerge import doMerge
|
||||
from fanficdownloader import adapters,writers,exceptions
|
||||
from fanficdownloader.epubutils import get_dcsource_chaptercount, get_update_data
|
||||
|
||||
if sys.version_info < (2, 5):
|
||||
print "This program requires Python 2.5 or newer."
|
||||
sys.exit(1)
|
||||
|
||||
from fanficdownloader import adapters,writers,exceptions
|
||||
|
||||
import ConfigParser
|
||||
|
||||
def writeStory(config,adapter,writeformat,metaonly=False,outstream=None):
|
||||
writer = writers.getWriter(writeformat,config,adapter)
|
||||
writer.writeStory(outstream=outstream,metaonly=metaonly)
|
||||
|
|
@ -116,19 +113,30 @@ def main():
|
|||
try:
|
||||
## Attempt to update an existing epub.
|
||||
if options.update:
|
||||
updateio = StringIO()
|
||||
(url,chaptercount) = doMerge(updateio,
|
||||
args,
|
||||
titlenavpoints=False,
|
||||
striptitletoc=True,
|
||||
forceunique=False)
|
||||
# updateio = StringIO()
|
||||
# (url,chaptercount) = doMerge(updateio,
|
||||
# args,
|
||||
# titlenavpoints=False,
|
||||
# striptitletoc=True,
|
||||
# forceunique=False)
|
||||
(url,chaptercount) = get_dcsource_chaptercount(args[0])
|
||||
print "Updating %s, URL: %s" % (args[0],url)
|
||||
output_filename = args[0]
|
||||
config.set("overrides","output_filename",args[0])
|
||||
else:
|
||||
url = args[0]
|
||||
|
||||
adapter = adapters.getAdapter(config,url)
|
||||
adapter = adapters.getAdapter(config,url,options.format)
|
||||
|
||||
## Check for include_images and absence of PIL, give warning.
|
||||
if adapter.getConfig('include_images'):
|
||||
try:
|
||||
import Image
|
||||
except:
|
||||
print "You have include_images enabled, but Python Image Library(PIL) isn't found.\nImages will be included full size in original format.\nContinue? (y/n)?"
|
||||
if not sys.stdin.readline().strip().lower().startswith('y'):
|
||||
return
|
||||
|
||||
|
||||
## three tries, that's enough if both user/pass & is_adult needed,
|
||||
## or a couple tries of one or the other
|
||||
|
|
@ -157,17 +165,23 @@ def main():
|
|||
print "Do update - epub(%d) vs url(%d)" % (chaptercount, urlchaptercount)
|
||||
## Get updated title page/metadata by itself in an epub.
|
||||
## Even if the title page isn't included, this carries the metadata.
|
||||
titleio = StringIO()
|
||||
writeStory(config,adapter,"epub",metaonly=True,outstream=titleio)
|
||||
# titleio = StringIO()
|
||||
# writeStory(config,adapter,"epub",metaonly=True,outstream=titleio)
|
||||
|
||||
newchaptersio = None
|
||||
# newchaptersio = None
|
||||
if not options.metaonly:
|
||||
(url,chaptercount,
|
||||
adapter.oldchapters,
|
||||
adapter.oldimgs) = get_update_data(args[0])
|
||||
|
||||
writeStory(config,adapter,"epub")
|
||||
|
||||
## Go get the new chapters only in another epub.
|
||||
newchaptersio = StringIO()
|
||||
adapter.setChaptersRange(chaptercount+1,urlchaptercount)
|
||||
config.set("overrides",'include_tocpage','false')
|
||||
config.set("overrides",'include_titlepage','false')
|
||||
writeStory(config,adapter,"epub",outstream=newchaptersio)
|
||||
# newchaptersio = StringIO()
|
||||
# adapter.setChaptersRange(chaptercount+1,urlchaptercount)
|
||||
# config.set("overrides",'include_tocpage','false')
|
||||
# config.set("overrides",'include_titlepage','false')
|
||||
# writeStory(config,adapter,"epub",outstream=newchaptersio)
|
||||
|
||||
# out = open("testing/titleio.epub","wb")
|
||||
# out.write(titleio.getvalue())
|
||||
|
|
@ -182,12 +196,12 @@ def main():
|
|||
# out.close()
|
||||
|
||||
## Merge the three epubs together.
|
||||
doMerge(args[0],
|
||||
[titleio,updateio,newchaptersio],
|
||||
fromfirst=True,
|
||||
titlenavpoints=False,
|
||||
striptitletoc=False,
|
||||
forceunique=False)
|
||||
# doMerge(args[0],
|
||||
# [titleio,updateio,newchaptersio],
|
||||
# fromfirst=True,
|
||||
# titlenavpoints=False,
|
||||
# striptitletoc=False,
|
||||
# forceunique=False)
|
||||
|
||||
else:
|
||||
# regular download
|
||||
|
|
|
|||
374
epubmerge.py
374
epubmerge.py
|
|
@ -16,374 +16,10 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import sys
|
||||
import os
|
||||
import re
|
||||
#import StringIO
|
||||
from optparse import OptionParser
|
||||
|
||||
import zlib
|
||||
import zipfile
|
||||
from zipfile import ZipFile, ZIP_STORED, ZIP_DEFLATED
|
||||
from time import time
|
||||
|
||||
from exceptions import KeyError
|
||||
|
||||
from xml.dom.minidom import parse, parseString, getDOMImplementation
|
||||
|
||||
def main(argv):
|
||||
# read in args, anything starting with -- will be treated as --<varible>=<value>
|
||||
usage = "usage: %prog [options] <input epub> [<input epub>...]"
|
||||
parser = OptionParser(usage)
|
||||
parser.add_option("-o", "--output", dest="outputopt", default="merge.epub",
|
||||
help="Set OUTPUT file, Default: merge.epub", metavar="OUTPUT")
|
||||
parser.add_option("-t", "--title", dest="titleopt", default=None,
|
||||
help="Use TITLE as the metadata title. Default: '<first epub title> Anthology'", metavar="TITLE")
|
||||
parser.add_option("-d", "--description", dest="descopt", default=None,
|
||||
help="Use DESC as the metadata description. Default: '<epub title> by <author>' for each epub.", metavar="DESC")
|
||||
parser.add_option("-a", "--author",
|
||||
action="append", dest="authoropts", default=[],
|
||||
help="Use AUTHOR as a metadata author, multiple authors may be given, Default: <All authors from epubs>", metavar="AUTHOR")
|
||||
parser.add_option("-f", "--first",
|
||||
action="store_true", dest="fromfirst", default=False,
|
||||
help="Take all metadata from first input epub",)
|
||||
parser.add_option("-n", "--titles-in-toc",
|
||||
action="store_true", dest="titlenavpoints",
|
||||
help="Put an entry in the TOC for each epub, in addition to each epub's chapters.",)
|
||||
parser.add_option("-s", "--strip-title-toc",
|
||||
action="store_true", dest="striptitletoc",
|
||||
help="Strip any title_page.xhtml and toc_page.xhtml files.",)
|
||||
|
||||
(options, args) = parser.parse_args()
|
||||
|
||||
## Add .epub if not already there.
|
||||
if not options.outputopt.lower().endswith(".epub"):
|
||||
options.outputopt=options.outputopt+".epub"
|
||||
|
||||
print "output file: "+options.outputopt
|
||||
doMerge(options.outputopt,
|
||||
args,
|
||||
options.authoropts,
|
||||
options.titleopt,
|
||||
options.descopt,
|
||||
options.fromfirst,
|
||||
options.titlenavpoints,
|
||||
options.striptitletoc)
|
||||
|
||||
# output = StringIO.StringIO()
|
||||
# files = []
|
||||
# for file in args:
|
||||
# f = open(file,"rb")
|
||||
# fio = StringIO.StringIO(f.read())
|
||||
# f.close()
|
||||
# files.append(fio)
|
||||
|
||||
# doMerge(output,files,authoropts,titleopt,descopt,fromfirst,titlenavpoints,striptitletoc)
|
||||
|
||||
# out = open(outputopt,"wb")
|
||||
# out.write(output.getvalue())
|
||||
|
||||
def doMerge(outputio,files,authoropts=[],titleopt=None,descopt=None,
|
||||
fromfirst=False,
|
||||
titlenavpoints=True,
|
||||
striptitletoc=False,
|
||||
forceunique=True):
|
||||
'''
|
||||
outputio = output file name or StringIO.
|
||||
files = list of input file names or StringIOs.
|
||||
authoropts = list of authors to use, otherwise add from all input
|
||||
titleopt = title, otherwise '<first title> Anthology'
|
||||
descopt = description, otherwise '<title> by <author>' list for all input
|
||||
fromfirst if true, take all metadata (including author, title, desc) from first input
|
||||
titlenavpoints if true, put in a new TOC entry for each epub
|
||||
striptitletoc if true, strip out any (title|toc)_page.xhtml files
|
||||
forceunique if true, guarantee uniqueness of contents by adding a dir for each input
|
||||
'''
|
||||
## Python 2.5 ZipFile is rather more primative than later
|
||||
## versions. It can operate on a file, or on a StringIO, but
|
||||
## not on an open stream. OTOH, I suspect we would have had
|
||||
## problems with closing and opening again to change the
|
||||
## compression type anyway.
|
||||
|
||||
filecount=0
|
||||
source=None
|
||||
|
||||
## Write mimetype file, must be first and uncompressed.
|
||||
## Older versions of python(2.4/5) don't allow you to specify
|
||||
## compression by individual file.
|
||||
## Overwrite if existing output file.
|
||||
outputepub = ZipFile(outputio, "w", compression=ZIP_STORED)
|
||||
outputepub.debug = 3
|
||||
outputepub.writestr("mimetype", "application/epub+zip")
|
||||
outputepub.close()
|
||||
|
||||
## Re-open file for content.
|
||||
outputepub = ZipFile(outputio, "a", compression=ZIP_DEFLATED)
|
||||
outputepub.debug = 3
|
||||
|
||||
## Create META-INF/container.xml file. The only thing it does is
|
||||
## point to content.opf
|
||||
containerdom = getDOMImplementation().createDocument(None, "container", None)
|
||||
containertop = containerdom.documentElement
|
||||
containertop.setAttribute("version","1.0")
|
||||
containertop.setAttribute("xmlns","urn:oasis:names:tc:opendocument:xmlns:container")
|
||||
rootfiles = containerdom.createElement("rootfiles")
|
||||
containertop.appendChild(rootfiles)
|
||||
rootfiles.appendChild(newTag(containerdom,"rootfile",{"full-path":"content.opf",
|
||||
"media-type":"application/oebps-package+xml"}))
|
||||
outputepub.writestr("META-INF/container.xml",containerdom.toprettyxml(indent=' ',encoding='utf-8'))
|
||||
|
||||
## Process input epubs.
|
||||
|
||||
items = [] # list of (id, href, type) tuples(all strings) -- From .opfs' manifests
|
||||
items.append(("ncx","toc.ncx","application/x-dtbncx+xml")) ## we'll generate the toc.ncx file,
|
||||
## but it needs to be in the items manifest.
|
||||
itemrefs = [] # list of strings -- idrefs from .opfs' spines
|
||||
navmaps = [] # list of navMap DOM elements -- TOC data for each from toc.ncx files
|
||||
|
||||
booktitles = [] # list of strings -- Each book's title
|
||||
allauthors = [] # list of lists of strings -- Each book's list of authors.
|
||||
|
||||
filelist = []
|
||||
|
||||
booknum=1
|
||||
firstmetadom = None
|
||||
for file in files:
|
||||
if file == None : continue
|
||||
|
||||
book = "%d" % booknum
|
||||
bookdir = ""
|
||||
bookid = ""
|
||||
if forceunique:
|
||||
bookdir = "%d/" % booknum
|
||||
bookid = "a%d" % booknum
|
||||
#print "book %d" % booknum
|
||||
|
||||
epub = ZipFile(file, 'r')
|
||||
|
||||
## Find the .opf file.
|
||||
container = epub.read("META-INF/container.xml")
|
||||
containerdom = parseString(container)
|
||||
rootfilenodelist = containerdom.getElementsByTagName("rootfile")
|
||||
rootfilename = rootfilenodelist[0].getAttribute("full-path")
|
||||
|
||||
## Save the path to the .opf file--hrefs inside it are relative to it.
|
||||
relpath = os.path.dirname(rootfilename)
|
||||
if( len(relpath) > 0 ):
|
||||
relpath=relpath+"/"
|
||||
|
||||
metadom = parseString(epub.read(rootfilename))
|
||||
if booknum==1:
|
||||
firstmetadom = metadom.getElementsByTagName("metadata")[0]
|
||||
try:
|
||||
source=firstmetadom.getElementsByTagName("dc:source")[0].firstChild.data.encode("utf-8")
|
||||
except:
|
||||
source=""
|
||||
#print "Source:%s"%source
|
||||
|
||||
## Save indiv book title
|
||||
booktitles.append(metadom.getElementsByTagName("dc:title")[0].firstChild.data)
|
||||
|
||||
## Save authors.
|
||||
authors=[]
|
||||
for creator in metadom.getElementsByTagName("dc:creator"):
|
||||
if( creator.getAttribute("opf:role") == "aut" ):
|
||||
authors.append(creator.firstChild.data)
|
||||
allauthors.append(authors)
|
||||
|
||||
for item in metadom.getElementsByTagName("item"):
|
||||
if( item.getAttribute("media-type") == "application/x-dtbncx+xml" ):
|
||||
# TOC file is only one with this type--as far as I know.
|
||||
# grab the whole navmap, deal with it later.
|
||||
tocdom = parseString(epub.read(relpath+item.getAttribute("href")))
|
||||
|
||||
for navpoint in tocdom.getElementsByTagName("navPoint"):
|
||||
navpoint.setAttribute("id",bookid+navpoint.getAttribute("id"))
|
||||
|
||||
for content in tocdom.getElementsByTagName("content"):
|
||||
content.setAttribute("src",bookdir+relpath+content.getAttribute("src"))
|
||||
|
||||
navmaps.append(tocdom.getElementsByTagName("navMap")[0])
|
||||
else:
|
||||
id=bookid+item.getAttribute("id")
|
||||
href=bookdir+relpath+item.getAttribute("href")
|
||||
href=href.encode('utf8')
|
||||
#print "href:"+href
|
||||
if not striptitletoc or not re.match(r'.*/(title|toc)_page\.xhtml',
|
||||
item.getAttribute("href")):
|
||||
if href not in filelist:
|
||||
try:
|
||||
outputepub.writestr(href,
|
||||
epub.read(relpath+item.getAttribute("href")))
|
||||
if re.match(r'.*/(file|chapter)\d+\.x?html',href):
|
||||
filecount+=1
|
||||
items.append((id,href,item.getAttribute("media-type")))
|
||||
filelist.append(href)
|
||||
except KeyError, ke:
|
||||
pass # Skip missing files.
|
||||
|
||||
for itemref in metadom.getElementsByTagName("itemref"):
|
||||
|
||||
if not striptitletoc or not re.match(r'(title|toc)_page', itemref.getAttribute("idref")):
|
||||
itemrefs.append(bookid+itemref.getAttribute("idref"))
|
||||
|
||||
booknum=booknum+1;
|
||||
if not forceunique:
|
||||
# If not forceunique, it's an epub update.
|
||||
# If there's a "calibre_bookmarks.txt", it's from reading
|
||||
# in Calibre and should be preserved.
|
||||
try:
|
||||
fn = "META-INF/calibre_bookmarks.txt"
|
||||
outputepub.writestr(fn,epub.read(fn))
|
||||
except:
|
||||
pass
|
||||
|
||||
|
||||
## create content.opf file.
|
||||
uniqueid="epubmerge-uid-%d" % time() # real sophisticated uid scheme.
|
||||
contentdom = getDOMImplementation().createDocument(None, "package", None)
|
||||
package = contentdom.documentElement
|
||||
if fromfirst and firstmetadom:
|
||||
metadata = firstmetadom
|
||||
firstpackage = firstmetadom.parentNode
|
||||
package.setAttribute("version",firstpackage.getAttribute("version"))
|
||||
package.setAttribute("xmlns",firstpackage.getAttribute("xmlns"))
|
||||
package.setAttribute("unique-identifier",firstpackage.getAttribute("unique-identifier"))
|
||||
else:
|
||||
package.setAttribute("version","2.0")
|
||||
package.setAttribute("xmlns","http://www.idpf.org/2007/opf")
|
||||
package.setAttribute("unique-identifier","epubmerge-id")
|
||||
metadata=newTag(contentdom,"metadata",
|
||||
attrs={"xmlns:dc":"http://purl.org/dc/elements/1.1/",
|
||||
"xmlns:opf":"http://www.idpf.org/2007/opf"})
|
||||
metadata.appendChild(newTag(contentdom,"dc:identifier",text=uniqueid,attrs={"id":"epubmerge-id"}))
|
||||
if( titleopt is None ):
|
||||
titleopt = booktitles[0]+" Anthology"
|
||||
metadata.appendChild(newTag(contentdom,"dc:title",text=titleopt))
|
||||
|
||||
# If cmdline authors, use those instead of those collected from the epubs
|
||||
# (allauthors kept for TOC & description gen below.
|
||||
if( len(authoropts) > 1 ):
|
||||
useauthors=[authoropts]
|
||||
else:
|
||||
useauthors=allauthors
|
||||
|
||||
usedauthors=dict()
|
||||
for authorlist in useauthors:
|
||||
for author in authorlist:
|
||||
if( not usedauthors.has_key(author) ):
|
||||
usedauthors[author]=author
|
||||
metadata.appendChild(newTag(contentdom,"dc:creator",
|
||||
attrs={"opf:role":"aut"},
|
||||
text=author))
|
||||
|
||||
metadata.appendChild(newTag(contentdom,"dc:contributor",text="epubmerge",attrs={"opf:role":"bkp"}))
|
||||
metadata.appendChild(newTag(contentdom,"dc:rights",text="Copyrights as per source stories"))
|
||||
metadata.appendChild(newTag(contentdom,"dc:language",text="en"))
|
||||
|
||||
if not descopt:
|
||||
# created now, but not filled in until TOC generation to save loops.
|
||||
description = newTag(contentdom,"dc:description",text="Anthology containing:\n")
|
||||
else:
|
||||
description = newTag(contentdom,"dc:description",text=descopt)
|
||||
metadata.appendChild(description)
|
||||
|
||||
package.appendChild(metadata)
|
||||
|
||||
manifest = contentdom.createElement("manifest")
|
||||
package.appendChild(manifest)
|
||||
for item in items:
|
||||
(id,href,type)=item
|
||||
manifest.appendChild(newTag(contentdom,"item",
|
||||
attrs={'id':id,
|
||||
'href':href,
|
||||
'media-type':type}))
|
||||
|
||||
spine = newTag(contentdom,"spine",attrs={"toc":"ncx"})
|
||||
package.appendChild(spine)
|
||||
for itemref in itemrefs:
|
||||
spine.appendChild(newTag(contentdom,"itemref",
|
||||
attrs={"idref":itemref,
|
||||
"linear":"yes"}))
|
||||
|
||||
## create toc.ncx file
|
||||
tocncxdom = getDOMImplementation().createDocument(None, "ncx", None)
|
||||
ncx = tocncxdom.documentElement
|
||||
ncx.setAttribute("version","2005-1")
|
||||
ncx.setAttribute("xmlns","http://www.daisy.org/z3986/2005/ncx/")
|
||||
head = tocncxdom.createElement("head")
|
||||
ncx.appendChild(head)
|
||||
head.appendChild(newTag(tocncxdom,"meta",
|
||||
attrs={"name":"dtb:uid", "content":uniqueid}))
|
||||
head.appendChild(newTag(tocncxdom,"meta",
|
||||
attrs={"name":"dtb:depth", "content":"1"}))
|
||||
head.appendChild(newTag(tocncxdom,"meta",
|
||||
attrs={"name":"dtb:totalPageCount", "content":"0"}))
|
||||
head.appendChild(newTag(tocncxdom,"meta",
|
||||
attrs={"name":"dtb:maxPageNumber", "content":"0"}))
|
||||
|
||||
docTitle = tocncxdom.createElement("docTitle")
|
||||
docTitle.appendChild(newTag(tocncxdom,"text",text=titleopt))
|
||||
ncx.appendChild(docTitle)
|
||||
|
||||
tocnavMap = tocncxdom.createElement("navMap")
|
||||
ncx.appendChild(tocnavMap)
|
||||
|
||||
## TOC navPoints can be nested, but this flattens them for
|
||||
## simplicity, plus adds a navPoint for each epub.
|
||||
booknum=0
|
||||
for navmap in navmaps:
|
||||
navpoints = navmap.getElementsByTagName("navPoint")
|
||||
if titlenavpoints:
|
||||
## Copy first navPoint of each epub, give a different id and
|
||||
## text: bookname by authorname
|
||||
newnav = navpoints[0].cloneNode(True)
|
||||
newnav.setAttribute("id","book"+newnav.getAttribute("id"))
|
||||
## For purposes of TOC titling & desc, use first book author
|
||||
newtext = newTag(tocncxdom,"text",text=booktitles[booknum]+" by "+allauthors[booknum][0])
|
||||
text = newnav.getElementsByTagName("text")[0]
|
||||
text.parentNode.replaceChild(newtext,text)
|
||||
tocnavMap.appendChild(newnav)
|
||||
|
||||
if not descopt and not fromfirst:
|
||||
description.appendChild(contentdom.createTextNode(booktitles[booknum]+" by "+allauthors[booknum][0]+"\n"))
|
||||
|
||||
for navpoint in navpoints:
|
||||
#print "navpoint:%s"%navpoint.getAttribute("id")
|
||||
if not striptitletoc or not re.match(r'(title|toc)_page',navpoint.getAttribute("id")):
|
||||
tocnavMap.appendChild(navpoint)
|
||||
booknum=booknum+1;
|
||||
|
||||
## Force strict ordering of playOrder
|
||||
playorder=1
|
||||
for navpoint in tocncxdom.getElementsByTagName("navPoint"):
|
||||
navpoint.setAttribute("playOrder","%d" % playorder)
|
||||
if( not navpoint.getAttribute("id").startswith("book") ):
|
||||
playorder = playorder + 1
|
||||
|
||||
## content.opf written now due to description being filled in
|
||||
## during TOC generation to save loops.
|
||||
outputepub.writestr("content.opf",contentdom.toxml('utf-8'))
|
||||
outputepub.writestr("toc.ncx",tocncxdom.toxml('utf-8'))
|
||||
|
||||
# declares all the files created by Windows. otherwise, when
|
||||
# it runs in appengine, windows unzips the files as 000 perms.
|
||||
for zf in outputepub.filelist:
|
||||
zf.create_system = 0
|
||||
outputepub.close()
|
||||
|
||||
return (source,filecount)
|
||||
|
||||
## Utility method for creating new tags.
|
||||
def newTag(dom,name,attrs=None,text=None):
|
||||
tag = dom.createElement(name)
|
||||
if( attrs is not None ):
|
||||
for attr in attrs.keys():
|
||||
tag.setAttribute(attr,attrs[attr])
|
||||
if( text is not None ):
|
||||
tag.appendChild(dom.createTextNode(text))
|
||||
return tag
|
||||
|
||||
if __name__ == "__main__":
|
||||
main(sys.argv[1:])
|
||||
print('''
|
||||
The this utility has been split out into it's own project.
|
||||
See: http://code.google.com/p/epubmerge/
|
||||
...for a CLI epubmerge.py program and calibre plugin.
|
||||
''')
|
||||
|
|
|
|||
|
|
@ -64,7 +64,7 @@ for x in imports():
|
|||
#print x
|
||||
__class_list.append(sys.modules[x].getClass())
|
||||
|
||||
def getAdapter(config,url):
|
||||
def getAdapter(config,url,fileform=None):
|
||||
## fix up leading protocol.
|
||||
fixedurl = re.sub(r"(?i)^[htp]+[:/]+","http://",url.strip())
|
||||
if not fixedurl.startswith("http"):
|
||||
|
|
@ -89,6 +89,7 @@ def getAdapter(config,url):
|
|||
fixedurl = fixedurl.replace("http://","http://www.")
|
||||
if cls:
|
||||
adapter = cls(config,fixedurl) # raises InvalidStoryURL
|
||||
adapter.setSectionOrder(adapter.getSiteDomain(),fileform)
|
||||
return adapter
|
||||
# No adapter found.
|
||||
raise exceptions.UnknownSite( url, [cls.getSiteDomain() for cls in __class_list] )
|
||||
|
|
|
|||
|
|
@ -25,7 +25,7 @@ from .. import BeautifulSoup as bs
|
|||
from ..htmlcleanup import stripHTML
|
||||
from .. import exceptions as exceptions
|
||||
|
||||
from base_adapter import BaseSiteAdapter, utf8FromSoup, makeDate
|
||||
from base_adapter import BaseSiteAdapter, makeDate
|
||||
|
||||
class AdAstraFanficComSiteAdapter(BaseSiteAdapter):
|
||||
|
||||
|
|
@ -133,7 +133,8 @@ class AdAstraFanficComSiteAdapter(BaseSiteAdapter):
|
|||
# sometimes poorly formated desc (<p> w/o </p>) leads
|
||||
# to all labels being included.
|
||||
svalue=svalue[:svalue.find('<span class="label">')]
|
||||
self.story.setMetadata('description',stripHTML(svalue))
|
||||
self.setDescription(url,svalue)
|
||||
#self.story.setMetadata('description',stripHTML(svalue))
|
||||
|
||||
if 'Rated' in label:
|
||||
self.story.setMetadata('rating', value)
|
||||
|
|
@ -220,7 +221,7 @@ class AdAstraFanficComSiteAdapter(BaseSiteAdapter):
|
|||
if None == span:
|
||||
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
return utf8FromSoup(span)
|
||||
return self.utf8FromSoup(url,span)
|
||||
|
||||
def getClass():
|
||||
return AdAstraFanficComSiteAdapter
|
||||
|
|
|
|||
|
|
@ -24,7 +24,7 @@ from .. import BeautifulSoup as bs
|
|||
from ..htmlcleanup import stripHTML
|
||||
from .. import exceptions as exceptions
|
||||
|
||||
from base_adapter import BaseSiteAdapter, utf8FromSoup, makeDate
|
||||
from base_adapter import BaseSiteAdapter, makeDate
|
||||
|
||||
def getClass():
|
||||
return ArchiveOfOurOwnOrgAdapter
|
||||
|
|
@ -126,7 +126,8 @@ class ArchiveOfOurOwnOrgAdapter(BaseSiteAdapter):
|
|||
|
||||
a = metasoup.find('blockquote',{'class':'userstuff'})
|
||||
if a != None:
|
||||
self.story.setMetadata('description',a.text)
|
||||
self.setDescription(url,a.text)
|
||||
#self.story.setMetadata('description',a.text)
|
||||
|
||||
a = metasoup.find('dd',{'class':"rating tags"})
|
||||
if a != None:
|
||||
|
|
@ -216,7 +217,8 @@ class ArchiveOfOurOwnOrgAdapter(BaseSiteAdapter):
|
|||
logging.debug('Getting chapter text from: %s' % url)
|
||||
|
||||
chapter=bs.BeautifulSoup('<div class="story"></div>')
|
||||
soup = bs.BeautifulSoup(self._fetchUrl(url),selfClosingTags=('br','hr'))
|
||||
data = self._fetchUrl(url)
|
||||
soup = bs.BeautifulSoup(data,selfClosingTags=('br','hr'))
|
||||
|
||||
headnotes = soup.find('div', {'class' : "preface group"}).find('div', {'class' : "notes module"})
|
||||
if headnotes != None:
|
||||
|
|
@ -257,5 +259,5 @@ class ArchiveOfOurOwnOrgAdapter(BaseSiteAdapter):
|
|||
|
||||
if None == soup:
|
||||
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
return utf8FromSoup(chapter)
|
||||
|
||||
return self.utf8FromSoup(url,chapter)
|
||||
|
|
|
|||
|
|
@ -24,7 +24,7 @@ from .. import BeautifulSoup as bs
|
|||
from ..htmlcleanup import stripHTML
|
||||
from .. import exceptions as exceptions
|
||||
|
||||
from base_adapter import BaseSiteAdapter, utf8FromSoup, makeDate
|
||||
from base_adapter import BaseSiteAdapter, makeDate
|
||||
|
||||
# By virtue of being recent and requiring both is_adult and user/pass,
|
||||
# adapter_fanficcastletvnet.py is the best choice for learning to
|
||||
|
|
@ -218,7 +218,8 @@ class CastleFansOrgAdapter(BaseSiteAdapter): # XXX
|
|||
while not defaultGetattr(value,'class') == 'label':
|
||||
svalue += str(value)
|
||||
value = value.nextSibling
|
||||
self.story.setMetadata('description',stripHTML(svalue))
|
||||
self.setDescription(url,svalue)
|
||||
#self.story.setMetadata('description',stripHTML(svalue))
|
||||
|
||||
if 'Rated' in label:
|
||||
self.story.setMetadata('rating', value)
|
||||
|
|
@ -305,4 +306,4 @@ class CastleFansOrgAdapter(BaseSiteAdapter): # XXX
|
|||
if None == div:
|
||||
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
return utf8FromSoup(div)
|
||||
return self.utf8FromSoup(url,div)
|
||||
|
|
|
|||
|
|
@ -24,7 +24,7 @@ import time
|
|||
from .. import BeautifulSoup as bs
|
||||
from .. import exceptions as exceptions
|
||||
|
||||
from base_adapter import BaseSiteAdapter, utf8FromSoup, makeDate
|
||||
from base_adapter import BaseSiteAdapter, makeDate
|
||||
|
||||
class FanFictionNetSiteAdapter(BaseSiteAdapter):
|
||||
|
||||
|
|
@ -153,7 +153,8 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter):
|
|||
if 'title_t' in var:
|
||||
self.story.setMetadata('title', value)
|
||||
if 'summary' in var:
|
||||
self.story.setMetadata('description', value)
|
||||
self.setDescription(url,value)
|
||||
#self.story.setMetadata('description', value)
|
||||
if 'datep' in var:
|
||||
self.story.setMetadata('datePublished',makeDate(value, '%m-%d-%y'))
|
||||
if 'dateu' in var:
|
||||
|
|
@ -270,7 +271,7 @@ class FanFictionNetSiteAdapter(BaseSiteAdapter):
|
|||
logging.debug('div id=storytext not found. data:%s'%data)
|
||||
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
return utf8FromSoup(div)
|
||||
return self.utf8FromSoup(url,div)
|
||||
|
||||
def getClass():
|
||||
return FanFictionNetSiteAdapter
|
||||
|
|
|
|||
|
|
@ -26,7 +26,7 @@ from .. import BeautifulSoup as bs
|
|||
from ..htmlcleanup import stripHTML
|
||||
from .. import exceptions as exceptions
|
||||
|
||||
from base_adapter import BaseSiteAdapter, utf8FromSoup, makeDate
|
||||
from base_adapter import BaseSiteAdapter, makeDate
|
||||
|
||||
|
||||
def getClass():
|
||||
|
|
@ -201,7 +201,8 @@ class FicBookNetAdapter(BaseSiteAdapter):
|
|||
break
|
||||
|
||||
summary=soup.find('span', {'class' : 'urlize'})
|
||||
self.story.setMetadata('description', summary.text)
|
||||
self.setDescription(url,summary.text)
|
||||
#self.story.setMetadata('description', summary.text)
|
||||
|
||||
# grab the text for an individual chapter.
|
||||
def getChapterText(self, url):
|
||||
|
|
@ -218,4 +219,4 @@ class FicBookNetAdapter(BaseSiteAdapter):
|
|||
if None == chapter:
|
||||
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
return utf8FromSoup(chapter)
|
||||
return self.utf8FromSoup(url,chapter)
|
||||
|
|
|
|||
|
|
@ -25,7 +25,7 @@ from .. import BeautifulSoup as bs
|
|||
from ..htmlcleanup import stripHTML
|
||||
from .. import exceptions as exceptions
|
||||
|
||||
from base_adapter import BaseSiteAdapter, utf8FromSoup, makeDate
|
||||
from base_adapter import BaseSiteAdapter, makeDate
|
||||
|
||||
class FictionAlleyOrgSiteAdapter(BaseSiteAdapter):
|
||||
|
||||
|
|
@ -187,7 +187,8 @@ class FictionAlleyOrgSiteAdapter(BaseSiteAdapter):
|
|||
|
||||
for small in storydd.findAll('small'):
|
||||
small.extract() ## removes the <small> tags, leaving only the summary.
|
||||
self.story.setMetadata('description',stripHTML(storydd))
|
||||
self.setDescription(url,storydd)
|
||||
#self.story.setMetadata('description',stripHTML(storydd))
|
||||
|
||||
return
|
||||
|
||||
|
|
@ -223,7 +224,7 @@ class FictionAlleyOrgSiteAdapter(BaseSiteAdapter):
|
|||
if not data or not text:
|
||||
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
return utf8FromSoup(text)
|
||||
return self.utf8FromSoup(url,text)
|
||||
|
||||
def getClass():
|
||||
return FictionAlleyOrgSiteAdapter
|
||||
|
|
|
|||
|
|
@ -26,7 +26,7 @@ from .. import BeautifulSoup as bs
|
|||
from .. import exceptions as exceptions
|
||||
from ..htmlcleanup import stripHTML
|
||||
|
||||
from base_adapter import BaseSiteAdapter, utf8FromSoup, makeDate
|
||||
from base_adapter import BaseSiteAdapter, makeDate
|
||||
|
||||
class FicwadComSiteAdapter(BaseSiteAdapter):
|
||||
|
||||
|
|
@ -124,7 +124,8 @@ class FicwadComSiteAdapter(BaseSiteAdapter):
|
|||
|
||||
# description
|
||||
storydiv = soup.find("div",{"id":"story"})
|
||||
self.story.setMetadata('description', storydiv.find("blockquote",{'class':'summary'}).p.string)
|
||||
self.setDescription(url,storydiv.find("blockquote",{'class':'summary'}).p.string)
|
||||
#self.story.setMetadata('description', storydiv.find("blockquote",{'class':'summary'}).p.string)
|
||||
|
||||
# most of the meta data is here:
|
||||
metap = storydiv.find("p",{"class":"meta"})
|
||||
|
|
@ -209,7 +210,7 @@ class FicwadComSiteAdapter(BaseSiteAdapter):
|
|||
if None == span:
|
||||
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
return utf8FromSoup(span)
|
||||
return self.utf8FromSoup(url,span)
|
||||
|
||||
def getClass():
|
||||
return FicwadComSiteAdapter
|
||||
|
|
|
|||
|
|
@ -26,7 +26,7 @@ from .. import BeautifulSoup as bs
|
|||
from ..htmlcleanup import stripHTML
|
||||
from .. import exceptions as exceptions
|
||||
|
||||
from base_adapter import BaseSiteAdapter, utf8FromSoup, makeDate
|
||||
from base_adapter import BaseSiteAdapter, makeDate
|
||||
|
||||
def getClass():
|
||||
return FimFictionNetSiteAdapter
|
||||
|
|
@ -141,7 +141,13 @@ class FimFictionNetSiteAdapter(BaseSiteAdapter):
|
|||
description_soup.find('a', {"class":"more"}).extract()
|
||||
except:
|
||||
pass
|
||||
self.story.setMetadata('description', description_soup.text)
|
||||
|
||||
# fimfic is the first site with an explicit cover image.
|
||||
story_img = soup.find('img',{'class':'story_image'})
|
||||
if self.getConfig('include_images') and story_img:
|
||||
self.story.addImgUrl(self,self.url,story_img['src'],self._fetchUrlRaw,cover=True)
|
||||
self.setDescription(self.url,description_soup.text)
|
||||
#self.story.setMetadata('description', description_soup.text)
|
||||
|
||||
# Unfortunately, nowhere on the page is the year mentioned.
|
||||
# Best effort to deal with this:
|
||||
|
|
@ -171,5 +177,5 @@ class FimFictionNetSiteAdapter(BaseSiteAdapter):
|
|||
soup = bs.BeautifulSoup(self._fetchUrl(url),selfClosingTags=('br','hr')).find('div', {'id' : 'chapter_container'})
|
||||
if soup == None:
|
||||
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
return utf8FromSoup(soup)
|
||||
return self.utf8FromSoup(url,soup)
|
||||
|
||||
|
|
|
|||
|
|
@ -25,7 +25,7 @@ from .. import BeautifulSoup as bs
|
|||
from ..htmlcleanup import stripHTML
|
||||
from .. import exceptions as exceptions
|
||||
|
||||
from base_adapter import BaseSiteAdapter, utf8FromSoup, makeDate
|
||||
from base_adapter import BaseSiteAdapter, makeDate
|
||||
|
||||
def getClass():
|
||||
return GayAuthorsAdapter
|
||||
|
|
@ -162,7 +162,8 @@ class GayAuthorsAdapter(BaseSiteAdapter):
|
|||
self.story.setMetadata('rating',rating.text)
|
||||
|
||||
summary = msoup.find('span', {'itemprop' : 'description'})
|
||||
self.story.setMetadata('description',summary.text)
|
||||
self.setDescription(self.url,summary.text)
|
||||
#self.story.setMetadata('description',summary.text)
|
||||
|
||||
|
||||
stats = msoup.find('dl',{'class':'info'})
|
||||
|
|
@ -200,4 +201,4 @@ class GayAuthorsAdapter(BaseSiteAdapter):
|
|||
if None == div:
|
||||
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
return utf8FromSoup(div)
|
||||
return self.utf8FromSoup(url,div)
|
||||
|
|
|
|||
|
|
@ -25,7 +25,7 @@ from .. import BeautifulSoup as bs
|
|||
from ..htmlcleanup import stripHTML
|
||||
from .. import exceptions as exceptions
|
||||
|
||||
from base_adapter import BaseSiteAdapter, utf8FromSoup, makeDate
|
||||
from base_adapter import BaseSiteAdapter, makeDate
|
||||
|
||||
class HarryPotterFanFictionComSiteAdapter(BaseSiteAdapter):
|
||||
|
||||
|
|
@ -125,7 +125,8 @@ class HarryPotterFanFictionComSiteAdapter(BaseSiteAdapter):
|
|||
|
||||
## Finding the metadata is a bit of a pain. Desc is the only thing this color.
|
||||
desctable= soup.find('table',{'bgcolor':'#f0e8e8'})
|
||||
self.story.setMetadata('description',stripHTML(desctable))
|
||||
self.setDescription(url,desctable)
|
||||
#self.story.setMetadata('description',stripHTML(desctable))
|
||||
|
||||
## Finding the metadata is a bit of a pain. Most of the meta
|
||||
## data is in a center.table without a bgcolor.
|
||||
|
|
@ -193,7 +194,7 @@ class HarryPotterFanFictionComSiteAdapter(BaseSiteAdapter):
|
|||
if None == div:
|
||||
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
return utf8FromSoup(div)
|
||||
return self.utf8FromSoup(url,div)
|
||||
|
||||
def getClass():
|
||||
return HarryPotterFanFictionComSiteAdapter
|
||||
|
|
|
|||
|
|
@ -25,7 +25,7 @@ from .. import BeautifulSoup as bs
|
|||
from ..htmlcleanup import stripHTML
|
||||
from .. import exceptions as exceptions
|
||||
|
||||
from base_adapter import BaseSiteAdapter, utf8FromSoup, makeDate
|
||||
from base_adapter import BaseSiteAdapter, makeDate
|
||||
|
||||
class MediaMinerOrgSiteAdapter(BaseSiteAdapter):
|
||||
|
||||
|
|
@ -174,7 +174,8 @@ class MediaMinerOrgSiteAdapter(BaseSiteAdapter):
|
|||
# Summary: ....
|
||||
m = re.match(r".*?Summary: (.*)$",metastr)
|
||||
if m:
|
||||
self.story.setMetadata('description', m.group(1))
|
||||
self.setDescription(url, m.group(1))
|
||||
#self.story.setMetadata('description', m.group(1))
|
||||
|
||||
# completed
|
||||
m = re.match(r".*?Status: Completed.*?",metastr)
|
||||
|
|
@ -210,7 +211,7 @@ class MediaMinerOrgSiteAdapter(BaseSiteAdapter):
|
|||
del div['style']
|
||||
del div['align']
|
||||
anchor.name='div'
|
||||
return utf8FromSoup(anchor)
|
||||
return self.utf8FromSoup(url,anchor)
|
||||
|
||||
else:
|
||||
logging.debug('Using kludgey text find for older mediaminer story.')
|
||||
|
|
@ -226,7 +227,7 @@ class MediaMinerOrgSiteAdapter(BaseSiteAdapter):
|
|||
soup.findAll('table',{'class':'tbbrdr'}):
|
||||
tag.extract() # remove tag from soup.
|
||||
|
||||
return utf8FromSoup(soup)
|
||||
return self.utf8FromSoup(url,soup)
|
||||
|
||||
|
||||
def getClass():
|
||||
|
|
|
|||
|
|
@ -25,7 +25,7 @@ from .. import BeautifulSoup as bs
|
|||
from ..htmlcleanup import stripHTML
|
||||
from .. import exceptions as exceptions
|
||||
|
||||
from base_adapter import BaseSiteAdapter, utf8FromSoup, makeDate
|
||||
from base_adapter import BaseSiteAdapter, makeDate
|
||||
|
||||
class PotionsAndSnitchesNetSiteAdapter(BaseSiteAdapter):
|
||||
|
||||
|
|
@ -131,7 +131,8 @@ class PotionsAndSnitchesNetSiteAdapter(BaseSiteAdapter):
|
|||
while not defaultGetattr(value,'class') == 'listbox':
|
||||
svalue += str(value)
|
||||
value = value.nextSibling
|
||||
self.story.setMetadata('description',stripHTML(svalue))
|
||||
self.setDescription(url,svalue)
|
||||
#self.story.setMetadata('description',stripHTML(svalue))
|
||||
|
||||
if 'Rated' in label:
|
||||
self.story.setMetadata('rating', value)
|
||||
|
|
@ -209,7 +210,7 @@ class PotionsAndSnitchesNetSiteAdapter(BaseSiteAdapter):
|
|||
if None == div:
|
||||
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
return utf8FromSoup(div)
|
||||
return self.utf8FromSoup(url,div)
|
||||
|
||||
def getClass():
|
||||
return PotionsAndSnitchesNetSiteAdapter
|
||||
|
|
|
|||
|
|
@ -24,7 +24,7 @@ from .. import BeautifulSoup as bs
|
|||
from ..htmlcleanup import stripHTML
|
||||
from .. import exceptions as exceptions
|
||||
|
||||
from base_adapter import BaseSiteAdapter, utf8FromSoup, makeDate
|
||||
from base_adapter import BaseSiteAdapter, makeDate
|
||||
|
||||
# This function is called by the downloader in all adapter_*.py files
|
||||
# in this dir to register the adapter class. So it needs to be
|
||||
|
|
@ -227,7 +227,8 @@ class SiyeCoUkAdapter(BaseSiteAdapter): # XXX
|
|||
|
||||
if part.startswith("Summary:"):
|
||||
part = part[part.find(':')+1:]
|
||||
self.story.setMetadata('description',part)
|
||||
self.setDescription(url,part)
|
||||
#self.story.setMetadata('description',part)
|
||||
|
||||
# want to get the next tr of the table.
|
||||
#print("%s"%titlea.parent.parent.findNextSibling('tr'))
|
||||
|
|
@ -295,4 +296,4 @@ class SiyeCoUkAdapter(BaseSiteAdapter): # XXX
|
|||
if None == story:
|
||||
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
return utf8FromSoup(story)
|
||||
return self.utf8FromSoup(url,story)
|
||||
|
|
|
|||
|
|
@ -25,7 +25,7 @@ from .. import BeautifulSoup as bs
|
|||
from ..htmlcleanup import stripHTML
|
||||
from .. import exceptions as exceptions
|
||||
|
||||
from base_adapter import BaseSiteAdapter, utf8FromSoup, makeDate
|
||||
from base_adapter import BaseSiteAdapter, makeDate
|
||||
|
||||
class TenhawkPresentsComSiteAdapter(BaseSiteAdapter):
|
||||
|
||||
|
|
@ -164,7 +164,8 @@ class TenhawkPresentsComSiteAdapter(BaseSiteAdapter):
|
|||
while not defaultGetattr(value,'class') == 'label':
|
||||
svalue += str(value)
|
||||
value = value.nextSibling
|
||||
self.story.setMetadata('description',stripHTML(svalue))
|
||||
self.setDescription(url,svalue)
|
||||
#self.story.setMetadata('description',stripHTML(svalue))
|
||||
|
||||
if 'Rated' in label:
|
||||
self.story.setMetadata('rating', value)
|
||||
|
|
@ -238,7 +239,7 @@ class TenhawkPresentsComSiteAdapter(BaseSiteAdapter):
|
|||
if None == span:
|
||||
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
return utf8FromSoup(span)
|
||||
return self.utf8FromSoup(url,span)
|
||||
|
||||
def getClass():
|
||||
return TenhawkPresentsComSiteAdapter
|
||||
|
|
|
|||
|
|
@ -22,7 +22,7 @@ import logging
|
|||
from .. import BeautifulSoup as bs
|
||||
from .. import exceptions
|
||||
|
||||
from base_adapter import BaseSiteAdapter, utf8FromSoup, makeDate
|
||||
from base_adapter import BaseSiteAdapter, makeDate
|
||||
|
||||
class TestSiteAdapter(BaseSiteAdapter):
|
||||
|
||||
|
|
@ -78,7 +78,6 @@ class TestSiteAdapter(BaseSiteAdapter):
|
|||
Some more longer description. "I suck at summaries!" "Better than it sounds!" "My first fic"
|
||||
''')
|
||||
self.story.setMetadata('datePublished',makeDate("1975-03-15","%Y-%m-%d"))
|
||||
self.story.setMetadata('dateCreated',datetime.datetime.now())
|
||||
if self.story.getMetadata('storyId') == '669':
|
||||
self.story.setMetadata('dateUpdated',datetime.datetime.now())
|
||||
else:
|
||||
|
|
@ -127,7 +126,7 @@ Some more longer description. "I suck at summaries!" "Better than it sounds!"
|
|||
('Chapter 4',self.url+"&chapter=5"),
|
||||
('Chapter 5',self.url+"&chapter=6"),
|
||||
('Chapter 6',self.url+"&chapter=6"),
|
||||
('Chapter 7',self.url+"&chapter=6"),
|
||||
# ('Chapter 7',self.url+"&chapter=6"),
|
||||
# ('Chapter 8',self.url+"&chapter=6"),
|
||||
# ('Chapter 9',self.url+"&chapter=6"),
|
||||
# ('Chapter 0',self.url+"&chapter=6"),
|
||||
|
|
@ -178,10 +177,12 @@ Some more longer description. "I suck at summaries!" "Better than it sounds!"
|
|||
else:
|
||||
text=u'''
|
||||
<div>
|
||||
<h3>Chapter</h3>
|
||||
<h3>Chapter title from site</h3>
|
||||
<p><center>Centered text</center></p>
|
||||
<p>Lorem '''+self.crazystring+''' <i>italics</i>, <b>bold</b>, <u>underline</u> consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.</p>
|
||||
br breaks<br><br>
|
||||
|
||||
<a href="http://code.google.com/p/fanficdownloader/wiki/FanFictionDownLoaderPluginWithReadingList" title="Tilt-a-Whirl by Jim & Sarah, on Flickr"><img src="http://i.imgur.com/bo8eD.png"></a><br/>
|
||||
br breaks<br><br>
|
||||
<hr>
|
||||
horizontal rules
|
||||
|
|
@ -191,7 +192,7 @@ horizontal rules
|
|||
</div>
|
||||
'''
|
||||
soup = bs.BeautifulStoneSoup(text,selfClosingTags=('br','hr')) # otherwise soup eats the br/hr tags.
|
||||
return utf8FromSoup(soup)
|
||||
return self.utf8FromSoup(url,soup)
|
||||
|
||||
def getClass():
|
||||
return TestSiteAdapter
|
||||
|
|
|
|||
|
|
@ -25,7 +25,7 @@ from .. import BeautifulSoup as bs
|
|||
from ..htmlcleanup import stripHTML
|
||||
from .. import exceptions as exceptions
|
||||
|
||||
from base_adapter import BaseSiteAdapter, utf8FromSoup, makeDate
|
||||
from base_adapter import BaseSiteAdapter, makeDate
|
||||
|
||||
class TheWritersCoffeeShopComSiteAdapter(BaseSiteAdapter):
|
||||
|
||||
|
|
@ -166,7 +166,7 @@ class TheWritersCoffeeShopComSiteAdapter(BaseSiteAdapter):
|
|||
while not defaultGetattr(value,'class') == 'label':
|
||||
svalue += str(value)
|
||||
value = value.nextSibling
|
||||
self.story.setMetadata('description',stripHTML(svalue))
|
||||
self.setDescription(url,svalue)
|
||||
|
||||
if 'Rated' in label:
|
||||
self.story.setMetadata('rating', value)
|
||||
|
|
@ -245,7 +245,7 @@ class TheWritersCoffeeShopComSiteAdapter(BaseSiteAdapter):
|
|||
if None == span:
|
||||
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
return utf8FromSoup(span)
|
||||
return self.utf8FromSoup(url,span)
|
||||
|
||||
def getClass():
|
||||
return TheWritersCoffeeShopComSiteAdapter
|
||||
|
|
|
|||
|
|
@ -25,7 +25,7 @@ from .. import BeautifulSoup as bs
|
|||
from ..htmlcleanup import stripHTML
|
||||
from .. import exceptions as exceptions
|
||||
|
||||
from base_adapter import BaseSiteAdapter, utf8FromSoup, makeDate
|
||||
from base_adapter import BaseSiteAdapter, makeDate
|
||||
|
||||
class TwistingTheHellmouthSiteAdapter(BaseSiteAdapter):
|
||||
|
||||
|
|
@ -127,6 +127,8 @@ class TwistingTheHellmouthSiteAdapter(BaseSiteAdapter):
|
|||
else:
|
||||
raise e
|
||||
|
||||
descurl = url
|
||||
|
||||
if "<h2>Story Not Found</h2>" in data:
|
||||
raise exceptions.StoryDoesNotExist(url)
|
||||
|
||||
|
|
@ -154,12 +156,14 @@ class TwistingTheHellmouthSiteAdapter(BaseSiteAdapter):
|
|||
# going to pull part of the meta data from author list page.
|
||||
logging.debug("**AUTHOR** URL: "+self.story.getMetadata('authorUrl'))
|
||||
authordata = self._fetchUrl(self.story.getMetadata('authorUrl'))
|
||||
descurl=self.story.getMetadata('authorUrl')
|
||||
authorsoup = bs.BeautifulSoup(authordata)
|
||||
# author can have several pages, scan until we find it.
|
||||
while( not authorsoup.find('a', href=re.compile(r"^/Story-"+self.story.getMetadata('storyId'))) ):
|
||||
nextpage = 'http://'+self.host+authorsoup.find('a', {'class':'arrowf'})['href']
|
||||
logging.debug("**AUTHOR** nextpage URL: "+nextpage)
|
||||
authordata = self._fetchUrl(nextpage)
|
||||
descurl=nextpage
|
||||
authorsoup = bs.BeautifulSoup(authordata)
|
||||
except urllib2.HTTPError, e:
|
||||
if e.code == 404:
|
||||
|
|
@ -168,7 +172,8 @@ class TwistingTheHellmouthSiteAdapter(BaseSiteAdapter):
|
|||
raise e
|
||||
|
||||
storydiv = authorsoup.find('div', {'id':'st'+self.story.getMetadata('storyId'), 'class':re.compile(r"storylistitem")})
|
||||
self.story.setMetadata('description',stripHTML(storydiv.find('div',{'class':'storydesc'})))
|
||||
self.setDescription(descurl,storydiv.find('div',{'class':'storydesc'}))
|
||||
#self.story.setMetadata('description',stripHTML(storydiv.find('div',{'class':'storydesc'})))
|
||||
self.story.setMetadata('title',stripHTML(storydiv.find('a',{'class':'storylink'})))
|
||||
|
||||
verticaltable = soup.find('table', {'class':'verticaltable'})
|
||||
|
|
@ -238,7 +243,7 @@ class TwistingTheHellmouthSiteAdapter(BaseSiteAdapter):
|
|||
div.find('h3').extract()
|
||||
except:
|
||||
pass
|
||||
return utf8FromSoup(div)
|
||||
return self.utf8FromSoup(url,div)
|
||||
|
||||
def getClass():
|
||||
return TwistingTheHellmouthSiteAdapter
|
||||
|
|
|
|||
|
|
@ -25,7 +25,7 @@ from .. import BeautifulSoup as bs
|
|||
from ..htmlcleanup import stripHTML
|
||||
from .. import exceptions as exceptions
|
||||
|
||||
from base_adapter import BaseSiteAdapter, utf8FromSoup, makeDate
|
||||
from base_adapter import BaseSiteAdapter, makeDate
|
||||
|
||||
class TwilightedNetSiteAdapter(BaseSiteAdapter):
|
||||
|
||||
|
|
@ -162,7 +162,7 @@ class TwilightedNetSiteAdapter(BaseSiteAdapter):
|
|||
while not defaultGetattr(value,'class') == 'label':
|
||||
svalue += str(value)
|
||||
value = value.nextSibling
|
||||
self.story.setMetadata('description',stripHTML(svalue))
|
||||
self.setDescription(url,svalue)
|
||||
|
||||
if 'Rated' in label:
|
||||
self.story.setMetadata('rating', value)
|
||||
|
|
@ -243,7 +243,7 @@ class TwilightedNetSiteAdapter(BaseSiteAdapter):
|
|||
if None == span:
|
||||
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
return utf8FromSoup(span)
|
||||
return self.utf8FromSoup(url,span)
|
||||
|
||||
def getClass():
|
||||
return TwilightedNetSiteAdapter
|
||||
|
|
|
|||
|
|
@ -25,7 +25,7 @@ from .. import BeautifulSoup as bs
|
|||
from ..htmlcleanup import stripHTML
|
||||
from .. import exceptions as exceptions
|
||||
|
||||
from base_adapter import BaseSiteAdapter, utf8FromSoup, makeDate
|
||||
from base_adapter import BaseSiteAdapter, makeDate
|
||||
|
||||
class TwiwriteNetSiteAdapter(BaseSiteAdapter):
|
||||
|
||||
|
|
@ -169,7 +169,8 @@ class TwiwriteNetSiteAdapter(BaseSiteAdapter):
|
|||
while not defaultGetattr(value,'class') == 'label':
|
||||
svalue += str(value)
|
||||
value = value.nextSibling
|
||||
self.story.setMetadata('description',stripHTML(svalue))
|
||||
self.setDescription(url,svalue)
|
||||
#self.story.setMetadata('description',stripHTML(svalue))
|
||||
|
||||
if 'Rated' in label:
|
||||
self.story.setMetadata('rating', value)
|
||||
|
|
@ -255,7 +256,7 @@ class TwiwriteNetSiteAdapter(BaseSiteAdapter):
|
|||
if None == span:
|
||||
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
return utf8FromSoup(span)
|
||||
return self.utf8FromSoup(url,span)
|
||||
|
||||
def getClass():
|
||||
return TwiwriteNetSiteAdapter
|
||||
|
|
|
|||
|
|
@ -23,7 +23,7 @@ import urllib2
|
|||
from .. import BeautifulSoup as bs
|
||||
from .. import exceptions as exceptions
|
||||
|
||||
from base_adapter import BaseSiteAdapter, utf8FromSoup, makeDate
|
||||
from base_adapter import BaseSiteAdapter, makeDate
|
||||
|
||||
class WhoficComSiteAdapter(BaseSiteAdapter):
|
||||
|
||||
|
|
@ -120,9 +120,10 @@ class WhoficComSiteAdapter(BaseSiteAdapter):
|
|||
# link instead to find the appropriate metadata.
|
||||
a = soup.find('a', href=re.compile(r'reviews.php\?sid='+self.story.getMetadata('storyId')))
|
||||
metadata = a.findParent('td')
|
||||
metadatachunks = utf8FromSoup(metadata).split('<br />')
|
||||
metadatachunks = self.utf8FromSoup(None,metadata).split('<br />')
|
||||
# process metadata for this story.
|
||||
self.story.setMetadata('description', metadatachunks[1])
|
||||
self.setDescription(url,metadatachunks[1])
|
||||
#self.story.setMetadata('description', metadatachunks[1])
|
||||
|
||||
# First line of the stuff with ' - ' separators
|
||||
moremeta = metadatachunks[2]
|
||||
|
|
@ -224,7 +225,7 @@ class WhoficComSiteAdapter(BaseSiteAdapter):
|
|||
if None == span:
|
||||
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
return utf8FromSoup(span)
|
||||
return self.utf8FromSoup(url,span)
|
||||
|
||||
def getClass():
|
||||
return WhoficComSiteAdapter
|
||||
|
|
|
|||
|
|
@ -22,6 +22,10 @@ import logging
|
|||
import urllib
|
||||
import urllib2 as u2
|
||||
import urlparse as up
|
||||
from functools import partial
|
||||
|
||||
from .. import BeautifulSoup as bs
|
||||
from ..htmlcleanup import stripHTML
|
||||
|
||||
try:
|
||||
from google.appengine.api import apiproxy_stub_map
|
||||
|
|
@ -66,8 +70,9 @@ class BaseSiteAdapter(Configurable):
|
|||
def __init__(self, config, url):
|
||||
self.config = config
|
||||
Configurable.__init__(self, config)
|
||||
self.addConfigSection(self.getSiteDomain())
|
||||
self.addConfigSection("overrides")
|
||||
self.setSectionOrder(self.getSiteDomain())
|
||||
# self.addConfigSection(self.getSiteDomain())
|
||||
# self.addConfigSection("overrides")
|
||||
|
||||
self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
|
||||
self.password = ""
|
||||
|
|
@ -82,6 +87,8 @@ class BaseSiteAdapter(Configurable):
|
|||
self.chapterUrls = [] # tuples of (chapter title,chapter url)
|
||||
self.chapterFirst = None
|
||||
self.chapterLast = None
|
||||
self.oldchapters = None
|
||||
self.oldimgs = None
|
||||
## order of preference for decoding.
|
||||
self.decode = ["utf8",
|
||||
"Windows-1252"] # 1252 is a superset of
|
||||
|
|
@ -150,6 +157,12 @@ class BaseSiteAdapter(Configurable):
|
|||
headers=headers)
|
||||
return self._decode(self.opener.open(req).read())
|
||||
|
||||
def _fetchUrlRaw(self, url, parameters=None):
|
||||
if parameters != None:
|
||||
return self.opener.open(url,urllib.urlencode(parameters)).read()
|
||||
else:
|
||||
return self.opener.open(url).read()
|
||||
|
||||
# parameters is a dict()
|
||||
def _fetchUrl(self, url, parameters=None):
|
||||
if self.getConfig('slow_down_sleep_time'):
|
||||
|
|
@ -159,10 +172,7 @@ class BaseSiteAdapter(Configurable):
|
|||
for sleeptime in [0, 0.5, 4, 9]:
|
||||
time.sleep(sleeptime)
|
||||
try:
|
||||
if parameters:
|
||||
return self._decode(self.opener.open(url,urllib.urlencode(parameters)).read())
|
||||
else:
|
||||
return self._decode(self.opener.open(url).read())
|
||||
return self._decode(self._fetchUrlRaw(url,parameters))
|
||||
except Exception, e:
|
||||
excpt=e
|
||||
logging.warn("Caught an exception reading URL: %s Exception %s."%(unicode(url),unicode(e)))
|
||||
|
|
@ -182,15 +192,32 @@ class BaseSiteAdapter(Configurable):
|
|||
def getStory(self):
|
||||
if not self.storyDone:
|
||||
self.getStoryMetadataOnly()
|
||||
|
||||
for index, (title,url) in enumerate(self.chapterUrls):
|
||||
if (self.chapterFirst!=None and index < self.chapterFirst) or \
|
||||
(self.chapterLast!=None and index > self.chapterLast):
|
||||
self.story.addChapter(removeEntities(title),
|
||||
None)
|
||||
else:
|
||||
if self.oldchapters and index < len(self.oldchapters):
|
||||
data = self.utf8FromSoup(None,
|
||||
self.oldchapters[index],
|
||||
partial(cachedfetch,self._fetchUrlRaw,self.oldimgs))
|
||||
else:
|
||||
data = self.getChapterText(url)
|
||||
self.story.addChapter(removeEntities(title),
|
||||
removeEntities(self.getChapterText(url)))
|
||||
removeEntities(data))
|
||||
self.storyDone = True
|
||||
|
||||
# include image, but no cover from story, add default_cover_image cover.
|
||||
if self.getConfig('include_images') and \
|
||||
not self.story.cover and \
|
||||
self.getConfig('default_cover_image'):
|
||||
self.story.addImgUrl(self,
|
||||
None,
|
||||
self.getConfig('default_cover_image'),
|
||||
self._fetchUrlRaw,
|
||||
cover=True)
|
||||
return self.story
|
||||
|
||||
def getStoryMetadataOnly(self):
|
||||
|
|
@ -235,17 +262,74 @@ class BaseSiteAdapter(Configurable):
|
|||
if self.getConfig('collect_series'):
|
||||
self.story.setMetadata('series','%s [%s]'%(name, num))
|
||||
|
||||
def setDescription(self,url,svalue):
|
||||
#print("\n\nsvalue:\n%s\n"%svalue)
|
||||
if self.getConfig('keep_summary_html'):
|
||||
if isinstance(svalue,str) or isinstance(svalue,unicode):
|
||||
svalue = bs.BeautifulSoup(svalue)
|
||||
self.story.setMetadata('description',self.utf8FromSoup(url,svalue))
|
||||
else:
|
||||
self.story.setMetadata('description',stripHTML(svalue))
|
||||
#print("\n\ndescription:\n"+self.story.getMetadata('description')+"\n\n")
|
||||
|
||||
# this gives us a unicode object, not just a string containing bytes.
|
||||
# (I gave soup a unicode string, you'd think it could give it back...)
|
||||
def utf8FromSoup(self,url,soup,fetch=None):
|
||||
if not fetch:
|
||||
fetch=self._fetchUrlRaw
|
||||
|
||||
acceptable_attributes = ['href','name']
|
||||
#print("include_images:"+self.getConfig('include_images'))
|
||||
if self.getConfig('include_images'):
|
||||
acceptable_attributes.extend(('src','alt','origsrc'))
|
||||
for img in soup.findAll('img'):
|
||||
img['origsrc']=img['src']
|
||||
img['src']=self.story.addImgUrl(self,url,img['src'],fetch)
|
||||
|
||||
for attr in soup._getAttrMap().keys():
|
||||
if attr not in acceptable_attributes:
|
||||
del soup[attr] ## strip all tag attributes except href and name
|
||||
|
||||
for t in soup.findAll(recursive=True):
|
||||
for attr in t._getAttrMap().keys():
|
||||
if attr not in acceptable_attributes:
|
||||
del t[attr] ## strip all tag attributes except href and name
|
||||
|
||||
# these are not acceptable strict XHTML. But we do already have
|
||||
# CSS classes of the same names defined in constants.py
|
||||
if t.name in ('u'):
|
||||
t['class']=t.name
|
||||
t.name='span'
|
||||
if t.name in ('center'):
|
||||
t['class']=t.name
|
||||
t.name='div'
|
||||
# removes paired, but empty tags.
|
||||
if t.string != None and len(t.string.strip()) == 0 :
|
||||
t.extract()
|
||||
# Don't want body tags in chapter html--writers add them.
|
||||
return re.sub(r"</?body>\r?\n?","",soup.__str__('utf8').decode('utf-8'))
|
||||
|
||||
fullmon = {"January":"01", "February":"02", "March":"03", "April":"04", "May":"05",
|
||||
"June":"06","July":"07", "August":"08", "September":"09", "October":"10",
|
||||
"November":"11", "December":"12" }
|
||||
|
||||
def cachedfetch(realfetch,cache,url):
|
||||
if url in cache:
|
||||
print("cache hit")
|
||||
return cache[url]
|
||||
else:
|
||||
return realfetch(url)
|
||||
|
||||
|
||||
def makeDate(string,format):
|
||||
# Surprise! Abstracting this turned out to be more useful than
|
||||
# just saving bytes.
|
||||
|
||||
# fudge english month names for people who's locale is set to
|
||||
# non-english. All our current sites date in english, even if
|
||||
# there's non-english content.
|
||||
# there's non-english content. -- ficbook.net now makes that a
|
||||
# lie. It has to do something even more complicated to get
|
||||
# Russian month names correct everywhere.
|
||||
do_abbrev = "%b" in format
|
||||
|
||||
if "%B" in format or do_abbrev:
|
||||
|
|
@ -259,24 +343,3 @@ def makeDate(string,format):
|
|||
|
||||
return datetime.datetime.strptime(string,format)
|
||||
|
||||
acceptable_attributes = ['href','name']
|
||||
|
||||
# this gives us a unicode object, not just a string containing bytes.
|
||||
# (I gave soup a unicode string, you'd think it could give it back...)
|
||||
def utf8FromSoup(soup):
|
||||
for t in soup.findAll(recursive=True):
|
||||
for attr in t._getAttrMap().keys():
|
||||
if attr not in acceptable_attributes:
|
||||
del t[attr] ## strip all tag attributes except href and name
|
||||
# these are not acceptable strict XHTML. But we do already have
|
||||
# CSS classes of the same names defined in constants.py
|
||||
if t.name in ('u'):
|
||||
t['class']=t.name
|
||||
t.name='span'
|
||||
if t.name in ('center'):
|
||||
t['class']=t.name
|
||||
t.name='div'
|
||||
# removes paired, but empty tags.
|
||||
if t.string != None and len(t.string.strip()) == 0 :
|
||||
t.extract()
|
||||
return soup.__str__('utf8').decode('utf-8')
|
||||
|
|
|
|||
|
|
@ -21,16 +21,21 @@ import ConfigParser
|
|||
# inherit from Configurable. The config file(s) uses ini format:
|
||||
# [sections] with key:value settings.
|
||||
#
|
||||
# There's a [defaults] section which is overriden by the writer's
|
||||
# section [epub], which is overriden by the adapter's section for each
|
||||
# site.
|
||||
# writer does [defaults], [www.whofic.com], [epub], [www.whofic.com:epub], [overrides]
|
||||
#
|
||||
# Until a write is created, the adapter only has [defaults], [www.whofic.com], [overrides]
|
||||
#
|
||||
# [defaults]
|
||||
# titlepage_entries: category,genre, status
|
||||
# [epub]
|
||||
# titlepage_entries: category,genre, status,datePublished,dateUpdated,dateCreated
|
||||
# [www.whofic.com]
|
||||
# titlepage_entries: category,genre, status,dateUpdated,rating
|
||||
# [epub]
|
||||
# titlepage_entries: category,genre, status,datePublished,dateUpdated,dateCreated
|
||||
# [www.whofic.com:epub]
|
||||
# titlepage_entries: category,genre, status,datePublished
|
||||
# [overrides]
|
||||
# titlepage_entries: category
|
||||
|
||||
|
||||
class Configurable(object):
|
||||
|
||||
|
|
@ -38,6 +43,14 @@ class Configurable(object):
|
|||
self.config = config
|
||||
self.sectionslist = ['defaults']
|
||||
|
||||
def setSectionOrder(self,site,fileform=None):
|
||||
self.sectionslist = ['defaults']
|
||||
self.addConfigSection(site)
|
||||
if fileform:
|
||||
self.addConfigSection(fileform)
|
||||
self.addConfigSection(site+":"+fileform)
|
||||
self.addConfigSection("overrides")
|
||||
|
||||
def addConfigSection(self,section):
|
||||
self.sectionslist.insert(0,section)
|
||||
|
||||
|
|
|
|||
86
fanficdownloader/epubutils.py
Normal file
86
fanficdownloader/epubutils.py
Normal file
|
|
@ -0,0 +1,86 @@
|
|||
#!/usr/bin/env python
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import (unicode_literals, division, absolute_import,
|
||||
print_function)
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2012, Jim Miller'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import re, os, traceback
|
||||
from zipfile import ZipFile
|
||||
from xml.dom.minidom import parseString
|
||||
|
||||
from . import BeautifulSoup as bs
|
||||
|
||||
def get_dcsource(inputio):
|
||||
return get_update_data(inputio,getfilecount=False,getsoups=False)[0]
|
||||
|
||||
def get_dcsource_chaptercount(inputio):
|
||||
return get_update_data(inputio,getfilecount=True,getsoups=False)[:2] # (source,filecount)
|
||||
|
||||
def get_update_data(inputio,
|
||||
getfilecount=True,
|
||||
getsoups=True):
|
||||
epub = ZipFile(inputio, 'r')
|
||||
|
||||
## Find the .opf file.
|
||||
container = epub.read("META-INF/container.xml")
|
||||
containerdom = parseString(container)
|
||||
rootfilenodelist = containerdom.getElementsByTagName("rootfile")
|
||||
rootfilename = rootfilenodelist[0].getAttribute("full-path")
|
||||
|
||||
contentdom = parseString(epub.read(rootfilename))
|
||||
firstmetadom = contentdom.getElementsByTagName("metadata")[0]
|
||||
try:
|
||||
source=firstmetadom.getElementsByTagName("dc:source")[0].firstChild.data.encode("utf-8")
|
||||
except:
|
||||
source=None
|
||||
|
||||
## Save the path to the .opf file--hrefs inside it are relative to it.
|
||||
relpath = get_path_part(rootfilename)
|
||||
|
||||
filecount = 0
|
||||
soups = [] # list of xhmtl blocks
|
||||
images = {} # dict() origsrc->data
|
||||
if getfilecount:
|
||||
# spin through the manifest--only place there are item tags.
|
||||
for item in contentdom.getElementsByTagName("item"):
|
||||
# First, count the 'chapter' files. FFDL uses file0000.xhtml,
|
||||
# but can also update epubs downloaded from Twisting the
|
||||
# Hellmouth, which uses chapter0.html.
|
||||
if( item.getAttribute("media-type") == "application/xhtml+xml" ):
|
||||
href=relpath+item.getAttribute("href")
|
||||
print("---- item href:%s path part: %s"%(href,get_path_part(href)))
|
||||
if re.match(r'.*/(file|chapter)\d+\.x?html',href):
|
||||
if getsoups:
|
||||
soup = bs.BeautifulSoup(epub.read(href).decode("utf-8"))
|
||||
for img in soup.findAll('img'):
|
||||
try:
|
||||
newsrc=get_path_part(href)+img['src']
|
||||
# remove all .. and the path part above it, if present.
|
||||
# Most for epubs edited by Sigil.
|
||||
newsrc = re.sub(r"([^/]+/\.\./)","",newsrc)
|
||||
origsrc=img['origsrc']
|
||||
data = epub.read(newsrc)
|
||||
images[origsrc] = data
|
||||
img['src'] = img['origsrc']
|
||||
except Exception as e:
|
||||
print("Image %s not found!\n(originally:%s)"%(newsrc,origsrc))
|
||||
print("Exception: %s"%(unicode(e)))
|
||||
traceback.print_exc()
|
||||
soup = soup.find('body')
|
||||
soup.find('h3').extract()
|
||||
soups.append(soup)
|
||||
|
||||
filecount+=1
|
||||
|
||||
for k in images.keys():
|
||||
print("\torigsrc:%s\n\tData len:%s\n"%(k,len(images[k])))
|
||||
return (source,filecount,soups,images)
|
||||
|
||||
def get_path_part(n):
|
||||
relpath = os.path.dirname(n)
|
||||
if( len(relpath) > 0 ):
|
||||
relpath=relpath+"/"
|
||||
return relpath
|
||||
|
|
@ -16,9 +16,127 @@
|
|||
#
|
||||
|
||||
import os, re
|
||||
import urlparse
|
||||
from math import floor
|
||||
|
||||
from htmlcleanup import conditionalRemoveEntities, removeAllEntities
|
||||
|
||||
# Create convert_image method depending on which graphics lib we can
|
||||
# load. Preferred: calibre, PIL, none
|
||||
try:
|
||||
from calibre.utils.magick import Image
|
||||
|
||||
def convert_image(url,data,sizes,grayscale):
|
||||
export = False
|
||||
img = Image()
|
||||
img.load(data)
|
||||
|
||||
owidth, oheight = img.size
|
||||
nwidth, nheight = sizes
|
||||
scaled, nwidth, nheight = fit_image(owidth, oheight, nwidth, nheight)
|
||||
if scaled:
|
||||
img.size = (nwidth, nheight)
|
||||
export = True
|
||||
|
||||
if grayscale and img.type != "GrayscaleType":
|
||||
img.type = "GrayscaleType"
|
||||
export = True
|
||||
|
||||
if normalize_format_name(img.format) != "jpg":
|
||||
export = True
|
||||
|
||||
if export:
|
||||
return (img.export('JPG'),'jpg','image/jpeg')
|
||||
else:
|
||||
print("image used unchanged")
|
||||
return (data,'jpg','image/jpeg')
|
||||
|
||||
except:
|
||||
|
||||
# No calibre routines, try for PIL for CLI.
|
||||
try:
|
||||
import Image
|
||||
from StringIO import StringIO
|
||||
def convert_image(url,data,sizes,grayscale):
|
||||
|
||||
export = False
|
||||
img = Image.open(StringIO(data))
|
||||
|
||||
owidth, oheight = img.size
|
||||
nwidth, nheight = sizes
|
||||
scaled, nwidth, nheight = fit_image(owidth, oheight, nwidth, nheight)
|
||||
if scaled:
|
||||
img = img.resize((nwidth, nheight),Image.ANTIALIAS)
|
||||
export = True
|
||||
|
||||
if grayscale and img.mode != "L":
|
||||
img = img.convert("L")
|
||||
export = True
|
||||
|
||||
if normalize_format_name(img.format) != "jpg":
|
||||
export = True
|
||||
|
||||
if export:
|
||||
outsio = StringIO()
|
||||
img.save(outsio,'JPEG')
|
||||
return (outsio.getvalue(),'jpg','image/jpeg')
|
||||
else:
|
||||
print("image used unchanged")
|
||||
return (data,'jpg','image/jpeg')
|
||||
|
||||
except:
|
||||
|
||||
# No calibre or PIL, simple pass through with mimetype.
|
||||
imagetypes = {
|
||||
'jpg':'image/jpeg',
|
||||
'jpeg':'image/jpeg',
|
||||
'png':'image/png',
|
||||
'gif':'image/gif',
|
||||
'svg':'image/svg+xml',
|
||||
}
|
||||
|
||||
def convert_image(url,data,sizes,grayscale):
|
||||
ext=url[url.rfind('.')+1:].lower()
|
||||
return (data,ext,imagetypes[ext])
|
||||
|
||||
def normalize_format_name(fmt):
|
||||
if fmt:
|
||||
fmt = fmt.lower()
|
||||
if fmt == 'jpeg':
|
||||
fmt = 'jpg'
|
||||
return fmt
|
||||
|
||||
def fit_image(width, height, pwidth, pheight):
|
||||
'''
|
||||
Fit image in box of width pwidth and height pheight.
|
||||
@param width: Width of image
|
||||
@param height: Height of image
|
||||
@param pwidth: Width of box
|
||||
@param pheight: Height of box
|
||||
@return: scaled, new_width, new_height. scaled is True iff new_width and/or new_height is different from width or height.
|
||||
'''
|
||||
scaled = height > pheight or width > pwidth
|
||||
if height > pheight:
|
||||
corrf = pheight/float(height)
|
||||
width, height = floor(corrf*width), pheight
|
||||
if width > pwidth:
|
||||
corrf = pwidth/float(width)
|
||||
width, height = pwidth, floor(corrf*height)
|
||||
if height > pheight:
|
||||
corrf = pheight/float(height)
|
||||
width, height = floor(corrf*width), pheight
|
||||
|
||||
return scaled, int(width), int(height)
|
||||
|
||||
try:
|
||||
# doesn't really matter what, just checking for appengine.
|
||||
from google.appengine.api import apiproxy_stub_map
|
||||
|
||||
is_appengine = True
|
||||
except:
|
||||
is_appengine = False
|
||||
|
||||
|
||||
# The list comes from ffnet, the only multi-language site we support
|
||||
# at the time of writing. Values are taken largely from pycountry,
|
||||
# but with some corrections and guesses.
|
||||
|
|
@ -72,7 +190,10 @@ class Story:
|
|||
self.metadata = {'version':'4.3'}
|
||||
self.replacements = []
|
||||
self.chapters = [] # chapters will be tuples of (title,html)
|
||||
self.imgurls = []
|
||||
self.imgtuples = []
|
||||
self.listables = {} # some items (extratags, category, warnings & genres) are also kept as lists.
|
||||
self.cover=None
|
||||
|
||||
def setMetadata(self, key, value):
|
||||
## still keeps < < and &
|
||||
|
|
@ -153,6 +274,90 @@ class Story:
|
|||
def getChapters(self):
|
||||
"Chapters will be tuples of (title,html)"
|
||||
return self.chapters
|
||||
|
||||
# pass fetch in from adapter in case we need the cookies collected
|
||||
# as well as it's a base_story class method.
|
||||
def addImgUrl(self,configurable,parenturl,url,fetch,cover=False):
|
||||
|
||||
# appengine (web version) isn't allowed to do images--just
|
||||
# gets too big too fast and breaks things.
|
||||
if is_appengine:
|
||||
return
|
||||
|
||||
if url.startswith("http") or url.startswith("file") or parenturl == None:
|
||||
imgurl = url
|
||||
else:
|
||||
parsedUrl = urlparse.urlparse(parenturl)
|
||||
if url.startswith("/") :
|
||||
imgurl = urlparse.urlunparse(
|
||||
(parsedUrl.scheme,
|
||||
parsedUrl.netloc,
|
||||
url,
|
||||
'','',''))
|
||||
else:
|
||||
imgurl = urlparse.urlunparse(
|
||||
(parsedUrl.scheme,
|
||||
parsedUrl.netloc,
|
||||
parsedUrl.path + url,
|
||||
'','',''))
|
||||
|
||||
# This version, prefixing the images with the creation
|
||||
# timestamp, still allows for dup images to be detected and
|
||||
# not dup'ed in a single download. And it prevents 0.jpg from
|
||||
# earlier update being overwritten by the first image in newer
|
||||
# chapter. It does not, however, prevent dup copies of the
|
||||
# same image being d/l'ed and saved in different updates. A
|
||||
# bit of corner case inefficiency I can live with rather than
|
||||
# scanning all the pre-existing files on update. oldsrc is
|
||||
# being saved on img tags just in case, however.
|
||||
prefix='ffdl' #self.getMetadataRaw('dateCreated').strftime("%Y%m%d%H%M%S")
|
||||
|
||||
if imgurl not in self.imgurls:
|
||||
parsedUrl = urlparse.urlparse(imgurl)
|
||||
sizes = [ int(x) for x in configurable.getConfigList('image_max_size') ]
|
||||
(data,ext,mime) = convert_image(imgurl,
|
||||
fetch(imgurl),
|
||||
sizes,
|
||||
configurable.getConfig('grayscale_images'))
|
||||
# explicit cover, make the first image.
|
||||
if cover:
|
||||
if len(self.imgtuples) > 0 and 'cover' in self.imgtuples[0]['newsrc']:
|
||||
# remove existing cover, if there is one.
|
||||
del self.imgurls[0]
|
||||
del self.imgtuples[0]
|
||||
self.imgurls.insert(0,imgurl)
|
||||
newsrc = "images/cover.%s"%ext
|
||||
self.cover=newsrc
|
||||
self.imgtuples.insert(0,{'newsrc':newsrc,'mime':mime,'data':data})
|
||||
else:
|
||||
self.imgurls.append(imgurl)
|
||||
# First image, copy not link because calibre will replace with it's cover.
|
||||
if (len(self.imgurls)==1 and configurable.getConfig('make_firstimage_cover')):
|
||||
newsrc = "images/cover.%s"%ext
|
||||
self.cover=newsrc
|
||||
self.imgtuples.append({'newsrc':newsrc,'mime':mime,'data':data})
|
||||
self.imgurls.append(imgurl)
|
||||
|
||||
newsrc = "images/%s-%s.%s"%(
|
||||
prefix,
|
||||
self.imgurls.index(imgurl),
|
||||
ext)
|
||||
self.imgtuples.append({'newsrc':newsrc,'mime':mime,'data':data})
|
||||
|
||||
print("\nimgurl:%s\nnewsrc:%s\nimage size:%d\n"%(imgurl,newsrc,len(data)))
|
||||
else:
|
||||
newsrc = self.imgtuples[self.imgurls.index(imgurl)]['newsrc']
|
||||
|
||||
#print("===============\n%s\nimg url:%s\n============"%(newsrc,self.imgurls[-1]))
|
||||
|
||||
return newsrc
|
||||
|
||||
def getImgUrls(self):
|
||||
retlist = []
|
||||
for i, url in enumerate(self.imgurls):
|
||||
#parsedUrl = urlparse.urlparse(url)
|
||||
retlist.append(self.imgtuples[i])
|
||||
return retlist
|
||||
|
||||
def __str__(self):
|
||||
return "Metadata: " +str(self.metadata) + "\nListables: " +str(self.listables) #+ "\nChapters: "+str(self.chapters)
|
||||
|
|
|
|||
|
|
@ -39,10 +39,11 @@ class BaseStoryWriter(Configurable):
|
|||
|
||||
def __init__(self, config, adapter):
|
||||
Configurable.__init__(self, config)
|
||||
self.addConfigSection(adapter.getSiteDomain())
|
||||
self.addConfigSection(self.getFormatName())
|
||||
self.addConfigSection(adapter.getSiteDomain()+":"+self.getFormatName())
|
||||
self.addConfigSection("overrides")
|
||||
self.setSectionOrder(adapter.getSiteDomain(),self.getFormatName())
|
||||
# self.addConfigSection(adapter.getSiteDomain())
|
||||
# self.addConfigSection(self.getFormatName())
|
||||
# self.addConfigSection(adapter.getSiteDomain()+":"+self.getFormatName())
|
||||
# self.addConfigSection("overrides")
|
||||
|
||||
self.adapter = adapter
|
||||
self.story = adapter.getStoryMetadataOnly() # only cache the metadata initially.
|
||||
|
|
@ -144,7 +145,7 @@ class BaseStoryWriter(Configurable):
|
|||
def _write(self, out, text):
|
||||
out.write(text.encode('utf8'))
|
||||
|
||||
def writeTitlePage(self, out, START, ENTRY, END, WIDE_ENTRY=None):
|
||||
def writeTitlePage(self, out, START, ENTRY, END, WIDE_ENTRY=None, NO_TITLE_ENTRY=None):
|
||||
"""
|
||||
Write the title page, but only include entries that there's
|
||||
metadata for. START, ENTRY and END are expected to already by
|
||||
|
|
@ -171,6 +172,12 @@ class BaseStoryWriter(Configurable):
|
|||
label=self.getConfig(entry+"_label")
|
||||
else:
|
||||
label=self.titleLabels[entry]
|
||||
|
||||
# If the label for the title entry is empty, use the
|
||||
# 'no title' option if there is one.
|
||||
if label == "" and NO_TITLE_ENTRY:
|
||||
TEMPLATE= NO_TITLE_ENTRY
|
||||
|
||||
self._write(out,TEMPLATE.substitute({'label':label,
|
||||
'value':self.story.getMetadata(entry)}))
|
||||
|
||||
|
|
|
|||
|
|
@ -20,6 +20,7 @@ import string
|
|||
import StringIO
|
||||
import zipfile
|
||||
from zipfile import ZipFile, ZIP_STORED, ZIP_DEFLATED
|
||||
import urllib
|
||||
|
||||
## XML isn't as forgiving as HTML, so rather than generate as strings,
|
||||
## use DOM to generate the XML files.
|
||||
|
|
@ -57,6 +58,10 @@ class EpubWriter(BaseStoryWriter):
|
|||
|
||||
self.EPUB_TITLE_ENTRY = string.Template('''
|
||||
<b>${label}:</b> ${value}<br />
|
||||
''')
|
||||
|
||||
self.EPUB_NO_TITLE_ENTRY = string.Template('''
|
||||
${value}<br />
|
||||
''')
|
||||
|
||||
self.EPUB_TITLE_PAGE_END = string.Template('''
|
||||
|
|
@ -84,6 +89,10 @@ class EpubWriter(BaseStoryWriter):
|
|||
|
||||
self.EPUB_TABLE_TITLE_WIDE_ENTRY = string.Template('''
|
||||
<tr><td colspan="2"><b>${label}:</b> ${value}</td></tr>
|
||||
''')
|
||||
|
||||
self.EPUB_TABLE_NO_TITLE_ENTRY = string.Template('''
|
||||
<tr><td colspan="2">${label}${value}</td></tr>
|
||||
''')
|
||||
|
||||
self.EPUB_TABLE_TITLE_PAGE_END = string.Template('''
|
||||
|
|
@ -252,7 +261,52 @@ class EpubWriter(BaseStoryWriter):
|
|||
itemrefs = [] # list of strings -- idrefs from .opfs' spines
|
||||
items.append(("ncx","toc.ncx","application/x-dtbncx+xml",None)) ## we'll generate the toc.ncx file,
|
||||
## but it needs to be in the items manifest.
|
||||
|
||||
if self.getConfig('include_images'):
|
||||
imgcount=0
|
||||
for imgmap in self.story.getImgUrls():
|
||||
imgfile = "OEBPS/"+imgmap['newsrc']
|
||||
outputepub.writestr(imgfile,imgmap['data'])
|
||||
items.append(("image%04d"%imgcount,
|
||||
imgfile,
|
||||
imgmap['mime'],
|
||||
None))
|
||||
imgcount+=1
|
||||
|
||||
|
||||
items.append(("style","OEBPS/stylesheet.css","text/css",None))
|
||||
|
||||
guide = None
|
||||
coverIO = None
|
||||
|
||||
if self.story.cover:
|
||||
items.append(("cover","OEBPS/cover.xhtml","application/xhtml+xml",None))
|
||||
itemrefs.append("cover")
|
||||
#
|
||||
# <meta name="cover" content="cover.jpg"/>
|
||||
metadata.appendChild(newTag(contentdom,"meta",{"content":"image0000",
|
||||
"name":"cover"}))
|
||||
# cover stuff for later:
|
||||
# at end of <package>:
|
||||
# <guide>
|
||||
# <reference type="cover" title="Cover" href="Text/cover.xhtml"/>
|
||||
# </guide>
|
||||
guide = newTag(contentdom,"guide")
|
||||
guide.appendChild(newTag(contentdom,"reference",attrs={"type":"cover",
|
||||
"title":"Cover",
|
||||
"href":"OEBPS/cover.xhtml"}))
|
||||
|
||||
coverIO = StringIO.StringIO()
|
||||
coverIO.write('''
|
||||
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en"><head><title>Cover</title><style type="text/css" title="override_css">
|
||||
@page {padding: 0pt; margin:0pt}
|
||||
body { text-align: center; padding:0pt; margin: 0pt; }
|
||||
div { margin: 0pt; padding: 0pt; }
|
||||
</style></head><body><div>
|
||||
<img src="%s" alt="cover"/>
|
||||
</div></body></html>
|
||||
'''%self.story.cover)
|
||||
|
||||
if self.getConfig("include_titlepage"):
|
||||
items.append(("title_page","OEBPS/title_page.xhtml","application/xhtml+xml","Title Page"))
|
||||
itemrefs.append("title_page")
|
||||
|
|
@ -283,8 +337,17 @@ class EpubWriter(BaseStoryWriter):
|
|||
spine.appendChild(newTag(contentdom,"itemref",
|
||||
attrs={"idref":itemref,
|
||||
"linear":"yes"}))
|
||||
# guide only exists if there's a cover.
|
||||
if guide:
|
||||
package.appendChild(guide)
|
||||
|
||||
# write content.opf to zip.
|
||||
outputepub.writestr("content.opf",contentdom.toxml(encoding='utf-8'))
|
||||
contentxml = contentdom.toxml(encoding='utf-8')
|
||||
# tweak for brain damaged Nook STR.
|
||||
contentxml = contentxml.replace('<meta content="image0000" name="cover"/>',
|
||||
'<meta name="cover" content="image0000"/>')
|
||||
outputepub.writestr("content.opf",contentxml)
|
||||
|
||||
contentdom.unlink()
|
||||
del contentdom
|
||||
|
||||
|
|
@ -320,7 +383,7 @@ class EpubWriter(BaseStoryWriter):
|
|||
index=0
|
||||
for item in items:
|
||||
(id,href,type,title)=item
|
||||
# only items to be skipped, toc.ncx, stylesheet.css, should have no title.
|
||||
# only items to be skipped, cover.xhtml, images, toc.ncx, stylesheet.css, should have no title.
|
||||
if title :
|
||||
navPoint = newTag(tocncxdom,"navPoint",
|
||||
attrs={'id':id,
|
||||
|
|
@ -333,7 +396,7 @@ class EpubWriter(BaseStoryWriter):
|
|||
navPoint.appendChild(newTag(tocncxdom,"content",attrs={"src":href}))
|
||||
index=index+1
|
||||
|
||||
# write toc.ncs to zip file
|
||||
# write toc.ncx to zip file
|
||||
outputepub.writestr("toc.ncx",tocncxdom.toxml(encoding='utf-8'))
|
||||
tocncxdom.unlink()
|
||||
del tocncxdom
|
||||
|
|
@ -346,19 +409,26 @@ class EpubWriter(BaseStoryWriter):
|
|||
TITLE_PAGE_START = self.EPUB_TABLE_TITLE_PAGE_START
|
||||
TITLE_ENTRY = self.EPUB_TABLE_TITLE_ENTRY
|
||||
WIDE_TITLE_ENTRY = self.EPUB_TABLE_TITLE_WIDE_ENTRY
|
||||
NO_TITLE_ENTRY = self.EPUB_TABLE_NO_TITLE_ENTRY
|
||||
TITLE_PAGE_END = self.EPUB_TABLE_TITLE_PAGE_END
|
||||
else:
|
||||
TITLE_PAGE_START = self.EPUB_TITLE_PAGE_START
|
||||
TITLE_ENTRY = self.EPUB_TITLE_ENTRY
|
||||
WIDE_TITLE_ENTRY = self.EPUB_TITLE_ENTRY # same, only wide in tables.
|
||||
NO_TITLE_ENTRY = self.EPUB_NO_TITLE_ENTRY
|
||||
TITLE_PAGE_END = self.EPUB_TITLE_PAGE_END
|
||||
|
||||
|
||||
if coverIO:
|
||||
outputepub.writestr("OEBPS/cover.xhtml",coverIO.getvalue())
|
||||
coverIO.close()
|
||||
|
||||
titlepageIO = StringIO.StringIO()
|
||||
self.writeTitlePage(out=titlepageIO,
|
||||
START=TITLE_PAGE_START,
|
||||
ENTRY=TITLE_ENTRY,
|
||||
WIDE_ENTRY=WIDE_TITLE_ENTRY,
|
||||
END=TITLE_PAGE_END)
|
||||
END=TITLE_PAGE_END,
|
||||
NO_TITLE_ENTRY=NO_TITLE_ENTRY)
|
||||
if titlepageIO.getvalue(): # will be false if no title page.
|
||||
outputepub.writestr("OEBPS/title_page.xhtml",titlepageIO.getvalue())
|
||||
titlepageIO.close()
|
||||
|
|
@ -384,7 +454,7 @@ class EpubWriter(BaseStoryWriter):
|
|||
fullhtml = fullhtml.replace('</p>','</p>\n').replace('<br />','<br />\n')
|
||||
outputepub.writestr("OEBPS/file%04d.xhtml"%(index+1),fullhtml.encode('utf-8'))
|
||||
del fullhtml
|
||||
|
||||
|
||||
# declares all the files created by Windows. otherwise, when
|
||||
# it runs in appengine, windows unzips the files as 000 perms.
|
||||
for zf in outputepub.filelist:
|
||||
|
|
|
|||
|
|
@ -49,6 +49,10 @@ class MobiWriter(BaseStoryWriter):
|
|||
|
||||
self.MOBI_TITLE_ENTRY = string.Template('''
|
||||
<b>${label}:</b> ${value}<br />
|
||||
''')
|
||||
|
||||
self.MOBI_NO_TITLE_ENTRY = string.Template('''
|
||||
${value}<br />
|
||||
''')
|
||||
|
||||
self.MOBI_TITLE_PAGE_END = string.Template('''
|
||||
|
|
@ -75,6 +79,10 @@ class MobiWriter(BaseStoryWriter):
|
|||
|
||||
self.MOBI_TABLE_TITLE_WIDE_ENTRY = string.Template('''
|
||||
<tr><td colspan="2"><b>${label}:</b> ${value}</td></tr>
|
||||
''')
|
||||
|
||||
self.MOBI_TABLE_NO_TITLE_WIDE_ENTRY = string.Template('''
|
||||
<tr><td colspan="2">${value}</td></tr>
|
||||
''')
|
||||
|
||||
self.MOBI_TABLE_TITLE_PAGE_END = string.Template('''
|
||||
|
|
@ -129,11 +137,13 @@ class MobiWriter(BaseStoryWriter):
|
|||
TITLE_PAGE_START = self.MOBI_TABLE_TITLE_PAGE_START
|
||||
TITLE_ENTRY = self.MOBI_TABLE_TITLE_ENTRY
|
||||
WIDE_TITLE_ENTRY = self.MOBI_TABLE_TITLE_WIDE_ENTRY
|
||||
NO_TITLE_ENTRY = self.MOBI_TABLE_NO_TITLE_ENTRY
|
||||
TITLE_PAGE_END = self.MOBI_TABLE_TITLE_PAGE_END
|
||||
else:
|
||||
TITLE_PAGE_START = self.MOBI_TITLE_PAGE_START
|
||||
TITLE_ENTRY = self.MOBI_TITLE_ENTRY
|
||||
WIDE_TITLE_ENTRY = self.MOBI_TITLE_ENTRY # same, only wide in tables.
|
||||
NO_TITLE_ENTRY = self.MOBI_NO_TITLE_ENTRY
|
||||
TITLE_PAGE_END = self.MOBI_TITLE_PAGE_END
|
||||
|
||||
titlepageIO = StringIO.StringIO()
|
||||
|
|
@ -141,7 +151,8 @@ class MobiWriter(BaseStoryWriter):
|
|||
START=TITLE_PAGE_START,
|
||||
ENTRY=TITLE_ENTRY,
|
||||
WIDE_ENTRY=WIDE_TITLE_ENTRY,
|
||||
END=TITLE_PAGE_END)
|
||||
END=TITLE_PAGE_END,
|
||||
NO_TITLE_ENTRY=NO_TITLE_ENTRY)
|
||||
if titlepageIO.getvalue(): # will be false if no title page.
|
||||
files.append(titlepageIO.getvalue())
|
||||
titlepageIO.close()
|
||||
|
|
|
|||
4
main.py
4
main.py
|
|
@ -339,7 +339,7 @@ class FanfictionDownloader(UserConfigServer):
|
|||
self.redirect("/?error=custom&errtext=%s"%urlEscape("There's an error in your User Configuration: "+str(e)))
|
||||
return
|
||||
|
||||
adapter = adapters.getAdapter(config,url)
|
||||
adapter = adapters.getAdapter(config,url,format)
|
||||
logging.info('Created an adaper: %s' % adapter)
|
||||
|
||||
if len(login) > 1:
|
||||
|
|
@ -442,7 +442,7 @@ class FanfictionDownloaderTask(UserConfigServer):
|
|||
|
||||
try:
|
||||
config = self.getUserConfig(user)
|
||||
adapter = adapters.getAdapter(config,url)
|
||||
adapter = adapters.getAdapter(config,url,format)
|
||||
|
||||
logging.info('Created an adapter: %s' % adapter)
|
||||
|
||||
|
|
|
|||
|
|
@ -108,10 +108,6 @@ extratags: FanFiction
|
|||
## useful if pulling large numbers of stories or if the site is slow.
|
||||
#slow_down_sleep_time:0.5
|
||||
|
||||
## output background color--only used by html and epub (and ignored in
|
||||
## epub by many readers). Must be hex code, # will be added.
|
||||
background_color: ffffff
|
||||
|
||||
## Use regular expressions to find and replace (or remove) metadata.
|
||||
## For example, you could change Sci-Fi=>SF, remove *-Centered tags,
|
||||
## etc. See http://docs.python.org/library/re.html (look for re.sub)
|
||||
|
|
@ -160,13 +156,6 @@ titlepage_entries: series,category,genre,language,status,datePublished,dateUpdat
|
|||
## use \r\n for line endings, the windows convention. text output only.
|
||||
windows_eol: true
|
||||
|
||||
[txt]
|
||||
## Add URLs since there aren't links.
|
||||
titlepage_entries: series,category,genre,status,datePublished,dateUpdated,dateCreated,rating,warnings,numChapters,numWords,site,storyUrl, authorUrl, description
|
||||
|
||||
## use \r\n for line endings, the windows convention. text output only.
|
||||
windows_eol: true
|
||||
|
||||
[epub]
|
||||
|
||||
## epub carries the TOC in metadata.
|
||||
|
|
@ -213,11 +202,41 @@ output_css:
|
|||
.u {text-decoration: underline;}
|
||||
.bold {font-weight: bold;}
|
||||
|
||||
## include images from img tags in the body and summary of
|
||||
## stories. Images will be converted to jpg for size if possible.
|
||||
#include_images:false
|
||||
|
||||
## If not set, the summary will have all html stripped for safety.
|
||||
## Both this and include_images must be true to get images in the
|
||||
## summary.
|
||||
#keep_summary_html:false
|
||||
|
||||
## If set, the first image found will be made the cover image. If
|
||||
## keep_summary_html is true, any images in summary will be before any
|
||||
## in chapters.
|
||||
#make_firstimage_cover: false
|
||||
|
||||
## If set, and there isn't already a cover image from the adapter or
|
||||
## from make_firstimage_cover, this image will be made the cover.
|
||||
## It can be either a 'file:' or 'http:' url.
|
||||
## Note that if you enable make_firstimage_cover in [epub], but want
|
||||
## to use default_cover_image for a specific site, use the site:format
|
||||
## section, for example: [www.ficwad.com:epub]
|
||||
#default_cover_image:file:///C:/Users/username/Desktop/nook/images/icon.png
|
||||
#default_cover_image:http://www.somesite.com/someimage.gif
|
||||
|
||||
## Resize images down to width, height, preserving aspect ratio.
|
||||
## Nook size, with margin.
|
||||
image_max_size: 580, 725
|
||||
|
||||
## Change image to grayscale, if graphics library allows, to save
|
||||
## space.
|
||||
#grayscale_images: false
|
||||
|
||||
[mobi]
|
||||
## mobi TOC cannot be turned off right now.
|
||||
#include_tocpage: true
|
||||
|
||||
|
||||
## Each site has a section that overrides [defaults] *and* the format
|
||||
## sections test1.com specifically is not a real story site. Instead,
|
||||
## it is a fake site for testing configuration and output. It uses
|
||||
|
|
|
|||
Loading…
Reference in a new issue