Handle recoverable OPF errors more gracefully, with logging. Change LIT-writing logging to share logger with OEB-processing.

This commit is contained in:
Marshall T. Vandegrift 2008-12-16 18:55:44 -05:00
parent c82d2d5d13
commit 24e5133c3b
2 changed files with 57 additions and 22 deletions

View file

@ -13,7 +13,9 @@
from itertools import izip, count
from urlparse import urldefrag, urlparse, urlunparse
from urllib import unquote as urlunquote
import logging
from lxml import etree
from calibre import LoggingInterface
XML_PARSER = etree.XMLParser(recover=True, resolve_entities=False)
XML_NS = 'http://www.w3.org/XML/1998/namespace'
@ -82,6 +84,13 @@ def urlnormalize(href):
return urlunparse(parts)
class FauxLogger(object):
def __getattr__(self, name):
return self
def __call__(self, message):
print message
class AbstractContainer(object):
def read_xml(self, path):
return etree.fromstring(
@ -102,6 +111,10 @@ def write(self, path, data):
with open(urlunquote(path), 'wb') as f:
return f.write(data)
def exists(self, path):
path = os.path.join(self.rootdir, path)
return os.path.isfile(path)
class Metadata(object):
TERMS = set(['contributor', 'coverage', 'creator', 'date', 'description',
@ -287,7 +300,7 @@ def items(self):
yield id, items
def __contains__(self, key):
return id in self.items
return key in self.items
def to_opf1(self, parent=None):
elem = element(parent, 'manifest')
@ -473,13 +486,14 @@ def to_ncx(self, parent, playorder=None, depth=1):
node.to_ncx(point, playorder, depth+1)
return parent
class OEBBook(object):
def __init__(self, opfpath, container=None):
def __init__(self, opfpath, container=None, logger=FauxLogger()):
if not container:
container = DirContainer(os.path.dirname(opfpath))
opfpath = os.path.basename(opfpath)
self.container = container
self.logger = logger
opf = self._read_opf(opfpath)
self._all_from_opf(opf)
@ -533,17 +547,28 @@ def _metadata_from_opf(self, opf):
if item.id == uid:
self.uid = item
break
else:
self.logger.log_warn(u'Unique-identifier %r not found.' % uid)
self.uid = metadata.identifier[0]
def _manifest_from_opf(self, opf):
self.manifest = manifest = Manifest(self)
for elem in xpath(opf, '/o2:package/o2:manifest/o2:item'):
manifest.add(elem.get('id'), elem.get('href'),
elem.get('media-type'), elem.get('fallback'))
href = elem.get('href')
if not self.container.exists(href):
self.logger.log_warn(u'Manifest item %r not found.' % href)
continue
manifest.add(elem.get('id'), href, elem.get('media-type'),
elem.get('fallback'))
def _spine_from_opf(self, opf):
self.spine = spine = Spine(self)
for elem in xpath(opf, '/o2:package/o2:spine/o2:itemref'):
item = self.manifest[elem.get('idref')]
idref = elem.get('idref')
if idref not in self.manifest:
self.logger.log_warn(u'Spine item %r not found.' % idref)
continue
item = self.manifest[idref]
spine.add(item, elem.get('linear'))
extras = []
for item in self.manifest.values():
@ -557,7 +582,11 @@ def _spine_from_opf(self, opf):
def _guide_from_opf(self, opf):
self.guide = guide = Guide(self)
for elem in xpath(opf, '/o2:package/o2:guide/o2:reference'):
guide.add(elem.get('type'), elem.get('title'), elem.get('href'))
href = elem.get('href')
if href not in self.manifest.hrefs:
self.logger.log_warn(u'Guide reference %r not found' % href)
continue
guide.add(elem.get('type'), elem.get('title'), href)
def _toc_from_navpoint(self, toc, navpoint):
children = xpath(navpoint, 'ncx:navPoint')

View file

@ -26,10 +26,11 @@
from calibre.ebooks.lit.oeb import OEB_DOCS, OEB_STYLES, OEB_CSS_MIME, \
CSS_MIME, OPF_MIME, XML_NS, XML
from calibre.ebooks.lit.oeb import namespace, barename, urlnormalize, xpath
from calibre.ebooks.lit.oeb import OEBBook
from calibre.ebooks.lit.oeb import FauxLogger, OEBBook
from calibre.ebooks.lit.stylizer import Stylizer
from calibre.ebooks.lit.lzx import Compressor
import calibre
from calibre import LoggingInterface
from calibre import plugins
msdes, msdeserror = plugins['msdes']
import calibre.ebooks.lit.mssha1 as mssha1
@ -141,9 +142,9 @@ def warn(x):
class ReBinary(object):
NSRMAP = {'': None, XML_NS: 'xml'}
def __init__(self, root, path, oeb, map=HTML_MAP, warn=warn):
def __init__(self, root, path, oeb, map=HTML_MAP, logger=FauxLogger()):
self.path = path
self.log_warn = warn
self.logger = logger
self.dir = os.path.dirname(path)
self.manifest = oeb.manifest
self.tags, self.tattrs = map
@ -272,7 +273,7 @@ def tree_to_binary(self, elem, nsrmap=NSRMAP, parents=[],
def build_ahc(self):
if len(self.anchors) > 6:
self.log_warn("More than six anchors in file %r. " \
self.logger.log_warn("More than six anchors in file %r. " \
"Some links may not work properly." % self.path)
data = StringIO()
data.write(unichr(len(self.anchors)).encode('utf-8'))
@ -296,11 +297,10 @@ def wrapper(self, *args, **kwargs):
functools.update_wrapper(wrapper, function)
return wrapper
class LitWriter(object, calibre.LoggingInterface):
def __init__(self, oeb, verbose=0):
calibre.LoggingInterface.__init__(self, logging.getLogger('oeb2lit'))
self.setup_cli_handler(verbose)
class LitWriter(object):
def __init__(self, oeb, logger=FauxLogger()):
self._oeb = oeb
self._logger = logger
self._litize_oeb()
def _litize_oeb(self):
@ -325,7 +325,7 @@ def _litize_oeb(self):
if type not in oeb.guide:
oeb.guide.add(type, title, cover.href)
else:
self.log_warn('No suitable cover image found.')
self._logger.log_warn('No suitable cover image found.')
def dump(self, stream):
self._stream = stream
@ -467,7 +467,7 @@ def _build_data(self):
self._add_folder('/data')
for item in self._oeb.manifest.values():
if item.media_type not in LIT_MIMES:
self.log_warn("File %r of unknown media-type %r " \
self._logger.log_warn("File %r of unknown media-type %r " \
"excluded from output." % (item.href, item.media_type))
continue
name = '/data/' + item.id
@ -475,7 +475,8 @@ def _build_data(self):
secnum = 0
if not isinstance(data, basestring):
self._add_folder(name)
rebin = ReBinary(data, item.href, self._oeb, warn=self.log_warn)
rebin = ReBinary(data, item.href, self._oeb, map=HTML_MAP,
logger=self._logger)
self._add_file(name + '/ahc', rebin.ahc, 0)
self._add_file(name + '/aht', rebin.aht, 0)
item.page_breaks = rebin.page_breaks
@ -554,7 +555,8 @@ def _build_meta(self):
meta.attrib['ms--minimum_level'] = '0'
meta.attrib['ms--attr5'] = '1'
meta.attrib['ms--guid'] = '{%s}' % str(uuid.uuid4()).upper()
rebin = ReBinary(meta, 'content.opf', self._oeb, map=OPF_MAP, warn=self.log_warn)
rebin = ReBinary(meta, 'content.opf', self._oeb, map=OPF_MAP,
logger=self._logger)
meta = rebin.content
self._meta = meta
self._add_file('/meta', meta)
@ -713,19 +715,23 @@ def option_parser():
parser.add_option(
'-o', '--output', default=None,
help=_('Output file. Default is derived from input filename.'))
parser.add_option(
'--verbose', default=False, action='store_true',
help=_('Useful for debugging.'))
return parser
def oeb2lit(opts, opfpath):
logger = LoggingInterface(logging.getLogger('oeb2lit'))
logger.setup_cli_handler(opts.verbose)
litpath = opts.output
if litpath is None:
litpath = os.path.basename(opfpath)
litpath = os.path.splitext(litpath)[0] + '.lit'
litpath = os.path.abspath(litpath)
lit = LitWriter(OEBBook(opfpath), opts.verbose)
lit = LitWriter(OEBBook(opfpath))
with open(litpath, 'wb') as f:
lit.dump(f)
logger = logging.getLogger('oeb2lit')
logger.info(_('Output written to ')+litpath)
logger.log_info(_('Output written to ')+litpath)
def main(argv=sys.argv):