ebook-polish: Roundtripping for both epub and azw3.

This commit is contained in:
Kovid Goyal 2013-02-03 22:36:31 +05:30
parent d661b15ae2
commit 09429e3c1c
2 changed files with 130 additions and 26 deletions

View file

@ -337,6 +337,24 @@ def xml2unicode(root, pretty_print=False):
def xml2text(elem):
return etree.tostring(elem, method='text', encoding=unicode, with_tail=False)
def serialize(data, media_type, pretty_print=False):
if isinstance(data, etree._Element):
ans = xml2str(data, pretty_print=pretty_print)
if media_type in OEB_DOCS:
# Convert self closing div|span|a|video|audio|iframe|etc tags
# to normally closed ones, as they are interpreted
# incorrectly by some browser based renderers
ans = close_self_closing_tags(ans)
return ans
if isinstance(data, unicode):
return data.encode('utf-8')
if hasattr(data, 'cssText'):
data = data.cssText
if isinstance(data, unicode):
data = data.encode('utf-8')
return data + b'\n'
return bytes(data)
ASCII_CHARS = set(chr(x) for x in xrange(128))
UNIBYTE_CHARS = set(chr(x) for x in xrange(256))
URL_SAFE = set('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
@ -960,23 +978,7 @@ def loader2(*args):
self._data = None
def __str__(self):
data = self.data
if isinstance(data, etree._Element):
ans = xml2str(data, pretty_print=self.oeb.pretty_print)
if self.media_type in OEB_DOCS:
# Convert self closing div|span|a|video|audio|iframe|etc tags
# to normally closed ones, as they are interpreted
# incorrectly by some browser based renderers
ans = close_self_closing_tags(ans)
return ans
if isinstance(data, unicode):
return data.encode('utf-8')
if hasattr(data, 'cssText'):
data = data.cssText
if isinstance(data, unicode):
data = data.encode('utf-8')
return data + b'\n'
return str(data)
return serialize(self.data, self.media_type, pretty_print=self.oeb.pretty_print)
def __unicode__(self):
data = self.data

View file

@ -13,17 +13,20 @@
from lxml import etree
from calibre import guess_type, CurrentDir
from calibre.customize.ui import (plugin_for_input_format,
plugin_for_output_format)
from calibre.ebooks.chardet import xml_to_unicode
from calibre.ebooks.conversion.plugins.epub_input import (
ADOBE_OBFUSCATION, IDPF_OBFUSCATION, decrypt_font)
from calibre.ebooks.conversion.preprocess import HTMLPreProcessor, CSSPreProcessor
from calibre.ebooks.mobi import MobiError
from calibre.ebooks.mobi.reader.headers import MetadataHeader
from calibre.ebooks.oeb.base import OEB_DOCS, _css_logger, OEB_STYLES, OPF2_NS
from calibre.ebooks.mobi.tweak import set_cover
from calibre.ebooks.oeb.base import (serialize, OEB_DOCS, _css_logger,
OEB_STYLES, OPF2_NS)
from calibre.ebooks.oeb.polish.errors import InvalidBook, DRMError
from calibre.ebooks.oeb.parse_utils import NotHTML, parse_html, RECOVER_PARSER
from calibre.ptempfile import PersistentTemporaryDirectory
from calibre.utils.fonts.sfnt.container import Sfnt
from calibre.ptempfile import PersistentTemporaryDirectory, PersistentTemporaryFile
from calibre.utils.ipc.simple_worker import fork_job, WorkerError
from calibre.utils.logging import default_log
from calibre.utils.zipfile import ZipFile
@ -43,6 +46,7 @@ def __init__(self, rootpath, opfpath, log):
self.parsed_cache = {}
self.mime_map = {}
self.name_path_map = {}
self.dirtied = set()
# Map of relative paths with '/' separators from root of unzipped ePub
# to absolute paths on filesystem with os-specific separators
@ -141,8 +145,6 @@ def parse(self, path, mime):
data = self.parse_xml(data)
elif mime in OEB_STYLES:
data = self.parse_css(data, self.relpath(path))
elif mime in OEB_FONTS or path.rpartition('.')[-1].lower() in {'ttf', 'otf'}:
data = Sfnt(data)
return data
def parse_css(self, data, fname):
@ -189,6 +191,64 @@ def spine_items(self):
for path in non_linear:
yield path
def remove_item(self, name):
'''
Remove the item identified by name from this container. This remove all
references to the item in the OPF manifest, guide and spine as well as from
any internal caches.
'''
removed = set()
for elem in self.opf.xpath('//opf:manifest/opf:item[@href]',
namespaces={'opf':OPF2_NS}):
if self.href_to_name(elem.get('href')) == name:
id_ = elem.get('id', None)
if id_ is not None:
removed.add(id_)
elem.getparent().remove(elem)
if removed:
for item in self.opf.xpath('//opf:spine/opf:itemref[@idref]',
namespaces={'opf':OPF2_NS}):
idref = item.get('idref')
if idref in removed:
item.getparent().remove(item)
for item in self.opf.xpath('//opf:guide/opf:reference[@href]',
namespaces={'opf':OPF2_NS}):
if self.href_to_name(item.get('href')) == name:
item.getparent().remove(item)
path = self.name_path_map.pop(name)
if os.path.exists(path):
os.remove(path)
self.mime_map.pop(name, None)
self.parsed_cache.pop(name, None)
self.dirtied.discard(name)
def dirty(self, name):
self.dirtied.add(name)
def commit(self, outpath=None):
for name in self.dirtied:
self.dirtied.remove(name)
data = self.parsed_cache.pop(name)
data = serialize(data)
with open(self.name_path_map[name], 'wb') as f:
f.write(data)
def compare_to(self, other):
if set(self.name_path_map) != set(other.name_path_map):
return ['Set of files is not the same']
mismatches = []
for name, path in self.name_path_map.iteritems():
opath = other.name_path_map[name]
with open(path, 'rb') as f1, open(opath, 'rb') as f2:
if f1.read() != f2.read():
mismatches.append('The file %s is not the same'%name)
import subprocess
subprocess.call(['kompare', path, opath])
return '\n'.join(mismatches)
# EPUB {{{
class InvalidEpub(InvalidBook):
pass
@ -294,8 +354,24 @@ def process_encryption(self):
if not tkey:
raise InvalidBook('Failed to find obfuscation key')
decrypt_font(tkey, path, alg)
self.obfuscated_fonts[name] = (alg, tkey)
self.obfuscated_fonts[font] = (alg, tkey)
def commit(self, outpath=None):
super(EpubContainer, self).commit()
for name in self.obfuscated_fonts:
if name not in self.name_path_map:
continue
alg, key = self.obfuscated_fonts[name]
# Decrypting and encrypting are the same operation (XOR with key)
decrypt_font(key, self.name_path_map[name], alg)
if outpath is None:
outpath = self.pathtoepub
from calibre.ebooks.tweak import zip_rebuilder
zip_rebuilder(self.root, outpath)
# }}}
# AZW3 {{{
class InvalidMobi(InvalidBook):
pass
@ -357,14 +433,40 @@ def __init__(self, pathtoazw3, log):
super(AZW3Container, self).__init__(tdir, opf_path, log)
self.obfuscated_fonts = {x.replace(os.sep, '/') for x in obfuscated_fonts}
def commit(self, outpath=None):
super(AZW3Container, self).commit()
if outpath is None:
outpath = self.pathtoazw3
from calibre.ebooks.conversion.plumber import Plumber, create_oebbook
opf = self.name_path_map[self.opf_name]
plumber = Plumber(opf, outpath, self.log)
plumber.setup_options()
inp = plugin_for_input_format('azw3')
outp = plugin_for_output_format('azw3')
plumber.opts.mobi_passthrough = True
oeb = create_oebbook(default_log, opf, plumber.opts)
set_cover(oeb)
outp.convert(oeb, outpath, inp, plumber.opts, default_log)
# }}}
def get_container(path, log=None):
if log is None: log = default_log
ebook = (AZW3Container if path.rpartition('.')[-1].lower() in {'azw3', 'mobi'}
else EpubContainer)(path, log)
return ebook
if __name__ == '__main__':
def test_roundtrip():
ebook = get_container(sys.argv[-1])
for s in ebook.spine_items:
print (ebook.relpath(s))
p = PersistentTemporaryFile(suffix='.'+sys.argv[-1].rpartition('.')[-1])
p.close()
ebook.commit(outpath=p.name)
ebook2 = get_container(p.name)
ebook3 = get_container(p.name)
diff = ebook3.compare_to(ebook2)
if diff is not None:
print (diff)
if __name__ == '__main__':
test_roundtrip()