diff --git a/src/calibre/ebooks/oeb/base.py b/src/calibre/ebooks/oeb/base.py index 76a6648e8d..ed7981df4f 100644 --- a/src/calibre/ebooks/oeb/base.py +++ b/src/calibre/ebooks/oeb/base.py @@ -272,11 +272,26 @@ def XPath(expr): def xpath(elem, expr): return elem.xpath(expr, namespaces=XPNSMAP) -def xml2str(root, pretty_print=False): - return etree.tostring(root, encoding='utf-8', xml_declaration=True, +def _prepare_xml_for_serialization(root): + root.set('xmlns', XHTML_NS) + root.set('{%s}xlink'%XHTML_NS, XLINK_NS) + for x in root.iter(): + if hasattr(x.tag, 'rpartition') and x.tag.rpartition('}')[-1].lower() == 'svg': + x.set('xmlns', SVG_NS) + +def xml2str(root, pretty_print=False, strip_comments=False): + _prepare_xml_for_serialization(root) + ans = etree.tostring(root, encoding='utf-8', xml_declaration=True, pretty_print=pretty_print) + if strip_comments: + ans = re.compile(r'', re.DOTALL).sub('', ans) + + return ans + + def xml2unicode(root, pretty_print=False): + _prepare_xml_for_serialization(root) return etree.tostring(root, pretty_print=pretty_print) ASCII_CHARS = set(chr(x) for x in xrange(128)) @@ -826,6 +841,11 @@ def __str__(self): return xml2str(data, pretty_print=self.oeb.pretty_print) if isinstance(data, unicode): return data.encode('utf-8') + if hasattr(data, 'cssText'): + data = data.cssText + if isinstance(data, unicode): + data = data.encode('utf-8') + return data return str(data) def __unicode__(self): @@ -834,6 +854,8 @@ def __unicode__(self): return xml2unicode(data, pretty_print=self.oeb.pretty_print) if isinstance(data, unicode): return data + if hasattr(data, 'cssText'): + return data.cssText return unicode(data) def __eq__(self, other): @@ -1044,6 +1066,12 @@ def remove(self, item): self.items[i].spine_position = i item.spine_position = None + def index(self, item): + for i, x in enumerate(self): + if item == x: + return i + return -1 + def __iter__(self): for item in self.items: yield item diff --git a/src/calibre/ebooks/oeb/iterator.py b/src/calibre/ebooks/oeb/iterator.py index ec0eda908a..8672d42e2b 100644 --- a/src/calibre/ebooks/oeb/iterator.py +++ b/src/calibre/ebooks/oeb/iterator.py @@ -162,7 +162,6 @@ def __enter__(self): s.pages = p start = 1 - for s in self.spine: s.start_page = start start += s.pages diff --git a/src/calibre/ebooks/oeb/output.py b/src/calibre/ebooks/oeb/output.py index ea986f49fa..480ca3776e 100644 --- a/src/calibre/ebooks/oeb/output.py +++ b/src/calibre/ebooks/oeb/output.py @@ -22,7 +22,6 @@ def convert(self, oeb_book, output_path, input_plugin, opts, log): if not os.path.exists(output_path): os.makedirs(output_path) from calibre.ebooks.oeb.base import OPF_MIME, NCX_MIME, PAGE_MAP_MIME - from calibre.ebooks.html import tostring as html_tostring with CurrentDir(output_path): results = oeb_book.to_opf2(page_map=True) for key in (OPF_MIME, NCX_MIME, PAGE_MAP_MIME): @@ -38,16 +37,7 @@ def convert(self, oeb_book, output_path, input_plugin, opts, log): dir = os.path.dirname(path) if not os.path.exists(dir): os.makedirs(dir) - raw = item.data - if not isinstance(raw, basestring): - if hasattr(raw, 'cssText'): - raw = raw.cssText - else: - raw = html_tostring(raw, - pretty_print=opts.pretty_print) - if isinstance(raw, unicode): - raw = raw.encode('utf-8') with open(path, 'wb') as f: - f.write(raw) + f.write(str(item)) diff --git a/src/calibre/ebooks/epub/split.py b/src/calibre/ebooks/oeb/transforms/split.py similarity index 51% rename from src/calibre/ebooks/epub/split.py rename to src/calibre/ebooks/oeb/transforms/split.py index 8ff62a1c4b..20205e9c6d 100644 --- a/src/calibre/ebooks/epub/split.py +++ b/src/calibre/ebooks/oeb/transforms/split.py @@ -4,21 +4,25 @@ __docformat__ = 'restructuredtext en' ''' -Split the flows in an epub file to conform to size limitations. +Splitting of the XHTML flows. Splitting can happen on page boundaries or can be +forces at "likely" locations to conform to size limitations. This transform +assumes a prior call to the flatcss transform. ''' -import os, math, functools, collections, re, copy, sys +import os, math, functools, collections, re, copy from lxml.etree import XPath as _XPath from lxml import etree, html from lxml.cssselect import CSSSelector -from calibre.ebooks.metadata.opf2 import OPF +from calibre.ebooks.oeb.base import OEB_STYLES, XPNSMAP, urldefrag, \ + rewrite_links from calibre.ebooks.epub import tostring, rules -from calibre import CurrentDir -XPath = functools.partial(_XPath, namespaces={'re':'http://exslt.org/regular-expressions'}) -content = functools.partial(os.path.join, 'content') +NAMESPACES = dict(XPNSMAP) +NAMESPACES['re'] = 'http://exslt.org/regular-expressions' + +XPath = functools.partial(_XPath, namespaces=NAMESPACES) SPLIT_ATTR = 'cs' SPLIT_POINT_ATTR = 'csp' @@ -27,149 +31,166 @@ class SplitError(ValueError): def __init__(self, path, root): size = len(tostring(root))/1024. - ValueError.__init__(self, _('Could not find reasonable point at which to split: %s Sub-tree size: %d KB')% - (os.path.basename(path), size)) + ValueError.__init__(self, + _('Could not find reasonable point at which to split: ' + '%s Sub-tree size: %d KB')% + (path, size)) + +class Split(object): + + def __init__(self, split_on_page_breaks=True, page_breaks_xpath=None, + max_flow_size=0): + self.split_on_page_breaks = split_on_page_breaks + self.page_breaks_xpath = page_breaks_xpath + self.max_flow_size = max_flow_size + if self.page_breaks_xpath is not None: + self.page_breaks_xpath = XPath(self.page_breaks_xpath) + + def __call__(self, oeb, context): + self.oeb = oeb + self.log = oeb.log + self.map = {} + self.page_break_selectors = None + for item in self.oeb.manifest.items: + if etree.iselement(item.data): + self.split_item(item) + + self.fix_links() + + def split_item(self, item): + if self.split_on_page_breaks: + if self.page_breaks_xpath is None: + page_breaks, page_break_ids = self.find_page_breaks(item) + else: + page_breaks, page_break_ids = self.page_breaks_xpath(item.data) + + splitter = FlowSplitter(item, page_breaks, page_break_ids, + self.max_flow_size, self.oeb) + if splitter.was_split: + self.map[item.href] = dict(splitter.anchor_map) + + def find_page_breaks(self, item): + if self.page_break_selectors is None: + self.page_break_selectors = set([]) + stylesheets = [x.data for x in self.oeb.manifest if x.media_type in + OEB_STYLES] + page_break_selectors = set([]) + for rule in rules(stylesheets): + before = getattr(rule.style.getPropertyCSSValue( + 'page-break-before'), 'cssText', '').strip().lower() + after = getattr(rule.style.getPropertyCSSValue( + 'page-break-after'), 'cssText', '').strip().lower() + try: + if before and before != 'avoid': + page_break_selectors.add((CSSSelector(rule.selectorText), + True)) + except: + pass + try: + if after and after != 'avoid': + page_break_selectors.add((CSSSelector(rule.selectorText), + False)) + except: + pass + + page_breaks = set([]) + for selector, before in page_break_selectors: + for elem in selector(item.data): + elem.pb_before = before + page_breaks.add(elem) + + for i, elem in enumerate(item.data.iter()): + elem.pb_order = i + + page_breaks = list(page_breaks) + page_breaks.sort(cmp=lambda x,y : cmp(x.pb_order, y.pb_order)) + page_break_ids, page_breaks_ = [], [] + for i, x in enumerate(page_breaks): + x.set('id', x.get('id', 'calibre_pb_%d'%i)) + id = x.get('id') + page_breaks_.append((XPath('//*[@id="%s"]'%id), x.pb_before)) + page_break_ids.append(id) + + return page_breaks_, page_break_ids + + def fix_links(self, opf): + ''' + Fix references to the split files in other content files. + ''' + for item in self.oeb.manifest: + if etree.iselement(item.data): + self.current_item = item + rewrite_links(item.data, self.rewrite_links) + + def rewrite_links(self, url): + href, frag = urldefrag(url) + href = self.current_item.abshref(href) + if href in self.map: + anchor_map = self.map[href] + nhref = anchor_map[frag if frag else None] + if frag: + nhref = '#'.joinn(href, frag) + return nhref + return url -class Splitter(object): +class FlowSplitter(object): - def __init__(self, path, opts, stylesheet_map, opf): - self.setup_cli_handler(opts.verbose) - self.path = path - self.always_remove = not opts.preserve_tag_structure or \ - os.stat(content(path)).st_size > 5*opts.profile.flow_size - self.base = (os.path.splitext(path)[0].replace('%', '%%') + '_split_%d.html') - self.opts = opts - self.orig_size = os.stat(content(path)).st_size - self.log_info('\tSplitting %s (%d KB)', path, self.orig_size/1024.) - root = html.fromstring(open(content(path)).read()) + def __init__(self, item, page_breaks, page_break_ids, max_flow_size, oeb): + self.item = item + self.oeb = oeb + self.log = oeb.log + self.page_breaks = page_breaks + self.page_break_ids = page_break_ids + self.max_flow_size = max_flow_size + self.base = item.abshref(item.href) - self.page_breaks, self.trees = [], [] - self.split_size = 0 + base, ext = os.path.splitext(self.base) + self.base = base.replace('%', '%%')+'_split_%d'+ext - # Split on page breaks + self.trees = [self.item.data] self.splitting_on_page_breaks = True - if not opts.dont_split_on_page_breaks: - self.log_info('\tSplitting on page breaks...') - if self.path in stylesheet_map: - self.find_page_breaks(stylesheet_map[self.path], root) - self.split_on_page_breaks(root.getroottree()) - trees = list(self.trees) - else: - self.trees = [root.getroottree()] - trees = list(self.trees) - - # Split any remaining over-sized trees + if self.page_breaks: + self.split_on_page_breaks(self.item.data) self.splitting_on_page_breaks = False - if self.opts.profile.flow_size < sys.maxint: + + if self.max_flow_size > 0: lt_found = False - self.log_info('\tLooking for large trees...') - for i, tree in enumerate(list(trees)): + self.log('\tLooking for large trees...') + trees = list(self.trees) + for i, tree in enumerate(list(self.trees)): self.trees = [] size = len(tostring(tree.getroot())) if size > self.opts.profile.flow_size: lt_found = True - try: - self.split_to_size(tree) - except (SplitError, RuntimeError): # Splitting fails - if not self.always_remove: - self.always_remove = True - self.split_to_size(tree) - else: - raise + self.split_to_size(tree) trees[i:i+1] = list(self.trees) if not lt_found: self.log_info('\tNo large trees found') + self.trees = trees - self.trees = trees self.was_split = len(self.trees) > 1 - if self.was_split: - self.commit() - self.log_info('\t\tSplit into %d parts.', len(self.trees)) - if self.opts.verbose: - for f in self.files: - self.log_info('\t\t\t%s - %d KB', f, os.stat(content(f)).st_size/1024.) - self.fix_opf(opf) + self.commit() - self.trees = None + def split_on_page_breaks(self, orig_tree): + ordered_ids = [] + for elem in orig_tree.xpath('//*[@id]'): + id = elem.get('id') + if id in self.page_break_ids: + ordered_ids.append(self.page_breaks[self.page_break_ids.index(id)]) - - def split_text(self, text, root, size): - self.log_debug('\t\t\tSplitting text of length: %d'%len(text)) - rest = text.replace('\r', '') - parts = re.split('\n\n', rest) - self.log_debug('\t\t\t\tFound %d parts'%len(parts)) - if max(map(len, parts)) > size: - raise SplitError('Cannot split as file contains a
tag with a very large paragraph', root)
- ans = []
- buf = ''
- for part in parts:
- if len(buf) + len(part) < size:
- buf += '\n\n'+part
- else:
- ans.append(buf)
- buf = part
- return ans
-
-
- def split_to_size(self, tree):
- self.log_debug('\t\tSplitting...')
- root = tree.getroot()
- # Split large tags
- for pre in list(root.xpath('//pre')):
- text = u''.join(pre.xpath('descendant::text()'))
- pre.text = text
- for child in list(pre.iterchildren()):
- pre.remove(child)
- if len(pre.text) > self.opts.profile.flow_size*0.5:
- frags = self.split_text(pre.text, root, int(0.2*self.opts.profile.flow_size))
- new_pres = []
- for frag in frags:
- pre2 = copy.copy(pre)
- pre2.text = frag
- pre2.tail = u''
- new_pres.append(pre2)
- new_pres[-1].tail = pre.tail
- p = pre.getparent()
- i = p.index(pre)
- p[i:i+1] = new_pres
-
- split_point, before = self.find_split_point(root)
- if split_point is None or self.split_size > 6*self.orig_size:
- if not self.always_remove:
- self.log_warn(_('\t\tToo much markup. Re-splitting without '
- 'structure preservation. This may cause '
- 'incorrect rendering.'))
- raise SplitError(self.path, root)
-
- for t in self.do_split(tree, split_point, before):
- r = t.getroot()
- if self.is_page_empty(r):
- continue
- size = len(tostring(r))
- if size <= self.opts.profile.flow_size:
- self.trees.append(t)
- #print tostring(t.getroot(), pretty_print=True)
- self.log_debug('\t\t\tCommitted sub-tree #%d (%d KB)',
- len(self.trees), size/1024.)
- self.split_size += size
- else:
- self.split_to_size(t)
-
- def is_page_empty(self, root):
- body = root.find('body')
- if body is None:
- return False
- txt = re.sub(r'\s+', '', html.tostring(body, method='text', encoding=unicode))
- if len(txt) > 4:
- #if len(txt) < 100:
- # print 1111111, html.tostring(body, method='html', encoding=unicode)
- return False
- for img in root.xpath('//img'):
- if img.get('style', '') != 'display:none':
- return False
- return True
+ self.trees = []
+ tree = orig_tree
+ for pattern, before in ordered_ids:
+ self.log.debug('\t\tSplitting on page-break')
+ elem = pattern(tree)
+ if elem:
+ before, after = self.do_split(tree, elem[0], before)
+ self.trees.append(before)
+ tree = after
+ self.trees.append(tree)
+ self.trees = [t for t in self.trees if not self.is_page_empty(t.getroot())]
def do_split(self, tree, split_point, before):
'''
@@ -190,7 +211,7 @@ def do_split(self, tree, split_point, before):
split_point2 = root2.xpath(path)[0]
def nix_element(elem, top=True):
- if self.always_remove:
+ if True:
parent = elem.getparent()
index = parent.index(elem)
if top:
@@ -198,7 +219,6 @@ def nix_element(elem, top=True):
else:
index = parent.index(elem)
parent[index:index+1] = list(elem.iterchildren())
-
else:
elem.text = u''
elem.tail = u''
@@ -241,67 +261,76 @@ def fix_split_point(sp):
return tree, tree2
+ def is_page_empty(self, root):
+ body = root.find('body')
+ if body is None:
+ return False
+ txt = re.sub(r'\s+', '', html.tostring(body, method='text', encoding=unicode))
+ if len(txt) > 4:
+ return False
+ for img in root.xpath('//img'):
+ if img.get('style', '') != 'display:none':
+ return False
+ return True
- def split_on_page_breaks(self, orig_tree):
- ordered_ids = []
- for elem in orig_tree.xpath('//*[@id]'):
- id = elem.get('id')
- if id in self.page_break_ids:
- ordered_ids.append(self.page_breaks[self.page_break_ids.index(id)])
-
- self.trees = []
- tree = orig_tree
- for pattern, before in ordered_ids:
- self.log_info('\t\tSplitting on page-break')
- elem = pattern(tree)
- if elem:
- before, after = self.do_split(tree, elem[0], before)
- self.trees.append(before)
- tree = after
- self.trees.append(tree)
- self.trees = [t for t in self.trees if not self.is_page_empty(t.getroot())]
+ def split_text(self, text, root, size):
+ self.log.debug('\t\t\tSplitting text of length: %d'%len(text))
+ rest = text.replace('\r', '')
+ parts = re.split('\n\n', rest)
+ self.log.debug('\t\t\t\tFound %d parts'%len(parts))
+ if max(map(len, parts)) > size:
+ raise SplitError('Cannot split as file contains a tag '
+ 'with a very large paragraph', root)
+ ans = []
+ buf = ''
+ for part in parts:
+ if len(buf) + len(part) < size:
+ buf += '\n\n'+part
+ else:
+ ans.append(buf)
+ buf = part
+ return ans
+ def split_to_size(self, tree):
+ self.log.debug('\t\tSplitting...')
+ root = tree.getroot()
+ # Split large tags
+ for pre in list(root.xpath('//pre')):
+ text = u''.join(pre.xpath('descendant::text()'))
+ pre.text = text
+ for child in list(pre.iterchildren()):
+ pre.remove(child)
+ if len(pre.text) > self.max_flow_size*0.5:
+ frags = self.split_text(pre.text, root, int(0.2*self.max_flow_size))
+ new_pres = []
+ for frag in frags:
+ pre2 = copy.copy(pre)
+ pre2.text = frag
+ pre2.tail = u''
+ new_pres.append(pre2)
+ new_pres[-1].tail = pre.tail
+ p = pre.getparent()
+ i = p.index(pre)
+ p[i:i+1] = new_pres
- def find_page_breaks(self, stylesheets, root):
- '''
- Find all elements that have either page-break-before or page-break-after set.
- Populates `self.page_breaks` with id based XPath selectors (for elements that don't
- have ids, an id is created).
- '''
- page_break_selectors = set([])
- for rule in rules(stylesheets):
- before = getattr(rule.style.getPropertyCSSValue('page-break-before'), 'cssText', '').strip().lower()
- after = getattr(rule.style.getPropertyCSSValue('page-break-after'), 'cssText', '').strip().lower()
- try:
- if before and before != 'avoid':
- page_break_selectors.add((CSSSelector(rule.selectorText), True))
- except:
- pass
- try:
- if after and after != 'avoid':
- page_break_selectors.add((CSSSelector(rule.selectorText), False))
- except:
- pass
-
- page_breaks = set([])
- for selector, before in page_break_selectors:
- for elem in selector(root):
- elem.pb_before = before
- page_breaks.add(elem)
-
- for i, elem in enumerate(root.iter()):
- elem.pb_order = i
-
- page_breaks = list(page_breaks)
- page_breaks.sort(cmp=lambda x,y : cmp(x.pb_order, y.pb_order))
- self.page_break_ids = []
- for i, x in enumerate(page_breaks):
- x.set('id', x.get('id', 'calibre_pb_%d'%i))
- id = x.get('id')
- self.page_breaks.append((XPath('//*[@id="%s"]'%id), x.pb_before))
- self.page_break_ids.append(id)
+ split_point, before = self.find_split_point(root)
+ if split_point is None:
+ raise SplitError(self.item.href, root)
+ for t in self.do_split(tree, split_point, before):
+ r = t.getroot()
+ if self.is_page_empty(r):
+ continue
+ size = len(tostring(r))
+ if size <= self.max_flow_size:
+ self.trees.append(t)
+ #print tostring(t.getroot(), pretty_print=True)
+ self.log.debug('\t\t\tCommitted sub-tree #%d (%d KB)',
+ len(self.trees), size/1024.)
+ self.split_size += size
+ else:
+ self.split_to_size(t)
def find_split_point(self, root):
'''
@@ -336,8 +365,7 @@ def pick_elem(elems):
'//br',
'//li',
):
- elems = root.xpath(path,
- namespaces={'re':'http://exslt.org/regular-expressions'})
+ elems = root.xpath(path, namespaces=NAMESPACES)
elem = pick_elem(elems)
if elem is not None:
try:
@@ -355,6 +383,8 @@ def commit(self):
all anchors in the original tree. Internal links are re-directed. The
original file is deleted and the split files are saved.
'''
+ if not self.was_split:
+ return
self.anchor_map = collections.defaultdict(lambda :self.base%0)
self.files = []
@@ -368,134 +398,46 @@ def commit(self):
elem.attrib.pop(SPLIT_ATTR, None)
elem.attrib.pop(SPLIT_POINT_ATTR, '0')
- for current, tree in zip(self.files, self.trees):
- for a in tree.getroot().xpath('//a[@href]'):
+ spine_pos = self.item.spine_pos
+ for current, tree in zip(map(reversed, (self.files, self.trees))):
+ for a in tree.getroot().xpath('//h:a[@href]', namespaces=NAMESPACES):
href = a.get('href').strip()
if href.startswith('#'):
anchor = href[1:]
file = self.anchor_map[anchor]
if file != current:
a.set('href', file+href)
- open(content(current), 'wb').\
- write(tostring(tree.getroot(), pretty_print=self.opts.pretty_print))
- os.remove(content(self.path))
+ new_id = self.oeb.manifest.generate(id=self.item.id)[0]
+ new_item = self.oeb.manifest.add(new_id, current,
+ self.item.media_type, data=tree.getroot())
+ self.oeb.spine.insert(spine_pos, new_item, self.item.linear)
+
+ if self.oeb.guide:
+ for ref in self.oeb.guide:
+ href, frag = urldefrag(ref.href)
+ if href == self.item.href:
+ nhref = self.anchor_map[frag if frag else None]
+ if frag:
+ nhref = '#'.join(nhref, frag)
+ ref.href = nhref
+
+ def fix_toc_entry(toc):
+ if toc.href:
+ href, frag = urldefrag(toc.href)
+ if href == self.item.href:
+ nhref = self.anchor_map[frag if frag else None]
+ if frag:
+ nhref = '#'.join(nhref, frag)
+ toc.href = nhref
+ for x in toc:
+ fix_toc_entry(x)
- def fix_opf(self, opf):
- '''
- Fix references to the split file in the OPF.
- '''
- items = [item for item in opf.itermanifest() if item.get('href') == 'content/'+self.path]
- new_items = [('content/'+f, None) for f in self.files]
- id_map = {}
- for item in items:
- id_map[item.get('id')] = opf.replace_manifest_item(item, new_items)
+ if self.oeb.toc:
+ fix_toc_entry(self.oeb.toc)
- for id in id_map.keys():
- opf.replace_spine_items_by_idref(id, id_map[id])
-
- for ref in opf.iterguide():
- href = ref.get('href', '')
- if href.startswith('content/'+self.path):
- href = href.split('#')
- frag = None
- if len(href) > 1:
- frag = href[1]
- if frag not in self.anchor_map:
- self.log_warning('\t\tUnable to re-map OPF link', href)
- continue
- new_file = self.anchor_map[frag]
- ref.set('href', 'content/'+new_file+('' if frag is None else ('#'+frag)))
+ self.oeb.manifest.remove(self.item)
-def fix_content_links(html_files, changes, opts):
- split_files = [f.path for f in changes]
- anchor_maps = [f.anchor_map for f in changes]
- files = list(html_files)
- for j, f in enumerate(split_files):
- try:
- i = files.index(f)
- files[i:i+1] = changes[j].files
- except ValueError:
- continue
-
- for htmlfile in files:
- changed = False
- root = html.fromstring(open(content(htmlfile), 'rb').read())
- for a in root.xpath('//a[@href]'):
- href = a.get('href')
- if not href.startswith('#'):
- href = href.split('#')
- anchor = href[1] if len(href) > 1 else None
- href = href[0]
- if href in split_files:
- try:
- newf = anchor_maps[split_files.index(href)][anchor]
- except:
- print '\t\tUnable to remap HTML link:', href, anchor
- continue
- frag = ('#'+anchor) if anchor else ''
- a.set('href', newf+frag)
- changed = True
-
- if changed:
- open(content(htmlfile), 'wb').write(tostring(root, pretty_print=opts.pretty_print))
-
-def fix_ncx(path, changes):
- split_files = [f.path for f in changes]
- anchor_maps = [f.anchor_map for f in changes]
- tree = etree.parse(path)
- changed = False
- for content in tree.getroot().xpath('//x:content[@src]',
- namespaces={'x':"http://www.daisy.org/z3986/2005/ncx/"}):
- href = content.get('src')
- if not href.startswith('#'):
- href = href.split('#')
- anchor = href[1] if len(href) > 1 else None
- href = href[0].split('/')[-1]
- if href in split_files:
- try:
- newf = anchor_maps[split_files.index(href)][anchor]
- except:
- print 'Unable to remap NCX link:', href, anchor
- frag = ('#'+anchor) if anchor else ''
- content.set('src', 'content/'+newf+frag)
- changed = True
- if changed:
- open(path, 'wb').write(etree.tostring(tree.getroot(), encoding='UTF-8', xml_declaration=True))
-
-def find_html_files(opf):
- '''
- Find all HTML files referenced by `opf`.
- '''
- html_files = []
- for item in opf.itermanifest():
- if 'html' in item.get('media-type', '').lower():
- f = item.get('href').split('/')[-1]
- f2 = f.replace('&', '%26')
- if not os.path.exists(content(f)) and os.path.exists(content(f2)):
- f = f2
- item.set('href', item.get('href').replace('&', '%26'))
- if os.path.exists(content(f)):
- html_files.append(f)
- return html_files
-
-
-def split(pathtoopf, opts, stylesheet_map):
- pathtoopf = os.path.abspath(pathtoopf)
- opf = OPF(open(pathtoopf, 'rb'), os.path.dirname(pathtoopf))
-
- with CurrentDir(os.path.dirname(pathtoopf)):
- html_files = find_html_files(opf)
- changes = [Splitter(f, opts, stylesheet_map, opf) for f in html_files]
- changes = [c for c in changes if c.was_split]
-
- fix_content_links(html_files, changes, opts)
- for item in opf.itermanifest():
- if item.get('media-type', '') == 'application/x-dtbncx+xml':
- fix_ncx(item.get('href'), changes)
- break
-
- open(pathtoopf, 'wb').write(opf.render())