diff --git a/src/calibre/ebooks/epub/split.py b/src/calibre/ebooks/epub/split.py index 491d90c4d8..c83bb33034 100644 --- a/src/calibre/ebooks/epub/split.py +++ b/src/calibre/ebooks/epub/split.py @@ -7,7 +7,7 @@ Split the flows in an epub file to conform to size limitations. ''' -import os, math, copy, logging, functools, collections +import os, math, logging, functools, collections, re, copy from lxml.etree import XPath as _XPath from lxml import etree, html @@ -72,7 +72,25 @@ def __init__(self, path, opts, always_remove=False): for f in self.files: self.log_info('\t\t\t%s - %d KB', f, os.stat(content(f)).st_size/1024.) self.trees = None - + + def split_text(self, text, root, size): + self.log_debug('\t\t\tSplitting text of length: %d'%len(text)) + rest = text.replace('\r', '') + parts = re.split('\n\n', rest) + self.log_debug('\t\t\t\tFound %d parts'%len(parts)) + if max(map(len, parts)) > size: + raise SplitError('Cannot split as file contains a
tag with a very large paragraph', root)
+ ans = []
+ buf = ''
+ for part in parts:
+ if len(buf) + len(part) < size:
+ buf += '\n\n'+part
+ else:
+ ans.append(buf)
+ buf = part
+ return ans
+
+
def split(self, tree):
'''
Split ``tree`` into a *before* and *after* tree, preserving tag structure,
@@ -81,6 +99,25 @@ def split(self, tree):
'''
self.log_debug('\t\tSplitting...')
root = tree.getroot()
+ # Split large tags
+ for pre in list(root.xpath('//pre')):
+ text = u''.join(pre.xpath('./text()'))
+ pre.text = text
+ for child in list(pre.iterdescendants()):
+ pre.remove(child)
+ if len(pre.text) > self.opts.profile.flow_size*0.5:
+ frags = self.split_text(pre.text, root, int(0.2*self.opts.profile.flow_size))
+ new_pres = []
+ for frag in frags:
+ pre2 = copy.copy(pre)
+ pre2.text = frag
+ pre2.tail = u''
+ new_pres.append(pre2)
+ new_pres[-1].tail = pre.tail
+ p = pre.getparent()
+ i = p.index(pre)
+ p[i:i+1] = new_pres
+
split_point, before = self.find_split_point(root)
if split_point is None or self.split_size > 6*self.orig_size:
if not self.always_remove:
@@ -219,10 +256,21 @@ def pick_elem(elems):
- for path in ('//*[re:match(name(), "h[1-6]", "i")]', '/html/body/div', '//p'):
+ for path in (
+ '//*[re:match(name(), "h[1-6]", "i")]',
+ '/html/body/div',
+ '//pre',
+ '//hr',
+ '//p',
+ '//br',
+ ):
elems = root.xpath(path)
elem = pick_elem(elems)
if elem is not None:
+ try:
+ XPath(elem.getroottree().getpath(elem))
+ except:
+ continue
return elem, True
return None, True
diff --git a/src/calibre/ebooks/html.py b/src/calibre/ebooks/html.py
index f950b946f7..7525c4a07a 100644
--- a/src/calibre/ebooks/html.py
+++ b/src/calibre/ebooks/html.py
@@ -378,7 +378,14 @@ def parse_html(self):
self.log_exception('lxml based parsing failed')
self.root = soupparser.fromstring(src)
head = self.root.xpath('./head')
- self.head = head[0] if head else etree.SubElement(self.root, 'head')
+ if head:
+ head = head[0]
+ else:
+ head = etree.SubElement(self.root, 'head')
+ self.root.remove(head)
+ self.root.insert(0, head)
+
+ self.head = head
self.body = self.root.body
for a in self.root.xpath('//a[@name]'):
a.set('id', a.get('name'))