diff --git a/src/calibre/customize/conversion.py b/src/calibre/customize/conversion.py index 72c067747d..8c8ce8c686 100644 --- a/src/calibre/customize/conversion.py +++ b/src/calibre/customize/conversion.py @@ -155,7 +155,7 @@ def get_images(self): ''' raise NotImplementedError() - def preprocess_html(self, html): + def preprocess_html(self, opts, html): ''' This method is called by the conversion pipeline on all HTML before it is parsed. It is meant to be used to do any required preprocessing on diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index e72e15c3d9..3e5de26766 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -399,7 +399,7 @@ def dump(raw, where): html = unidecoder.decode(html) if self.plugin_preprocess: - html = self.input_plugin_preprocess(html) + html = self.input_plugin_preprocess(self.extra_opts, html) if getattr(self.extra_opts, 'smarten_punctuation', False): html = self.smarten_punctuation(html) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index 3fe6ce0ed4..8588ff65ad 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -11,7 +11,7 @@ class PreProcessor(object): - def __init__(self, log=None, extra_opts=None): + def __init__(self, extra_opts=None, log=None): self.log = default_log if log is None else log self.html_preprocess_sections = 0 self.found_indents = 0 @@ -77,6 +77,32 @@ def no_markup(self, raw, percent): def __call__(self, html): self.log("********* Preprocessing HTML *********") + ###### Check Markup ###### + # + # some lit files don't have any
tags or equivalent (generally just plain text between + #
tags), check and mark up line endings if required before proceeding
+ if self.no_markup(html, 0.1):
+ self.log("not enough paragraph markers, adding now")
+ # check if content is in pre tags, use txt procesor to mark up if so
+ pre = re.compile(r'', re.IGNORECASE)
+ if len(pre.findall(html)) == 1:
+ self.log("Running Text Processing")
+ from calibre.ebooks.txt.processor import convert_basic, preserve_spaces, \
+ separate_paragraphs_single_line
+ outerhtml = re.compile(r'.*?(?<=)(?P.*)(?=
).*', re.IGNORECASE|re.DOTALL)
+ html = outerhtml.sub('\g', html)
+ html = separate_paragraphs_single_line(html)
+ html = preserve_spaces(html)
+ html = convert_basic(html, epub_split_size_kb=0)
+ else:
+ # Add markup naively
+ # TODO - find out if there are cases where there are more than one tag or
+ # other types of unmarked html and handle them in some better fashion
+ add_markup = re.compile('(?)(\n)')
+ html = add_markup.sub('\n', html)
+
+ ###### Mark Indents/Cleanup ######
+ #
# Replace series of non-breaking spaces with text-indent
txtindent = re.compile(ur'
[^>]*)>\s*(?P(]*>\s*)+)?\s*(\u00a0){2,}', re.IGNORECASE)
html = txtindent.sub(self.insert_indent, html)
@@ -86,31 +112,27 @@ def __call__(self, html):
html = re.sub(ur'\u00a0', ' ', html)
# Get rid of empty tags to simplify other processing
html = re.sub(ur'\s*\s* ', ' ', html)
- # Get rid of empty span tags
- html = re.sub(r"\s*]*>\s*", " ", html)
+ # Get rid of empty span, bold, & italics tags
+ html = re.sub(r"\s*]*>\s*(]>\s*){0,2}\s*\s*", " ", html)
+ html = re.sub(r"\s*<[ibu]>\s*(<[ibu]>\s*[ibu]>\s*){0,2}\s*[ibu]>", " ", html)
+ html = re.sub(r"\s*]*>\s*(]>\s*){0,2}\s*\s*", " ", html)
# If more than 40% of the lines are empty paragraphs then delete them to clean up spacing
linereg = re.compile('(?<=)', re.IGNORECASE|re.DOTALL)
- blankreg = re.compile(r'\s*
]*>\s*(<(b|i|u)>)?\s*((b|i|u)>)?\s*
', re.IGNORECASE)
+ blankreg = re.compile(r'\s*(?P]*>)\s*(?P
)', re.IGNORECASE)
#multi_blank = re.compile(r'(\s*]*>\s*(<(b|i|u)>)?\s*((b|i|u)>)?\s*
){2,}', re.IGNORECASE)
blanklines = blankreg.findall(html)
lines = linereg.findall(html)
if len(lines) > 1:
self.log("There are " + str(len(blanklines)) + " blank lines. " + str(float(len(blanklines)) / float(len(lines))) + " percent blank")
- if float(len(blanklines)) / float(len(lines)) > 0.40:
+ if float(len(blanklines)) / float(len(lines)) > 0.40 and getattr(self.extra_opts,
+ 'remove_paragraph_spacing', False):
self.log("deleting blank lines")
html = blankreg.sub('', html)
# Arrange line feeds and
tags so the line_length and no_markup functions work correctly
html = re.sub(r"\s*", "\n", html)
html = re.sub(r"\s*\s*", "\n
", html)
-
- # some lit files don't have any
tags or equivalent (generally just plain text between
- #
tags), check and mark up line endings if required before proceeding
- if self.no_markup(html, 0.1):
- self.log("not enough paragraph markers, adding now")
- add_markup = re.compile('(?)(\n)')
- html = add_markup.sub('\n', html)
-
+ #self.log("\n\n\n\n\n\n\n\n\n\n\n"+html+"\n\n\n\n\n\n\n\n\n\n\n\n\n")
# detect chapters/sections to match xpath or splitting logic
heading = re.compile(']*>', re.IGNORECASE)
self.html_preprocess_sections = len(heading.findall(html))
@@ -118,7 +140,7 @@ def __call__(self, html):
#
# Start with most typical chapter headings, get more aggressive until one works
if self.html_preprocess_sections < 10:
- chapdetect = re.compile(r'(?=?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu]>){0,2}\s*(]*>)?\s*(?P(<[ibu]>){0,2}s*(]*>)?\s*.?(Introduction|Synopsis|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8}\s*([ibu]>){0,2})\s*()?s*( )?\s*([ibu]>){0,2}\s*((p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu]>){0,2}\s*(]*>)?\s*(?P(<[ibu]>){0,2}(\s*[\w\'\"-]+){1,5}\s*([ibu]>){0,2})\s*( )?\s*([ibu]>){0,2}\s*((br|p)>))?', re.IGNORECASE)
+ chapdetect = re.compile(r'(?=?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu]>){0,2}\s*(]*>)?\s*(<[ibu]>){0,2}\s*(]*>)?\s*(?P(<[ibu]>){0,2}\s*.?(Introduction|Synopsis|Acknowledgements|Chapter|Epilogue|Volume|Prologue|Book\s|Part\s|Dedication)\s*([\d\w-]+\:?\s*){0,8}\s*([ibu]>){0,2})\s*( )?s*([ibu]>){0,2}\s*()?\s*((p|/?br)>)\s*\s*(\s*]*>\s*
){0,2}\s*(<(/?br|p)[^>]*>\s*(<[ibu]>){0,2}\s*(]*>)?\s*(?P(<[ibu]>){0,2}(\s*[\w\'\"-]+){1,5}\s*([ibu]>){0,2})\s*( )?\s*([ibu]>){0,2}\s*((br|p)>))?', re.IGNORECASE|re.VERBOSE)
html = chapdetect.sub(self.chapter_head, html)
if self.html_preprocess_sections < 10:
self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying numeric chapters")
@@ -127,10 +149,10 @@ def __call__(self, html):
if self.html_preprocess_sections < 10:
self.log("not enough chapters, only " + str(self.html_preprocess_sections) + ", trying with uppercase words")
- chapdetect2 = re.compile(r'(?=?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu]>){0,2}\s*(]*>)?\s*(?P(<[ibu]>){0,2}\s*.?(([A-Z#-]+\s*){1,9})\s*([ibu]>){0,2})\s*( )?\s*([ibu]>){0,2}\s*((p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu]>){0,2}\s*(]*>)?\s*(?P(<[ibu]>){0,2}(\s*[\w\'\"-]+){1,5}\s*([ibu]>){0,2})\s*( )?\s*([ibu]>){0,2}\s*((br|p)>))?', re.UNICODE)
+ chapdetect2 = re.compile(r'(?=?(br|p))(<(/?br|p)[^>]*>)\s*(<[ibu]>){0,2}\s*(]*>)?\s*(?P(<[ibu]>){0,2}\s*.?([A-Z#\-\s]+)\s*([ibu]>){0,2})\s*( )?\s*([ibu]>){0,2}\s*((p|/?br)>)\s*(<(/?br|p)[^>]*>\s*(<[ibu]>){0,2}\s*(]*>)?\s*(?P(<[ibu]>){0,2}(\s*[\w\'\"-]+){1,5}\s*([ibu]>){0,2})\s*( )?\s*([ibu]>){0,2}\s*((br|p)>))?', re.UNICODE)
html = chapdetect2.sub(self.chapter_head, html)
- # Unwrap lines
+ ###### Unwrap lines ######
#
self.log("Unwrapping Lines")
# Some OCR sourced files have line breaks in the html using a combination of span & p tags
@@ -149,9 +171,9 @@ def __call__(self, html):
format = 'html'
# Calculate Length
- length = line_length('pdf', html, getattr(self.extra_opts,
+ length = line_length(format, html, getattr(self.extra_opts,
'html_unwrap_factor', 0.4))
- self.log("*** Median line length is " + str(length) + ",calculated with " + format + " format ***")
+ self.log("*** Median line length is " + str(length) + ", calculated with " + format + " format ***")
#
# Unwrap and/or delete soft-hyphens, hyphens
html = re.sub(u'\s*(\s*([iubp]>\s*<[iubp][^>]*>\s*)?]*>|[iubp]>\s*<[iubp][^>]*>)?\s*', '', html)
@@ -164,13 +186,15 @@ def __call__(self, html):
# If still no sections after unwrapping mark split points on lines with no punctuation
if self.html_preprocess_sections < 10:
self.log("Looking for more split points based on punctuation, currently have " + str(self.html_preprocess_sections))
- #self.log(html)
chapdetect3 = re.compile(r'<(?P(p|div)[^>]*)>\s*(?P(]*>)?\s*(<[ibu]>){0,2}\s*(]*>)?\s*(<[ibu]>){0,2}\s*(]*>)?\s*.?([a-z#-*]+\s*){1,5}\s*\s*()?([ibu]>){0,2}\s*()?\s*([ibu]>){0,2}\s*()?\s*(p|div)>)', re.IGNORECASE)
html = chapdetect3.sub(self.chapter_break, html)
# search for places where a first or second level heading is immediately followed by another
# top level heading. demote the second heading to h3 to prevent splitting between chapter
# headings and titles, images, etc
doubleheading = re.compile(r'(?P]*>.+? \s*(<(?!h\d)[^>]*>\s*)*)[^>]*>.+?) ', re.IGNORECASE)
- html = doubleheading.sub('\g'+''+'
', html)
+ html = doubleheading.sub('\g'+'\n'+'
', html)
+
+ # put back non-breaking spaces in empty paragraphs to preserve original formatting
+ html = blankreg.sub('\n'+'\g'+' '+'\g', html)
return html
diff --git a/src/calibre/ebooks/html/input.py b/src/calibre/ebooks/html/input.py
index 084d48e54b..603adadb53 100644
--- a/src/calibre/ebooks/html/input.py
+++ b/src/calibre/ebooks/html/input.py
@@ -490,7 +490,8 @@ def css_import_handler(self, base, href):
return (None, None)
return (None, raw)
- def preprocess_html(self, html):
- preprocessor = PreProcessor(log=getattr(self, 'log', None))
+ def preprocess_html(self, options, html):
+ self.options = options
+ preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
return preprocessor(html)
diff --git a/src/calibre/ebooks/lit/input.py b/src/calibre/ebooks/lit/input.py
index 65f5c607a2..46a5e75977 100644
--- a/src/calibre/ebooks/lit/input.py
+++ b/src/calibre/ebooks/lit/input.py
@@ -53,7 +53,8 @@ def postprocess_book(self, oeb, opts, log):
pre.append(ne)
- def preprocess_html(self, html):
- preprocessor = PreProcessor(log=getattr(self, 'log', None))
+ def preprocess_html(self, options, html):
+ self.options = options
+ preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
return preprocessor(html)
diff --git a/src/calibre/ebooks/lrf/input.py b/src/calibre/ebooks/lrf/input.py
index c54f3b071f..70529c0a04 100644
--- a/src/calibre/ebooks/lrf/input.py
+++ b/src/calibre/ebooks/lrf/input.py
@@ -420,8 +420,9 @@ def convert(self, stream, options, file_ext, log,
styles.write()
return os.path.abspath('content.opf')
- def preprocess_html(self, html):
- preprocessor = PreProcessor(log=getattr(self, 'log', None))
+ def preprocess_html(self, options, html):
+ self.options = options
+ preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
return preprocessor(html)
diff --git a/src/calibre/ebooks/mobi/input.py b/src/calibre/ebooks/mobi/input.py
index b8dc7a9560..9ab7996a74 100644
--- a/src/calibre/ebooks/mobi/input.py
+++ b/src/calibre/ebooks/mobi/input.py
@@ -39,11 +39,11 @@ def convert(self, stream, options, file_ext, log,
accelerators['pagebreaks'] = '//h:div[@class="mbp_pagebreak"]'
return mr.created_opf_path
- def preprocess_html(self, html):
+ def preprocess_html(self, options, html):
# search for places where a first or second level heading is immediately followed by another
# top level heading. demote the second heading to h3 to prevent splitting between chapter
# headings and titles, images, etc
doubleheading = re.compile(r'(?P]*>.+? \s*(<(?!h\d)[^>]*>\s*)*)[^>]*>.+?) ', re.IGNORECASE)
- html = doubleheading.sub('\g'+''+'
', html)
+ html = doubleheading.sub('\g'+'\n'+'
', html)
return html
diff --git a/src/calibre/ebooks/rtf/input.py b/src/calibre/ebooks/rtf/input.py
index 000c603c1c..078b30627f 100644
--- a/src/calibre/ebooks/rtf/input.py
+++ b/src/calibre/ebooks/rtf/input.py
@@ -229,7 +229,7 @@ def convert(self, stream, options, file_ext, log,
res = transform.tostring(result)
res = res[:100].replace('xmlns:html', 'xmlns') + res[100:]
if self.options.preprocess_html:
- preprocessor = PreProcessor(log=getattr(self, 'log', None))
+ preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
res = preprocessor(res)
f.write(res)
self.write_inline_css(inline_class)