diff --git a/src/calibre/customize/conversion.py b/src/calibre/customize/conversion.py index ec83600a49..a9e573ffa0 100644 --- a/src/calibre/customize/conversion.py +++ b/src/calibre/customize/conversion.py @@ -160,7 +160,7 @@ def get_images(self): ''' raise NotImplementedError() - def preprocess_html(self, opts, html): + def heuristics(self, opts, html): ''' This method is called by the conversion pipeline on all HTML before it is parsed. It is meant to be used to do any required preprocessing on diff --git a/src/calibre/ebooks/conversion/plumber.py b/src/calibre/ebooks/conversion/plumber.py index 50d0646c7d..a40c17a743 100644 --- a/src/calibre/ebooks/conversion/plumber.py +++ b/src/calibre/ebooks/conversion/plumber.py @@ -1012,8 +1012,8 @@ def create_oebbook(log, path_or_stream, opts, input_plugin, reader=None, Create an OEBBook. ''' from calibre.ebooks.oeb.base import OEBBook - html_preprocessor = HTMLPreProcessor(input_plugin.preprocess_html, - opts.preprocess_html, opts) + html_preprocessor = HTMLPreProcessor(input_plugin.heuristics, + opts.enable_heuristics, opts) if not encoding: encoding = None oeb = OEBBook(log, html_preprocessor, diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index dac93fa2e2..44d4235b6c 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -113,6 +113,11 @@ def get_word_count(self, html): return wordcount.words def markup_chapters(self, html, wordcount, blanks_between_paragraphs): + ''' + Searches for common chapter headings throughout the document + attempts multiple patterns based on likelihood of a match + with minimum false positives. Exits after finding a successful pattern + ''' # Typical chapters are between 2000 and 7000 words, use the larger number to decide the # minimum of chapters to search for self.min_chapters = 1 @@ -185,6 +190,10 @@ def markup_chapters(self, html, wordcount, blanks_between_paragraphs): return html def punctuation_unwrap(self, length, content, format): + ''' + Unwraps lines based on line length and punctuation + supports range of potential html markup and text files + ''' # define the pieces of the regex lookahead = "(?<=.{"+str(length)+"}([a-zäëïöüàèìòùáćéíóńśúâêîôûçąężıãõñæøþðß,:)\IA\u00DF]|(?\s*((p|span|div)>)?" @@ -201,53 +210,38 @@ def punctuation_unwrap(self, length, content, format): return content - def __call__(self, html): - self.log("********* Preprocessing HTML *********") + def text_process_pre(self, html): + pre = re.compile(r'
', re.IGNORECASE)
+ if len(pre.findall(html)) == 1:
+ self.log("Running Text Processing")
+ from calibre.ebooks.txt.processor import convert_basic, preserve_spaces, \
+ separate_paragraphs_single_line
+ outerhtml = re.compile(r'.*?(?<=)(?P.*)(?=
).*', re.IGNORECASE|re.DOTALL)
+ html = outerhtml.sub('\g', html)
+ html = separate_paragraphs_single_line(html)
+ html = preserve_spaces(html)
+ html = convert_basic(html, epub_split_size_kb=0)
+ else:
+ # Add markup naively
+ # TODO - find out if there are cases where there are more than one tag or
+ # other types of unmarked html and handle them in some better fashion
+ add_markup = re.compile('(?)(\n)')
+ html = add_markup.sub('\n', html)
+ return html
- # Count the words in the document to estimate how many chapters to look for and whether
- # other types of processing are attempted
- totalwords = 0
- totalwords = self.get_word_count(html)
-
- if totalwords < 50:
- self.log("not enough text, not preprocessing")
- return html
-
- # Arrange line feeds and
tags so the line_length and no_markup functions work correctly
+ def arrange_htm_line_endings(self, html):
html = re.sub(r"\s*(?Pp|div)>", ""+"\g"+">\n", html)
html = re.sub(r"\s*<(?Pp|div)(?P