Enable heuristic processing over the entire conversion pipe line when option is enabled.

2026-05-09 03:53:51 +02:00 · 2011-01-15 12:35:02 -05:00 · 2011-01-15 12:35:02 -05:00 · 64796696ae
commit 64796696ae
parent d6256ef452
9 changed files with 11 additions and 50 deletions
--- a/src/calibre/customize/conversion.py
+++ b/src/calibre/customize/conversion.py
@ -160,18 +160,6 @@ def get_images(self):
        '''
        raise NotImplementedError()

-    def heuristics(self, opts, html):
-        '''
-        This method is called by the conversion pipeline on all HTML before it
-        is parsed. It is meant to be used to do any required preprocessing on
-        the HTML, like removing hard line breaks, etc.
-
-        :param html: A unicode string
-        :return: A unicode string
-        '''
-        return html
-
-
    def convert(self, stream, options, file_ext, log, accelerators):
        '''
        This method must be implemented in sub-classes. It must return
--- a/src/calibre/ebooks/conversion/plumber.py
+++ b/src/calibre/ebooks/conversion/plumber.py
@ -899,7 +899,6 @@ def run(self):
                self.opts_to_mi(self.user_metadata)
            if not hasattr(self.oeb, 'manifest'):
                self.oeb = create_oebbook(self.log, self.oeb, self.opts,
-                        self.input_plugin,
                        encoding=self.input_plugin.output_encoding)
            self.input_plugin.postprocess_book(self.oeb, self.opts, self.log)
            self.opts.is_image_collection = self.input_plugin.is_image_collection
@ -1009,14 +1008,13 @@ def run(self):
        self.log(self.output_fmt.upper(), 'output written to', self.output)
        self.flush()

-def create_oebbook(log, path_or_stream, opts, input_plugin, reader=None,
+def create_oebbook(log, path_or_stream, opts, reader=None,
        encoding='utf-8', populate=True):
    '''
    Create an OEBBook.
    '''
    from calibre.ebooks.oeb.base import OEBBook
-    html_preprocessor = HTMLPreProcessor(input_plugin.heuristics,
-            opts.enable_heuristics, opts)
+    html_preprocessor = HTMLPreProcessor(log, opts)
    if not encoding:
        encoding = None
    oeb = OEBBook(log, html_preprocessor,
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@ -397,10 +397,8 @@ class HTMLPreProcessor(object):
                     (re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
                      lambda match : '<h3 class="subtitle">%s</h3>'%(match.group(1),)),
                     ]
-    def __init__(self, input_plugin_preprocess, plugin_preprocess,
-            extra_opts=None):
-        self.input_plugin_preprocess = input_plugin_preprocess
-        self.plugin_preprocess = plugin_preprocess
+    def __init__(self, log=None, extra_opts=None):
+        self.log = log
        self.extra_opts = extra_opts

    def is_baen(self, src):
@ -542,8 +540,10 @@ def dump(raw, where):
            unidecoder = Unidecoder()
            html = unidecoder.decode(html)

-        if self.plugin_preprocess:
-            html = self.input_plugin_preprocess(self.extra_opts, html)
+        if getattr(self.extra_opts, 'enable_heuristics', False):
+            from calibre.ebooks.conversion.utils import HeuristicProcessor
+            preprocessor = HeuristicProcessor(self.extra_opts, self.log)
+            html = preprocessor(html)

        if getattr(self.extra_opts, 'smarten_punctuation', False):
            html = self.smarten_punctuation(html)
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@ -11,7 +11,7 @@
 from calibre.utils.logging import default_log
 from calibre.utils.wordcount import get_wordcount_obj

-class PreProcessor(object):
+class HeuristicProcessor(object):

    def __init__(self, extra_opts=None, log=None):
        self.log = default_log if log is None else log
@ -366,7 +366,7 @@ def cleanup_required(self):


    def __call__(self, html):
-        self.log("*********  Preprocessing HTML  *********")
+        self.log("*********  Heuristic processing HTML  *********")

        # Count the words in the document to estimate how many chapters to look for and whether
        # other types of processing are attempted
--- a/src/calibre/ebooks/html/input.py
+++ b/src/calibre/ebooks/html/input.py
@ -24,7 +24,6 @@
 from calibre import unicode_path
 from calibre.utils.localization import get_lang
 from calibre.utils.filenames import ascii_filename
-from calibre.ebooks.conversion.utils import PreProcessor

 class Link(object):
    '''
@ -485,9 +484,3 @@ def css_import_handler(self, base, href):
            self.log.exception('Failed to read CSS file: %r'%link)
            return (None, None)
        return (None, raw)
-
-    def heuristics(self, options, html):
-        self.options = options
-        preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
-        return preprocessor(html)
-
--- a/src/calibre/ebooks/lit/input.py
+++ b/src/calibre/ebooks/lit/input.py
@ -7,7 +7,7 @@
 __docformat__ = 'restructuredtext en'

 from calibre.customize.conversion import InputFormatPlugin
-from calibre.ebooks.conversion.utils import PreProcessor
+from calibre.ebooks.conversion.utils import HeuristicProcessor


 class LITInput(InputFormatPlugin):
@ -51,10 +51,3 @@ def postprocess_book(self, oeb, opts, log):
                    for elem in body:
                        ne = copy.deepcopy(elem)
                        pre.append(ne)
-
-
-    def heuristics(self, options, html):
-        self.options = options
-        preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
-        return preprocessor(html)
-
--- a/src/calibre/ebooks/lrf/input.py
+++ b/src/calibre/ebooks/lrf/input.py
@ -12,7 +12,6 @@
 from lxml import etree

 from calibre.customize.conversion import InputFormatPlugin
-from calibre.ebooks.conversion.utils import PreProcessor
 from calibre import guess_type

 class Canvas(etree.XSLTExtension):
@ -419,11 +418,3 @@ def convert(self, stream, options, file_ext, log,
            f.write(result)
        styles.write()
        return os.path.abspath('content.opf')
-
-    def heuristics(self, options, html):
-        self.options = options
-        preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
-        return preprocessor(html)
-
-
-
--- a/src/calibre/ebooks/pdb/input.py
+++ b/src/calibre/ebooks/pdb/input.py
@ -9,7 +9,6 @@
 from calibre.customize.conversion import InputFormatPlugin
 from calibre.ebooks.pdb.header import PdbHeaderReader
 from calibre.ebooks.pdb import PDBError, IDENTITY_TO_NAME, get_reader
-from calibre.ebooks.conversion.utils import PreProcessor

 class PDBInput(InputFormatPlugin):

--- a/src/calibre/ebooks/rtf/input.py
+++ b/src/calibre/ebooks/rtf/input.py
@ -7,7 +7,6 @@
 from lxml import etree

 from calibre.customize.conversion import InputFormatPlugin
-from calibre.ebooks.conversion.utils import PreProcessor

 border_style_map = {
        'single' : 'solid',