Enable heuristic processing over the entire conversion pipe line when option is enabled.

This commit is contained in:
John Schember 2011-01-15 12:35:02 -05:00
parent d6256ef452
commit 64796696ae
9 changed files with 11 additions and 50 deletions

View file

@ -160,18 +160,6 @@ def get_images(self):
'''
raise NotImplementedError()
def heuristics(self, opts, html):
'''
This method is called by the conversion pipeline on all HTML before it
is parsed. It is meant to be used to do any required preprocessing on
the HTML, like removing hard line breaks, etc.
:param html: A unicode string
:return: A unicode string
'''
return html
def convert(self, stream, options, file_ext, log, accelerators):
'''
This method must be implemented in sub-classes. It must return

View file

@ -899,7 +899,6 @@ def run(self):
self.opts_to_mi(self.user_metadata)
if not hasattr(self.oeb, 'manifest'):
self.oeb = create_oebbook(self.log, self.oeb, self.opts,
self.input_plugin,
encoding=self.input_plugin.output_encoding)
self.input_plugin.postprocess_book(self.oeb, self.opts, self.log)
self.opts.is_image_collection = self.input_plugin.is_image_collection
@ -1009,14 +1008,13 @@ def run(self):
self.log(self.output_fmt.upper(), 'output written to', self.output)
self.flush()
def create_oebbook(log, path_or_stream, opts, input_plugin, reader=None,
def create_oebbook(log, path_or_stream, opts, reader=None,
encoding='utf-8', populate=True):
'''
Create an OEBBook.
'''
from calibre.ebooks.oeb.base import OEBBook
html_preprocessor = HTMLPreProcessor(input_plugin.heuristics,
opts.enable_heuristics, opts)
html_preprocessor = HTMLPreProcessor(log, opts)
if not encoding:
encoding = None
oeb = OEBBook(log, html_preprocessor,

View file

@ -397,10 +397,8 @@ class HTMLPreProcessor(object):
(re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
lambda match : '<h3 class="subtitle">%s</h3>'%(match.group(1),)),
]
def __init__(self, input_plugin_preprocess, plugin_preprocess,
extra_opts=None):
self.input_plugin_preprocess = input_plugin_preprocess
self.plugin_preprocess = plugin_preprocess
def __init__(self, log=None, extra_opts=None):
self.log = log
self.extra_opts = extra_opts
def is_baen(self, src):
@ -542,8 +540,10 @@ def dump(raw, where):
unidecoder = Unidecoder()
html = unidecoder.decode(html)
if self.plugin_preprocess:
html = self.input_plugin_preprocess(self.extra_opts, html)
if getattr(self.extra_opts, 'enable_heuristics', False):
from calibre.ebooks.conversion.utils import HeuristicProcessor
preprocessor = HeuristicProcessor(self.extra_opts, self.log)
html = preprocessor(html)
if getattr(self.extra_opts, 'smarten_punctuation', False):
html = self.smarten_punctuation(html)

View file

@ -11,7 +11,7 @@
from calibre.utils.logging import default_log
from calibre.utils.wordcount import get_wordcount_obj
class PreProcessor(object):
class HeuristicProcessor(object):
def __init__(self, extra_opts=None, log=None):
self.log = default_log if log is None else log
@ -366,7 +366,7 @@ def cleanup_required(self):
def __call__(self, html):
self.log("********* Preprocessing HTML *********")
self.log("********* Heuristic processing HTML *********")
# Count the words in the document to estimate how many chapters to look for and whether
# other types of processing are attempted

View file

@ -24,7 +24,6 @@
from calibre import unicode_path
from calibre.utils.localization import get_lang
from calibre.utils.filenames import ascii_filename
from calibre.ebooks.conversion.utils import PreProcessor
class Link(object):
'''
@ -485,9 +484,3 @@ def css_import_handler(self, base, href):
self.log.exception('Failed to read CSS file: %r'%link)
return (None, None)
return (None, raw)
def heuristics(self, options, html):
self.options = options
preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
return preprocessor(html)

View file

@ -7,7 +7,7 @@
__docformat__ = 'restructuredtext en'
from calibre.customize.conversion import InputFormatPlugin
from calibre.ebooks.conversion.utils import PreProcessor
from calibre.ebooks.conversion.utils import HeuristicProcessor
class LITInput(InputFormatPlugin):
@ -51,10 +51,3 @@ def postprocess_book(self, oeb, opts, log):
for elem in body:
ne = copy.deepcopy(elem)
pre.append(ne)
def heuristics(self, options, html):
self.options = options
preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
return preprocessor(html)

View file

@ -12,7 +12,6 @@
from lxml import etree
from calibre.customize.conversion import InputFormatPlugin
from calibre.ebooks.conversion.utils import PreProcessor
from calibre import guess_type
class Canvas(etree.XSLTExtension):
@ -419,11 +418,3 @@ def convert(self, stream, options, file_ext, log,
f.write(result)
styles.write()
return os.path.abspath('content.opf')
def heuristics(self, options, html):
self.options = options
preprocessor = PreProcessor(self.options, log=getattr(self, 'log', None))
return preprocessor(html)

View file

@ -9,7 +9,6 @@
from calibre.customize.conversion import InputFormatPlugin
from calibre.ebooks.pdb.header import PdbHeaderReader
from calibre.ebooks.pdb import PDBError, IDENTITY_TO_NAME, get_reader
from calibre.ebooks.conversion.utils import PreProcessor
class PDBInput(InputFormatPlugin):

View file

@ -7,7 +7,6 @@
from lxml import etree
from calibre.customize.conversion import InputFormatPlugin
from calibre.ebooks.conversion.utils import PreProcessor
border_style_map = {
'single' : 'solid',