Migrate html2lrf, web2lrf to python logging framework

This commit is contained in:
Kovid Goyal 2007-08-12 21:59:36 +00:00
parent 1f24807b87
commit 0b68623f86
4 changed files with 98 additions and 99 deletions

View file

@ -28,6 +28,9 @@
except:
pass
class CommandLineError(Exception):
pass
def setup_cli_handlers(logger, level):
logger.setLevel(level)
if level == logging.WARNING:

View file

@ -20,7 +20,7 @@
I am indebted to esperanc for the initial CSS->Xylog Style conversion code
and to Falstaff for pylrs.
"""
import os, re, sys, shutil, traceback, copy, glob
import os, re, sys, shutil, copy, glob, logging
from htmlentitydefs import name2codepoint
from urllib import unquote
from urlparse import urlparse
@ -43,7 +43,7 @@
from libprs500.ebooks.lrf import option_parser as lrf_option_parser
from libprs500.ebooks import ConversionError
from libprs500.ebooks.lrf.html.table import Table
from libprs500 import extract, filename_to_utf8
from libprs500 import extract, filename_to_utf8, setup_cli_handlers
from libprs500.ptempfile import PersistentTemporaryFile
class Span(_Span):
@ -84,7 +84,7 @@ def unit_convert(val, dpi, ref=80):
return result
@staticmethod
def translate_attrs(d, dpi, fonts, font_delta=0, memory=None):
def translate_attrs(d, dpi, fonts, logger, font_delta=0, memory=None):
"""
Receives a dictionary of html attributes and styles and returns
approximate Xylog equivalents in a new dictionary
@ -211,20 +211,20 @@ def font_size(val):
else:
memory.append(key)
if report:
print >>sys.stderr, 'Unhandled/malformed CSS key:', key, d[key]
logger.info('Unhandled/malformed CSS key: %s: %s', key, d[key])
t['fontfacename'] = (family, font_key(family, style, weight))
if t.has_key('fontsize') and int(t['fontsize']) > 120:
t['wordspace'] = 50
return t
def __init__(self, ns, css, memory, dpi, fonts, font_delta=0, normal_font_size=100):
def __init__(self, ns, css, memory, dpi, fonts, logger, font_delta=0, normal_font_size=100):
src = ns.string if hasattr(ns, 'string') else ns
src = re.sub(r'\s{2,}', ' ', src) # Remove multiple spaces
for pat, repl in Span.rules:
src = pat.sub(repl, src)
if not src:
raise ConversionError('No point in adding an empty string to a Span')
attrs = Span.translate_attrs(css, dpi, fonts, font_delta=font_delta, memory=memory)
attrs = Span.translate_attrs(css, dpi, fonts, logger, font_delta=font_delta, memory=memory)
if 'fontsize' in attrs.keys():
normal_font_size = int(attrs['fontsize'])
variant = attrs.pop('fontvariant', None)
@ -323,7 +323,7 @@ def __setattr__(self, attr, val):
else:
object.__setattr__(self, attr, val)
def __init__(self, book, fonts, path, options, link_level=0, is_root=True):
def __init__(self, book, fonts, path, options, logger, link_level=0, is_root=True):
'''
Convert HTML file at C{path} and add it to C{book}. After creating
the object, you must call L{self.process_links} on it to create the links and
@ -356,7 +356,8 @@ def __init__(self, book, fonts, path, options, link_level=0, is_root=True):
th = {'font-size' : 'large', 'font-weight':'bold'},
big = {'font-size' : 'large', 'font-weight':'bold'},
)
self.css['.libprs500_dropcaps'] = {'font-size': 'xx-large'}
self.css['.libprs500_dropcaps'] = {'font-size': 'xx-large'}
self.logger = logger
self.fonts = fonts #: dict specifting font families to use
self.scaled_images = {} #: Temporary files with scaled version of images
self.rotated_images = {} #: Temporary files with rotated version of images
@ -385,8 +386,7 @@ def __init__(self, book, fonts, path, options, link_level=0, is_root=True):
path = os.path.abspath(path)
os.chdir(os.path.dirname(path))
self.file_name = os.path.basename(path)
print "Processing", self.file_name
print '\tParsing HTML...',
self.logger.info('Processing %s\n\tParsing HTML...', self.file_name)
sys.stdout.flush()
nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
nmassage.extend(HTMLConverter.MARKUP_MASSAGE)
@ -400,7 +400,7 @@ def __init__(self, book, fonts, path, options, link_level=0, is_root=True):
self.soup = BeautifulSoup(raw,
convertEntities=BeautifulSoup.HTML_ENTITIES,
markupMassage=nmassage)
print 'done\n\tConverting to BBeB...',
logger.info('\tConverting to BBeB...')
sys.stdout.flush()
self.current_page = None
self.current_para = None
@ -411,7 +411,6 @@ def __init__(self, book, fonts, path, options, link_level=0, is_root=True):
self.page_break_found = True
self.parse_file()
HTMLConverter.processed_files[path] = self
print 'done'
def parse_css(self, style):
"""
@ -554,8 +553,8 @@ def add_toc_entry(text, target):
if target.parent != None and \
hasattr(target.parent, 'objId'):
self.book.addTocEntry(ascii_text, tb)
elif self.verbose:
print "Cannot add link", ascii_text, "to TOC"
else:
self.logger.debug("Cannot add link %s to TOC", ascii_text)
def get_target_block(fragment, targets):
@ -624,21 +623,21 @@ def get_target_block(fragment, targets):
if not os.access(path.encode('utf8', 'replace'), os.R_OK):
continue
except Exception:
if self.verbose:
print "Skipping", link
self.logger.exception('Skipping %s', link)
continue
path = os.path.abspath(path)
if not path in HTMLConverter.processed_files.keys():
try:
self.files[path] = HTMLConverter(
self.book, self.fonts, path, self.options,
self.logger,
link_level = self.link_level+1,
is_root = False,)
HTMLConverter.processed_files[path] = self.files[path]
except Exception:
print >>sys.stderr, 'Unable to process', path
self.logger.warning('Unable to process %s', path)
if self.verbose:
traceback.print_exc()
self.logger.exception('')
continue
finally:
os.chdir(cwd)
@ -759,12 +758,12 @@ def add_text(self, tag, css):
else:
self.process_alignment(css)
try:
self.current_para.append(Span(src, self.sanctify_css(css), self.memory,\
self.profile.dpi, self.fonts, font_delta=self.font_delta))
self.current_para.append(Span(src, self.sanctify_css(css), self.memory,
self.profile.dpi, self.fonts, self.logger,
font_delta=self.font_delta))
self.current_para.normalize_spaces()
except ConversionError, err:
if self.verbose:
print >>sys.stderr, err
except ConversionError:
self.logger.exception('Bad text')
def sanctify_css(self, css):
""" Return a copy of C{css} that is safe for use in a SPAM Xylog tag """
@ -809,7 +808,7 @@ def process_image(self, path, tag_css, width=None, height=None, dropcaps=False):
try:
im = PILImage.open(path)
except IOError, err:
print >>sys.stderr, 'Unable to process:', path, err
self.logger.warning('Unable to process image: %s\n%s', path, err)
return
@ -826,7 +825,7 @@ def scale_image(width, height):
self.scaled_images[path] = pt
return pt.name
except IOError: # PIL chokes on interlaced PNG images
print >>sys.stderr, 'Unable to process interlaced PNG', path
self.logger.warning('Unable to process interlaced PNG %s', path)
return None
pheight = int(self.current_page.pageStyle.attrs['textheight'])
@ -863,10 +862,8 @@ def scale_image(width, height):
path = pt.name
self.rotated_images[path] = pt
width, height = im.size
except IOError, err: # PIL chokes on interlaced PNG files and since auto-rotation is not critical we ignore the error
if self.verbose:
print >>sys.stderr, 'Unable to autorotate interlaced PNG', path
print >>sys.stderr, err
except IOError: # PIL chokes on interlaced PNG files and since auto-rotation is not critical we ignore the error
self.logger.debug('Unable to process interlaced PNG %s', path)
finally:
pt.close()
@ -945,8 +942,7 @@ def process_page_breaks(self, tag, tagname, tag_css):
if not self.page_break_found and self.page_break.match(tagname):
if len(self.current_page.contents) > 3:
self.end_page()
if self.verbose:
print 'Forcing page break at', tagname
self.logger.debug('Forcing page break at %s', tagname)
return end_page
def parse_tag(self, tag, parent_css):
@ -1048,8 +1044,7 @@ def parse_tag(self, tag, parent_css):
dropcaps = tag.has_key('class') and tag['class'] == 'libprs500_dropcaps'
self.process_image(path, tag_css, width, height, dropcaps=dropcaps)
else:
if self.verbose:
print >>sys.stderr, "Failed to process:", tag
self.logger.debug("Failed to process: %s", str(tag))
elif tagname in ['style', 'link']:
def update_css(ncss):
for key in ncss.keys():
@ -1083,7 +1078,8 @@ def update_css(ncss):
c.replaceWith(self.get_text(c))
self.end_current_para()
self.current_block.append_to(self.current_page)
attrs = Span.translate_attrs(tag_css, self.profile.dpi, self.fonts, self.font_delta, self.memory)
attrs = Span.translate_attrs(tag_css, self.profile.dpi, self.fonts,
self.logger, self.font_delta, self.memory)
attrs['fontfacename'] = self.fonts['mono']['normal'][1]
ts = self.book.create_text_style(**self.unindented_style.attrs)
ts.attrs.update(attrs)
@ -1185,8 +1181,7 @@ def update_css(ncss):
src = self.get_text(tag, limit=1000)
if self.chapter_detection and tagname.startswith('h'):
if self.chapter_regex.search(src):
if self.verbose:
print 'Detected chapter', src
self.logger.debug('Detected chapter %s', src)
self.end_page()
self.page_break_found = True
self.end_current_para()
@ -1241,9 +1236,8 @@ def update_css(ncss):
try:
self.process_table(tag, tag_css)
except Exception, err:
print 'WARNING: An error occurred while processing a table:', err
print 'Ignoring table markup for table:'
print str(tag)[:300]
self.logger.warning('An error occurred while processing a table: %s', str(err))
self.logger.warning('Ignoring table markup for table:\n%s', str(tag)[:300])
self.in_table = False
self.process_children(tag, tag_css)
else:
@ -1275,16 +1269,20 @@ def cleanup(self):
for _file in self.scaled_images.values() + self.rotated_images.values():
_file.__del__()
def process_file(path, options):
def process_file(path, options, logger=None):
if re.match('http://|https://', path):
raise ConversionError, 'You have to save the website %s as an html file first and then run html2lrf on it.'%(path,)
if logger is None:
level = logging.DEBUG if options.verbose else logging.INFO
logger = logging.getLogger('html2lrf')
setup_cli_handlers(logger, level)
cwd = os.getcwd()
dirpath = None
default_title = filename_to_utf8(os.path.splitext(os.path.basename(path))[0])
try:
dirpath, path = get_path(path)
cpath, tpath = '', ''
try_opf(path, options)
try_opf(path, options, logger)
if options.cover:
options.cover = os.path.abspath(os.path.expanduser(options.cover))
cpath = options.cover
@ -1347,7 +1345,7 @@ def process_file(path, options):
fpba = ['$', '', '$']
options.force_page_break_attr = [re.compile(fpba[0], re.IGNORECASE), fpba[1],
re.compile(fpba[2], re.IGNORECASE)]
conv = HTMLConverter(book, fonts, path, options)
conv = HTMLConverter(book, fonts, path, options, logger)
conv.process_links()
oname = options.output
if not oname:
@ -1356,7 +1354,7 @@ def process_file(path, options):
oname = os.path.join(cwd,name)
oname = os.path.abspath(os.path.expanduser(oname))
conv.writeto(oname, lrs=options.lrs)
print 'Output written to', oname
logger.info('Output written to %s', oname)
conv.cleanup()
return oname
finally:
@ -1364,7 +1362,7 @@ def process_file(path, options):
if dirpath:
shutil.rmtree(dirpath, True)
def try_opf(path, options):
def try_opf(path, options, logger):
try:
opf = glob.glob(os.path.join(os.path.dirname(path),'*.opf'))[0]
except IndexError:
@ -1419,12 +1417,9 @@ def try_opf(path, options):
if not os.access(options.cover, os.R_OK):
options.cover = None
except:
if options.verbose:
traceback.print_exc()
except Exception, err:
if options.verbose:
print >>sys.stderr, 'Failed to process opf file', err
pass
logger.exception('Could not load cover')
except Exception:
logger.exception('Failed to process opf file')
def option_parser():
return lrf_option_parser('''Usage: %prog [options] mybook.[html|rar|zip]\n\n'''

View file

@ -14,14 +14,13 @@
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
'''Convert known websites into LRF files.'''
import sys, time, tempfile, shutil, os
import sys, time, tempfile, shutil, os, logging
from urlparse import urlsplit
from libprs500 import __appname__
from libprs500 import __appname__, setup_cli_handlers, CommandLineError
from libprs500.ebooks.lrf import option_parser as lrf_option_parser
from libprs500.ebooks.lrf.html.convert_from import process_file
from libprs500.ebooks.lrf.web.profiles import profiles
from libprs500.web.fetch.simple import setup_logger as web2disk_setup_logger
from libprs500.web.fetch.simple import create_fetcher
available_profiles = profiles.keys()
@ -57,14 +56,14 @@ def option_parser():
help='Any link that matches this regular expression will be ignored. This option can be specified multiple times, in which case as long as any regexp matches a link, it will be ignored.By default, no links are ignored. If both --filter-regexp and --match-regexp are specified, then --filter-regexp is applied first.')
return parser
def fetch_website(options):
def fetch_website(options, logger):
tdir = tempfile.mkdtemp(prefix=__appname__+'_' )
options.dir = tdir
fetcher = create_fetcher(options)
fetcher = create_fetcher(options, logger)
fetcher.preprocess_regexps = options.preprocess_regexps
return fetcher.start_fetch(options.url), tdir
def create_lrf(htmlfile, options):
def create_lrf(htmlfile, options, logger):
if not options.author:
options.author = __appname__
options.header = True
@ -73,20 +72,16 @@ def create_lrf(htmlfile, options):
else:
options.output = os.path.abspath(os.path.expanduser(options.title + ('.lrs' if options.lrs else '.lrf')))
process_file(htmlfile, options)
process_file(htmlfile, options, logger)
def main(args=sys.argv):
parser = option_parser()
options, args = parser.parse_args(args)
web2disk_setup_logger(options)
if len(args) > 2:
parser.print_help()
return 1
def process_profile(args, options, logger=None):
if logger is None:
level = logging.DEBUG if options.verbose else logging.INFO
logger = logging.getLogger('web2lrf')
setup_cli_handlers(logger, level)
if len(args) == 2:
if not profiles.has_key(args[1]):
print >>sys.stderr, 'Unknown profile', args[1]
print >>sys.stderr, 'Valid profiles:', profiles.keys()
return 1
raise CommandLineError('Unknown profile: %s\nValid profiles: %s'%(args[1], profiles.keys()))
profile = profiles[args[1]] if len(args) == 2 else profiles['default']
if profile.has_key('initialize'):
@ -98,11 +93,7 @@ def main(args=sys.argv):
setattr(options, opt, profile[opt])
if not options.url:
parser.print_help()
print >>sys.stderr
print >>sys.stderr, 'You must specify the --url option or a profile from one of:',
print >>sys.stderr, available_profiles
return 1
raise CommandLineError('You must specify the --url option or a profile from one of: %s', available_profiles)
if not options.title:
title = profile['title']
@ -114,12 +105,24 @@ def main(args=sys.argv):
options.preprocess_regexps = profile['preprocess_regexps']
options.filter_regexps += profile['filter_regexps']
htmlfile, tdir = fetch_website(options)
create_lrf(htmlfile, options)
htmlfile, tdir = fetch_website(options, logger)
create_lrf(htmlfile, options, logger)
if profile.has_key('finalize'):
profile['finalize'](profile)
shutil.rmtree(tdir)
def main(args=sys.argv):
parser = option_parser()
options, args = parser.parse_args(args)
if len(args) > 2:
parser.print_help()
return 1
try:
process_profile(args, options)
except CommandLineError, err:
print >>sys.stderr, err
return 0
if __name__ == '__main__':

View file

@ -23,8 +23,6 @@
from libprs500 import __version__, __appname__, __author__, setup_cli_handlers
from libprs500.ebooks.BeautifulSoup import BeautifulSoup
logger = logging.getLogger('libprs500.web.fetch.simple')
class FetchError(Exception):
pass
@ -52,7 +50,8 @@ class RecursiveFetcher(object):
# )
CSS_IMPORT_PATTERN = re.compile(r'\@import\s+url\((.*?)\)', re.IGNORECASE)
def __init__(self, options):
def __init__(self, options, logger):
self.logger = logger
self.base_dir = os.path.abspath(os.path.expanduser(options.dir))
if not os.path.exists(self.base_dir):
os.makedirs(self.base_dir)
@ -80,7 +79,7 @@ def get_soup(self, src):
def fetch_url(self, url):
f = None
logger.info('Fetching %s', url)
self.logger.debug('Fetching %s', url)
delta = time.time() - self.last_fetch_at
if delta < self.delay:
time.sleep(delta)
@ -138,8 +137,8 @@ def process_stylesheets(self, soup, baseurl):
try:
f = self.fetch_url(iurl)
except Exception, err:
logger.warning('Could not fetch stylesheet %s', iurl)
logger.debug('Error: %s', str(err), exc_info=True)
self.logger.warning('Could not fetch stylesheet %s', iurl)
self.logger.debug('Error: %s', str(err), exc_info=True)
continue
c += 1
stylepath = os.path.join(diskpath, 'style'+str(c)+'.css')
@ -160,8 +159,8 @@ def process_stylesheets(self, soup, baseurl):
try:
f = self.fetch_url(iurl)
except Exception, err:
logger.warning('Could not fetch stylesheet %s', iurl)
logger.debug('Error: %s', str(err), exc_info=True)
self.logger.warning('Could not fetch stylesheet %s', iurl)
self.logger.debug('Error: %s', str(err), exc_info=True)
continue
c += 1
stylepath = os.path.join(diskpath, 'style'+str(c)+'.css')
@ -179,7 +178,7 @@ def process_images(self, soup, baseurl):
for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
iurl, ext = tag['src'], os.path.splitext(tag['src'])[1]
if not ext:
logger.info('Skipping extensionless image %s', iurl)
self.logger.debug('Skipping extensionless image %s', iurl)
continue
if not urlparse.urlsplit(iurl).scheme:
iurl = urlparse.urljoin(baseurl, iurl, False)
@ -189,8 +188,8 @@ def process_images(self, soup, baseurl):
try:
f = self.fetch_url(iurl)
except Exception, err:
logger.warning('Could not fetch image %s', iurl)
logger.debug('Error: %s', str(err), exc_info=True)
self.logger.warning('Could not fetch image %s', iurl)
self.logger.debug('Error: %s', str(err), exc_info=True)
continue
c += 1
imgpath = os.path.join(diskpath, 'img'+str(c)+ext)
@ -206,7 +205,7 @@ def absurl(self, baseurl, tag, key):
if not parts.scheme:
iurl = urlparse.urljoin(baseurl, iurl, False)
if not self.is_link_ok(iurl):
logger.info('Skipping invalid link: %s', iurl)
self.logger.debug('Skipping invalid link: %s', iurl)
return None
return iurl
@ -258,7 +257,7 @@ def process_links(self, soup, baseurl, recursion_level, into_dir='links'):
self.current_dir = linkdiskpath
f = self.fetch_url(iurl)
soup = self.get_soup(f.read())
logger.info('Processing images...')
self.logger.debug('Processing images...')
self.process_images(soup, f.geturl())
if self.download_stylesheets:
self.process_stylesheets(soup, f.geturl())
@ -266,17 +265,17 @@ def process_links(self, soup, baseurl, recursion_level, into_dir='links'):
res = os.path.join(linkdiskpath, basename(iurl))
self.filemap[nurl] = res
if recursion_level < self.max_recursions:
logger.info('Processing links...')
self.logger.debug('Processing links...')
self.process_links(soup, iurl, recursion_level+1)
else:
self.process_return_links(soup, iurl)
logger.info('Recursion limit reached. Skipping %s', iurl)
self.logger.debug('Recursion limit reached. Skipping %s', iurl)
save_soup(soup, res)
self.localize_link(tag, 'href', res)
except Exception, err:
logger.warning('Could not fetch link %s', iurl)
logger.debug('Error: %s', str(err), exc_info=True)
self.logger.warning('Could not fetch link %s', iurl)
self.logger.debug('Error: %s', str(err), exc_info=True)
finally:
self.current_dir = diskpath
self.files += 1
@ -313,12 +312,12 @@ def option_parser(usage='%prog URL\n\nWhere URL is for example http://google.com
return parser
def create_fetcher(options):
return RecursiveFetcher(options)
def setup_logger(options):
level = logging.DEBUG if options.verbose else logging.WARNING
setup_cli_handlers(logger, level)
def create_fetcher(options, logger=None):
if logger is None:
level = logging.DEBUG if options.verbose else logging.INFO
logger = logging.getLogger('web2disk')
setup_cli_handlers(logger, level)
return RecursiveFetcher(options, logger)
def main(args=sys.argv):
parser = option_parser()
@ -327,7 +326,6 @@ def main(args=sys.argv):
parser.print_help()
return 1
setup_logger(options)
fetcher = create_fetcher(options)
fetcher.start_fetch(args[1])