calibre/src/libprs500/lrf/html/convert_from.py
2007-05-11 20:05:38 +00:00

1130 lines
No EOL
50 KiB
Python

## Copyright (C) 2006 Kovid Goyal kovid@kovidgoyal.net
## This work is based on htmlbbeb created by esperanc.
##
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 2 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License along
## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
"""
Code to convert HTML ebooks into LRF ebooks.
I am indebted to esperanc for the CSS->Xylog Style conversion routines
and to Falstaff for pylrs.
"""
import os, re, sys, shutil, traceback, copy
from htmlentitydefs import name2codepoint
from urllib import urlopen, unquote
from urlparse import urlparse
from tempfile import mkdtemp
from operator import itemgetter
from math import ceil, floor
try:
from PIL import Image as PILImage
except ImportError:
import Image as PILImage
from libprs500.lrf.html.BeautifulSoup import BeautifulSoup, Comment, Tag, \
NavigableString, Declaration, ProcessingInstruction
from libprs500.lrf.pylrs.pylrs import Paragraph, CR, Italic, ImageStream, TextBlock, \
ImageBlock, JumpButton, CharButton, \
Bold, Space, Plot, Image, BlockSpace,\
RuledLine, BookSetting
from libprs500.lrf.pylrs.pylrs import Span as _Span
from libprs500.lrf import ConversionError, option_parser, Book
from libprs500 import extract
from libprs500.ptempfile import PersistentTemporaryFile
class Span(_Span):
replaced_entities = [ 'amp', 'lt', 'gt' , 'ldquo', 'rdquo', 'lsquo', 'rsquo', 'nbsp' ]
patterns = [ re.compile('&'+i+';') for i in replaced_entities ]
targets = [ unichr(name2codepoint[i]) for i in replaced_entities ]
rules = zip(patterns, targets)
@staticmethod
def unit_convert(val, ref=80):
"""
Tries to convert html units stored in C{val} to pixels. C{ref} contains
the reference value for relative units. Returns the number of pixels
(an int) if successful. Otherwise, returns None.
Assumes: 1 pixel is 1/4 mm. One em is 10pts
"""
result = None
m = re.match("\s*(-*[0-9]*\.?[0-9]*)\s*(%|em|px|mm|cm|in|pt|pc)", val)
if m is not None:
unit = float(m.group(1))
if m.group(2) == '%':
result = int(unit/100.0*ref)
elif m.group(2) == 'px':
result = int(unit)
elif m.group(2) == 'in':
result = int(unit * 25.4 * 4)
elif m.group(2) == 'pt':
result = int(unit * 25.4 * 4 / 72)
elif m.group(2)== 'em':
result = int(unit * 25.4 * 4 / 72 * 10)
elif m.group(2)== 'pc':
result = int(unit * 25.4 * 4 / 72 * 12)
elif m.group(2)== 'mm':
result = int(unit * 4)
elif m.group(2)== 'cm':
result = int(unit * 10 * 4)
return result
@staticmethod
def translate_attrs(d, font_delta=0, memory=None):
"""
Receives a dictionary of html attributes and styles and returns
approximate Xylog equivalents in a new dictionary
"""
def font_weight(val):
ans = None
m = re.search("([0-9]+)", val)
if m:
ans = str(int(m.group(1)))
elif val.find("bold") >= 0 or val.find("strong") >= 0:
ans = "1000"
return ans
def font_family(val):
ans = None
if max(val.find("courier"), val.find("mono"), val.find("fixed"), val.find("typewriter"))>=0:
ans = "Courier10 BT Roman"
elif max(val.find("arial"), val.find("helvetica"), val.find("verdana"),
val.find("trebuchet"), val.find("sans")) >= 0:
ans = "Swis721 BT Roman"
return ans
def font_size(val):
ans = None
unit = Span.unit_convert(val, 14)
if unit:
# Assume a 10 pt font (14 pixels) has fontsize 100
ans = int (unit / 14.0 * 100)
else:
if "xx-small" in val:
ans = 40
elif "x-small" in val >= 0:
ans = 60
elif "small" in val:
ans = 80
elif "xx-large" in val:
ans = 180
elif "x-large" in val >= 0:
ans = 140
elif "large" in val >= 0:
ans = 120
if ans is not None:
ans += font_delta * 20
ans = str(ans)
return ans
t = dict()
for key in d.keys():
val = d[key].lower()
if key == 'font':
val = val.split()
val.reverse()
for sval in val:
ans = font_family(sval)
if ans:
t['fontfacename'] = ans
else:
ans = font_size(sval)
if ans:
t['fontsize'] = ans
else:
ans = font_weight(sval)
if ans:
t['fontweight'] = ans
elif key in ['font-family', 'font-name']:
ans = font_family(val)
if ans:
t['fontfacename'] = ans
elif key == "font-size":
ans = font_size(val)
if ans:
t['fontsize'] = ans
elif key == 'font-weight':
ans = font_weight(val)
if ans:
t['fontweight'] = ans
if int(ans) > 1400:
t['wordspace'] = '50'
elif key.startswith("margin"):
if key == "margin":
u = []
for x in val.split(" "):
u.append(Span.unit_convert (x,200)*2)
if len(u)==1:
u = [u[0], u[0], u[0], u[0]]
elif len(u)==2:
u = [u[0], u[1], u[0], u[1]]
elif len(u)==3:
u = [u[0], u[1], u[2], u[1]]
elif key == "margin-top":
u = [Span.unit_convert(val, 200)*2, None, None, None]
elif key == "margin-right":
u = [None, Span.unit_convert(val, 200)*2, None, None]
elif key == "margin-bottom":
u = [None, None, Span.unit_convert(val, 200)*2, None]
else:
u = [None, None, None, Span.unit_convert(val, 200)*2]
if u[2] is not None:
t["parskip"] = str(u[2])
t["footskip"] = str(u[2])
if u[0] is not None:
t["topskip"] = str(u[0])
if u[1] is not None:
t["sidemargin"] = str(u[1])
else:
report = True
if memory != None:
if key in memory:
report = False
else:
memory.append(key)
if report:
print >>sys.stderr, 'Unhandled/malformed CSS key:', key, d[key]
return t
def __init__(self, ns, css, memory, font_delta=0):
src = ns.string if hasattr(ns, 'string') else ns
src = re.sub(r'\s{2,}', ' ', src) # Remove multiple spaces
for pat, repl in Span.rules:
src = pat.sub(repl, src)
if not src:
raise ConversionError('No point in adding an empty string to a Span')
if 'font-style' in css.keys():
fs = css.pop('font-style')
if fs.lower() == 'italic':
src = Italic(src)
attrs = Span.translate_attrs(css, font_delta=font_delta, memory=memory)
_Span.__init__(self, text=src, **attrs)
class HTMLConverter(object):
SELECTOR_PAT = re.compile(r"([A-Za-z0-9\-\_\:\.]+[A-Za-z0-9\-\_\:\.\s\,]*)\s*\{([^\}]*)\}")
IGNORED_TAGS = (Comment, Declaration, ProcessingInstruction)
# Fix <a /> elements
MARKUP_MASSAGE = [(re.compile("(<\s*[aA]\s+.*\/)\s*>"),
lambda match: match.group(1)+"></a>")]
# Fix Baen markup
BAEN_SANCTIFY = [(re.compile(r'<\s*[Aa]\s+id="p[0-9]+"\s+name="p[0-9]+"\s*>\s*<\/[Aa]>'),
lambda match: ''),
(re.compile(r'page-break-before:\s*\w+([\s;\}])'),
lambda match: match.group(1)) ]
class Link(object):
def __init__(self, para, tag):
self.para = para
self.tag = tag
processed_files = {} #: Files that have been processed
def __init__(self, book, path, dpi=166, width=575, height=747,
font_delta=0, verbose=False, cover=None,
max_link_levels=sys.maxint, link_level=0,
is_root=True, baen=False, chapter_detection=True,
chapter_regex=re.compile('chapter|book|appendix', re.IGNORECASE),
link_exclude=re.compile('')):
'''
Convert HTML file at C{path} and add it to C{book}. After creating
the object, you must call L{self.process_links} on it to create the links and
then L{self.writeto} to output the LRF/S file.
@param book: The LRF book
@type book: L{libprs500.lrf.pylrs.Book}
@param path: path to the HTML file to process
@type path: C{str}
@param width: Width of the device on which the LRF file is to be read
@type width: C{int}
@param height: Height of the device on which the LRF file is to be read
@type height: C{int}
@param font_delta: The amount in pts by which all fonts should be changed
@type font_delta: C{int}
@param verbose: Whether processing should be verbose or not
@type verbose: C{bool}
@param cover: Path to an image to use as the cover of this book
@type cover: C{str}
@param max_link_levels: Number of link levels to process recursively
@type max_link_levels: C{int}
@param link_level: Current link level
@type link_level: C{int}
@param is_root: True iff this object is converting the root HTML file
@type is_root: C{bool}
@param chapter_detection: Insert page breaks before what looks like
the start of a chapter
@type chapter_detection: C{bool}
@param chapter_regex: The compiled regular expression used to search for chapter titles
@param link_exclude: Compiled regex. Matching hrefs are ignored.
'''
# Defaults for various formatting tags
self.css = dict(
h1 = {"font-size" :"xx-large", "font-weight":"bold", 'text-indent':'0pt'},
h2 = {"font-size" :"x-large", "font-weight":"bold", 'text-indent':'0pt'},
h3 = {"font-size" :"large", "font-weight":"bold", 'text-indent':'0pt'},
h4 = {"font-size" :"large", 'text-indent':'0pt'},
h5 = {"font-weight" :"bold", 'text-indent':'0pt'},
b = {"font-weight" :"bold"},
strong = {"font-weight" :"bold"},
i = {"font-style" :"italic"},
em = {"font-style" :"italic"},
small = {'font-size' :'small'},
pre = {'font-family' :'monospace' },
center = {'text-align' : 'center'}
)
self.page_width = width #: The width of the page
self.page_height = height #: The height of the page
self.dpi = dpi #: The DPI of the intended display device
self.chapter_detection = chapter_detection #: Flag to toggle chapter detection
self.chapter_regex = chapter_regex #: Regex used to search for chapter titles
self.link_exclude = link_exclude #: Ignore matching hrefs
self.scaled_images = {} #: Temporary files with scaled version of images
self.max_link_levels = max_link_levels #: Number of link levels to process recursively
self.link_level = link_level #: Current link level
self.blockquote_style = book.create_block_style(sidemargin=60,
topskip=20, footskip=20)
self.unindented_style = book.create_text_style(parindent=0)
self.text_styles = []#: Keep track of already used textstyles
self.block_styles = []#: Keep track of already used blockstyles
self.images = {} #: Images referenced in the HTML document
self.targets = {} #: <a name=...> elements
self.links = [] #: <a href=...> elements
self.files = {} #: links that point to other files
self.links_processed = False #: Whether links_processed has been called on this object
self.font_delta = font_delta
self.cover = cover
self.memory = [] #: Used to ensure that duplicate CSS unhandled erros are not reported
self.in_ol = False #: Flag indicating we're in an <ol> element
self.book = book #: The Book object representing a BBeB book
self.is_root = is_root #: Are we converting the root HTML file
self.lstrip_toggle = False #; If true the next add_text call will do an lstrip
path = os.path.abspath(path)
os.chdir(os.path.dirname(path))
self.file_name = os.path.basename(path)
print "Processing", self.file_name
print '\tParsing HTML...',
sys.stdout.flush()
nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
nmassage.extend(HTMLConverter.MARKUP_MASSAGE)
self.baen = baen
if baen:
nmassage.extend(HTMLConverter.BAEN_SANCTIFY)
self.soup = BeautifulSoup(open(self.file_name, 'r').read(),
convertEntities=BeautifulSoup.HTML_ENTITIES,
markupMassage=nmassage)
print 'done\n\tConverting to BBeB...',
sys.stdout.flush()
self.verbose = verbose
self.current_page = None
self.current_para = None
self.current_style = {}
self.parse_file()
HTMLConverter.processed_files[path] = self
print 'done'
def parse_css(self, style):
"""
Parse the contents of a <style> tag or .css file.
@param style: C{str(style)} should be the CSS to parse.
@return: A dictionary with one entry per selector where the key is the
selector name and the value is a dictionary of properties
"""
sdict = dict()
style = re.sub('/\*.*?\*/', '', style) # Remove /*...*/ comments
for sel in re.findall(HTMLConverter.SELECTOR_PAT, style):
for key in sel[0].split(','):
key = key.strip().lower()
val = self.parse_style_properties(sel[1])
if key in sdict:
sdict[key].update(val)
else:
sdict[key] = val
return sdict
def parse_style_properties(self, props):
"""
Parses a style attribute. The code within a CSS selector block or in
the style attribute of an HTML element.
@return: A dictionary with one entry for each property where the key
is the property name and the value is the property value.
"""
prop = dict()
for s in props.split(';'):
l = s.split(':',1)
if len(l)==2:
key = str(l[0].strip()).lower()
val = l[1].strip()
prop [key] = val
return prop
def tag_css(self, tag, parent_css={}):
"""
Return a dictionary of style properties applicable to Tag tag.
"""
def merge_parent_css(prop, pcss):
temp = {}
for key in pcss.keys():
chk = key.lower()
if chk.startswith('font') or chk == 'text-align':
temp[key] = pcss[key]
prop.update(temp)
prop = dict()
if tag.has_key("align"):
prop["text-align"] = tag["align"]
if self.css.has_key(tag.name):
prop.update(self.css[tag.name])
if tag.has_key("class"):
cls = tag["class"].lower()
for classname in ["."+cls, tag.name+"."+cls]:
if self.css.has_key(classname):
prop.update(self.css[classname])
if parent_css:
merge_parent_css(prop, parent_css)
if tag.has_key("style"):
prop.update(self.parse_style_properties(tag["style"]))
return prop
def parse_file(self):
previous = self.book.last_page()
self.current_page = self.book.create_page()
self.current_block = self.book.create_text_block()
self.current_para = Paragraph()
if self.cover:
self.add_image_page(self.cover)
self.top = self.current_block
self.process_children(self.soup, {})
if self.current_para and self.current_block:
self.current_para.append_to(self.current_block)
if self.current_block and self.current_page:
self.current_block.append_to(self.current_page)
if self.current_page and self.current_page.has_text():
self.book.append(self.current_page)
if not self.top.parent:
if not previous:
self.top = self.book.pages()[0].contents[0]
else:
found = False
for page in self.book.pages():
if page == previous:
found = True
continue
if found:
self.top = page.contents[0]
break
if not self.top.parent:
raise ConversionError, 'Could not parse ' + self.file_name
def get_text(self, tag):
css = self.tag_css(tag)
if css.has_key('display') and css['display'].lower() == 'none':
return ''
text = ''
for c in tag.contents:
if isinstance(c, HTMLConverter.IGNORED_TAGS):
return ''
if isinstance(c, NavigableString):
text += str(c)
elif isinstance(c, Tag):
text += self.get_text(c)
return text
def process_links(self):
def get_target_block(fragment, targets):
'''Return the correct block for the <a name> element'''
bs = targets[fragment]
if not isinstance(bs, BlockSpace):
return bs
ans, found, page = None, False, bs.parent
for item in page.contents:
if found:
if isinstance(item, (TextBlock, ImageBlock)):
ans = item
break
if item == bs:
found = True
continue
if not ans:
for i in range(len(page.contents)-1, -1, -1):
if isinstance(page.contents[i], (TextBlock, ImageBlock)):
ans = page.contents[i]
break
if not ans:
ntb = self.book.create_text_block()
ntb.Paragraph(' ')
page.append(ntb)
ans = ntb
if found:
targets[fragment] = ans
page.contents.remove(bs)
return ans
cwd = os.getcwd()
for link in self.links:
purl = urlparse(link.tag['href'])
if purl[1]: # Not a link to a file on the local filesystem
continue
path, fragment = unquote(purl[2]), purl[5]
para, tag = link.para, link.tag
if not path or os.path.basename(path) == self.file_name:
if fragment in self.targets.keys():
tb = get_target_block(fragment, self.targets)
if self.is_root:
self.book.addTocEntry(self.get_text(tag), tb)
sys.stdout.flush()
jb = JumpButton(tb)
self.book.append(jb)
cb = CharButton(jb, text=self.get_text(tag))
para.contents = []
para.append(cb)
elif self.link_level < self.max_link_levels:
if not os.access(path, os.R_OK):
if self.verbose:
print "Skipping", link
continue
path = os.path.abspath(path)
if not path in HTMLConverter.processed_files.keys():
try:
self.files[path] = HTMLConverter(self.book, path,
width=self.page_width, height=self.page_height,
dpi=self.dpi,
font_delta=self.font_delta, verbose=self.verbose,
link_level=self.link_level+1,
max_link_levels=self.max_link_levels,
is_root = False, baen=self.baen,
chapter_detection=self.chapter_detection,
chapter_regex=self.chapter_regex,
link_exclude=self.link_exclude)
HTMLConverter.processed_files[path] = self.files[path]
except Exception:
print >>sys.stderr, 'Unable to process', path
if self.verbose:
traceback.print_exc()
continue
finally:
os.chdir(cwd)
else:
self.files[path] = HTMLConverter.processed_files[path]
conv = self.files[path]
if fragment in conv.targets.keys():
tb = get_target_block(fragment, conv.targets)
else:
tb = conv.top
if self.is_root:
self.book.addTocEntry(self.get_text(tag), tb)
jb = JumpButton(tb)
self.book.append(jb)
cb = CharButton(jb, text=self.get_text(tag))
para.contents = []
para.append(cb)
self.links_processed = True
for path in self.files.keys():
if self.files[path].links_processed:
continue
try:
os.chdir(os.path.dirname(path))
self.files[path].process_links()
finally:
os.chdir(cwd)
def end_page(self):
"""
End the current page, ensuring that any further content is displayed
on a new page.
"""
self.current_para.append_to(self.current_block)
self.current_para = Paragraph()
self.current_block.append_to(self.current_page)
self.current_block = self.book.create_text_block()
if self.current_page.has_text():
self.book.append(self.current_page)
self.current_page = self.book.create_page()
def add_image_page(self, path):
if os.access(path, os.R_OK):
self.end_page()
page = self.book.create_page(evensidemargin=0, oddsidemargin=0,
topmargin=0, textwidth=self.page_width,
textheight=self.page_height)
if not self.images.has_key(path):
self.images[path] = ImageStream(path)
page.append(ImageBlock(self.images[path]))
self.book.append(page)
def process_children(self, ptag, pcss):
""" Process the children of ptag """
for c in ptag.contents:
if isinstance(c, HTMLConverter.IGNORED_TAGS):
continue
elif isinstance(c, Tag):
self.parse_tag(c, pcss)
elif isinstance(c, NavigableString):
self.add_text(c, pcss)
def process_alignment(self, css):
'''
Create a new TextBlock only if necessary as indicated by css
@type css: dict
'''
align = 'head'
if css.has_key('text-align'):
val = css['text-align']
if val in ["right", "foot"]:
align = "foot"
elif val == "center":
align = "center"
if align != self.current_block.textStyle.attrs['align']:
self.current_para.append_to(self.current_block)
self.current_block.append_to(self.current_page)
ts = self.book.create_text_style(**self.current_block.textStyle.attrs)
ts.attrs['align'] = align
try:
index = self.text_styles.index(ts)
ts = self.text_styles[index]
except ValueError:
self.text_styles.append(ts)
self.current_block = self.book.create_text_block(
blockStyle=self.current_block.blockStyle,
textStyle=ts)
self.current_para = Paragraph()
def add_text(self, tag, css):
'''
Add text to the current paragraph taking CSS into account.
@param tag: Either a BeautifulSoup tag or a string
@param css:
@type css:
'''
src = tag.string if hasattr(tag, 'string') else tag
if self.lstrip_toggle:
src = src.lstrip()
self.lstrip_toggle = False
if not src.strip():
self.current_para.append(' ')
else:
self.process_alignment(css)
try:
self.current_para.append(Span(src, self.sanctify_css(css), self.memory,\
font_delta=self.font_delta))
except ConversionError, err:
if self.verbose:
print >>sys.stderr, err
def sanctify_css(self, css):
""" Return a copy of C{css} that is safe for use in a SPAM Xylog tag """
css = copy.copy(css)
for key in css.keys():
test = key.lower()
if test.startswith('margin') or test.startswith('text') or \
'padding' in test or 'border' in test or 'page-break' in test \
or test.startswith('mso') or test.startswith('background')\
or test in ['color', 'display', \
'letter-spacing',
'font-variant']:
css.pop(key)
return css
def end_current_para(self):
'''
End current paragraph with a paragraph break after it. If the current
paragraph has no non whitespace text in it do nothing.
'''
if not self.current_para.has_text():
return
if self.current_para.contents:
self.current_block.append(self.current_para)
self.current_para = Paragraph()
if self.current_block.contents and \
not isinstance(self.current_block.contents[-1], CR):
self.current_block.append(CR())
def end_current_block(self):
self.current_para.append_to(self.current_block)
self.current_block.append_to(self.current_page)
self.current_para = Paragraph()
self.current_block = self.book.create_text_block(textStyle=self.current_block.textStyle,
blockStyle=self.current_block.blockStyle)
def parse_tag(self, tag, parent_css):
try:
tagname = tag.name.lower()
except AttributeError:
if not isinstance(tag, HTMLConverter.IGNORED_TAGS):
self.add_text(tag, parent_css)
return
tag_css = self.tag_css(tag, parent_css=parent_css)
try: # Skip element if its display attribute is set to none
if tag_css['display'].lower() == 'none':
return
except KeyError:
pass
if 'page-break-before' in tag_css.keys():
if tag_css['page-break-before'].lower() != 'avoid':
self.end_page()
tag_css.pop('page-break-before')
end_page = False
if 'page-break-after' in tag_css.keys() and \
tag_css['page-break-after'].lower() != 'avoid':
end_page = True
tag_css.pop('page-break-after')
if tagname in ["title", "script", "meta", 'del', 'frameset']:
pass
elif tagname == 'a' and self.max_link_levels >= 0:
if tag.has_key('name'):
previous = self.current_block
self.process_children(tag, tag_css)
target = None
if self.current_block == previous:
self.current_para.append_to(self.current_block)
self.current_para = Paragraph()
if self.current_block.has_text():
target = self.current_block
else:
target = BlockSpace()
self.current_page.append(target)
else:
found = False
for item in self.current_page.contents:
if item == previous:
found = True
continue
if found:
target = item
break
if target and not isinstance(target, (TextBlock, ImageBlock)):
if isinstance(target, RuledLine):
target = self.book.create_text_block(textStyle=self.current_block.textStyle,
blockStyle=self.current_block.blockStyle)
target.Paragraph(' ')
self.current_page.append(target)
else:
target = BlockSpace()
self.current_page.append(target)
if target == None:
if self.current_block.has_text():
target = self.current_block
else:
target = BlockSpace()
self.current_page.append(target)
self.targets[tag['name']] = target
elif tag.has_key('href') and not self.link_exclude.match(tag['href']):
purl = urlparse(tag['href'])
path = purl[2]
if path and os.path.splitext(path)[1][1:].lower() in \
['png', 'jpg', 'bmp', 'jpeg']:
self.add_image_page(path)
else:
self.add_text('Link: '+tag['href'], tag_css)
self.links.append(HTMLConverter.Link(self.current_para.contents[-1], tag))
elif tagname == 'img':
if tag.has_key('src') and os.access(unquote(tag['src']), os.R_OK):
path = os.path.abspath(unquote(tag['src']))
if self.scaled_images.has_key(path):
path = self.scaled_images[path].name
im = PILImage.open(path)
width, height = im.size
try:
width = int(tag['width'])
height = int(tag['height'])
except:
pass
def scale_image(width, height):
pt = PersistentTemporaryFile(suffix='.png')
im.resize((int(width), int(height)), PILImage.ANTIALIAS).save(pt, 'PNG')
pt.close()
self.scaled_images[path] = pt
return pt.name
if height > self.page_height:
corrf = self.page_height/(1.*height)
width, height = floor(corrf*width), self.page_height-1
if width > self.page_width:
corrf = (self.page_width)/(1.*width)
width, height = self.page_width-1, floor(corrf*height)
path = scale_image(width, height)
if width > self.page_width:
corrf = self.page_width/(1.*width)
width, height = self.page_width-1, floor(corrf*height)
if height > self.page_height:
corrf = (self.page_height)/(1.*height)
width, height = floor(corrf*width), self.page_height-1
path = scale_image(width, height)
width, height = int(width), int(height)
if not self.images.has_key(path):
self.images[path] = ImageStream(path)
factor = 720./self.dpi
if max(width, height) <= min(self.page_width, self.page_height)/5.:
im = Image(self.images[path], x0=0, y0=0, x1=width, y1=height,\
xsize=width, ysize=height)
self.current_para.append(Plot(im, xsize=ceil(width*factor),
ysize=ceil(height*factor)))
elif height <= self.page_height/1.5:
pb = self.current_block
self.end_current_para()
self.process_alignment(tag_css)
im = Image(self.images[path], x0=0, y0=0, x1=width, y1=height,\
xsize=width, ysize=height)
self.current_para.append(Plot(im, xsize=width*factor,
ysize=height*factor))
self.current_block.append(self.current_para)
self.current_page.append(self.current_block)
self.current_block = self.book.create_text_block(
textStyle=pb.textStyle,
blockStyle=pb.blockStyle)
self.current_para = Paragraph()
else:
self.current_block.append(self.current_para)
self.current_page.append(self.current_block)
self.current_para = Paragraph()
self.current_block = self.book.create_text_block(textStyle=self.current_block.textStyle,
blockStyle=self.current_block.blockStyle)
im = ImageBlock(self.images[path], x1=width, y1=height,
xsize=width, ysize=height)
self.current_page.append(im)
else:
print >>sys.stderr, "Failed to process:", tag
elif tagname in ['style', 'link']:
def update_css(ncss):
for key in ncss.keys():
if self.css.has_key(key):
self.css[key].update(ncss[key])
else:
self.css[key] = ncss[key]
ncss = {}
if tagname == 'style':
for c in tag.contents:
if isinstance(c, NavigableString):
ncss.update(self.parse_css(str(c)))
elif tag.has_key('type') and tag['type'] == "text/css" \
and tag.has_key('href'):
url = tag['href']
try:
if url.startswith('http://'):
f = urlopen(url)
else:
f = open(unquote(url))
ncss = self.parse_css(f.read())
f.close()
except IOError:
pass
if ncss:
update_css(ncss)
elif tagname == 'pre':
self.end_current_para()
self.current_block.append_to(self.current_page)
self.current_block = self.book.create_text_block(
blockStyle=self.current_block.blockStyle,
textStyle=self.unindented_style)
src = ''.join([str(i) for i in tag.contents])
lines = src.split('\n')
for line in lines:
try:
self.current_para.append(Span(line, tag_css, self.memory))
self.current_para.CR()
except ConversionError:
pass
self.end_current_block()
elif tagname in ['ul', 'ol']:
self.in_ol = 1 if tagname == 'ol' else 0
self.end_current_block()
self.current_block = self.book.create_text_block(
blockStyle=self.current_block.blockStyle,
textStyle=self.unindented_style)
self.process_children(tag, tag_css)
self.in_ol = 0
self.end_current_block()
elif tagname == 'li':
prepend = str(self.in_ol)+'. ' if self.in_ol else u'\u2022' + ' '
if self.current_para.has_text():
self.current_para.append(CR())
self.current_block.append(self.current_para)
self.current_para = Paragraph()
self.current_para.append(Space(xsize=100))
self.current_para.append(prepend)
self.process_children(tag, tag_css)
if self.in_ol:
self.in_ol += 1
elif tagname == 'blockquote':
self.current_para.append_to(self.current_block)
self.current_block.append_to(self.current_page)
pb = self.current_block
self.current_para = Paragraph()
ts = self.book.create_text_style(**self.current_block.textStyle.attrs)
ts.attrs['parindent'] = 0
try:
index = self.text_styles.index(ts)
ts = self.text_styles[index]
except ValueError:
self.text_styles.append(ts)
bs = self.book.create_block_style(**self.current_block.blockStyle.attrs)
bs.attrs['sidemargin'], bs.attrs['topskip'], bs.attrs['footskip'] = \
60, 20, 20
try:
index = self.block_styles.index(bs)
bs = self.block_styles[index]
except ValueError:
self.block_styles.append(bs)
self.current_block = self.book.create_text_block(
blockStyle=bs, textStyle=ts)
self.process_children(tag, tag_css)
self.current_para.append_to(self.current_block)
self.current_block.append_to(self.current_page)
self.current_para = Paragraph()
self.current_block = self.book.create_text_block(textStyle=pb.textStyle,
blockStyle=pb.blockStyle)
elif tagname in ['p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
if self.chapter_detection and tagname.startswith('h'):
src = self.get_text(tag)
if self.chapter_regex.search(src):
if self.verbose:
print 'Detected chapter', src
self.end_page()
self.end_current_para()
self.lstrip_toggle = True
if tag_css.has_key('text-indent'):
indent = Span.unit_convert(tag_css['text-indent'])
if not indent:
indent=0
else:
indent = self.book.defaultTextStyle.attrs['parindent']
if indent != self.current_block.textStyle.attrs['parindent']:
self.current_block.append_to(self.current_page)
ts = self.book.create_text_style(**self.current_block.textStyle.attrs)
ts.attrs['parindent'] = indent
try:
index = self.text_styles.index(ts)
ts = self.text_styles[index]
except ValueError:
self.text_styles.append(ts)
self.current_block = self.book.create_text_block(blockStyle=self.current_block.blockStyle,
textStyle=ts)
self.process_children(tag, tag_css)
self.end_current_para()
if tagname.startswith('h'):
self.current_block.append(CR())
elif tagname in ['b', 'strong', 'i', 'em', 'span']:
self.process_children(tag, tag_css)
elif tagname == 'font':
if tag.has_key('face'):
tag_css['font-family'] = tag['face']
self.process_children(tag, tag_css)
elif tagname in ['br', 'tr']:
self.current_para.append(CR())
self.process_children(tag, tag_css)
elif tagname == 'hr':
self.end_current_para()
self.current_block.append(CR())
self.end_current_block()
self.current_page.RuledLine(linelength=self.page_width)
else:
self.process_children(tag, tag_css)
if end_page:
self.end_page()
def writeto(self, path, lrs=False):
self.book.renderLrs(path) if lrs else self.book.renderLrf(path)
def cleanup(self):
for _file in self.scaled_images.values():
_file.__del__()
def process_file(path, options):
cwd = os.getcwd()
dirpath = None
try:
dirpath, path = get_path(path)
cpath, tpath = options.cover, ''
if options.cover and os.access(options.cover, os.R_OK):
try:
from libprs500.prs500 import PRS500
im = PILImage.open(os.path.join(cwd, cpath))
cim = im.resize((600, 800), PILImage.BICUBIC)
cf = PersistentTemporaryFile(prefix="html2lrf_", suffix=".jpg")
cf.close()
cim.save(cf.name)
cpath = cf.name
th = PRS500.THUMBNAIL_HEIGHT
tim = im.resize((int(0.75*th), th), PILImage.ANTIALIAS)
tf = PersistentTemporaryFile(prefix="html2lrf_", suffix=".jpg")
tf.close()
tim.save(tf.name)
tpath = tf.name
except ImportError:
print >>sys.stderr, "WARNING: You don't have PIL installed. ",
'Cover and thumbnails wont work'
pass
title = (options.title, options.title_sort)
author = (options.author, options.author_sort)
args = dict(font_delta=options.font_delta, title=title, \
author=author, sourceencoding='utf8',\
freetext=options.freetext, category=options.category,
booksetting=BookSetting(dpi=10*options.dpi,screenheight=800,
screenwidth=600))
if tpath:
args['thumbnail'] = tpath
header = None
if options.header:
header = Paragraph()
header.append(Bold(options.title))
header.append(' by ')
header.append(Italic(options.author))
book = Book(header=header, **args)
conv = HTMLConverter(book, path, dpi=options.dpi,
font_delta=options.font_delta,
cover=cpath, max_link_levels=options.link_levels,
baen=options.baen,
chapter_detection=options.chapter_detection,
chapter_regex=re.compile(options.chapter_regex, re.IGNORECASE),
link_exclude=re.compile(options.link_exclude))
conv.process_links()
oname = options.output
if not oname:
suffix = '.lrs' if options.lrs else '.lrf'
name = os.path.splitext(os.path.basename(path))[0] + suffix
oname = os.path.join(cwd,name)
oname = os.path.abspath(os.path.expanduser(oname))
conv.writeto(oname, lrs=options.lrs)
print 'Output written to', oname
conv.cleanup()
finally:
os.chdir(cwd)
if dirpath:
shutil.rmtree(dirpath, True)
def main():
""" CLI for html -> lrf conversions """
parser = option_parser("""usage: %prog [options] mybook.[html|rar|zip]
%prog converts mybook.html to mybook.lrf""")
parser.add_option('--cover', action='store', dest='cover', default=None, \
help='Path to file containing image to be used as cover')
parser.add_option('--lrs', action='store_true', dest='lrs', \
help='Convert to LRS', default=False)
parser.add_option('--font-delta', action='store', type='int', default=0, \
help="""Increase the font size by 2 * FONT_DELTA pts.
If FONT_DELTA is negative, the font size is decreased.""",
dest='font_delta')
parser.add_option('--link-levels', action='store', type='int', default=sys.maxint, \
dest='link_levels',
help=r'''The maximum number of levels to recursively process '''
'''links. A value of 0 means thats links are not followed. '''
'''A negative value means that <a> tags are ignored.''')
parser.add_option('--baen', action='store_true', default=False, dest='baen',
help='''Preprocess Baen HTML files to improve generated LRF.''')
parser.add_option('--dpi', action='store', type='int', default=166, dest='dpi',
help='''The DPI of the target device. Default is 166 for the
Sony PRS 500''')
parser.add_option('--disable-chapter-detection', action='store_false',
default=True, dest='chapter_detection',
help='''Prevent html2lrf from automatically inserting page breaks'''
'''before what it thinks are chapters.''')
parser.add_option('--chapter-regex', dest='chapter_regex',
default='chapter|book|appendix',
help='''The regular expression used to detect chapter titles.'''
'''It is searched for in heading tags. Default is chapter|book|appendix''')
parser.add_option('--link-exclude', dest='link_exclude', default='',
help='''A regular expression. <a> tags whoose href '''
'''matches will be ignored''')
options, args = parser.parse_args()
if len(args) != 1:
parser.print_help()
sys.exit(1)
src = args[0]
if options.title == None:
options.title = os.path.splitext(os.path.basename(src))[0]
process_file(src, options)
def console_query(dirpath, candidate, docs):
if len(docs) == 1:
return 0
try:
import readline
except ImportError:
pass
i = 0
for doc in docs:
prefix = '>' if i == candidate else ''
print prefix+str(i)+'.\t', doc[0]
i += 1
print
while True:
try:
choice = raw_input('Choose file to convert (0-'+str(i-1) + \
'). Current choice is ['+ str(candidate) + ']:')
if not choice:
return candidate
choice = int(choice)
if choice < 0 or choice >= i:
continue
candidate = choice
except EOFError, KeyboardInterrupt:
sys.exit()
except:
continue
break
return candidate
def get_path(path, query=console_query):
path = os.path.abspath(os.path.expanduser(path))
ext = os.path.splitext(path)[1][1:].lower()
if ext in ['htm', 'html', 'xhtml']:
return None, path
dirpath = mkdtemp('','html2lrf')
extract(path, dirpath)
candidate, docs = None, []
for root, dirs, files in os.walk(dirpath):
for name in files:
ext = os.path.splitext(name)[1][1:].lower()
if ext not in ['html', 'xhtml', 'htm', 'xhtm']:
continue
docs.append((name, root, os.stat(os.path.join(root, name)).st_size))
if 'toc' in name.lower():
candidate = name
docs.sort(key=itemgetter(2))
if candidate:
for i in range(len(docs)):
if docs[i][0] == candidate:
candidate = i
break
else:
candidate = len(docs) - 1
if len(docs) == 0:
raise ConversionError('No suitable files found in archive')
if len(docs) > 0:
candidate = query(dirpath, candidate, docs)
return dirpath, os.path.join(docs[candidate][1], docs[candidate][0])
if __name__ == '__main__':
main()