Refactoring.

This commit is contained in:
John Schember 2009-05-21 16:22:24 -04:00
parent 4be2cbb770
commit 24ca1a1134
14 changed files with 230 additions and 185 deletions

View file

@ -89,7 +89,7 @@ def _ep_to_function(ep):
include_dirs=['src/calibre/utils/msdes']),
Extension('calibre.plugins.cPalmdoc',
sources=['src/calibre/ebooks/mobi/palmdoc.c']),
sources=['src/calibre/ebooks/compression/palmdoc.c']),
PyQtExtension('calibre.plugins.pictureflow',
['src/calibre/gui2/pictureflow/pictureflow.cpp',

View file

@ -0,0 +1,5 @@
# -*- coding: utf-8 -*-
__license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'

View file

@ -25,15 +25,9 @@
'div' : 'p',
}
STYLE_MAP = {
'bold' : 'strong',
'bolder' : 'strong',
'italic' : 'emphasis',
}
STYLES = [
'font-weight',
'font-style',
('font-weight', {'bold' : 'strong', 'bolder' : 'strong'}),
('font-style', {'italic' : 'emphasis'}),
]
class FB2MLizer(object):
@ -107,8 +101,9 @@ def dump_text(self, elem, stylizer, tag_stack=[]):
fb2_text += '<%s>' % fb2_tag
tag_stack.append(fb2_tag)
# Processes style information
for s in STYLES:
style_tag = STYLE_MAP.get(style[s], None)
style_tag = s[1].get(style[s[0]], None)
if style_tag:
tag_count += 1
fb2_text += '<%s>' % style_tag

View file

@ -1,11 +1,17 @@
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
'''
Read data from .mobi files
'''
import struct, os, cStringIO, re, functools, datetime, textwrap
import datetime
import functools
import os
import re
import struct
import textwrap
import cStringIO
try:
from PIL import Image as PILImage
@ -21,8 +27,8 @@
from calibre.ebooks.chardet import ENCODING_PATS
from calibre.ebooks.mobi import MobiError
from calibre.ebooks.mobi.huffcdic import HuffReader
from calibre.ebooks.mobi.palmdoc import decompress_doc
from calibre.ebooks.mobi.langcodes import main_language, sub_language
from calibre.ebooks.compression.palmdoc import decompress_doc
from calibre.ebooks.metadata import MetaInformation
from calibre.ebooks.metadata.opf2 import OPFCreator, OPF
from calibre.ebooks.metadata.toc import TOC
@ -40,8 +46,8 @@ def __init__(self, raw, codec, title):
while left > 0:
left -= 1
id, size = struct.unpack('>LL', raw[pos:pos+8])
content = raw[pos+8:pos+size]
id, size = struct.unpack('>LL', raw[pos:pos + 8])
content = raw[pos + 8:pos + size]
pos += size
if id >= 100 and id < 200:
self.process_metadata(id, content, codec)
@ -87,7 +93,7 @@ def process_metadata(self, id, content, codec):
elif id == 106:
try:
self.mi.publish_date = datetime.datetime.strptime(
content, '%Y-%m-%d',).date()
content, '%Y-%m-%d', ).date()
except:
pass
elif id == 108:
@ -123,13 +129,13 @@ def __init__(self, raw, ident, user_encoding, log):
try:
self.codec = {
1252 : 'cp1252',
65001 : 'utf-8',
}[self.codepage]
1252: 'cp1252',
65001: 'utf-8',
}[self.codepage]
except (IndexError, KeyError):
self.codec = 'cp1252' if user_encoding is None else user_encoding
log.warn('Unknown codepage %d. Assuming %s'%(self.codepage,
self.codec))
log.warn('Unknown codepage %d. Assuming %s' % (self.codepage,
self.codec))
if ident == 'TEXTREAD' or self.length < 0xE4 or 0xE8 < self.length:
self.extra_flags = 0
else:
@ -147,14 +153,14 @@ def __init__(self, raw, ident, user_encoding, log):
self.language = main_language.get(langid, 'ENGLISH')
self.sublanguage = sub_language.get(sublangid, 'NEUTRAL')
self.mobi_version = struct.unpack('>I', raw[0x68:0x6c])[0]
self.first_image_index = struct.unpack('>L', raw[0x6c:0x6c+4])[0]
self.first_image_index = struct.unpack('>L', raw[0x6c:0x6c + 4])[0]
self.exth_flag, = struct.unpack('>L', raw[0x80:0x84])
self.exth = None
if not isinstance(self.title, unicode):
self.title = self.title.decode(self.codec, 'replace')
if self.exth_flag & 0x40:
self.exth = EXTHHeader(raw[16+self.length:], self.codec, self.title)
self.exth = EXTHHeader(raw[16 + self.length:], self.codec, self.title)
self.exth.mi.uid = self.unique_id
self.exth.mi.language = self.language
@ -182,7 +188,7 @@ def section_count(self):
return struct.unpack('>H', self.stream.read(2))[0]
def section_offset(self, number):
self.stream.seek(78+number*8)
self.stream.seek(78 + number * 8)
return struct.unpack('>LBBBB', self.stream.read(8))[0]
def header(self):
@ -242,15 +248,15 @@ def __init__(self, filename_or_stream, log, user_encoding=None, debug=None):
self.name = self.header[:32].replace('\x00', '')
self.num_sections, = struct.unpack('>H', raw[76:78])
self.ident = self.header[0x3C:0x3C+8].upper()
self.ident = self.header[0x3C:0x3C + 8].upper()
if self.ident not in ['BOOKMOBI', 'TEXTREAD']:
raise MobiError('Unknown book type: %s'%self.ident)
raise MobiError('Unknown book type: %s' % self.ident)
self.sections = []
self.section_headers = []
for i in range(self.num_sections):
offset, a1, a2, a3, a4 = struct.unpack('>LBBBB', raw[78+i*8:78+i*8+8])
flags, val = a1, a2<<16 | a3<<8 | a4
offset, a1, a2, a3, a4 = struct.unpack('>LBBBB', raw[78 + i * 8:78 + i * 8 + 8])
flags, val = a1, a2 << 16 | a3 << 8 | a4
self.section_headers.append((offset, flags, val))
def section(section_number):
@ -266,7 +272,7 @@ def section(section_number):
self.book_header = BookHeader(self.sections[0][0], self.ident,
user_encoding, self.log)
user_encoding, self.log)
self.name = self.name.decode(self.book_header.codec, 'replace')
def extract_content(self, output_dir, parse_cache):
@ -279,13 +285,13 @@ def extract_content(self, output_dir, parse_cache):
parse_cache['calibre_raw_mobi_markup'] = self.mobi_html
self.add_anchors()
self.processed_html = self.processed_html.decode(self.book_header.codec,
'ignore')
'ignore')
for pat in ENCODING_PATS:
self.processed_html = pat.sub('', self.processed_html)
e2u = functools.partial(entity_to_unicode,
exceptions=['lt', 'gt', 'amp', 'apos', 'quot'])
exceptions=['lt', 'gt', 'amp', 'apos', 'quot'])
self.processed_html = re.sub(r'&(\S+?);', e2u,
self.processed_html)
self.processed_html)
self.extract_images(processed_records, output_dir)
self.replace_page_breaks()
self.cleanup_html()
@ -295,7 +301,7 @@ def extract_content(self, output_dir, parse_cache):
if root.xpath('descendant::p/descendant::p'):
from lxml.html import soupparser
self.log.warning('Markup contains unclosed <p> tags, parsing using',
'BeatifulSoup')
'BeatifulSoup')
root = soupparser.fromstring(self.processed_html)
if root.tag != 'html':
self.log.warn('File does not have opening <html> tag')
@ -346,45 +352,45 @@ def scmp(x, y): return cmp(sz(x), sz(y))
fname = self.name.encode('ascii', 'replace')
fname = re.sub(r'[\x08\x15\0]+', '', fname)
htmlfile = os.path.join(output_dir,
sanitize_file_name(fname)+'.html')
sanitize_file_name(fname) + '.html')
try:
for ref in guide.xpath('descendant::reference'):
if ref.attrib.has_key('href'):
ref.attrib['href'] = os.path.basename(htmlfile)+ref.attrib['href']
ref.attrib['href'] = os.path.basename(htmlfile) + ref.attrib['href']
except AttributeError:
pass
parse_cache[htmlfile] = root
self.htmlfile = htmlfile
ncx = cStringIO.StringIO()
opf, ncx_manifest_entry = self.create_opf(htmlfile, guide, root)
self.created_opf_path = os.path.splitext(htmlfile)[0]+'.opf'
self.created_opf_path = os.path.splitext(htmlfile)[0] + '.opf'
opf.render(open(self.created_opf_path, 'wb'), ncx,
ncx_manifest_entry=ncx_manifest_entry)
ncx_manifest_entry=ncx_manifest_entry)
ncx = ncx.getvalue()
if ncx:
ncx_path = os.path.join(os.path.dirname(htmlfile), 'toc.ncx')
open(ncx_path, 'wb').write(ncx)
with open('styles.css', 'wb') as s:
s.write(self.base_css_rules+'\n\n')
s.write(self.base_css_rules + '\n\n')
for cls, rule in self.tag_css_rules.items():
if isinstance(rule, unicode):
rule = rule.encode('utf-8')
s.write('.%s { %s }\n\n'%(cls, rule))
s.write('.%s { %s }\n\n' % (cls, rule))
if self.book_header.exth is not None or self.embedded_mi is not None:
self.log.debug('Creating OPF...')
ncx = cStringIO.StringIO()
opf, ncx_manifest_entry = self.create_opf(htmlfile, guide, root)
opf.render(open(os.path.splitext(htmlfile)[0]+'.opf', 'wb'), ncx,
ncx_manifest_entry )
opf.render(open(os.path.splitext(htmlfile)[0] + '.opf', 'wb'), ncx,
ncx_manifest_entry)
ncx = ncx.getvalue()
if ncx:
open(os.path.splitext(htmlfile)[0]+'.ncx', 'wb').write(ncx)
open(os.path.splitext(htmlfile)[0] + '.ncx', 'wb').write(ncx)
def read_embedded_metadata(self, root, elem, guide):
raw = '<package>'+html.tostring(elem, encoding='utf-8')+'</package>'
raw = '<package>' + html.tostring(elem, encoding='utf-8') + '</package>'
stream = cStringIO.StringIO(raw)
opf = OPF(stream)
self.embedded_mi = MetaInformation(opf)
@ -394,7 +400,7 @@ def read_embedded_metadata(self, root, elem, guide):
href = ref.get('href', '')
if href.startswith('#'):
href = href[1:]
anchors = root.xpath('//*[@id="%s"]'%href)
anchors = root.xpath('//*[@id="%s"]' % href)
if anchors:
cpos = anchors[0]
reached = False
@ -412,27 +418,27 @@ def cleanup_html(self):
self.log.debug('Cleaning up HTML...')
self.processed_html = re.sub(r'<div height="0(pt|px|ex|em|%){0,1}"></div>', '', self.processed_html)
if self.book_header.ancient and '<html' not in self.mobi_html[:300].lower():
self.processed_html = '<html><p>'+self.processed_html.replace('\n\n', '<p>')+'</html>'
self.processed_html = '<html><p>' + self.processed_html.replace('\n\n', '<p>') + '</html>'
self.processed_html = self.processed_html.replace('\r\n', '\n')
self.processed_html = self.processed_html.replace('> <', '>\n<')
def upshift_markup(self, root):
self.log.debug('Converting style information to CSS...')
size_map = {
'xx-small' : '0.5',
'x-small' : '1',
'small' : '2',
'medium' : '3',
'large' : '4',
'x-large' : '5',
'xx-large' : '6',
}
'xx-small': '0.5',
'x-small': '1',
'small': '2',
'medium': '3',
'large': '4',
'x-large': '5',
'xx-large': '6',
}
mobi_version = self.book_header.mobi_version
for i, tag in enumerate(root.iter(etree.Element)):
tag.attrib.pop('xmlns', '')
if tag.tag in ('country-region', 'place', 'placetype', 'placename',
'state', 'city', 'street', 'address', 'content'):
tag.tag = 'div' if tag.tag == 'content' else 'span'
'state', 'city', 'street', 'address', 'content'):
tag.tag = 'div' if tag.tag == 'content' else 'span'
for key in tag.attrib.keys():
tag.attrib.pop(key)
continue
@ -450,7 +456,7 @@ def upshift_markup(self, root):
if width:
styles.append('text-indent: %s' % width)
if width.startswith('-'):
styles.append('margin-left: %s'%(width[1:]))
styles.append('margin-left: %s' % (width[1:]))
if attrib.has_key('align'):
align = attrib.pop('align').strip()
if align:
@ -502,7 +508,7 @@ def upshift_markup(self, root):
cls = sel
break
if cls is None:
ncls = 'calibre_%d'%i
ncls = 'calibre_%d' % i
self.tag_css_rules[ncls] = rule
cls = attrib.get('class', '')
cls = cls + (' ' if cls else '') + ncls
@ -514,17 +520,17 @@ def create_opf(self, htmlfile, guide=None, root=None):
mi = MetaInformation(self.book_header.title, [_('Unknown')])
opf = OPFCreator(os.path.dirname(htmlfile), mi)
if hasattr(self.book_header.exth, 'cover_offset'):
opf.cover = 'images/%05d.jpg'%(self.book_header.exth.cover_offset+1)
opf.cover = 'images/%05d.jpg' % (self.book_header.exth.cover_offset + 1)
elif mi.cover is not None:
opf.cover = mi.cover
else:
opf.cover = 'images/%05d.jpg'%1
opf.cover = 'images/%05d.jpg' % 1
if not os.path.exists(os.path.join(os.path.dirname(htmlfile),
*opf.cover.split('/'))):
opf.cover = None
* opf.cover.split('/'))):
opf.cover = None
manifest = [(htmlfile, 'text/x-oeb1-document'),
(os.path.abspath('styles.css'), 'text/css')]
(os.path.abspath('styles.css'), 'text/css')]
bp = os.path.dirname(htmlfile)
for i in getattr(self, 'image_names', []):
manifest.append((os.path.join(bp, 'images/', i), 'image/jpeg'))
@ -541,7 +547,7 @@ def create_opf(self, htmlfile, guide=None, root=None):
ncx_manifest_entry = None
if toc:
ncx_manifest_entry = 'toc.ncx'
elems = root.xpath('//*[@id="%s"]'%toc.partition('#')[-1])
elems = root.xpath('//*[@id="%s"]' % toc.partition('#')[-1])
tocobj = None
ent_pat = re.compile(r'&(\S+?);')
if elems:
@ -556,12 +562,12 @@ def create_opf(self, htmlfile, guide=None, root=None):
if href and re.match('\w+://', href) is None:
try:
text = u' '.join([t.strip() for t in \
x.xpath('descendant::text()')])
x.xpath('descendant::text()')])
except:
text = ''
text = ent_pat.sub(entity_to_unicode, text)
tocobj.add_item(toc.partition('#')[0], href[1:],
text)
text)
if reached and x.get('class', None) == 'mbp_pagebreak':
break
if tocobj is not None:
@ -599,17 +605,17 @@ def text_section(self, index):
def extract_text(self):
self.log.debug('Extracting text...')
text_sections = [self.text_section(i) for i in range(1, self.book_header.records+1)]
processed_records = list(range(0, self.book_header.records+1))
text_sections = [self.text_section(i) for i in range(1, self.book_header.records + 1)]
processed_records = list(range(0, self.book_header.records + 1))
self.mobi_html = ''
if self.book_header.compression_type == 'DH':
huffs = [self.sections[i][0] for i in
range(self.book_header.huff_offset,
self.book_header.huff_offset+self.book_header.huff_number)]
range(self.book_header.huff_offset,
self.book_header.huff_offset + self.book_header.huff_number)]
processed_records += list(range(self.book_header.huff_offset,
self.book_header.huff_offset+self.book_header.huff_number))
self.book_header.huff_offset + self.book_header.huff_number))
huff = HuffReader(huffs)
self.mobi_html = huff.decompress(text_sections)
@ -620,7 +626,7 @@ def extract_text(self):
elif self.book_header.compression_type == '\x00\x01':
self.mobi_html = ''.join(text_sections)
else:
raise MobiError('Unknown compression algorithm: %s'%repr(self.book_header.compression_type))
raise MobiError('Unknown compression algorithm: %s' % repr(self.book_header.compression_type))
if self.book_header.ancient and '<html' not in self.mobi_html[:300].lower():
self.mobi_html = self.mobi_html.replace('\r ', '\n\n ')
self.mobi_html = self.mobi_html.replace('\0', '')
@ -636,7 +642,7 @@ def add_anchors(self):
self.log.debug('Adding anchors...')
positions = set([])
link_pattern = re.compile(r'''<[^<>]+filepos=['"]{0,1}(\d+)[^<>]*>''',
re.IGNORECASE)
re.IGNORECASE)
for match in link_pattern.finditer(self.mobi_html):
positions.add(int(match.group(1)))
pos = 0
@ -652,10 +658,10 @@ def add_anchors(self):
if r > -1 and (r < l or l == end or l == -1):
p = self.mobi_html.rfind('<', 0, end + 1)
if pos < end and p > -1 and \
not end_tag_re.match(self.mobi_html[p:r]) and \
not self.mobi_html[p:r+1].endswith('/>'):
anchor = ' filepos-id="filepos%d"'
end = r
not end_tag_re.match(self.mobi_html[p:r]) and \
not self.mobi_html[p:r + 1].endswith('/>'):
anchor = ' filepos-id="filepos%d"'
end = r
else:
end = r + 1
self.processed_html += self.mobi_html[pos:end] + (anchor % oend)
@ -673,7 +679,7 @@ def extract_images(self, processed_records, output_dir):
start = getattr(self.book_header, 'first_image_index', -1)
if start > self.num_sections or start < 0:
# BAEN PRC files have bad headers
start=0
start = 0
for i in range(start, self.num_sections):
if i in processed_records:
continue
@ -687,7 +693,7 @@ def extract_images(self, processed_records, output_dir):
except IOError:
continue
path = os.path.join(output_dir, '%05d.jpg'%image_index)
path = os.path.join(output_dir, '%05d.jpg' % image_index)
self.image_names.append(os.path.basename(path))
im.save(open(path, 'wb'), format='JPEG')

View file

@ -1,27 +1,32 @@
'''
Write content to Mobipocket books.
'''
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.cam>'
from collections import defaultdict
from itertools import count
from itertools import izip
import random
import re
from struct import pack
import time
import random
from cStringIO import StringIO
import re
from itertools import izip, count
from collections import defaultdict
from urlparse import urldefrag
from PIL import Image
from calibre.ebooks.oeb.base import XML_NS, XHTML, XHTML_NS, OEB_DOCS, \
OEB_RASTER_IMAGES
from calibre.ebooks.oeb.base import namespace, prefixname
from calibre.ebooks.oeb.base import urlnormalize
from calibre.ebooks.mobi.palmdoc import compress_doc
from cStringIO import StringIO
from calibre.ebooks.mobi.langcodes import iana2mobi
from calibre.ebooks.mobi.mobiml import MBP_NS
from calibre.ebooks.oeb.base import OEB_DOCS
from calibre.ebooks.oeb.base import OEB_RASTER_IMAGES
from calibre.ebooks.oeb.base import XHTML
from calibre.ebooks.oeb.base import XHTML_NS
from calibre.ebooks.oeb.base import XML_NS
from calibre.ebooks.oeb.base import namespace
from calibre.ebooks.oeb.base import prefixname
from calibre.ebooks.oeb.base import urlnormalize
from calibre.ebooks.compression.palmdoc import compress_doc
# TODO:
# - Allow override CSS (?)
@ -174,7 +179,7 @@ def serialize_href(self, href, base=None):
item = hrefs[path] if path else None
if item and item.spine_position is None:
return False
path = item.href if item else base.href
path = item.href if item else base.href
href = '#'.join((path, frag)) if frag else path
buffer.write('filepos=')
self.href_offsets[href].append(buffer.tell())
@ -211,8 +216,8 @@ def serialize_item(self, item):
def serialize_elem(self, elem, item, nsrmap=NSRMAP):
buffer = self.buffer
if not isinstance(elem.tag, basestring) \
or namespace(elem.tag) not in nsrmap:
return
or namespace(elem.tag) not in nsrmap:
return
tag = prefixname(elem.tag, nsrmap)
# Previous layers take care of @name
id = elem.attrib.pop('id', None)
@ -221,9 +226,9 @@ def serialize_elem(self, elem, item, nsrmap=NSRMAP):
offset = self.anchor_offset or buffer.tell()
self.id_offsets[href] = offset
if self.anchor_offset is not None and \
tag == 'a' and not elem.attrib and \
not len(elem) and not elem.text:
return
tag == 'a' and not elem.attrib and \
not len(elem) and not elem.text:
return
self.anchor_offset = buffer.tell()
buffer.write('<')
buffer.write(tag)
@ -286,8 +291,8 @@ class MobiWriter(object):
COLLAPSE_RE = re.compile(r'[ \t\r\n\v]+')
def __init__(self, compression=PALMDOC, imagemax=None,
prefer_author_sort=False):
self._compression = compression or UNCOMPRESSED
prefer_author_sort=False):
self._compression = compression or UNCOMPRESSED
self._imagemax = imagemax or OTHER_MAX_IMAGE_SIZE
self._prefer_author_sort = prefer_author_sort
@ -297,7 +302,7 @@ def generate(cls, opts):
imagemax = PALM_MAX_IMAGE_SIZE if opts.rescale_images else None
prefer_author_sort = opts.prefer_author_sort
return cls(compression=PALMDOC, imagemax=imagemax,
prefer_author_sort=prefer_author_sort)
prefer_author_sort=prefer_author_sort)
def __call__(self, oeb, path):
if hasattr(path, 'write'):
@ -305,7 +310,7 @@ def __call__(self, oeb, path):
with open(path, 'w+b') as stream:
return self._dump_stream(oeb, stream)
def _write(self, *data):
def _write(self, * data):
for datum in data:
self._stream.write(datum)

View file

@ -1,5 +1,4 @@
# -*- coding: utf-8 -*-
from __future__ import with_statement
__license__ = 'GPL v3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
@ -7,17 +6,17 @@
class PDBError(Exception):
pass
from calibre.ebooks.pdb.ereader.reader import Reader as ereader_reader
from calibre.ebooks.pdb.ztxt.reader import Reader as ztxt_reader
from calibre.ebooks.pdb.palmdoc.reader import Reader as palmdoc_reader
from calibre.ebooks.pdb.ztxt.reader import Reader as ztxt_reader
FORMAT_READERS = {
'PNPdPPrs' : ereader_reader,
'PNRdPPrs' : ereader_reader,
'zTXTGPlm' : ztxt_reader,
'TEXtREAd' : palmdoc_reader,
'PNPdPPrs': ereader_reader,
'PNRdPPrs': ereader_reader,
'zTXTGPlm': ztxt_reader,
'TEXtREAd': palmdoc_reader,
}
from calibre.ebooks.pdb.palmdoc.writer import Writer as palmdoc_writer
@ -25,41 +24,41 @@ class PDBError(Exception):
from calibre.ebooks.pdb.ereader.writer import Writer as ereader_writer
FORMAT_WRITERS = {
'doc' : palmdoc_writer,
'ztxt' : ztxt_writer,
'ereader' : ereader_writer,
'doc': palmdoc_writer,
'ztxt': ztxt_writer,
'ereader': ereader_writer,
}
IDENTITY_TO_NAME = {
'PNPdPPrs' : 'eReader',
'PNRdPPrs' : 'eReader',
'zTXTGPlm' : 'zTXT',
'TEXtREAd' : 'PalmDOC',
'.pdfADBE' : 'Adobe Reader',
'BVokBDIC' : 'BDicty',
'DB99DBOS' : 'DB (Database program)',
'vIMGView' : 'FireViewer (ImageViewer)',
'PmDBPmDB' : 'HanDBase',
'InfoINDB' : 'InfoView',
'ToGoToGo' : 'iSilo',
'SDocSilX' : 'iSilo 3',
'JbDbJBas' : 'JFile',
'JfDbJFil' : 'JFile Pro',
'DATALSdb' : 'LIST',
'Mdb1Mdb1' : 'MobileDB',
'BOOKMOBI' : 'MobiPocket',
'DataPlkr' : 'Plucker',
'DataSprd' : 'QuickSheet',
'SM01SMem' : 'SuperMemo',
'TEXtTlDc' : 'TealDoc',
'InfoTlIf' : 'TealInfo',
'DataTlMl' : 'TealMeal',
'DataTlPt' : 'TealPaint',
'dataTDBP' : 'ThinkDB',
'TdatTide' : 'Tides',
'ToRaTRPW' : 'TomeRaider',
'BDOCWrdS' : 'WordSmith',
'PNPdPPrs': 'eReader',
'PNRdPPrs': 'eReader',
'zTXTGPlm': 'zTXT',
'TEXtREAd': 'PalmDOC',
'.pdfADBE': 'Adobe Reader',
'BVokBDIC': 'BDicty',
'DB99DBOS': 'DB (Database program)',
'vIMGView': 'FireViewer (ImageViewer)',
'PmDBPmDB': 'HanDBase',
'InfoINDB': 'InfoView',
'ToGoToGo': 'iSilo',
'SDocSilX': 'iSilo 3',
'JbDbJBas': 'JFile',
'JfDbJFil': 'JFile Pro',
'DATALSdb': 'LIST',
'Mdb1Mdb1': 'MobileDB',
'BOOKMOBI': 'MobiPocket',
'DataPlkr': 'Plucker',
'DataSprd': 'QuickSheet',
'SM01SMem': 'SuperMemo',
'TEXtTlDc': 'TealDoc',
'InfoTlIf': 'TealInfo',
'DataTlMl': 'TealMeal',
'DataTlPt': 'TealPaint',
'dataTDBP': 'ThinkDB',
'TdatTide': 'Tides',
'ToRaTRPW': 'TomeRaider',
'BDOCWrdS': 'WordSmith',
}
def get_reader(identity):
@ -67,10 +66,10 @@ def get_reader(identity):
Returns None if no reader is found for the identity.
'''
return FORMAT_READERS.get(identity, None)
def get_writer(extension):
'''
Returns None if no writer is found for extension.
'''
return FORMAT_WRITERS.get(extension, None)

View file

@ -8,16 +8,19 @@
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import os, re, struct, zlib
import os
import re
import struct
import zlib
from calibre import CurrentDir
from calibre.ebooks import DRMError
from calibre.ebooks.pdb.formatreader import FormatReader
from calibre.ebooks.pdb.ereader import EreaderError
from calibre.ebooks.pml.pmlconverter import pml_to_html, \
footnote_sidebar_to_html
from calibre.ebooks.mobi.palmdoc import decompress_doc
from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre.ebooks.compression.palmdoc import decompress_doc
from calibre.ebooks.pdb.ereader import EreaderError
from calibre.ebooks.pdb.formatreader import FormatReader
from calibre.ebooks.pml.pmlconverter import footnote_sidebar_to_html
from calibre.ebooks.pml.pmlconverter import pml_to_html
class HeaderRecord(object):
'''
@ -32,7 +35,7 @@ def __init__(self, raw):
self.non_text_offset, = struct.unpack('>H', raw[12:14])
self.has_metadata, = struct.unpack('>H', raw[24:26])
self.footnote_rec, = struct.unpack('>H', raw[28:30])
self.sidebar_rec, = struct.unpack('>H', raw[30:32])
self.sidebar_rec, = struct.unpack('>H', raw[30:32])
self.bookmark_offset, = struct.unpack('>H', raw[32:34])
self.image_data_offset, = struct.unpack('>H', raw[40:42])
self.metadata_offset, = struct.unpack('>H', raw[44:46])
@ -79,7 +82,7 @@ def get_image(self, number):
if number < self.header_record.image_data_offset or number > self.header_record.image_data_offset + self.header_record.num_image_pages - 1:
return 'empty', ''
data = self.section_data(number)
name = data[4:4+32].strip('\x00')
name = data[4:4 + 32].strip('\x00')
img = data[62:]
return name, img

View file

@ -8,9 +8,11 @@
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import struct, zlib
import struct
import zlib
import Image, cStringIO
import Image
import cStringIO
from calibre.ebooks.pdb.formatwriter import FormatWriter
from calibre.ebooks.oeb.base import OEB_IMAGES

View file

@ -1,5 +1,4 @@
# -*- coding: utf-8 -*-
from __future__ import with_statement
'''
Read the header data from a pdb file.
'''
@ -8,7 +7,9 @@
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import re, struct, time
import re
import struct
import time
class PdbHeaderReader(object):
@ -35,16 +36,16 @@ def full_section_info(self, number):
if number not in range(0, self.num_sections):
raise ValueError('Not a valid section number %i' % number)
self.stream.seek(78+number*8)
self.stream.seek(78 + number * 8)
offset, a1, a2, a3, a4 = struct.unpack('>LBBBB', self.stream.read(8))[0]
flags, val = a1, a2<<16 | a3<<8 | a4
flags, val = a1, a2 << 16 | a3 << 8 | a4
return (offset, flags, val)
def section_offset(self, number):
if number not in range(0, self.num_sections):
raise ValueError('Not a valid section number %i' % number)
self.stream.seek(78+number*8)
self.stream.seek(78 + number * 8)
return struct.unpack('>LBBBB', self.stream.read(8))[0]
def section_data(self, number):

View file

@ -8,11 +8,13 @@
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import os, struct, zlib
import os
import struct
from calibre.ebooks.compression.palmdoc import decompress_doc
from calibre.ebooks.pdb.formatreader import FormatReader
from calibre.ebooks.mobi.palmdoc import decompress_doc
from calibre.ebooks.txt.processor import txt_to_markdown, opf_writer
from calibre.ebooks.txt.processor import opf_writer
from calibre.ebooks.txt.processor import txt_to_markdown
class HeaderRecord(object):
'''
@ -25,15 +27,15 @@ class HeaderRecord(object):
def __init__(self, raw):
self.compression, = struct.unpack('>H', raw[0:2])
self.num_records, = struct.unpack('>H', raw[8:10])
class Reader(FormatReader):
def __init__(self, header, stream, log, encoding=None):
self.stream = stream
self.log = log
self.encoding = encoding
self.sections = []
for i in range(header.num_sections):
self.sections.append(header.section_data(i))
@ -52,7 +54,7 @@ def decompress_text(self, number):
def extract_content(self, output_dir):
txt = ''
self.log.info('Decompressing text...')
for i in range(1, self.header_record.num_records + 1):
self.log.debug('\tDecompressing text section %i' % i)
@ -62,12 +64,12 @@ def extract_content(self, output_dir):
html = txt_to_markdown(txt)
with open(os.path.join(output_dir, 'index.html'), 'wb') as index:
index.write(html.encode('utf-8'))
from calibre.ebooks.metadata.meta import get_metadata
mi = get_metadata(self.stream, 'pdb')
manifest = [('index.html', None)]
spine = ['index.html']
opf_writer(output_dir, 'metadata.opf', manifest, spine, mi)
return os.path.join(output_dir, 'metadata.opf')

View file

@ -10,10 +10,11 @@
import struct
from calibre.ebooks.compression.palmdoc import compress_doc
from calibre.ebooks.pdb.formatwriter import FormatWriter
from calibre.ebooks.txt.writer import TxtWriter, TxtNewlines
from calibre.ebooks.mobi.palmdoc import compress_doc
from calibre.ebooks.pdb.header import PdbHeaderBuilder
from calibre.ebooks.txt.writer import TxtNewlines
from calibre.ebooks.txt.writer import TxtWriter
MAX_RECORD_SIZE = 4096
@ -22,48 +23,48 @@ class Writer(FormatWriter):
def __init__(self, opts, log):
self.opts = opts
self.log = log
def write_content(self, oeb_book, out_stream, metadata=None):
title = self.opts.title if self.opts.title else oeb_book.metadata.title[0].value if oeb_book.metadata.title != [] else _('Unknown')
txt_records, txt_length = self._generate_text(oeb_book.spine)
header_record = self._header_record(txt_length, len(txt_records))
section_lengths = [len(header_record)]
self.log.info('Compessing data...')
for i in range(0, len(txt_records)):
self.log.debug('\tCompressing record %i' % i)
txt_records[i] = compress_doc(txt_records[i].encode('utf-8'))
section_lengths.append(len(txt_records[i]))
out_stream.seek(0)
hb = PdbHeaderBuilder('TEXtREAd', title)
hb.build_header(section_lengths, out_stream)
for record in [header_record]+txt_records:
for record in [header_record] + txt_records:
out_stream.write(record)
def _generate_text(self, spine):
txt_writer = TxtWriter(TxtNewlines('system').newline, self.log)
txt = txt_writer.dump(spine)
txt_length = len(txt)
txt_records = []
for i in range(0, (len(txt) / MAX_RECORD_SIZE) + 1):
txt_records.append(txt[i * MAX_RECORD_SIZE : (i * MAX_RECORD_SIZE) + MAX_RECORD_SIZE])
txt_records.append(txt[i * MAX_RECORD_SIZE: (i * MAX_RECORD_SIZE) + MAX_RECORD_SIZE])
return txt_records, txt_length
def _header_record(self, txt_length, record_count):
record = ''
record += struct.pack('>H', 2) # [0:2], PalmDoc compression. (1 = No compression).
record += struct.pack('>H', 0) # [2:4], Always 0.
record += struct.pack('>L', txt_length) # [4:8], Uncompressed length of the entire text of the book.
record += struct.pack('>H', record_count) # [8:10], Number of PDB records used for the text of the book.
record += struct.pack('>H', MAX_RECORD_SIZE) # [10-12], Maximum size of each record containing text, always 4096.
record += struct.pack('>L', 0) # [12-16], Current reading position, as an offset into the uncompressed text.
return record

View file

@ -8,7 +8,8 @@
Transform OEB content into PML markup
'''
import os, re
import os
import re
from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace
from calibre.ebooks.oeb.stylizer import Stylizer
@ -40,6 +41,31 @@
('text-align', {'right' : 'r', 'center' : 'c'}),
]
BLOCK_TAGS = [
'p',
]
BLOCK_STYLES = [
'block',
]
LINK_TAGS = [
'a',
]
SEPARATE_TAGS = [
'h1',
'h2',
'h3',
'h4',
'h5',
'h6',
'p',
'div',
'li',
'tr',
]
class PMLMLizer(object):
def __init__(self, ignore_tables=False):
self.ignore_tables = ignore_tables
@ -104,7 +130,7 @@ def dump_text(self, elem, stylizer, tag_stack=[]):
tag_count = 0
# Are we in a paragraph block?
if tag == 'p' or style['display'] in ('block'):
if tag in BLOCK_TAGS or style['display'] in BLOCK_STYLES:
if 'block' not in tag_stack:
tag_count += 1
tag_stack.append('block')
@ -136,7 +162,7 @@ def dump_text(self, elem, stylizer, tag_stack=[]):
# Special processing of tags that require an argument.
# Anchors links
if tag == 'a' and 'q' not in tag_stack:
if tag in LINK_TAGS and 'q' not in tag_stack:
href = elem.get('href')
if href and '://' not in href:
if '#' in href:
@ -168,7 +194,7 @@ def dump_text(self, elem, stylizer, tag_stack=[]):
for i in range(0, tag_count):
close_tag_list.insert(0, tag_stack.pop())
text += self.close_tags(close_tag_list)
if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div', 'li', 'tr'):
if tag in SEPARATE_TAGS:
text += os.linesep + os.linesep
if 'block' not in tag_stack: