DOCX Input: Start work on tables

This commit is contained in:
Kovid Goyal 2013-05-28 09:45:24 +05:30
parent 5887243d56
commit 6eb97d2626
5 changed files with 228 additions and 34 deletions

View file

@ -65,35 +65,41 @@ def simple_float(val, mult=1.0):
} # }}}
# Read from XML {{{
def read_border(parent, dest):
tvals = {'padding_%s':inherit, 'border_%s_width':inherit,
'border_%s_style':inherit, 'border_%s_color':inherit}
vals = {}
for edge in ('left', 'top', 'right', 'bottom'):
vals.update({k % edge:v for k, v in tvals.iteritems()})
for border in XPath('./w:pBdr')(parent):
for edge in ('left', 'top', 'right', 'bottom'):
for elem in XPath('./w:%s' % edge)(border):
color = get(elem, 'w:color')
if color is not None:
vals['border_%s_color' % edge] = simple_color(color)
style = get(elem, 'w:val')
if style is not None:
vals['border_%s_style' % edge] = LINE_STYLES.get(style, 'solid')
space = get(elem, 'w:space')
if space is not None:
try:
vals['padding_%s' % edge] = float(space)
except (ValueError, TypeError):
pass
sz = get(elem, 'w:sz')
if sz is not None:
# we dont care about art borders (they are only used for page borders)
try:
vals['border_%s_width' % edge] = min(96, max(2, float(sz))) / 8
except (ValueError, TypeError):
pass
border_props = ('padding_%s', 'border_%s_width', 'border_%s_style', 'border_%s_color')
def read_single_border(parent, edge):
color = style = width = padding = None
for elem in XPath('./w:%s' % edge)(parent):
c = get(elem, 'w:color')
if c is not None:
color = simple_color(c)
s = get(elem, 'w:val')
if s is not None:
style = LINE_STYLES.get(s, 'solid')
space = get(elem, 'w:space')
if space is not None:
try:
padding = float(space)
except (ValueError, TypeError):
pass
sz = get(elem, 'w:sz')
if sz is not None:
# we dont care about art borders (they are only used for page borders)
try:
width = min(96, max(2, float(sz))) / 8
except (ValueError, TypeError):
pass
return {p:v for p, v in zip(border_props, (padding, width, style, color))}
def read_border(parent, dest, border_edges=('left', 'top', 'right', 'bottom'), name='pBdr'):
vals = {k % edge:inherit for edge in border_edges for k in border_props}
for border in XPath('./w:' + name)(parent):
for edge in border_edges:
for prop, val in read_single_border(border, edge).iteritems():
if val is not None:
vals[prop % edge] = val
for key, val in vals.iteritems():
setattr(dest, key, val)

View file

@ -17,7 +17,7 @@ def __init__(self, parent):
self.parent = parent
def __iter__(self):
for p in descendants(self.parent, 'w:p'):
for p in descendants(self.parent, 'w:p', 'w:tbl'):
yield p
class Footnotes(object):

View file

@ -11,6 +11,7 @@
from calibre.ebooks.docx.block_styles import ParagraphStyle, inherit
from calibre.ebooks.docx.char_styles import RunStyle
from calibre.ebooks.docx.tables import TableStyle
from calibre.ebooks.docx.names import XPath, get
class PageProperties(object):
@ -66,10 +67,17 @@ def __init__(self, elem):
self.based_on = None
self.is_default = get(elem, 'w:default') in {'1', 'on', 'true'}
self.paragraph_style = self.character_style = None
self.paragraph_style = self.character_style = self.table_style = None
if self.style_type in {'paragraph', 'character'}:
if self.style_type == 'paragraph':
if self.style_type in {'paragraph', 'character', 'table'}:
if self.style_type == 'table':
for tblPr in XPath('./w:tblPr')(elem):
ts = TableStyle(tblPr)
if self.table_style is None:
self.table_style = ts
else:
self.table_style.update(ts)
if self.style_type in {'paragraph', 'table'}:
for pPr in XPath('./w:pPr')(elem):
ps = ParagraphStyle(pPr)
if self.paragraph_style is None:
@ -90,6 +98,10 @@ def __init__(self, elem):
self.numbering_style_link = get(x, 'w:val')
def resolve_based_on(self, parent):
if parent.table_style is not None:
if self.table_style is None:
self.table_style = TableStyle()
self.table_style.resolve_based_on(parent.table_style)
if parent.paragraph_style is not None:
if self.paragraph_style is None:
self.paragraph_style = ParagraphStyle()

View file

@ -0,0 +1,152 @@
#!/usr/bin/env python
# vim:fileencoding=utf-8
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
from collections import OrderedDict
from lxml.html.builder import TABLE, TR, TD
from calibre.ebooks.docx.block_styles import inherit, read_shd, read_border ,border_props # noqa
from calibre.ebooks.docx.names import XPath, get
def _read_width(elem):
ans = inherit
try:
w = int(get(elem, 'w:w'))
except (TypeError, ValueError):
w = 0
typ = get(elem, 'w:type', 'auto')
if typ == 'nil':
ans = '0'
elif typ == 'auto':
ans = 'auto'
elif typ == 'dxa':
ans = '%.3gpt' % (w/20)
elif typ == 'pct':
ans = '%.3g%%' % (w/50)
return ans
def read_width(parent, dest):
ans = inherit
for tblW in XPath('./w:tblW')(parent):
ans = _read_width(tblW)
setattr(dest, 'width', ans)
def read_padding(parent, dest):
name = 'tblCellMar' if parent.tag.endswith('}tblPr') else 'tcMar'
left = top = bottom = right = inherit
for mar in XPath('./w:%s' % name)(parent):
for x in ('left', 'top', 'right', 'bottom'):
for edge in XPath('./w:%s' % x)(mar):
locals()[x] = _read_width(edge)
for x in ('left', 'top', 'right', 'bottom'):
setattr(dest, 'cell_padding_%s' % x, locals()[x])
def read_justification(parent, dest):
left = right = inherit
for jc in XPath('./w:jc[@w:val]')(parent):
val = get(jc, 'w:val')
if not val:
continue
if val == 'left':
right = 'auto'
elif val == 'right':
left = 'auto'
elif val == 'center':
left = right = 'auto'
setattr(dest, 'margin_left', left)
setattr(dest, 'margin_right', right)
def read_spacing(parent, dest):
ans = inherit
for cs in XPath('./w:tblCellSpacing')(parent):
ans = _read_width(cs)
setattr(dest, 'spacing', ans)
def read_indent(parent, dest):
ans = inherit
for cs in XPath('./w:tblInd')(parent):
ans = _read_width(cs)
setattr(dest, 'indent', ans)
border_edges = ('left', 'top', 'right', 'bottom', 'insideH', 'insideV')
def read_borders(parent, dest):
name = 'tblBorders' if parent.tag.endswith('}tblPr') else 'tcBorders'
read_border(parent, dest, border_edges, name)
class TableStyle(object):
all_properties = (
'width', 'cell_padding_left', 'cell_padding_right', 'cell_padding_top',
'cell_padding_bottom', 'margin_left', 'margin_right', 'background_color',
'spacing', 'indent',
) + tuple(k % edge for edge in border_edges for k in border_props)
def __init__(self, tblPr=None):
if tblPr is None:
for p in self.all_properties:
setattr(self, p, inherit)
else:
for x in ('width', 'padding', 'shd', 'justification', 'spacing', 'indent', 'borders'):
f = globals()['read_%s' % x]
f(tblPr, self)
self._css = None
def update(self, other):
for prop in self.all_properties:
nval = getattr(other, prop)
if nval is not inherit:
setattr(self, prop, nval)
def resolve_based_on(self, parent):
for p in self.all_properties:
val = getattr(self, p)
if val is inherit:
setattr(self, p, getattr(parent, p))
@property
def css(self):
return self._css
class Tables(object):
def __init__(self):
self.tables = OrderedDict()
def register(self, tbl):
self.tables[tbl] = self.current_table = []
def add(self, p):
self.current_table.append(p)
def apply_markup(self, object_map):
rmap = {v:k for k, v in object_map.iteritems()}
for tbl, blocks in self.tables.iteritems():
if not blocks:
continue
parent = rmap[blocks[0]].getparent()
table = TABLE('\n\t\t')
idx = parent.index(rmap[blocks[0]])
parent.insert(idx, table)
for row in XPath('./w:tr')(tbl):
tr = TR('\n\t\t\t')
tr.tail = '\n\t\t'
table.append(tr)
for tc in XPath('./w:tc')(row):
td = TD()
td.tail = '\n\t\t\t'
tr.append(td)
for p in XPath('./w:p')(tc):
block = rmap[p]
td.append(block)
if len(tr):
tr[-1].tail = '\n\t\t'
if len(table):
table[-1].tail = '\n\t'

View file

@ -21,6 +21,7 @@
from calibre.ebooks.docx.numbering import Numbering
from calibre.ebooks.docx.fonts import Fonts
from calibre.ebooks.docx.images import Images
from calibre.ebooks.docx.tables import Tables
from calibre.ebooks.docx.footnotes import Footnotes
from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre.ebooks.metadata.toc import TOC
@ -47,6 +48,7 @@ def __init__(self, path_or_stream, dest_dir=None, log=None, notes_text=None):
self.body = BODY()
self.styles = Styles()
self.images = Images()
self.tables = Tables()
self.object_map = OrderedDict()
self.html = HTML(
HEAD(
@ -98,15 +100,26 @@ def __call__(self):
dl.append(DT('[', A('' + text, href='#back_%s' % anchor, title=text), id=anchor))
dl[-1][0].tail = ']'
dl.append(DD())
in_table = False
for wp in note:
if wp.tag.endswith('}tbl'):
self.tables.register(wp)
in_table = True
continue
if in_table:
if ancestor(wp, 'w:tbl') is not None:
self.tables.add(wp)
else:
in_table = False
p = self.convert_p(wp)
dl[-1].append(p)
self.resolve_links(relationships_by_id)
# TODO: tables <w:tbl> child of <w:body> (nested tables?)
self.styles.cascade(self.layers)
self.tables.apply_markup(self.object_map)
numbered = []
for html_obj, obj in self.object_map.iteritems():
raw = obj.get('calibre_num_id', None)
@ -154,7 +167,13 @@ def read_page_properties(self, doc):
current = []
self.page_map = OrderedDict()
for p in descendants(doc, 'w:p'):
in_table = False
for p in descendants(doc, 'w:p', 'w:tbl'):
if p.tag.endswith('}tbl'):
in_table = True
self.tables.register(p)
continue
sect = tuple(descendants(p, 'w:sectPr'))
if sect:
pr = PageProperties(sect)
@ -163,6 +182,11 @@ def read_page_properties(self, doc):
current = []
else:
current.append(p)
if in_table:
if ancestor(p, 'w:tbl') is not None:
self.tables.add(p)
else:
in_table = False
if current:
last = XPath('./w:body/w:sectPr')(doc)
pr = PageProperties(last)