mirror of
git://github.com/kovidgoyal/calibre.git
synced 2025-12-15 07:04:28 +01:00
266 lines
No EOL
8.2 KiB
Python
266 lines
No EOL
8.2 KiB
Python
## Copyright (C) 2006 Kovid Goyal kovid@kovidgoyal.net
|
|
## This program is free software; you can redistribute it and/or modify
|
|
## it under the terms of the GNU General Public License as published by
|
|
## the Free Software Foundation; either version 2 of the License, or
|
|
## (at your option) any later version.
|
|
##
|
|
## This program is distributed in the hope that it will be useful,
|
|
## but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
## GNU General Public License for more details.
|
|
##
|
|
## You should have received a copy of the GNU General Public License along
|
|
## with this program; if not, write to the Free Software Foundation, Inc.,
|
|
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
|
|
"""
|
|
Thin ctypes based wrapper around libtidy. Example usage:
|
|
>>> from libtidy import parseString
|
|
>>> print parseString('<h1>fowehfow</h2>', \
|
|
output_xhtml=1, add_xml_decl=1, indent=1, tidy_mark=0)
|
|
<?xml version="1.0" encoding="us-ascii"?>
|
|
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
|
|
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
|
<html xmlns="http://www.w3.org/1999/xhtml">
|
|
<head>
|
|
<title></title>
|
|
</head>
|
|
<body>
|
|
<h1>
|
|
fowehfow
|
|
</h1>
|
|
</body>
|
|
</html>
|
|
"""
|
|
|
|
import ctypes
|
|
from cStringIO import StringIO
|
|
import weakref
|
|
|
|
class TidyLibError(Exception):
|
|
def __init__(self, arg):
|
|
self.arg=arg
|
|
|
|
class InvalidOptionError(TidyLibError):
|
|
def __str__(self):
|
|
return "%s was not a valid Tidy option." % (self.arg)
|
|
__repr__=__str__
|
|
|
|
class OptionArgError(TidyLibError):
|
|
def __init__(self, arg):
|
|
self.arg=arg
|
|
def __str__(self):
|
|
return self.arg
|
|
|
|
# search the path for libtidy using the known names;
|
|
thelib=None
|
|
for libname in ('cygtidy-0-99-0', 'libtidy', 'libtidy.so', 'tidylib'):
|
|
try:
|
|
thelib = getattr(ctypes.cdll, libname)
|
|
break
|
|
except OSError:
|
|
pass
|
|
if not thelib:
|
|
raise OSError("Couldn't find libtidy, please make sure it is installed.")
|
|
|
|
class Loader:
|
|
"""
|
|
I am a trivial wrapper that eliminates the need for tidy.tidyFoo,
|
|
so you can just access tidy.Foo
|
|
"""
|
|
def __init__(self):
|
|
self.lib = thelib
|
|
def __getattr__(self, name):
|
|
try:
|
|
return getattr(self.lib, "tidy%s" % name)
|
|
# current ctypes uses ValueError, future will use AttributeError
|
|
except (ValueError, AttributeError):
|
|
return getattr(self.lib, name)
|
|
|
|
_tidy=Loader()
|
|
|
|
# define a callback to pass to Tidylib
|
|
def _putByte(handle, c):
|
|
"""Lookup sink by handle and call its putByte method"""
|
|
sinkfactory[handle].putByte(c)
|
|
return 0
|
|
|
|
PUTBYTEFUNC = ctypes.CFUNCTYPE(ctypes.c_int, ctypes.c_int, ctypes.c_char)
|
|
putByte = PUTBYTEFUNC(_putByte)
|
|
|
|
class _OutputSink(ctypes.Structure):
|
|
_fields_ = [("sinkData", ctypes.c_int),
|
|
("putByte", PUTBYTEFUNC),
|
|
]
|
|
|
|
class _Sink:
|
|
def __init__(self):
|
|
self._data = StringIO()
|
|
self.struct = _OutputSink()
|
|
self.struct.putByte = putByte
|
|
|
|
def putByte(self, c):
|
|
self._data.write(c)
|
|
|
|
def __str__(self):
|
|
return self._data.getvalue()
|
|
|
|
class ReportItem:
|
|
def __init__(self, err):
|
|
self.err = err
|
|
if err.startswith('line'):
|
|
tokens = err.split(' ',6)
|
|
self.severity = tokens[5][0] # W or E
|
|
self.line = int(tokens[1])
|
|
self.col = int(tokens[3])
|
|
self.message = tokens[6]
|
|
else:
|
|
tokens = err.split(' ',1)
|
|
self.severity = tokens[0][0]
|
|
self.message = tokens[1]
|
|
self.line = None
|
|
self.col = None
|
|
# TODO - parse emacs mode
|
|
|
|
def __str__(self):
|
|
severities = dict(W='Warning', E='Error', C='Config')
|
|
try:
|
|
if self.line:
|
|
return "line %d col %d - %s: %s" % (self.line, self.col,
|
|
severities[self.severity],
|
|
self.message)
|
|
|
|
else:
|
|
return "%s: %s" % (severities[self.severity], self.message)
|
|
except KeyError:
|
|
return self.err
|
|
|
|
def __repr__(self):
|
|
return "%s('%s')" % (self.__class__.__name__,
|
|
str(self).replace("'", "\\'"))
|
|
|
|
class FactoryDict(dict):
|
|
"""I am a dict with a create method and no __setitem__. This allows
|
|
me to control my own keys.
|
|
"""
|
|
def create(self):
|
|
"""Subclasses should implement me to generate a new item"""
|
|
|
|
def _setitem(self, name, value):
|
|
dict.__setitem__(self, name, value)
|
|
|
|
def __setitem__(self, name, value):
|
|
raise TypeError, "Use create() to get a new object"
|
|
|
|
|
|
class SinkFactory(FactoryDict):
|
|
"""Mapping for lookup of sinks by handle"""
|
|
def __init__(self):
|
|
FactoryDict.__init__(self)
|
|
self.lastsink = 0
|
|
|
|
def create(self):
|
|
sink = _Sink()
|
|
sink.struct.sinkData = self.lastsink
|
|
FactoryDict._setitem(self, self.lastsink, sink)
|
|
self.lastsink = self.lastsink+1
|
|
return sink
|
|
|
|
sinkfactory = SinkFactory()
|
|
|
|
class _Document(object):
|
|
def __init__(self):
|
|
self.cdoc = _tidy.Create()
|
|
self.errsink = sinkfactory.create()
|
|
_tidy.SetErrorSink(self.cdoc, ctypes.byref(self.errsink.struct))
|
|
|
|
def write(self, stream):
|
|
stream.write(str(self))
|
|
|
|
def get_errors(self):
|
|
ret = []
|
|
for line in str(self.errsink).split('\n'):
|
|
line = line.strip(' \n\r')
|
|
if line: ret.append(ReportItem(line))
|
|
return ret
|
|
|
|
errors=property(get_errors)
|
|
|
|
def __str__(self):
|
|
stlen = ctypes.c_int(8192)
|
|
st = ctypes.c_buffer(stlen.value)
|
|
rc = _tidy.SaveString(self.cdoc, st, ctypes.byref(stlen))
|
|
if rc==-12: # buffer too small
|
|
st = ctypes.c_buffer(stlen.value)
|
|
_tidy.SaveString(self.cdoc, st, ctypes.byref(stlen))
|
|
return st.value
|
|
|
|
errors = {'missing or malformed argument for option: ': OptionArgError,
|
|
'unknown option: ': InvalidOptionError,
|
|
}
|
|
|
|
|
|
class DocumentFactory(FactoryDict):
|
|
def _setOptions(self, doc, **options):
|
|
for k in options.keys():
|
|
|
|
# this will flush out most argument type errors...
|
|
if options[k] is None: options[k] = ''
|
|
|
|
_tidy.OptParseValue(doc.cdoc,
|
|
k.replace('_', '-'),
|
|
str(options[k]))
|
|
if doc.errors:
|
|
match=filter(doc.errors[-1].message.startswith, errors.keys())
|
|
if match:
|
|
raise errors[match[0]](doc.errors[-1].message)
|
|
|
|
def load(self, doc, arg, loader):
|
|
loader(doc.cdoc, arg)
|
|
_tidy.CleanAndRepair(doc.cdoc)
|
|
|
|
def loadFile(self, doc, filename):
|
|
self.load(doc, filename, _tidy.ParseFile)
|
|
|
|
def loadString(self, doc, st):
|
|
self.load(doc, st, _tidy.ParseString)
|
|
|
|
def _create(self, *args, **kwargs):
|
|
doc = _Document()
|
|
self._setOptions(doc, **kwargs)
|
|
ref = weakref.ref(doc, self.releaseDoc)
|
|
FactoryDict._setitem(self, ref, doc.cdoc)
|
|
return doc
|
|
|
|
def parse(self, filename, *args, **kwargs):
|
|
"""
|
|
Open and process filename as an HTML file, returning a
|
|
processed document object.
|
|
@param kwargs: named options to pass to TidyLib for processing
|
|
the input file.
|
|
@param filename: the name of a file to process
|
|
@return: a document object
|
|
"""
|
|
doc = self._create(**kwargs)
|
|
self.loadFile(doc, filename)
|
|
return doc
|
|
|
|
def parseString(self, st, *args, **kwargs):
|
|
"""
|
|
Use st as an HTML file, and process it, returning a
|
|
document object.
|
|
@param kwargs: named options to pass to TidyLib for processing
|
|
the input file.
|
|
@param st: the string to parse
|
|
@return: a document object
|
|
"""
|
|
doc = self._create(**kwargs)
|
|
self.loadString(doc, st)
|
|
return doc
|
|
|
|
def releaseDoc(self, ref):
|
|
_tidy.Release(self[ref])
|
|
|
|
docfactory = DocumentFactory()
|
|
parse = docfactory.parse
|
|
parseString = docfactory.parseString |