calibre/src/cssutils/prodparser.py

637 lines
21 KiB
Python

# -*- coding: utf-8 -*-
"""Productions parser used by css and stylesheets classes to parse
test into a cssutils.util.Seq and at the same time retrieving
additional specific cssutils.util.Item objects for later use.
TODO:
- ProdsParser
- handle EOF or STOP?
- handle unknown @rules
- handle S: maybe save to Seq? parameterized?
- store['_raw']: always?
- Sequence:
- opt first(), naive impl for now
"""
__all__ = ['ProdParser', 'Sequence', 'Choice', 'Prod', 'PreDef']
__docformat__ = 'restructuredtext'
__version__ = '$Id: parse.py 1418 2008-08-09 19:27:50Z cthedot $'
import cssutils
import sys
class ParseError(Exception):
"""Base Exception class for ProdParser (used internally)."""
pass
class Done(ParseError):
"""Raised if Sequence or Choice is finished and no more Prods left."""
pass
class Exhausted(ParseError):
"""Raised if Sequence or Choice is finished but token is given."""
pass
class Missing(ParseError):
"""Raised if Sequence or Choice is not finished but no matching token given."""
pass
class NoMatch(ParseError):
"""Raised if nothing in Sequence or Choice does match."""
pass
class Choice(object):
"""A Choice of productions (Sequence or single Prod)."""
def __init__(self, *prods, **options):
"""
*prods
Prod or Sequence objects
options:
optional=False
"""
self._prods = prods
try:
self.optional = options['optional']
except KeyError, e:
for p in self._prods:
if p.optional:
self.optional = True
break
else:
self.optional = False
self.reset()
def reset(self):
"""Start Choice from zero"""
self._exhausted = False
def matches(self, token):
"""Check if token matches"""
for prod in self._prods:
if prod.matches(token):
return True
return False
def nextProd(self, token):
"""
Return:
- next matching Prod or Sequence
- ``None`` if any Prod or Sequence is optional and no token matched
- raise ParseError if nothing matches and all are mandatory
- raise Exhausted if choice already done
``token`` may be None but this occurs when no tokens left."""
if not self._exhausted:
optional = False
for x in self._prods:
if x.matches(token):
self._exhausted = True
x.reset()
return x
elif x.optional:
optional = True
else:
if not optional:
# None matched but also None is optional
raise ParseError(u'No match in %s' % self)
elif token:
raise Exhausted(u'Extra token')
def __str__(self):
return u'Choice(%s)' % u', '.join([str(x) for x in self._prods])
class Sequence(object):
"""A Sequence of productions (Choice or single Prod)."""
def __init__(self, *prods, **options):
"""
*prods
Prod or Sequence objects
**options:
minmax = lambda: (1, 1)
callback returning number of times this sequence may run
"""
self._prods = prods
try:
minmax = options['minmax']
except KeyError:
minmax = lambda: (1, 1)
self._min, self._max = minmax()
if self._max is None:
# unlimited
try:
# py2.6/3
self._max = sys.maxsize
except AttributeError:
# py<2.6
self._max = sys.maxint
self._prodcount = len(self._prods)
self.reset()
def matches(self, token):
"""Called by Choice to try to find if Sequence matches."""
for prod in self._prods:
if prod.matches(token):
return True
try:
if not prod.optional:
break
except AttributeError:
pass
return False
def reset(self):
"""Reset this Sequence if it is nested."""
self._roundstarted = False
self._i = 0
self._round = 0
def _currentName(self):
"""Return current element of Sequence, used by name"""
# TODO: current impl first only if 1st if an prod!
for prod in self._prods[self._i:]:
if not prod.optional:
return str(prod)
else:
return 'Sequence'
optional = property(lambda self: self._min == 0)
def nextProd(self, token):
"""Return
- next matching Prod or Choice
- raises ParseError if nothing matches
- raises Exhausted if sequence already done
"""
while self._round < self._max:
# for this round
i = self._i
round = self._round
p = self._prods[i]
if i == 0:
self._roundstarted = False
# for next round
self._i += 1
if self._i == self._prodcount:
self._round += 1
self._i = 0
if p.matches(token):
self._roundstarted = True
# reset nested Choice or Prod to use from start
p.reset()
return p
elif p.optional:
continue
elif round < self._min:
raise Missing(u'Missing token for production %s' % p)
elif not token:
if self._roundstarted:
raise Missing(u'Missing token for production %s' % p)
else:
raise Done()
else:
raise NoMatch(u'No matching production for token')
if token:
raise Exhausted(u'Extra token')
def __str__(self):
return u'Sequence(%s)' % u', '.join([str(x) for x in self._prods])
class Prod(object):
"""Single Prod in Sequence or Choice."""
def __init__(self, name, match, optional=False,
toSeq=None, toStore=None,
stop=False, nextSor=False, mayEnd=False):
"""
name
name used for error reporting
match callback
function called with parameters tokentype and tokenvalue
returning True, False or raising ParseError
toSeq callback (optional)
calling toSeq(token, tokens) returns (type_, val) == (token[0], token[1])
to be appended to seq else simply unaltered (type_, val)
toStore (optional)
key to save util.Item to store or callback(store, util.Item)
optional = False
wether Prod is optional or not
stop = False
if True stop parsing of tokens here
mayEnd = False
no token must follow even defined by Sequence.
Used for operator ',/ ' currently only
"""
self._name = name
self.match = match
self.optional = optional
self.stop = stop
self.nextSor = nextSor
self.mayEnd = mayEnd
def makeToStore(key):
"Return a function used by toStore."
def toStore(store, item):
"Set or append store item."
if key in store:
store[key].append(item)
else:
store[key] = item
return toStore
if toSeq:
# called: seq.append(toSeq(value))
self.toSeq = toSeq
else:
self.toSeq = lambda t, tokens: (t[0], t[1])
if hasattr(toStore, '__call__'):
self.toStore = toStore
elif toStore:
self.toStore = makeToStore(toStore)
else:
# always set!
self.toStore = None
def matches(self, token):
"""Return if token matches."""
if not token:
return False
type_, val, line, col = token
return self.match(type_, val)
def reset(self):
pass
def __str__(self):
return self._name
def __repr__(self):
return "<cssutils.prodsparser.%s object name=%r at 0x%x>" % (
self.__class__.__name__, self._name, id(self))
class ProdParser(object):
"""Productions parser."""
def __init__(self):
self.types = cssutils.cssproductions.CSSProductions
self._log = cssutils.log
self._tokenizer = cssutils.tokenize2.Tokenizer()
def _texttotokens(self, text):
"""Build a generator which is the only thing that is parsed!
old classes may use lists etc
"""
if isinstance(text, basestring):
# to tokenize strip space
tokens = self._tokenizer.tokenize(text.strip())
elif isinstance(text, tuple):
# (token, tokens) or a single token
if len(text) == 2:
# (token, tokens)
def gen(token, tokens):
"new generator appending token and tokens"
yield token
for t in tokens:
yield t
tokens = (t for t in gen(*text))
else:
# single token
tokens = (t for t in [text])
elif isinstance(text, list):
# generator from list
tokens = (t for t in text)
else:
# already tokenized, assume generator
tokens = text
return tokens
def _SorTokens(self, tokens, until=',/'):
"""New tokens generator which has S tokens removed,
if followed by anything in ``until``, normally a ``,``."""
def removedS(tokens):
for token in tokens:
if token[0] == self.types.S:
try:
next_ = tokens.next()
except StopIteration:
yield token
else:
if next_[1] in until:
# omit S as e.g. ``,`` has been found
yield next_
elif next_[0] == self.types.COMMENT:
# pass COMMENT
yield next_
else:
yield token
yield next_
elif token[0] == self.types.COMMENT:
# pass COMMENT
yield token
else:
yield token
break
# normal mode again
for token in tokens:
yield token
return (token for token in removedS(tokens))
def parse(self, text, name, productions, store=None):
"""
text (or token generator)
to parse, will be tokenized if not a generator yet
may be:
- a string to be tokenized
- a single token, a tuple
- a tuple of (token, tokensGenerator)
- already tokenized so a tokens generator
name
used for logging
productions
used to parse tokens
store UPDATED
If a Prod defines ``toStore`` the key defined there
is a key in store to be set or if store[key] is a list
the next Item is appended here.
TODO: NEEDED? :
Key ``raw`` is always added and holds all unprocessed
values found
returns
:wellformed: True or False
:seq: a filled cssutils.util.Seq object which is NOT readonly yet
:store: filled keys defined by Prod.toStore
:unusedtokens: token generator containing tokens not used yet
"""
tokens = self._texttotokens(text)
if not tokens:
self._log.error(u'No content to parse.')
# TODO: return???
seq = cssutils.util.Seq(readonly=False)
if not store: # store for specific values
store = {}
prods = [productions] # stack of productions
wellformed = True
# while no real token is found any S are ignored
started = False
prod = None
# flag if default S handling should be done
defaultS = True
while True:
try:
token = tokens.next()
except StopIteration:
break
type_, val, line, col = token
# default productions
if type_ == self.types.COMMENT:
# always append COMMENT
seq.append(cssutils.css.CSSComment(val),
cssutils.css.CSSComment, line, col)
elif defaultS and type_ == self.types.S:
# append S (but ignore starting ones)
if started:
seq.append(val, type_, line, col)
else:
continue
# elif type_ == self.types.ATKEYWORD:
# # @rule
# r = cssutils.css.CSSUnknownRule(cssText=val)
# seq.append(r, type(r), line, col)
elif type_ == self.types.INVALID:
# invalidate parse
wellformed = False
self._log.error(u'Invalid token: %r' % (token,))
break
elif type_ == 'EOF':
# do nothing? (self.types.EOF == True!)
pass
else:
started = True # check S now
nextSor = False # reset
try:
while True:
# find next matching production
try:
prod = prods[-1].nextProd(token)
except (Exhausted, NoMatch), e:
# try next
prod = None
if isinstance(prod, Prod):
# found actual Prod, not a Choice or Sequence
break
elif prod:
# nested Sequence, Choice
prods.append(prod)
else:
# nested exhausted, try in parent
if len(prods) > 1:
prods.pop()
else:
raise ParseError('No match')
except ParseError, e:
wellformed = False
self._log.error(u'%s: %s: %r' % (name, e, token))
break
else:
# process prod
if prod.toSeq:
type_, val = prod.toSeq(token, tokens)
if val is not None:
seq.append(val, type_, line, col)
if prod.toStore:
prod.toStore(store, seq[-1])
if prod.stop: # EOF?
# stop here and ignore following tokens
break
if prod.nextSor:
# following is S or other token (e.g. ",")?
# remove S if
tokens = self._SorTokens(tokens, ',/')
defaultS = False
else:
defaultS = True
lastprod = prod
while True:
# all productions exhausted?
try:
prod = prods[-1].nextProd(token=None)
except Done, e:
# ok
prod = None
except Missing, e:
prod = None
# last was a S operator which may End a Sequence, then ok
if hasattr(lastprod, 'mayEnd') and not lastprod.mayEnd:
wellformed = False
self._log.error(u'%s: %s' % (name, e))
except ParseError, e:
prod = None
wellformed = False
self._log.error(u'%s: %s' % (name, e))
else:
if prods[-1].optional:
prod = None
elif prod and prod.optional:
# ignore optional
continue
if prod and not prod.optional:
wellformed = False
self._log.error(u'%s: Missing token for production %r'
% (name, str(prod)))
break
elif len(prods) > 1:
# nested exhausted, next in parent
prods.pop()
else:
break
# trim S from end
seq.rstrip()
return wellformed, seq, store, tokens
class PreDef(object):
"""Predefined Prod definition for use in productions definition
for ProdParser instances.
"""
types = cssutils.cssproductions.CSSProductions
@staticmethod
def CHAR(name='char', char=u',', toSeq=None, toStore=None, stop=False,
nextSor=False):
"any CHAR"
return Prod(name=name, match=lambda t, v: v == char,
toSeq=toSeq,
toStore=toStore,
stop=stop,
nextSor=nextSor)
@staticmethod
def comma(toStore=None):
return PreDef.CHAR(u'comma', u',', toStore=toStore)
@staticmethod
def dimension(toStore=None, nextSor=False):
return Prod(name=u'dimension',
match=lambda t, v: t == PreDef.types.DIMENSION,
toStore=toStore,
toSeq=lambda t, tokens: (t[0], cssutils.helper.normalize(t[1])),
nextSor=nextSor)
@staticmethod
def function(toSeq=None, toStore=None, nextSor=False):
return Prod(name=u'function',
match=lambda t, v: t == PreDef.types.FUNCTION,
toSeq=toSeq,
toStore=toStore,
nextSor=nextSor)
@staticmethod
def funcEnd(toStore=None, stop=False, nextSor=False):
")"
return PreDef.CHAR(u'end FUNC ")"', u')',
toStore=toStore,
stop=stop,
nextSor=nextSor)
@staticmethod
def ident(toStore=None, nextSor=False):
return Prod(name=u'ident',
match=lambda t, v: t == PreDef.types.IDENT,
toStore=toStore,
nextSor=nextSor)
@staticmethod
def number(toStore=None, nextSor=False):
return Prod(name=u'number',
match=lambda t, v: t == PreDef.types.NUMBER,
toStore=toStore,
nextSor=nextSor)
@staticmethod
def string(toStore=None, nextSor=False):
"string delimiters are removed by default"
return Prod(name=u'string',
match=lambda t, v: t == PreDef.types.STRING,
toStore=toStore,
toSeq=lambda t, tokens: (t[0], cssutils.helper.stringvalue(t[1])),
nextSor=nextSor)
@staticmethod
def percentage(toStore=None, nextSor=False):
return Prod(name=u'percentage',
match=lambda t, v: t == PreDef.types.PERCENTAGE,
toStore=toStore,
nextSor=nextSor)
@staticmethod
def S(name=u'whitespace', optional=True, toSeq=None, toStore=None, nextSor=False,
mayEnd=False):
return Prod(name=name,
match=lambda t, v: t == PreDef.types.S,
optional=optional,
toSeq=toSeq,
toStore=toStore,
mayEnd=mayEnd)
@staticmethod
def unary(optional=True, toStore=None):
"+ or -"
return Prod(name=u'unary +-', match=lambda t, v: v in (u'+', u'-'),
optional=optional,
toStore=toStore)
@staticmethod
def uri(toStore=None, nextSor=False):
"'url(' and ')' are removed and URI is stripped"
return Prod(name=u'URI',
match=lambda t, v: t == PreDef.types.URI,
toStore=toStore,
toSeq=lambda t, tokens: (t[0], cssutils.helper.urivalue(t[1])),
nextSor=nextSor)
@staticmethod
def hexcolor(toStore=None, toSeq=None, nextSor=False):
return Prod(name='HEX color',
match=lambda t, v: t == PreDef.types.HASH and
len(v) == 4 or len(v) == 7,
toStore=toStore,
toSeq=toSeq,
nextSor=nextSor
)