mirror of
git://github.com/kovidgoyal/calibre.git
synced 2026-02-02 07:33:35 +01:00
Fix #5991 (Import of HTML With Embedded Stylesheet Broken In 0.7.5)
This commit is contained in:
parent
5e25f706d2
commit
ce736fa6ea
1 changed files with 9 additions and 9 deletions
|
|
@ -7,7 +7,7 @@
|
|||
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os, re, uuid, logging, functools
|
||||
import os, re, uuid, logging
|
||||
from mimetypes import types_map
|
||||
from collections import defaultdict
|
||||
from itertools import count
|
||||
|
|
@ -808,17 +808,17 @@ def _parse_xhtml(self, data):
|
|||
pat = re.compile(r'&(%s);'%('|'.join(user_entities.keys())))
|
||||
data = pat.sub(lambda m:user_entities[m.group(1)], data)
|
||||
|
||||
fromstring = functools.partial(etree.fromstring, parser=RECOVER_PARSER)
|
||||
parser = etree.XMLParser(no_network=True, huge_tree=True)
|
||||
# Try with more & more drastic measures to parse
|
||||
def first_pass(data):
|
||||
try:
|
||||
data = fromstring(data)
|
||||
data = etree.fromstring(data, parser=parser)
|
||||
except etree.XMLSyntaxError, err:
|
||||
self.oeb.log.exception('Initial parse failed:')
|
||||
repl = lambda m: ENTITYDEFS.get(m.group(1), m.group(0))
|
||||
data = ENTITY_RE.sub(repl, data)
|
||||
try:
|
||||
data = fromstring(data)
|
||||
data = etree.fromstring(data, parser=parser)
|
||||
except etree.XMLSyntaxError, err:
|
||||
self.oeb.logger.warn('Parsing file %r as HTML' % self.href)
|
||||
if err.args and err.args[0].startswith('Excessive depth'):
|
||||
|
|
@ -832,9 +832,9 @@ def first_pass(data):
|
|||
elem.text = elem.text.strip('-')
|
||||
data = etree.tostring(data, encoding=unicode)
|
||||
try:
|
||||
data = fromstring(data)
|
||||
data = etree.fromstring(data, parser=parser)
|
||||
except etree.XMLSyntaxError:
|
||||
data = fromstring(data)
|
||||
data = etree.fromstring(data, parser=RECOVER_PARSER)
|
||||
return data
|
||||
data = first_pass(data)
|
||||
|
||||
|
|
@ -866,12 +866,12 @@ def first_pass(data):
|
|||
data = etree.tostring(data, encoding=unicode)
|
||||
|
||||
try:
|
||||
data = fromstring(data)
|
||||
data = etree.fromstring(data, parser=parser)
|
||||
except:
|
||||
data = data.replace(':=', '=').replace(':>', '>')
|
||||
data = data.replace('<http:/>', '')
|
||||
try:
|
||||
data = fromstring(data)
|
||||
data = etree.fromstring(data, parser=parser)
|
||||
except etree.XMLSyntaxError:
|
||||
self.oeb.logger.warn('Stripping comments and meta tags from %s'%
|
||||
self.href)
|
||||
|
|
@ -882,7 +882,7 @@ def first_pass(data):
|
|||
"<?xml version='1.0' encoding='utf-8'?><o:p></o:p>",
|
||||
'')
|
||||
data = data.replace("<?xml version='1.0' encoding='utf-8'??>", '')
|
||||
data = fromstring(data)
|
||||
data = etree.fromstring(data)
|
||||
elif namespace(data.tag) != XHTML_NS:
|
||||
# OEB_DOC_NS, but possibly others
|
||||
ns = namespace(data.tag)
|
||||
|
|
|
|||
Loading…
Reference in a new issue