mirror of
git://github.com/kovidgoyal/calibre.git
synced 2026-05-05 15:03:36 +02:00
Make handling of malformed/fragmentary HTML more robust
This commit is contained in:
parent
2330c2a88e
commit
74875fff97
1 changed files with 16 additions and 3 deletions
|
|
@ -815,15 +815,28 @@ def first_pass(data):
|
|||
data = etree.fromstring(data, parser=RECOVER_PARSER)
|
||||
return data
|
||||
data = first_pass(data)
|
||||
# Force into the XHTML namespace
|
||||
|
||||
# Handle weird (non-HTML/fragment) files
|
||||
if barename(data.tag) != 'html':
|
||||
self.oeb.log.warn('File %r does not appear to be (X)HTML'%self.href)
|
||||
nroot = etree.fromstring('<html></html>')
|
||||
has_body = False
|
||||
for child in list(data):
|
||||
if barename(child.tag) == 'body':
|
||||
has_body = True
|
||||
break
|
||||
parent = nroot
|
||||
if not has_body:
|
||||
self.oeb.log.warn('File %r appears to be a HTML fragment'%self.href)
|
||||
nroot = etree.fromstring('<html><body/></html>')
|
||||
parent = nroot[0]
|
||||
for child in list(data):
|
||||
child.getparent().remove(child)
|
||||
nroot.append(child)
|
||||
parent.append(child)
|
||||
data = nroot
|
||||
elif not namespace(data.tag):
|
||||
|
||||
# Force into the XHTML namespace
|
||||
if not namespace(data.tag):
|
||||
data.attrib['xmlns'] = XHTML_NS
|
||||
data = etree.tostring(data, encoding=unicode)
|
||||
try:
|
||||
|
|
|
|||
Loading…
Reference in a new issue