mirror of
git://github.com/kovidgoyal/calibre.git
synced 2025-12-23 09:45:40 +01:00
Fix parsing of HTML that has a DOCTYPE declaring it as XHTML but no xmlns attribute. Also coerce downloaded article titles to unicode
This commit is contained in:
parent
14171d419c
commit
cc7b7ebff1
3 changed files with 11 additions and 2 deletions
|
|
@ -764,7 +764,14 @@ def _parse_xhtml(self, data):
|
|||
# Convert to Unicode and normalize line endings
|
||||
data = self.oeb.decode(data)
|
||||
data = self.oeb.html_preprocessor(data)
|
||||
orig_data = data
|
||||
|
||||
# Remove DOCTYPE declaration as it messes up parsing
|
||||
# Inparticular it causes a tostring to insert xmlns
|
||||
# declarations, which messes up the coesrcing logic
|
||||
idx = data.find('<html')
|
||||
if idx > -1:
|
||||
data = data[idx:]
|
||||
|
||||
# Try with more & more drastic measures to parse
|
||||
def first_pass(data):
|
||||
try:
|
||||
|
|
|
|||
|
|
@ -25,6 +25,8 @@ def __init__(self, id, title, url, author, summary, published, content):
|
|||
entity_to_unicode, self.title)
|
||||
except:
|
||||
pass
|
||||
if not isinstance(self.title, unicode):
|
||||
self.title = self.title.decode('utf-8', 'replace')
|
||||
self.url = url
|
||||
self.author = author
|
||||
if author and not isinstance(author, unicode):
|
||||
|
|
|
|||
|
|
@ -980,7 +980,7 @@ def article_downloaded(self, request, result):
|
|||
|
||||
def error_in_article_download(self, request, traceback):
|
||||
self.jobs_done += 1
|
||||
self.log.error(_('Failed to download article: %s from %s\n')%(request.article.title, request.article.url))
|
||||
self.log.error(_(u'Failed to download article: %s from %s\n')%(request.article.title, request.article.url))
|
||||
self.log.debug(traceback)
|
||||
self.log.debug('\n')
|
||||
self.report_progress(float(self.jobs_done)/len(self.jobs), _('Article download failed: %s')%request.article.title)
|
||||
|
|
|
|||
Loading…
Reference in a new issue