mirror of
git://github.com/kovidgoyal/calibre.git
synced 2026-05-08 22:43:31 +02:00
News download: Handle HTML entities in article titles
This commit is contained in:
parent
8e248482aa
commit
af8f3b56ce
1 changed files with 11 additions and 2 deletions
|
|
@ -5,10 +5,11 @@
|
|||
'''
|
||||
Contains the logic for parsing feeds.
|
||||
'''
|
||||
import time, logging, traceback, copy
|
||||
import time, logging, traceback, copy, re
|
||||
from datetime import datetime
|
||||
|
||||
from calibre.web.feeds.feedparser import parse
|
||||
from calibre import entity_to_unicode
|
||||
from lxml import html
|
||||
|
||||
class Article(object):
|
||||
|
|
@ -19,6 +20,12 @@ def __init__(self, id, title, url, summary, published, content):
|
|||
self.downloaded = False
|
||||
self.id = id
|
||||
self.title = title.strip() if title else title
|
||||
try:
|
||||
self.title = re.sub(r'&(\S+);',
|
||||
entity_to_unicode, self.title)
|
||||
print 11111, repr(self.title)
|
||||
except:
|
||||
pass
|
||||
self.url = url
|
||||
self.summary = summary
|
||||
if summary and not isinstance(summary, unicode):
|
||||
|
|
@ -37,6 +44,7 @@ def __init__(self, id, title, url, summary, published, content):
|
|||
self.date = published
|
||||
self.utctime = datetime(*self.date[:6])
|
||||
self.localtime = self.utctime + self.time_offset
|
||||
|
||||
|
||||
def __repr__(self):
|
||||
return \
|
||||
|
|
@ -91,7 +99,8 @@ def populate_from_feed(self, feed, title=None, oldest_article=7,
|
|||
if len(self.articles) >= max_articles_per_feed:
|
||||
break
|
||||
self.parse_article(item)
|
||||
|
||||
|
||||
|
||||
def populate_from_preparsed_feed(self, title, articles, oldest_article=7,
|
||||
max_articles_per_feed=100):
|
||||
self.title = title if title else _('Unknown feed')
|
||||
|
|
|
|||
Loading…
Reference in a new issue