diff --git a/Changelog.yaml b/Changelog.yaml index 35365d2c13..68149e6a21 100644 --- a/Changelog.yaml +++ b/Changelog.yaml @@ -19,6 +19,65 @@ # new recipes: # - title: +- version: 0.8.23 + date: 2011-10-21 + + new features: + - title: "Drivers for T-Mobile Move, new Pandigital Novel, New Onyx Boox and Freescale MX 515" + + - title: "SONY T1 driver: Support for periodicals and better timezone detection" + + - title: "Add a remove cover entry to the right click menu of the cover display in the right panel" + tickets: [874689] + + bug fixes: + - title: "Amazon metadata download: Fix for change in Amazon website that broke downloading metadata." + tickets: [878395] + + - title: "MOBI metadata: When reading titles from MOBI files only use the title in the PDB header if there is no long title in the EXTH header" + tickets: [ 875243 ] + + - title: "Fix regression that broke use of complex custom columns in save to disk templates." + tickets: [877366] + + - title: "Fix regression that broke reading metadata from CHM files" + + - title: "Fix a bug that broke conversion of some zipped up HTML files with non ascii filenames on certain windows installs." + tickets: [873288] + + - title: "RTF Input: Fix bug in handling of paragraph separators." + tickets: [863735] + + - title: "Fix a regression that broke downloading certain periodicals for the Kindle." + tickets: [875595] + + - title: "Fix regression that broke updating of covers inside ebook files when saving to disk" + + - title: "Fix regression breaking editing the 'show in tag browser' checkbox in custom column setup editing" + + - title: "Fix typo that broke stopping selected jobs in 0.8.22" + + improved recipes: + - Columbus Dispatch + - Ming Pao + - La Republica + - Korea Times + - USA Today + - CNN + - Liberation + - El Pais + - Helsingin Sanomat + + new recipes: + - title: Kyugyhang, Hankyoreh and Hankyoreh21 + author: Seongkyoun Yoo. + + - title: English Katherimini + author: Thomas Scholl + + - title: Various French news sources + author: Aurelien Chabot. + - version: 0.8.22 date: 2011-10-14 diff --git a/recipes/20minutes.recipe b/recipes/20minutes.recipe index 84dcd226e1..683f89fac9 100644 --- a/recipes/20minutes.recipe +++ b/recipes/20minutes.recipe @@ -4,7 +4,6 @@ ''' 20minutes.fr ''' -import re from calibre.web.feeds.recipes import BasicNewsRecipe class Minutes(BasicNewsRecipe): diff --git a/recipes/columbusdispatch.recipe b/recipes/columbusdispatch.recipe index e021f55048..adc47be976 100644 --- a/recipes/columbusdispatch.recipe +++ b/recipes/columbusdispatch.recipe @@ -14,67 +14,43 @@ class ColumbusDispatchRecipe(BasicNewsRecipe): use_embedded_content = False remove_empty_feeds = True oldest_article = 1.2 - max_articles_per_feed = 100 + use_embedded_content = False no_stylesheets = True - remove_javascript = True - encoding = 'utf-8' - # Seems to work best, but YMMV - simultaneous_downloads = 2 - + auto_cleanup = True + #auto_cleanup_keep = '//div[@id="story-photos"]' # Feeds from http://www.dispatch.com/live/content/rss/index.html - feeds = [] - feeds.append((u'News: Local and state news', u'http://www.dispatch.com/live/static/crt/2_rss_localnews.xml')) - feeds.append((u'News: National news', u'http://www.dispatch.com/live/static/crt/2_rss_nationalnews.xml')) - feeds.append((u'News: Editorials', u'http://www.dispatch.com/live/static/crt/2_rss_editorials.xml')) - feeds.append((u'News: Columnists', u'http://www.dispatch.com/live/static/crt/2_rss_columnists.xml')) - feeds.append((u'News: Health news', u'http://www.dispatch.com/live/static/crt/2_rss_health.xml')) - feeds.append((u'News: Science news', u'http://www.dispatch.com/live/static/crt/2_rss_science.xml')) - feeds.append((u'Sports: OSU football', u'http://www.dispatch.com/live/static/crt/2_rss_osufootball.xml')) - feeds.append((u'Sports: OSU men\'s basketball', u'http://www.dispatch.com/live/static/crt/2_rss_osumensbball.xml')) - feeds.append((u'Sports: OSU women\'s basketball', u'http://www.dispatch.com/live/static/crt/2_rss_osuwomensbball.xml')) - feeds.append((u'Sports: OSU sports', u'http://www.dispatch.com/live/static/crt/2_rss_osusports.xml')) - feeds.append((u'Sports: Blue Jackets', u'http://www.dispatch.com/live/static/crt/2_rss_bluejackets.xml')) - feeds.append((u'Sports: Crew', u'http://www.dispatch.com/live/static/crt/2_rss_crew.xml')) - feeds.append((u'Sports: Clippers', u'http://www.dispatch.com/live/static/crt/2_rss_clippers.xml')) - feeds.append((u'Sports: Indians', u'http://www.dispatch.com/live/static/crt/2_rss_indians.xml')) - feeds.append((u'Sports: Reds', u'http://www.dispatch.com/live/static/crt/2_rss_reds.xml')) - feeds.append((u'Sports: Golf', u'http://www.dispatch.com/live/static/crt/2_rss_golf.xml')) - feeds.append((u'Sports: Outdoors', u'http://www.dispatch.com/live/static/crt/2_rss_outdoors.xml')) - feeds.append((u'Sports: Cavs/NBA', u'http://www.dispatch.com/live/static/crt/2_rss_cavaliers.xml')) - feeds.append((u'Sports: High Schools', u'http://www.dispatch.com/live/static/crt/2_rss_highschools.xml')) - feeds.append((u'Sports: Browns', u'http://www.dispatch.com/live/static/crt/2_rss_browns.xml')) - feeds.append((u'Sports: Bengals', u'http://www.dispatch.com/live/static/crt/2_rss_bengals.xml')) - feeds.append((u'Sports: Auto Racing', u'http://www.dispatch.com/live/static/crt/2_rss_autoracing.xml')) - feeds.append((u'Business News', u'http://www.dispatch.com/live/static/crt/2_rss_business.xml')) - feeds.append((u'Features: Weekender', u'http://www.dispatch.com/live/static/crt/2_rss_weekender.xml')) - feeds.append((u'Features: Life and Arts', u'http://www.dispatch.com/live/static/crt/2_rss_lifearts.xml')) - feeds.append((u'Features: Food', u'http://www.dispatch.com/live/static/crt/2_rss_food.xml')) - feeds.append((u'Features: NOW! for kids', u'http://www.dispatch.com/live/static/crt/2_rss_now.xml')) - feeds.append((u'Features: Travel', u'http://www.dispatch.com/live/static/crt/2_rss_travel.xml')) - feeds.append((u'Features: Home and Garden', u'http://www.dispatch.com/live/static/crt/2_rss_homegarden.xml')) - feeds.append((u'Features: Faith and Values', u'http://www.dispatch.com/live/static/crt/2_rss_faithvalues.xml')) - #feeds.append((u'', u'')) + feeds = [ +('Local', + 'http://www.dispatch.com/content/syndication/news_local-state.xml'), +('National', + 'http://www.dispatch.com/content/syndication/news_national.xml'), +('Business', + 'http://www.dispatch.com/content/syndication/news_business.xml'), +('Editorials', + 'http://www.dispatch.com/content/syndication/opinion_editorials.xml'), +('Columnists', + 'http://www.dispatch.com/content/syndication/opinion_columns.xml'), +('Life and Arts', + 'http://www.dispatch.com/content/syndication/lae_life-and-arts.xml'), + ('OSU Sports', + 'http://www.dispatch.com/content/syndication/sports_osu.xml'), + ('Auto Racing', + 'http://www.dispatch.com/content/syndication/sports_auto-racing.xml'), + ('Outdoors', + 'http://www.dispatch.com/content/syndication/sports_outdoors.xml'), + ('Bengals', + 'http://www.dispatch.com/content/syndication/sports_bengals.xml'), + ('Indians', + 'http://www.dispatch.com/content/syndication/sports_indians.xml'), + ('Clippers', + 'http://www.dispatch.com/content/syndication/sports_clippers.xml'), + ('Crew', + 'http://www.dispatch.com/content/syndication/sports_crew.xml'), + ('Reds', + 'http://www.dispatch.com/content/syndication/sports_reds.xml'), + ('Blue Jackets', + 'http://www.dispatch.com/content/syndication/sports_bluejackets.xml'), +] - keep_only_tags = [] - keep_only_tags.append(dict(name = 'div', attrs = {'class': 'colhed'})) - keep_only_tags.append(dict(name = 'div', attrs = {'class': 'hed'})) - keep_only_tags.append(dict(name = 'div', attrs = {'class': 'subhed'})) - keep_only_tags.append(dict(name = 'div', attrs = {'class': 'date'})) - keep_only_tags.append(dict(name = 'div', attrs = {'class': 'byline'})) - keep_only_tags.append(dict(name = 'div', attrs = {'class': 'srcline'})) - keep_only_tags.append(dict(name = 'div', attrs = {'class': 'body'})) - - remove_tags = [] - remove_tags.append(dict(name = 'div', attrs = {'id': 'middle-story-ad-container'})) - - extra_css = ''' - body {font-family:verdana,arial,helvetica,geneva,sans-serif ;} - a {text-decoration: none; color: blue;} - div.colhed {font-weight: bold;} - div.hed {font-size: xx-large; font-weight: bold; margin-bottom: 0.2em;} - div.subhed {font-size: large;} - div.date {font-size: x-small; font-style: italic; color: #666666; margin-top: 0.4em; margin-bottom: 0.4em;} - div.byline, div.srcline {font-size: small; color: #696969;} - ''' diff --git a/recipes/frandroid.recipe b/recipes/frandroid.recipe index 0ad46dee4e..76c42f7360 100644 --- a/recipes/frandroid.recipe +++ b/recipes/frandroid.recipe @@ -1,5 +1,7 @@ # -*- coding: utf-8 -*- -class BasicUserRecipe1318572550(AutomaticNewsRecipe): + +from calibre.web.feeds.news import BasicNewsRecipe +class BasicUserRecipe1318572550(BasicNewsRecipe): title = u'FrAndroid' oldest_article = 2 max_articles_per_feed = 100 diff --git a/recipes/googlemobileblog.recipe b/recipes/googlemobileblog.recipe index c33c02db79..8d755e4a26 100644 --- a/recipes/googlemobileblog.recipe +++ b/recipes/googlemobileblog.recipe @@ -1,5 +1,8 @@ # -*- coding: utf-8 -*- -class BasicUserRecipe1318572445(AutomaticNewsRecipe): + +from calibre.web.feeds.news import BasicNewsRecipe + +class BasicUserRecipe1318572445(BasicNewsRecipe): title = u'Google Mobile Blog' oldest_article = 7 max_articles_per_feed = 100 diff --git a/recipes/hankyoreh.recipe b/recipes/hankyoreh.recipe index 3afd3dd5dd..8212765b66 100644 --- a/recipes/hankyoreh.recipe +++ b/recipes/hankyoreh.recipe @@ -3,34 +3,31 @@ ''' Profile to download The Hankyoreh ''' -import re from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulSoup - class Hankyoreh(BasicNewsRecipe): title = u'Hankyoreh' language = 'ko' description = u'The Hankyoreh News articles' - __author__ = 'Seongkyoun Yoo' + __author__ = 'Seongkyoun Yoo' oldest_article = 5 recursions = 1 max_articles_per_feed = 5 no_stylesheets = True keep_only_tags = [ - dict(name='tr', attrs={'height':['60px']}), - dict(id=['fontSzArea']) + dict(name='tr', attrs={'height':['60px']}), + dict(id=['fontSzArea']) ] remove_tags = [ dict(target='_blank'), - dict(name='td', attrs={'style':['padding: 10px 8px 5px 8px;']}), - dict(name='iframe', attrs={'width':['590']}), + dict(name='td', attrs={'style':['padding: 10px 8px 5px 8px;']}), + dict(name='iframe', attrs={'width':['590']}), ] remove_tags_after = [ dict(target='_top') ] feeds = [ - ('All News','http://www.hani.co.kr/rss/'), + ('All News','http://www.hani.co.kr/rss/'), ('Politics','http://www.hani.co.kr/rss/politics/'), ('Economy','http://www.hani.co.kr/rss/economy/'), ('Society','http://www.hani.co.kr/rss/society/'), @@ -47,4 +44,4 @@ class Hankyoreh(BasicNewsRecipe): ('Multihani','http://www.hani.co.kr/rss/multihani/'), ('Lead','http://www.hani.co.kr/rss/lead/'), ('Newsrank','http://www.hani.co.kr/rss/newsrank/'), - ] \ No newline at end of file + ] diff --git a/recipes/hankyoreh21.recipe b/recipes/hankyoreh21.recipe index 85ded3b8e3..f49b4cc1ab 100644 --- a/recipes/hankyoreh21.recipe +++ b/recipes/hankyoreh21.recipe @@ -3,7 +3,6 @@ ''' Profile to download The Hankyoreh ''' -import re from calibre.web.feeds.news import BasicNewsRecipe class Hankyoreh21(BasicNewsRecipe): @@ -23,4 +22,4 @@ class Hankyoreh21(BasicNewsRecipe): feeds = [ ('Hani21','http://h21.hani.co.kr/rss/ '), - ] \ No newline at end of file + ] diff --git a/recipes/japan_times.recipe b/recipes/japan_times.recipe index 229d5e4035..f5b90f2c05 100644 --- a/recipes/japan_times.recipe +++ b/recipes/japan_times.recipe @@ -44,7 +44,11 @@ def get_article_url(self, article): return rurl.partition('?')[0] def print_version(self, url): - return url.replace('/cgi-bin/','/print/') + if '/rss/' in url: + return url.replace('.jp/rss/','.jp/print/') + if '/text/' in url: + return url.replace('.jp/text/','.jp/print/') + return url def preprocess_html(self, soup): for item in soup.findAll(style=True): diff --git a/recipes/korben.recipe b/recipes/korben.recipe index cab52f2e32..fb8134b5cb 100644 --- a/recipes/korben.recipe +++ b/recipes/korben.recipe @@ -1,5 +1,7 @@ # -*- coding: utf-8 -*- -class BasicUserRecipe1318619728(AutomaticNewsRecipe): +from calibre.web.feeds.news import BasicNewsRecipe + +class BasicUserRecipe1318619728(BasicNewsRecipe): title = u'Korben' oldest_article = 7 max_articles_per_feed = 100 diff --git a/recipes/lepoint.recipe b/recipes/lepoint.recipe index 9ecc764534..9a4aab01da 100644 --- a/recipes/lepoint.recipe +++ b/recipes/lepoint.recipe @@ -4,7 +4,6 @@ ''' LePoint.fr ''' -import re from calibre.web.feeds.recipes import BasicNewsRecipe class lepoint(BasicNewsRecipe): diff --git a/recipes/lexpress.recipe b/recipes/lexpress.recipe index 1b48e4778b..3de6226f1b 100644 --- a/recipes/lexpress.recipe +++ b/recipes/lexpress.recipe @@ -4,7 +4,6 @@ ''' Lexpress.fr ''' -import re from calibre.web.feeds.recipes import BasicNewsRecipe class lepoint(BasicNewsRecipe): diff --git a/recipes/ming_pao.recipe b/recipes/ming_pao.recipe index da7272ca2e..9e9522f26e 100644 --- a/recipes/ming_pao.recipe +++ b/recipes/ming_pao.recipe @@ -18,10 +18,14 @@ __ParsePFF__ = True # (HK only) Turn below to True if you wish hi-res images (Default: False) __HiResImg__ = False +# Override the date returned by the program if specifying a YYYYMMDD below +__Date__ = '' ''' Change Log: +2011/10/21: fix a bug that hi-res img is unavailable in pages parsed from source txt +2011/10/19: fix a bug in txt source parsing 2011/10/17: disable fetching of premium content, also improved txt source parsing 2011/10/04: option to get hi-res photos for the articles 2011/09/21: fetching "column" section is made optional. @@ -170,13 +174,22 @@ def get_dtlocal(self): return dt_local def get_fetchdate(self): - return self.get_dtlocal().strftime("%Y%m%d") + if __Date__ <> '': + return __Date__ + else: + return self.get_dtlocal().strftime("%Y%m%d") def get_fetchformatteddate(self): - return self.get_dtlocal().strftime("%Y-%m-%d") + if __Date__ <> '': + return __Date__[0:4]+'-'+__Date__[4:6]+'-'+__Date__[6:8] + else: + return self.get_dtlocal().strftime("%Y-%m-%d") def get_fetchday(self): - return self.get_dtlocal().strftime("%d") + if __Date__ <> '': + return __Date__[6:8] + else: + return self.get_dtlocal().strftime("%d") def get_cover_url(self): if __Region__ == 'Hong Kong': @@ -477,53 +490,8 @@ def parse_col_section(self, url): # preprocess those .txt and javascript based files def preprocess_raw_html(self, raw_html, url): - #raw_html = raw_html.replace(u'
\u3010', u'\u3010')
- if __HiResImg__ == True:
- # TODO: add a _ in front of an image url
- if url.rfind('news.mingpao.com') > -1:
- imglist = re.findall('src="?.*?jpg"', raw_html)
- br = mechanize.Browser()
- br.set_handle_redirect(False)
- for img in imglist:
- gifimg = img.replace('jpg"', 'gif"')
- try:
- br.open_novisit(url + "/../" + gifimg[5:len(gifimg)-1])
- raw_html = raw_html.replace(img, gifimg)
- except:
- # find the location of the first _
- pos = img.find('_')
- if pos > -1:
- # if found, insert _ after the first _
- newimg = img[0:pos] + '_' + img[pos:]
- raw_html = raw_html.replace(img, newimg)
- else:
- # if not found, insert _ after "
- raw_html = raw_html.replace(img[1:], '"_' + img[1:])
- elif url.rfind('life.mingpao.com') > -1:
- imglist = re.findall('src=\'?.*?jpg\'', raw_html)
- br = mechanize.Browser()
- br.set_handle_redirect(False)
- #print 'Img list: ', imglist, '\n'
- for img in imglist:
- gifimg = img.replace('jpg\'', 'gif\'')
- try:
- #print 'Original: ', url
- #print 'To append: ', "/../" + gifimg[5:len(gifimg)-1]
- gifurl = re.sub(r'dailynews.*txt', '', url)
- #print 'newurl: ', gifurl + gifimg[5:len(gifimg)-1]
- br.open_novisit(gifurl + gifimg[5:len(gifimg)-1])
- #print 'URL: ', url + "/../" + gifimg[5:len(gifimg)-1]
- #br.open_novisit(url + "/../" + gifimg[5:len(gifimg)-1])
- raw_html = raw_html.replace(img, gifimg)
- except:
- #print 'GIF not found'
- pos = img.rfind('/')
- newimg = img[0:pos+1] + '_' + img[pos+1:]
- #print 'newimg: ', newimg
- raw_html = raw_html.replace(img, newimg)
- if url.rfind('ftp') == -1 and url.rfind('_print.htm') == -1:
- return raw_html
- else:
+ new_html = raw_html
+ if url.rfind('ftp') <> -1 or url.rfind('_print.htm') <> -1:
if url.rfind('_print.htm') <> -1:
# javascript based file
splitter = re.compile(r'\n')
@@ -558,49 +526,114 @@ def preprocess_raw_html(self, raw_html, url):
photo = photo.replace('', '
')
photo = photo.replace('class="photo"', '')
new_raw_html = new_raw_html + '