Updated Economist recipe for new website layout

2026-05-05 11:23:52 +02:00 · 2010-06-10 23:09:10 -06:00 · 2010-06-10 23:09:10 -06:00 · 0ed7568ae1
commit 0ed7568ae1
parent efe64efe25
2 changed files with 13 additions and 10 deletions
--- a/resources/recipes/economist.recipe
+++ b/resources/recipes/economist.recipe
@ -24,9 +24,10 @@ class Economist(BasicNewsRecipe):
    oldest_article = 7.0
    cover_url = 'http://www.economist.com/images/covers/currentcoverus_large.jpg'
    remove_tags = [dict(name=['script', 'noscript', 'title', 'iframe', 'cf_floatingcontent']),
-            dict(attrs={'class':['dblClkTrk']})]
-    remove_tags_before = dict(name=lambda tag: tag.name=='title' and tag.parent.name=='body')
+            dict(attrs={'class':['dblClkTrk', 'ec-article-info']})]
+    keep_only_tags = [dict(id='ec-article-body')]
    needs_subscription = True
+    no_stylesheets = True
    preprocess_regexps = [(re.compile('</html>.*', re.DOTALL),
        lambda x:'</html>')]

@ -87,7 +88,7 @@ def economist_parse_index(self):
                continue
            a = tag.find('a', href=True)
            if a is not None:
-                url=a['href'].replace('displaystory', 'PrinterFriendly').strip()
+                url=a['href'].split('?')[0]+'/print'
                if url.startswith('Printer'):
                    url = '/'+url
                if url.startswith('/'):
--- a/resources/recipes/economist_free.recipe
+++ b/resources/recipes/economist_free.recipe
@ -17,8 +17,9 @@ class Economist(BasicNewsRecipe):
    oldest_article = 7.0
    cover_url = 'http://www.economist.com/images/covers/currentcoverus_large.jpg'
    remove_tags = [dict(name=['script', 'noscript', 'title', 'iframe', 'cf_floatingcontent']),
-            dict(attrs={'class':['dblClkTrk']})]
-    remove_tags_before = dict(name=lambda tag: tag.name=='title' and tag.parent.name=='body')
+            dict(attrs={'class':['dblClkTrk', 'ec-article-info']})]
+    keep_only_tags = [dict(id='ec-article-body')]
+    no_stylesheets = True
    preprocess_regexps = [(re.compile('</html>.*', re.DOTALL),
        lambda x:'</html>')]

@ -88,19 +89,20 @@ def process_eco_feed_article(self, args):
        br = browser()
        ret = br.open(url)
        raw = ret.read()
-        url = br.geturl().replace('displaystory', 'PrinterFriendly').strip()
+        url = br.geturl().split('?')[0]+'/print'
        root = html.fromstring(raw)
-        matches = root.xpath('//*[@class = "article-section"]')
+        matches = root.xpath('//*[@class = "ec-article-info"]')
        feedtitle = 'Miscellaneous'
        if matches:
-            feedtitle = string.capwords(html.tostring(matches[0], method='text',
-                    encoding=unicode))
+            feedtitle = string.capwords(html.tostring(matches[-1], method='text',
+                    encoding=unicode).split('|')[-1].strip())
        return (i, feedtitle, url, title, description, author, published)

    def eco_article_found(self, req, result):
        from calibre.web.feeds import Article
        i, feedtitle, link, title, description, author, published = result
-        self.log('Found print version for article:', title)
+        self.log('Found print version for article:', title, 'in', feedtitle,
+                'at', link)

        a = Article(i, title, link, author, description, published, '')