...

2026-05-09 11:14:02 +02:00 · 2012-04-06 09:00:37 +05:30 · 2012-04-06 09:00:37 +05:30 · 921769bd6a
commit 921769bd6a
parent 768928a5d3
3 changed files with 50 additions and 56 deletions
--- a/recipes/ba_herald.recipe
+++ b/recipes/ba_herald.recipe
@ -4,10 +4,8 @@
 www.buenosairesherald.com
 '''

-import re
 from calibre import strftime
 from calibre.web.feeds.news import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import BeautifulSoup

 class BuenosAiresHerald(BasicNewsRecipe):
    title                 = 'Buenos Aires Herald'
@ -62,7 +60,7 @@ def parse_index(self):
        lfeeds = self.get_feeds()
        for feedobj in lfeeds:
            feedtitle, feedurl = feedobj
-            self.report_progress(0, _('Fetching feed')+' %s...'%(feedtitle if feedtitle else feedurl))
+            self.report_progress(0, ('Fetching feed')+' %s...'%(feedtitle if feedtitle else feedurl))
            articles = []
            soup = self.index_to_soup(feedurl)
            for item in soup.findAll('div', attrs={'class':'nota_texto_seccion'}):
--- a/recipes/oreilly_premium.recipe
+++ b/recipes/oreilly_premium.recipe
@ -1,4 +1,3 @@
-import string, re
 import time
 import traceback
 # above for debugging via stack
@ -6,22 +5,19 @@
 # Allows the Python soup converter, which makes parsing easier.
 from calibre.ebooks.BeautifulSoup import BeautifulSoup

-import os, time, traceback, re, urlparse, sys, cStringIO
-from collections import defaultdict
-from functools import partial
-from contextlib import nested, closing
+import os


-from calibre.web.feeds import feed_from_xml, templates, feeds_from_index, Feed
+from calibre.web.feeds import feeds_from_index
 from calibre.utils.threadpool import WorkRequest, ThreadPool, NoResultsPending


 # To Do: strip ads and graphics, Current Column lacks a title.
 # The News letter archive https://www.billoreilly.com/newsletterarchive is covered by other entries.
-# Newsletters: Talking Points Memos covered by cat12 
+# Newsletters: Talking Points Memos covered by cat12
 # ./ebook-convert  --username xxx --password xxx

-# this is derived from BasicNewsRecipe, so it can only overload those.  
+# this is derived from BasicNewsRecipe, so it can only overload those.
 # Soome of what we need is otherwise in article, so we have more copy to do than otherwise.
 class OReillyPremium(BasicNewsRecipe):
    title           = u'OReilly Premium'
@ -42,9 +38,9 @@ class OReillyPremium(BasicNewsRecipe):
    # Don't go down
    recursions      = 0
    max_articles_per_feed = 20
-    
+
    debugMessages   = True
-    
+
    # Name, URL, Soup FindAll Attr if relevant (last two are special case), articleList
    catList = [ ["TV Archives",         'https://www.billoreilly.com/show?action=tvShowArchive', 'a',    {'class':['showLinks','homeLinks']},                   []],
                # ["No Spin Archives",    'https://www.billoreilly.com/blog?categoryID=7',         True,   {'class':['blogBody'], 'style':['padding-top:10px;']}, []],
@ -53,19 +49,19 @@ class OReillyPremium(BasicNewsRecipe):
                # ["Talking Points Memo", 'https://www.billoreilly.com/blog?categoryID=12',        'td',   {},                                                    []],
                ["Current Column",      'https://www.billoreilly.com/currentcolumn',             'span', {'class':['defaultHeader']},                           []]
              ]
-              
+
    feeds          = [
        (u'No Spin',        u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=7'),
-        (u'Daily Briefing', u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=11'), 
+        (u'Daily Briefing', u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=11'),
        (u'Talking Points', u'https://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=12'),
        (u'Blog',           u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=0'),
        (u'StratFor',       u'http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=5')
    ]
-    # http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=8 is word for the day.              
-              
+    # http://www.billoreilly.com/blog?action=blogArchive&rss=true&categoryID=8 is word for the day.
+
    # Note: Talking Points is broken in the above model; the site changed to more Ajax-y.
    # Now using RSS
-    
+
    def get_browser(self):
        print("In get_browser")
        br = BasicNewsRecipe.get_browser()
@ -76,7 +72,7 @@ def get_browser(self):
            br['formPasswordField'] = self.password
            br.submit()
        return br
-        
+
    # Returns the best-guess print url.
    # The second parameter (pageURL) is returned if nothing is found.
    def extractPrintURL(self, baseURL, pageURL, printString):
@ -90,17 +86,17 @@ def extractPrintURL(self, baseURL, pageURL, printString):
            tag = printText.parent
            tagURL = baseURL+tag['href']
        return tagURL
-        
+
    def stripBadChars(self, inString) :
        return inString.replace("\'", "")
-        
-       
+
+
    def parseGeneric(self, baseURL):
-        # Does a generic parsing of the articles.  There are six categories (0-5) 
+        # Does a generic parsing of the articles.  There are six categories (0-5)
        # Name, URL, Soup FindAll Attr if relevant (last two are special case), articleList
        # NoSpin and TV are generic
        fullReturn = []
-        for i in range(len(self.catList)) : 
+        for i in range(len(self.catList)) :
            articleList = []
            print("In "+self.catList[i][0]+", index: "+ str(i))
            soup = self.index_to_soup(self.catList[i][1])
@ -110,7 +106,7 @@ def parseGeneric(self, baseURL):
            # Problem: 0-2 create many in an array
            # 3-5 create one.
            # So no for-div for 3-5
-            
+
            if i == 0 :
                print("Starting TV Archives")
                for div in soup.findAll(self.catList[i][2], self.catList[i][3]):
@ -151,7 +147,7 @@ def parseGeneric(self, baseURL):
            print("Returning")
            # print fullReturn
        return fullReturn
-     
+

    # build_index() starts with:
    #     try:
@ -161,7 +157,7 @@ def parseGeneric(self, baseURL):
    #        self.report_progress(0, _('Got feeds from index page'))
    #    except NotImplementedError:
    #        feeds = self.parse_feeds()
-    
+
    # which in turn is from __init__.py
    #def feeds_from_index(index, oldest_article=7, max_articles_per_feed=100,
    #    log=default_log):
@ -177,10 +173,10 @@ def parseGeneric(self, baseURL):
    #                                   max_articles_per_feed=max_articles_per_feed)
    #    feeds.append(pfeed)
    #           return feeds
-    
+
    #  use_embedded_content defaults to None, at which point if the content is > 2K, it is used as the article.

-            
+
    # calibre.web.feeds.news.BasicNewsRecipe.parse_index() fetches the list of articles.
    # returns a list of tuple ('feed title', list of articles)
    # {
@ -201,7 +197,7 @@ def parse_index(self):
        masterList = self.parseGeneric(baseURL)
        #print(masterList)
        return masterList
-        
+
    def preprocess_html(self, soup):
        print("In preprocess_html")
        refresh = soup.find('meta', {'http-equiv':'refresh'})
@ -210,22 +206,22 @@ def preprocess_html(self, soup):
        content = refresh.get('content').partition('=')[2]
        raw = self.browser.open('https://www.billoreilly.com'+content).read()
        return BeautifulSoup(raw.decode('cp1252', 'replace'))
-    
+
    def build_index(self):
        print("In OReilly build_index()\n\n")
        feedsRSS = []
-        self.report_progress(0, _('Fetching feeds...'))
+        self.report_progress(0, ('Fetching feeds...'))
        #try:
        feeds = feeds_from_index(self.parse_index(), oldest_article=self.oldest_article,
                                 max_articles_per_feed=self.max_articles_per_feed,
                                 log=self.log)
-        self.report_progress(0, _('Got feeds from index page'))
+        self.report_progress(0, ('Got feeds from index page'))
        #except NotImplementedError:
        #    feeds = self.parse_feeds()
-        # Now add regular feeds.    
+        # Now add regular feeds.
        feedsRSS = self.parse_feeds()
        print ("feedsRSS is type "+feedsRSS.__class__.__name__)
-        
+
        for articles in feedsRSS:
            print("articles is type "+articles.__class__.__name__)
            print("Title:" + articles.title)
@ -235,9 +231,9 @@ def build_index(self):

        #feeds = FeedCollection(feeds)

-        self.report_progress(0, _('Trying to download cover...'))
+        self.report_progress(0, ('Trying to download cover...'))
        self.download_cover()
-        self.report_progress(0, _('Generating masthead...'))
+        self.report_progress(0, ('Generating masthead...'))
        self.masthead_path = None

        try:
@ -317,7 +313,7 @@ def build_index(self):
            tp.putRequest(req, block=True, timeout=0)


-        self.report_progress(0, _('Starting download [%d thread(s)]...')%self.simultaneous_downloads)
+        self.report_progress(0, ('Starting download [%d thread(s)]...')%self.simultaneous_downloads)
        while True:
            try:
                tp.poll()
@ -331,8 +327,8 @@ def build_index(self):
            with open(os.path.join(feed_dir, 'index.html'), 'wb') as fi:
                fi.write(html)
        self.create_opf(feeds)
-        self.report_progress(1, _('Feeds downloaded to %s')%index)
+        self.report_progress(1, ('Feeds downloaded to %s')%index)

        return index
-    
+

--- a/recipes/real_clear.recipe
+++ b/recipes/real_clear.recipe
@ -1,9 +1,9 @@
 #  Test with "\Program Files\Calibre2\ebook-convert.exe" RealClear.recipe .epub --test -vv --debug-pipeline debug
-import string, re
+import re
 import time
 from urlparse import urlparse
 from calibre.web.feeds.recipes import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString
+from calibre.ebooks.BeautifulSoup import NavigableString

 class RealClear(BasicNewsRecipe):
    title           = u'Real Clear'
@ -23,8 +23,8 @@ class RealClear(BasicNewsRecipe):
    recursions      = 0
    max_articles_per_feed = 400
    debugMessages = True
-    
-    # Numeric parameter is type, controls whether we look for 
+
+    # Numeric parameter is type, controls whether we look for
    feedsets = [
                ["Politics",        "http://www.realclearpolitics.com/index.xml",   0],
                ["Policy",           "http://www.realclearpolicy.com/index.xml", 0],
@ -41,17 +41,17 @@ class RealClear(BasicNewsRecipe):
    # Hints to extractPrintURL.
    # First column is the URL snippet.  Then the string to search for as text, and the attributes to look for above it.  Start with attributes and drill down.
    phUrlSnip, phLinkText, phMainSearch, phHrefSearch = range(4)
-    
+
    printhints = [  ["realclear",           "",                            '' , 'printpage'],
                    ["billoreilly.com",     "Print this entry",            'a', ''],
                    ["billoreilly.com",     "Print This Article",          'a', ''],
-                    ["politico.com",        "Print",                       'a', 'share-print'],    
+                    ["politico.com",        "Print",                       'a', 'share-print'],
                    ["nationalreview.com",  ">Print<",                     'a', ''],
                    ["reason.com",          "",                       'a', 'printer']
                    # The following are not supported due to JavaScripting, and would require obfuscated_article to handle
-                    # forbes, 
+                    # forbes,
                    # usatoday - just prints with all current crap anyhow
-            
+
            ]
     # RCP - look for a strange compound.  See http://www.realclearpolitics.com/articles/2012/01/24/in_speech_obama_to_call_for_fairness_--_and_four_more_years_112879.html
     # The print link isn't obvious, and only the end is needed (the -full append.)  SO maybe try that first?s
@ -64,7 +64,7 @@ class RealClear(BasicNewsRecipe):
     # from  http://www.nytimes.com/2012/01/22/business/apple-america-and-a-squeezed-middle-class.html?_r=1
     # to    http://www.nytimes.com/2012/01/22/business/apple-america-and-a-squeezed-middle-class.html?_r=1&pagewanted=all
     # which is at link rel="canonical"   and at        <meta property="og:url"    or look for "Single Page"
-     
+
    # Returns the best-guess print url.
    # The second parameter (pageURL) is returned if nothing is found.
    def extractPrintURL(self, pageURL):
@ -104,7 +104,7 @@ def extractPrintURL(self, pageURL):
                    # print(soup)
                    print("end soup\n\n");
                continue
-                
+
            print(printFind)
            if isinstance(printFind, NavigableString)==False:
                if printFind['href'] is not None:
@ -130,7 +130,7 @@ def get_browser(self):
            print("In get_browser")
        br = BasicNewsRecipe.get_browser()
        return br
-        
+
    def parseRSS(self, index) :
        if self.debugMessages == True :
            print("\n\nStarting "+self.feedsets[index][0])
@ -160,7 +160,7 @@ def parseRSS(self, index) :
            pubDateEl = div.find("pubDate")
            if pubDateEl is None :
                pubDateEl = div.find("pubdate")
-            if pubDateEl is None :    
+            if pubDateEl is None :
                pubDate = time.strftime('%a, %d %b')
            else :
                pubDate = pubDateEl.contents[0]
@ -176,7 +176,7 @@ def parseRSS(self, index) :
            pubdate = time.strftime('%a, %d %b')
            articleList.append(dict(title=title, url=url, date=pubdate, description=description, content=''))
        return articleList
-    
+
    # calibre.web.feeds.news.BasicNewsRecipe.parse_index() fetches the list of articles.
    # returns a list of tuple ('feed title', list of articles)
    # {
@ -189,8 +189,8 @@ def parseRSS(self, index) :
    # this is used instead of BasicNewsRecipe.parse_feeds().
    def parse_index(self):
        # Parse the page into Python Soup
-        
-        articleList = []
+
+        #articleList = []
        ans = []
        feedsCount = len(self.feedsets)
        for x in range(0,feedsCount): # should be ,4
@ -200,5 +200,5 @@ def parse_index(self):
        if self.debugMessages == True :
            print(ans)
        return ans
-        
+