diff --git a/resources/recipes/nytimes.recipe b/resources/recipes/nytimes.recipe
index a2d5135045..c656450990 100644
--- a/resources/recipes/nytimes.recipe
+++ b/resources/recipes/nytimes.recipe
@@ -5,62 +5,59 @@
'''
nytimes.com
'''
-import re
-import time
-from calibre import entity_to_unicode
+import re, string, time
+from calibre import entity_to_unicode, strftime
from calibre.web.feeds.recipes import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString, \
-Comment, BeautifulStoneSoup
+from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, BeautifulStoneSoup
class NYTimes(BasicNewsRecipe):
- title = 'New York Times Top Stories'
- __author__ = 'GRiker'
- language = 'en'
- requires_version = (0, 7, 5)
- description = 'Top Stories from the New York Times'
+ # set headlinesOnly to True for the headlines-only version
+ headlinesOnly = True
- # List of sections typically included in Top Stories. Use a keyword from the
- # right column in the excludeSectionKeywords[] list to skip downloading that section
- sections = {
- 'arts' : 'Arts',
- 'business' : 'Business',
- 'diningwine' : 'Dining & Wine',
- 'editorials' : 'Editorials',
- 'health' : 'Health',
- 'magazine' : 'Magazine',
- 'mediaadvertising' : 'Media & Advertising',
- 'newyorkregion' : 'New York/Region',
- 'oped' : 'Op-Ed',
- 'politics' : 'Politics',
- 'science' : 'Science',
- 'sports' : 'Sports',
- 'technology' : 'Technology',
- 'topstories' : 'Top Stories',
- 'travel' : 'Travel',
- 'us' : 'U.S.',
- 'world' : 'World'
- }
+ # includeSections: List of sections to include. If empty, all sections found will be included.
+ # Otherwise, only the sections named will be included. For example,
+ #
+ # includeSections = ['Politics','Sports']
+ #
+ # would cause only the Politics and Sports sections to be included.
- # Add section keywords from the right column above to skip that section
- # For example, to skip sections containing the word 'Sports' or 'Dining', use:
- # excludeSectionKeywords = ['Sports', 'Dining']
- # Fetch only Business and Technology
- # excludeSectionKeywords = ['Arts','Dining','Editorials','Health','Magazine','Media','Region','Op-Ed','Politics','Science','Sports','Top Stories','Travel','U.S.','World']
- # Fetch only Top Stories
- # excludeSectionKeywords = ['Arts','Business','Dining','Editorials','Health','Magazine','Media','Region','Op-Ed','Politics','Science','Sports','Technology','Travel','U.S.','World']
- # By default, no sections are skipped.
- excludeSectionKeywords = []
+ includeSections = [] # by default, all sections included
+
+ # excludeSections: List of sections to exclude. If empty, all sections found will be included.
+ # Otherwise, the sections named will be excluded. For example,
+ #
+ # excludeSections = ['Politics','Sports']
+ #
+ # would cause the Politics and Sports sections to be excluded. This parameter can be used
+ # in conjuction with includeSections although in most cases using one or the other, but
+ # not both, is sufficient.
+
+ excludeSections = []
# one_picture_per_article specifies that calibre should only use the first image
# from an article (if one exists). If one_picture_per_article = True, the image
# will be moved to a location between the headline and the byline.
# If one_picture_per_article = False, all images from the article will be included
+
# and shown in their original location.
one_picture_per_article = True
# The maximum number of articles that will be downloaded
- max_articles_per_feed = 40
+ max_articles_per_feed = 100
+
+
+ if headlinesOnly:
+ title='New York Times Headlines'
+ description = 'Headlines from the New York Times'
+ else:
+ title='New York Times'
+ description = 'Today\'s New York Times'
+
+ __author__ = 'GRiker/Kovid Goyal/Nick Redding'
+ language = 'en'
+ requires_version = (0, 7, 5)
+
timefmt = ''
needs_subscription = True
@@ -82,6 +79,7 @@ class NYTimes(BasicNewsRecipe):
'entry-response module',
'icon enlargeThis',
'leftNavTabs',
+ 'metaFootnote',
'module box nav',
'nextArticleLink',
'nextArticleLink clearfix',
@@ -89,12 +87,13 @@ class NYTimes(BasicNewsRecipe):
'relatedSearchesModule',
'side_tool',
'singleAd',
- 'subNavigation clearfix',
- 'subNavigation tabContent active',
- 'subNavigation tabContent active clearfix',
+ re.compile('^subNavigation'),
+ re.compile('^leaderboard'),
+ re.compile('^module'),
]}),
dict(id=[
'adxLeaderboard',
+ 'adxSponLink',
'archive',
'articleExtras',
'articleInline',
@@ -105,87 +104,98 @@ class NYTimes(BasicNewsRecipe):
'footer',
'header',
'header_search',
+ 'inlineBox',
'login',
'masthead',
'masthead-nav',
'memberTools',
'navigation',
'portfolioInline',
+ 'readerReviews',
+ 'readerReviewsCount',
'relatedArticles',
+ 'relatedTopics',
'respond',
'side_search',
'side_index',
'side_tool',
'toolsRight',
]),
- dict(name=['script', 'noscript', 'style'])]
-
+ dict(name=['script', 'noscript', 'style','form','hr'])]
no_stylesheets = True
- extra_css = '.headline {text-align: left;}\n \
- .byline {font-family: monospace; \
- text-align: left; \
- margin-top: 0px; \
- margin-bottom: 0px;}\n \
- .dateline {font-size: small; \
- margin-top: 0px; \
- margin-bottom: 0px;}\n \
- .timestamp {font-size: small; \
- margin-top: 0px; \
- margin-bottom: 0px;}\n \
- .source {text-align: left;}\n \
- .image {text-align: center;}\n \
- .credit {text-align: right; \
- font-size: small; \
- margin-top: 0px; \
- margin-bottom: 0px;}\n \
- .articleBody {text-align: left;}\n \
- .authorId {text-align: left; \
- font-style: italic;}\n '
+ extra_css = '''
+ .articleHeadline { text-align: left; margin-top:0.5em; margin-bottom:0.25em; }
+ .credit { text-align: right; font-size: small; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
+ .byline { text-align: left; font-size: small; line-height:1em; margin-top:10px; margin-left:0; margin-right:0; margin-bottom: 0; }
+ .dateline { text-align: left; font-size: small; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
+ .kicker { font-size: small; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
+ .timestamp { text-align: left; font-size: small; }
+ .caption { font-size: small; font-style:italic; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
+ a:link {text-decoration: none; }
+ .articleBody { }
+ .authorId {text-align: left; }
+ .image {text-align: center;}
+ .source {text-align: left; }'''
- def dump_ans(self, ans) :
+ def filter_ans(self, ans) :
total_article_count = 0
- for section in ans :
+ idx = 0
+ idx_max = len(ans)-1
+ while idx <= idx_max:
+ if self.includeSections != []:
+ if ans[idx][0] not in self.includeSections:
+ print "SECTION NOT INCLUDED: ",ans[idx][0]
+ del ans[idx]
+ idx_max = idx_max-1
+ continue
+ if ans[idx][0] in self.excludeSections:
+ print "SECTION EXCLUDED: ",ans[idx][0]
+ del ans[idx]
+ idx_max = idx_max-1
+ continue
if self.verbose:
- self.log("section %s: %d articles" % (section[0], len(section[1])) )
- for article in section[1]:
+ self.log("Section %s: %d articles" % (ans[idx][0], len(ans[idx][1])) )
+ for article in ans[idx][1]:
total_article_count += 1
if self.verbose:
self.log("\t%-40.40s... \t%-60.60s..." % (article['title'].encode('cp1252','replace'),
article['url'].encode('cp1252','replace')))
+ idx = idx+1
+
self.log( "Queued %d articles" % total_article_count )
+ return ans
def fixChars(self,string):
# Replace lsquo (\x91)
- fixed = re.sub("\x91","‘",string)
+ fixed = re.sub("\x91","‘",string)
# Replace rsquo (\x92)
- fixed = re.sub("\x92","’",fixed)
+ fixed = re.sub("\x92","’",fixed)
# Replace ldquo (\x93)
- fixed = re.sub("\x93","“",fixed)
+ fixed = re.sub("\x93","“",fixed)
# Replace rdquo (\x94)
- fixed = re.sub("\x94","”",fixed)
+ fixed = re.sub("\x94","”",fixed)
# Replace ndash (\x96)
- fixed = re.sub("\x96","–",fixed)
+ fixed = re.sub("\x96","–",fixed)
# Replace mdash (\x97)
- fixed = re.sub("\x97","—",fixed)
+ fixed = re.sub("\x97","—",fixed)
return fixed
def get_browser(self):
br = BasicNewsRecipe.get_browser()
if self.username is not None and self.password is not None:
- try:
- br.open('http://www.nytimes.com/auth/login')
- br.select_form(name='login')
- br['USERID'] = self.username
- br['PASSWORD'] = self.password
- br.submit()
- except:
- self.log("\nFailed to login")
+ br.open('http://www.nytimes.com/auth/login')
+ br.select_form(name='login')
+ br['USERID'] = self.username
+ br['PASSWORD'] = self.password
+ raw = br.submit().read()
+ if 'Please try again' in raw:
+ raise Exception('Your username and password are incorrect')
return br
def skip_ad_pages(self, soup):
@@ -213,6 +223,9 @@ def get_cover_url(self):
cover = None
return cover
+ def short_title(self):
+ return self.title
+
def index_to_soup(self, url_or_raw, raw=False):
'''
OVERRIDE of class method
@@ -255,157 +268,184 @@ def massageNCXText(self, description):
# Kindle TOC descriptions won't render certain characters
if description:
massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
- # Replace '&' with '&'
- massaged = re.sub("&","&", massaged)
+ # Replace '&' with '&'
+ massaged = re.sub("&","&", massaged)
return self.fixChars(massaged)
else:
return description
- def parse_index(self):
+ def parse_todays_index(self):
+
+ def feed_title(div):
+ return ''.join(div.findAll(text=True, recursive=True)).strip()
+
+ articles = {}
+ key = None
+ ans = []
+ url_list = []
+
+ def handle_article(div):
+ a = div.find('a', href=True)
+ if not a:
+ return
+ url = re.sub(r'\?.*', '', a['href'])
+ if not url.startswith("http"):
+ return
+ if not url.endswith(".html"):
+ return
+ if 'podcast' in url:
+ return
+ if '/video/' in url:
+ return
+ url += '?pagewanted=all'
+ if url in url_list:
+ return
+ url_list.append(url)
+ title = self.tag_to_string(a, use_alt=True).strip()
+ description = ''
+ pubdate = strftime('%a, %d %b')
+ summary = div.find(True, attrs={'class':'summary'})
+ if summary:
+ description = self.tag_to_string(summary, use_alt=False)
+ author = ''
+ authorAttribution = div.find(True, attrs={'class':'byline'})
+ if authorAttribution:
+ author = self.tag_to_string(authorAttribution, use_alt=False)
+ else:
+ authorAttribution = div.find(True, attrs={'class':'byline'})
+ if authorAttribution:
+ author = self.tag_to_string(authorAttribution, use_alt=False)
+ feed = key if key is not None else 'Uncategorized'
+ if not articles.has_key(feed):
+ ans.append(feed)
+ articles[feed] = []
+ articles[feed].append(
+ dict(title=title, url=url, date=pubdate,
+ description=description, author=author,
+ content=''))
+
+
+ soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html')
+
+
+ # Find each article
+ for div in soup.findAll(True,
+ attrs={'class':['section-headline', 'story', 'story headline','sectionHeader','headlinesOnly multiline flush']}):
+
+ if div['class'] in ['section-headline','sectionHeader']:
+ key = string.capwords(feed_title(div))
+ key = key.replace('Op-ed','Op-Ed')
+ key = key.replace('U.s.','U.S.')
+ elif div['class'] in ['story', 'story headline'] :
+ handle_article(div)
+ elif div['class'] == 'headlinesOnly multiline flush':
+ for lidiv in div.findAll('li'):
+ handle_article(lidiv)
+
+ ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
+ return self.filter_ans(ans)
+
+ def parse_headline_index(self):
+
articles = {}
ans = []
-
- feed = key = 'All Top Stories'
- articles[key] = []
- ans.append(key)
- self.log("Scanning 1 section ...")
+ url_list = []
soup = self.index_to_soup('http://www.nytimes.com/pages/todaysheadlines/')
- # Fetch the outer table
- table = soup.find('table')
- previousTable = table
+ # Fetch the content table
+ content_table = soup.find('table',{'id':'content'})
+ if content_table is None:
+ self.log("FATAL ERROR: CANNOT FIND CONTENT TABLE")
+ return None
- # Find the deepest table containing the stories
- while True :
- table = table.find('table')
- if table.find(text=re.compile('top stories start')) :
- previousTable = table
- continue
- else :
- table = previousTable
- break
+ # Within this table are
entries, each containing one or more h6 tags which represent sections
- # There are multiple subtables, find the one containing the stories
- for block in table.findAll('table') :
- if block.find(text=re.compile('top stories start')) :
- table = block
- break
- else :
- continue
+ for td_col in content_table.findAll('td', {'id' : re.compile('Column')}):
+ for div_sec in td_col.findAll('div',recursive=False):
+ for h6_sec_name in div_sec.findAll('h6',{'style' : re.compile('text-transform: *uppercase')}):
+ section_name = self.tag_to_string(h6_sec_name,use_alt=False)
+ section_name = re.sub(r'^ *$','',section_name)
+ if section_name == '':
+ continue
+ section_name=string.capwords(section_name)
+ if section_name == 'U.s.':
+ section_name = 'U.S.'
+ elif section_name == 'Op-ed':
+ section_name = 'Op-Ed'
+ pubdate = strftime('%a, %d %b')
- # Again there are multiple subtables, find the one containing the stories
- for storyblock in table.findAll('table') :
- if storyblock.find(text=re.compile('top stories start')) :
- break
- else :
- continue
-
- skipThisSection = False
- todays_article_count = 0
- # Within this table are entries
- self.log("Fetching feed Top Stories")
- for tr in storyblock.findAllNext('tr'):
- if tr.find('span') is not None :
-
- sectionblock = tr.find(True, attrs={'face':['times new roman, times,sans serif',
- 'times new roman,times, sans serif',
- 'times new roman, times, sans serif']})
- section = None
- bylines = []
- descriptions = []
- pubdate = None
-
- # Get the Section title
- for (x,i) in enumerate(sectionblock.contents) :
- skipThisSection = False
- # Extract the section title
- if ('Comment' in str(i.__class__)) :
- if 'start(name=' in i :
- section = i[i.find('=')+1:-2]
-
- if not self.sections.has_key(section) :
- skipThisSection = True
+ search_div = div_sec
+ for next_tag in h6_sec_name.findNextSiblings(True):
+ if next_tag.__class__.__name__ == 'Tag':
+ if next_tag.name == 'div':
+ search_div = next_tag
break
- # Check for excluded section
- if len(self.excludeSectionKeywords):
- key = self.sections[section]
- excluded = re.compile('|'.join(self.excludeSectionKeywords))
- if excluded.search(key) or articles.has_key(key):
- skipThisSection = True
- break
-
- # Get the bylines and descriptions
- if not skipThisSection :
- lines = sectionblock.contents
- contentStrings = []
-
- for line in lines:
- if not isinstance(line, Comment) and line.strip and line.strip() > "":
- contentStrings.append(line.strip())
-
- # Gather the byline/description pairs
- bylines = []
- descriptions = []
- for contentString in contentStrings:
- if contentString[0:3] == 'By ' and contentString[3].isupper() :
- bylines.append(contentString)
+ # Get the articles
+ for h3_item in search_div.findAll('h3'):
+ byline = h3_item.h6
+ if byline is not None:
+ author = self.tag_to_string(byline,usa_alt=False)
else:
- descriptions.append(contentString)
-
- # Fetch the article titles and URLs
- articleCount = len(sectionblock.findAll('span'))
- todays_article_count += articleCount
- for (i,span) in enumerate(sectionblock.findAll(attrs={'class':'headlineWrapper'})) :
- a = span.find('a', href=True)
+ author = ''
+ a = h3_item.find('a', href=True)
+ if not a:
+ continue
url = re.sub(r'\?.*', '', a['href'])
+ if not url.startswith("http"):
+ continue
+ if not url.endswith(".html"):
+ continue
+ if 'podcast' in url:
+ continue
+ if 'video' in url:
+ continue
url += '?pagewanted=all'
+ if url in url_list:
+ continue
+ url_list.append(url)
+ self.log("URL %s" % url)
+ title = self.tag_to_string(a, use_alt=True).strip()
+ desc = h3_item.find('p')
+ if desc is not None:
+ description = self.tag_to_string(desc,use_alt=False)
+ else:
+ description = ''
+ if not articles.has_key(section_name):
+ ans.append(section_name)
+ articles[section_name] = []
+ articles[section_name].append(dict(title=title, url=url, date=pubdate, description=description, author=author, content=''))
- title = self.tag_to_string(a, use_alt=True)
- # prepend the section name
- title = self.sections[section] + " · " + title
- if not isinstance(title, unicode):
- title = title.decode('utf-8', 'replace')
-
- # Allow for unattributed, undescribed entries "Editor's Note"
- if i >= len(descriptions) :
- description = None
- else :
- description = descriptions[i]
-
- if len(bylines) == articleCount :
- author = bylines[i]
- else :
- author = None
-
- # Check for duplicates
- duplicateFound = False
- if len(articles[feed]) > 1:
- for article in articles[feed] :
- if url == article['url'] :
- duplicateFound = True
- break
-
- if duplicateFound:
- # Continue fetching, don't add this article
- todays_article_count -= 1
- continue
-
- if not articles.has_key(feed):
- articles[feed] = []
- articles[feed].append(
- dict(title=title, url=url, date=pubdate,
- description=description, author=author, content=''))
-# self.log("Queuing %d articles from %s" % (todays_article_count, "Top Stories"))
-
- ans = self.sort_index_by(ans, {'Top Stories':-1})
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
- self.dump_ans(ans)
- return ans
+ return self.filter_ans(ans)
+
+ def parse_index(self):
+ if self.headlinesOnly:
+ return self.parse_headline_index()
+ else:
+ return self.parse_todays_index()
+
+ def strip_anchors(self,soup):
+ paras = soup.findAll(True)
+ for para in paras:
+ aTags = para.findAll('a')
+ for a in aTags:
+ if a.img is None:
+ a.replaceWith(a.renderContents().decode('cp1252','replace'))
+ return soup
+
def preprocess_html(self, soup):
+
+ kicker_tag = soup.find(attrs={'class':'kicker'})
+ if kicker_tag: # remove Op_Ed author head shots
+ tagline = self.tag_to_string(kicker_tag)
+ if tagline=='Op-Ed Columnist':
+ img_div = soup.find('div','inlineImage module')
+ if img_div:
+ img_div.extract()
return self.strip_anchors(soup)
def postprocess_html(self,soup, True):
@@ -422,8 +462,9 @@ def postprocess_html(self,soup, True):
firstImg = inlineImgs[0]
for inlineImg in inlineImgs[1:]:
inlineImg.extract()
- # Move firstImg after headline
- cgFirst = soup.find(True, {'class':'columnGroup first'})
+ # Move firstImg before article body
+ #article_body = soup.find(True, {'id':'articleBody'})
+ cgFirst = soup.find(True, {'class':re.compile('columnGroup *first')})
if cgFirst:
# Strip all sibling NavigableStrings: noise
navstrings = cgFirst.findAll(text=True, recursive=False)
@@ -443,30 +484,18 @@ def postprocess_html(self,soup, True):
if headline_found:
cgFirst.insert(insertLoc,firstImg)
else:
- self.log(">>> No class:'columnGroup first' found <<<")
- # Change class="kicker" to
- kicker = soup.find(True, {'class':'kicker'})
- if kicker and kicker.contents[0]:
- h3Tag = Tag(soup, "h3")
- h3Tag.insert(0, self.fixChars(self.tag_to_string(kicker,
- use_alt=False)))
- kicker.replaceWith(h3Tag)
+ self.log(">>> No class:'columnGroup first' found <<<")
- # Change captions to italic -1
+ # Change captions to italic
for caption in soup.findAll(True, {'class':'caption'}) :
if caption and caption.contents[0]:
- emTag = Tag(soup, "em")
+ cTag = Tag(soup, "p", [("class", "caption")])
c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip()
mp_off = c.find("More Photos")
if mp_off >= 0:
c = c[:mp_off]
- emTag.insert(0, c)
- #hrTag = Tag(soup, 'hr')
- #hrTag['class'] = 'caption_divider'
- hrTag = Tag(soup, 'div')
- hrTag['class'] = 'divider'
- emTag.insert(1, hrTag)
- caption.replaceWith(emTag)
+ cTag.insert(0, c)
+ caption.replaceWith(cTag)
# Change to
h1 = soup.find('h1')
@@ -506,17 +535,6 @@ def postprocess_html(self,soup, True):
bTag.insert(0, subhead.contents[0])
subhead.replaceWith(bTag)
- # Synthesize a section header
- dsk = soup.find('meta', attrs={'name':'dsk'})
- if dsk and dsk.has_key('content'):
- hTag = Tag(soup,'h3')
- hTag['class'] = 'section'
- hTag.insert(0,NavigableString(dsk['content']))
- articleTag = soup.find(True, attrs={'id':'article'})
- if articleTag:
- articleTag.insert(0,hTag)
-
- # Add class="articleBody" to so we can format with CSS
divTag = soup.find('div',attrs={'id':'articleBody'})
if divTag:
divTag['class'] = divTag['id']
@@ -532,11 +550,3 @@ def postprocess_html(self,soup, True):
return soup
- def strip_anchors(self,soup):
- paras = soup.findAll(True)
- for para in paras:
- aTags = para.findAll('a')
- for a in aTags:
- if a.img is None:
- a.replaceWith(a.renderContents().decode('cp1252','replace'))
- return soup
diff --git a/resources/recipes/nytimes_sub.recipe b/resources/recipes/nytimes_sub.recipe
index 5452ae1c6e..ed1ba75f0f 100644
--- a/resources/recipes/nytimes_sub.recipe
+++ b/resources/recipes/nytimes_sub.recipe
@@ -5,52 +5,186 @@
'''
nytimes.com
'''
-import string, re, time
-from calibre import strftime
+import re, string, time
+from calibre import entity_to_unicode, strftime
from calibre.web.feeds.recipes import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import BeautifulSoup
-
-def decode(self, src):
- enc = 'utf-8'
- if 'iso-8859-1' in src:
- enc = 'cp1252'
- return src.decode(enc, 'ignore')
+from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, BeautifulStoneSoup
class NYTimes(BasicNewsRecipe):
- title = u'New York Times'
- __author__ = 'Kovid Goyal/Nick Redding'
- language = 'en'
- requires_version = (0, 6, 36)
+ # set headlinesOnly to True for the headlines-only version
+ headlinesOnly = False
- description = 'Daily news from the New York Times (subscription version)'
- timefmt = ' [%b %d]'
+ # includeSections: List of sections to include. If empty, all sections found will be included.
+ # Otherwise, only the sections named will be included. For example,
+ #
+ # includeSections = ['Politics','Sports']
+ #
+ # would cause only the Politics and Sports sections to be included.
+
+ includeSections = [] # by default, all sections included
+
+ # excludeSections: List of sections to exclude. If empty, all sections found will be included.
+ # Otherwise, the sections named will be excluded. For example,
+ #
+ # excludeSections = ['Politics','Sports']
+ #
+ # would cause the Politics and Sports sections to be excluded. This parameter can be used
+ # in conjuction with includeSections although in most cases using one or the other, but
+ # not both, is sufficient.
+
+ excludeSections = []
+
+ # one_picture_per_article specifies that calibre should only use the first image
+ # from an article (if one exists). If one_picture_per_article = True, the image
+ # will be moved to a location between the headline and the byline.
+ # If one_picture_per_article = False, all images from the article will be included
+
+ # and shown in their original location.
+ one_picture_per_article = True
+
+ # The maximum number of articles that will be downloaded
+ max_articles_per_feed = 100
+
+
+ if headlinesOnly:
+ title='New York Times Headlines'
+ description = 'Headlines from the New York Times'
+ else:
+ title='New York Times'
+ description = 'Today\'s New York Times'
+
+ __author__ = 'GRiker/Kovid Goyal/Nick Redding'
+ language = 'en'
+ requires_version = (0, 7, 5)
+
+
+ timefmt = ''
needs_subscription = True
+ masthead_url = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif'
+ cover_margins = (18,18,'grey99')
+
remove_tags_before = dict(id='article')
remove_tags_after = dict(id='article')
- remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool','nextArticleLink',
- 'nextArticleLink clearfix','columnGroup doubleRule','doubleRule','entry-meta',
- 'icon enlargeThis','columnGroup last','relatedSearchesModule']}),
- dict({'class':re.compile('^subNavigation')}),
- dict({'class':re.compile('^leaderboard')}),
- dict({'class':re.compile('^module')}),
- dict({'class':'metaFootnote'}),
- dict(id=['inlineBox','footer', 'toolsRight', 'articleInline','login','masthead',
- 'navigation', 'archive', 'side_search', 'blog_sidebar','cCol','portfolioInline',
- 'side_tool', 'side_index','header','readerReviewsCount','readerReviews',
- 'relatedArticles', 'relatedTopics', 'adxSponLink']),
+ remove_tags = [dict(attrs={'class':[
+ 'articleFooter',
+ 'articleTools',
+ 'columnGroup doubleRule',
+ 'columnGroup singleRule',
+ 'columnGroup last',
+ 'columnGroup last',
+ 'doubleRule',
+ 'dottedLine',
+ 'entry-meta',
+ 'entry-response module',
+ 'icon enlargeThis',
+ 'leftNavTabs',
+ 'metaFootnote',
+ 'module box nav',
+ 'nextArticleLink',
+ 'nextArticleLink clearfix',
+ 'post-tools',
+ 'relatedSearchesModule',
+ 'side_tool',
+ 'singleAd',
+ re.compile('^subNavigation'),
+ re.compile('^leaderboard'),
+ re.compile('^module'),
+ ]}),
+ dict(id=[
+ 'adxLeaderboard',
+ 'adxSponLink',
+ 'archive',
+ 'articleExtras',
+ 'articleInline',
+ 'blog_sidebar',
+ 'businessSearchBar',
+ 'cCol',
+ 'entertainmentSearchBar',
+ 'footer',
+ 'header',
+ 'header_search',
+ 'inlineBox',
+ 'login',
+ 'masthead',
+ 'masthead-nav',
+ 'memberTools',
+ 'navigation',
+ 'portfolioInline',
+ 'readerReviews',
+ 'readerReviewsCount',
+ 'relatedArticles',
+ 'relatedTopics',
+ 'respond',
+ 'side_search',
+ 'side_index',
+ 'side_tool',
+ 'toolsRight',
+ ]),
dict(name=['script', 'noscript', 'style','form','hr'])]
- encoding = decode
no_stylesheets = True
extra_css = '''
- .articleHeadline { margin-top:0.5em; margin-bottom:0.25em; }
- .credit { font-size: small; font-style:italic; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
- .byline { font-size: small; font-style:italic; line-height:1em; margin-top:10px; margin-left:0; margin-right:0; margin-bottom: 0; }
- .dateline { font-size: small; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
+ .articleHeadline { text-align: left; margin-top:0.5em; margin-bottom:0.25em; }
+ .credit { text-align: right; font-size: small; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
+ .byline { text-align: left; font-size: small; line-height:1em; margin-top:10px; margin-left:0; margin-right:0; margin-bottom: 0; }
+ .dateline { text-align: left; font-size: small; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
.kicker { font-size: small; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
- .timestamp { font-size: small; }
- .caption { font-size: small; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
- a:link {text-decoration: none; }'''
+ .timestamp { text-align: left; font-size: small; }
+ .caption { font-size: small; font-style:italic; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
+ a:link {text-decoration: none; }
+ .articleBody { }
+ .authorId {text-align: left; }
+ .image {text-align: center;}
+ .source {text-align: left; }'''
+
+ def filter_ans(self, ans) :
+ total_article_count = 0
+ idx = 0
+ idx_max = len(ans)-1
+ while idx <= idx_max:
+ if self.includeSections != []:
+ if ans[idx][0] not in self.includeSections:
+ print "SECTION NOT INCLUDED: ",ans[idx][0]
+ del ans[idx]
+ idx_max = idx_max-1
+ continue
+ if ans[idx][0] in self.excludeSections:
+ print "SECTION EXCLUDED: ",ans[idx][0]
+ del ans[idx]
+ idx_max = idx_max-1
+ continue
+ if self.verbose:
+ self.log("Section %s: %d articles" % (ans[idx][0], len(ans[idx][1])) )
+ for article in ans[idx][1]:
+ total_article_count += 1
+ if self.verbose:
+ self.log("\t%-40.40s... \t%-60.60s..." % (article['title'].encode('cp1252','replace'),
+ article['url'].encode('cp1252','replace')))
+ idx = idx+1
+
+ self.log( "Queued %d articles" % total_article_count )
+ return ans
+
+ def fixChars(self,string):
+ # Replace lsquo (\x91)
+ fixed = re.sub("\x91","‘",string)
+
+ # Replace rsquo (\x92)
+ fixed = re.sub("\x92","’",fixed)
+
+ # Replace ldquo (\x93)
+ fixed = re.sub("\x93","“",fixed)
+
+ # Replace rdquo (\x94)
+ fixed = re.sub("\x94","”",fixed)
+
+ # Replace ndash (\x96)
+ fixed = re.sub("\x96","–",fixed)
+
+ # Replace mdash (\x97)
+ fixed = re.sub("\x97","—",fixed)
+
+ return fixed
def get_browser(self):
br = BasicNewsRecipe.get_browser()
@@ -60,22 +194,19 @@ def get_browser(self):
br['USERID'] = self.username
br['PASSWORD'] = self.password
raw = br.submit().read()
- if 'Sorry, we could not find the combination you entered. Please try again.' in raw:
+ if 'Please try again' in raw:
raise Exception('Your username and password are incorrect')
- #open('/t/log.html', 'wb').write(raw)
return br
- def get_masthead_url(self):
- masthead = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif'
- #masthead = 'http://members.cox.net/nickredding/nytlogo.gif'
- br = BasicNewsRecipe.get_browser()
- try:
- br.open(masthead)
- except:
- self.log("\nMasthead unavailable")
- masthead = None
- return masthead
-
+ def skip_ad_pages(self, soup):
+ # Skip ad pages served before actual article
+ skip_tag = soup.find(True, {'name':'skip'})
+ if skip_tag is not None:
+ self.log.warn("Found forwarding link: %s" % skip_tag.parent['href'])
+ url = 'http://www.nytimes.com' + re.sub(r'\?.*', '', skip_tag.parent['href'])
+ url += '?pagewanted=all'
+ self.log.warn("Skipping ad to article at '%s'" % url)
+ return self.index_to_soup(url, raw=True)
def get_cover_url(self):
cover = None
@@ -93,12 +224,57 @@ def get_cover_url(self):
return cover
def short_title(self):
- return 'New York Times'
+ return self.title
- def parse_index(self):
- self.encoding = 'cp1252'
- soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html')
- self.encoding = decode
+ def index_to_soup(self, url_or_raw, raw=False):
+ '''
+ OVERRIDE of class method
+ deals with various page encodings between index and articles
+ '''
+ def get_the_soup(docEncoding, url_or_raw, raw=False) :
+ if re.match(r'\w+://', url_or_raw):
+ f = self.browser.open(url_or_raw)
+ _raw = f.read()
+ f.close()
+ if not _raw:
+ raise RuntimeError('Could not fetch index from %s'%url_or_raw)
+ else:
+ _raw = url_or_raw
+ if raw:
+ return _raw
+
+ if not isinstance(_raw, unicode) and self.encoding:
+ _raw = _raw.decode(docEncoding, 'replace')
+ massage = list(BeautifulSoup.MARKUP_MASSAGE)
+ massage.append((re.compile(r'&(\S+?);'), lambda match: entity_to_unicode(match, encoding=self.encoding)))
+ return BeautifulSoup(_raw, markupMassage=massage)
+
+ # Entry point
+ print "index_to_soup()"
+ soup = get_the_soup( self.encoding, url_or_raw )
+ contentType = soup.find(True,attrs={'http-equiv':'Content-Type'})
+ docEncoding = str(contentType)[str(contentType).find('charset=') + len('charset='):str(contentType).rfind('"')]
+ if docEncoding == '' :
+ docEncoding = self.encoding
+
+ if self.verbose > 2:
+ self.log( " document encoding: '%s'" % docEncoding)
+ if docEncoding != self.encoding :
+ soup = get_the_soup(docEncoding, url_or_raw)
+
+ return soup
+
+ def massageNCXText(self, description):
+ # Kindle TOC descriptions won't render certain characters
+ if description:
+ massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
+ # Replace '&' with '&'
+ massaged = re.sub("&","&", massaged)
+ return self.fixChars(massaged)
+ else:
+ return description
+
+ def parse_todays_index(self):
def feed_title(div):
return ''.join(div.findAll(text=True, recursive=True)).strip()
@@ -119,12 +295,13 @@ def handle_article(div):
return
if 'podcast' in url:
return
+ if '/video/' in url:
+ return
url += '?pagewanted=all'
if url in url_list:
return
url_list.append(url)
title = self.tag_to_string(a, use_alt=True).strip()
- #self.log("Title: %s" % title)
description = ''
pubdate = strftime('%a, %d %b')
summary = div.find(True, attrs={'class':'summary'})
@@ -140,6 +317,7 @@ def handle_article(div):
author = self.tag_to_string(authorAttribution, use_alt=False)
feed = key if key is not None else 'Uncategorized'
if not articles.has_key(feed):
+ ans.append(feed)
articles[feed] = []
articles[feed].append(
dict(title=title, url=url, date=pubdate,
@@ -147,46 +325,228 @@ def handle_article(div):
content=''))
+ soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html')
- # Find each instance of class="section-headline", class="story", class="story headline"
+
+ # Find each article
for div in soup.findAll(True,
attrs={'class':['section-headline', 'story', 'story headline','sectionHeader','headlinesOnly multiline flush']}):
if div['class'] in ['section-headline','sectionHeader']:
key = string.capwords(feed_title(div))
- articles[key] = []
- ans.append(key)
- #self.log('Section: %s' % key)
-
+ key = key.replace('Op-ed','Op-Ed')
+ key = key.replace('U.s.','U.S.')
elif div['class'] in ['story', 'story headline'] :
handle_article(div)
elif div['class'] == 'headlinesOnly multiline flush':
for lidiv in div.findAll('li'):
handle_article(lidiv)
-# ans = self.sort_index_by(ans, {'The Front Page':-1,
-# 'Dining In, Dining Out':1,
-# 'Obituaries':2})
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
+ return self.filter_ans(ans)
+
+ def parse_headline_index(self):
+
+ articles = {}
+ ans = []
+ url_list = []
+
+ soup = self.index_to_soup('http://www.nytimes.com/pages/todaysheadlines/')
+
+ # Fetch the content table
+ content_table = soup.find('table',{'id':'content'})
+ if content_table is None:
+ self.log("FATAL ERROR: CANNOT FIND CONTENT TABLE")
+ return None
+
+ # Within this table are entries, each containing one or more h6 tags which represent sections
+
+ for td_col in content_table.findAll('td', {'id' : re.compile('Column')}):
+ for div_sec in td_col.findAll('div',recursive=False):
+ for h6_sec_name in div_sec.findAll('h6',{'style' : re.compile('text-transform: *uppercase')}):
+ section_name = self.tag_to_string(h6_sec_name,use_alt=False)
+ section_name = re.sub(r'^ *$','',section_name)
+ if section_name == '':
+ continue
+ section_name=string.capwords(section_name)
+ if section_name == 'U.s.':
+ section_name = 'U.S.'
+ elif section_name == 'Op-ed':
+ section_name = 'Op-Ed'
+ pubdate = strftime('%a, %d %b')
+
+ search_div = div_sec
+ for next_tag in h6_sec_name.findNextSiblings(True):
+ if next_tag.__class__.__name__ == 'Tag':
+ if next_tag.name == 'div':
+ search_div = next_tag
+ break
+
+ # Get the articles
+ for h3_item in search_div.findAll('h3'):
+ byline = h3_item.h6
+ if byline is not None:
+ author = self.tag_to_string(byline,usa_alt=False)
+ else:
+ author = ''
+ a = h3_item.find('a', href=True)
+ if not a:
+ continue
+ url = re.sub(r'\?.*', '', a['href'])
+ if not url.startswith("http"):
+ continue
+ if not url.endswith(".html"):
+ continue
+ if 'podcast' in url:
+ continue
+ if 'video' in url:
+ continue
+ url += '?pagewanted=all'
+ if url in url_list:
+ continue
+ url_list.append(url)
+ self.log("URL %s" % url)
+ title = self.tag_to_string(a, use_alt=True).strip()
+ desc = h3_item.find('p')
+ if desc is not None:
+ description = self.tag_to_string(desc,use_alt=False)
+ else:
+ description = ''
+ if not articles.has_key(section_name):
+ ans.append(section_name)
+ articles[section_name] = []
+ articles[section_name].append(dict(title=title, url=url, date=pubdate, description=description, author=author, content=''))
+
+
+ ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
+ return self.filter_ans(ans)
+
+ def parse_index(self):
+ if self.headlinesOnly:
+ return self.parse_headline_index()
+ else:
+ return self.parse_todays_index()
+
+ def strip_anchors(self,soup):
+ paras = soup.findAll(True)
+ for para in paras:
+ aTags = para.findAll('a')
+ for a in aTags:
+ if a.img is None:
+ a.replaceWith(a.renderContents().decode('cp1252','replace'))
+ return soup
- return ans
def preprocess_html(self, soup):
+
kicker_tag = soup.find(attrs={'class':'kicker'})
- if kicker_tag:
+ if kicker_tag: # remove Op_Ed author head shots
tagline = self.tag_to_string(kicker_tag)
- #self.log("FOUND KICKER %s" % tagline)
if tagline=='Op-Ed Columnist':
img_div = soup.find('div','inlineImage module')
- #self.log("Searching for photo")
if img_div:
img_div.extract()
- #self.log("Photo deleted")
- refresh = soup.find('meta', {'http-equiv':'refresh'})
- if refresh is None:
- return soup
- content = refresh.get('content').partition('=')[2]
- raw = self.browser.open_novisit('http://www.nytimes.com'+content).read()
- return BeautifulSoup(raw.decode('cp1252', 'replace'))
+ return self.strip_anchors(soup)
+ def postprocess_html(self,soup, True):
+
+ if self.one_picture_per_article:
+ # Remove all images after first
+ largeImg = soup.find(True, {'class':'articleSpanImage'})
+ inlineImgs = soup.findAll(True, {'class':'inlineImage module'})
+ if largeImg:
+ for inlineImg in inlineImgs:
+ inlineImg.extract()
+ else:
+ if inlineImgs:
+ firstImg = inlineImgs[0]
+ for inlineImg in inlineImgs[1:]:
+ inlineImg.extract()
+ # Move firstImg before article body
+ #article_body = soup.find(True, {'id':'articleBody'})
+ cgFirst = soup.find(True, {'class':re.compile('columnGroup *first')})
+ if cgFirst:
+ # Strip all sibling NavigableStrings: noise
+ navstrings = cgFirst.findAll(text=True, recursive=False)
+ [ns.extract() for ns in navstrings]
+ headline_found = False
+ tag = cgFirst.find(True)
+ insertLoc = 0
+ while True:
+ insertLoc += 1
+ if hasattr(tag,'class') and tag['class'] == 'articleHeadline':
+ headline_found = True
+ break
+ tag = tag.nextSibling
+ if not tag:
+ headline_found = False
+ break
+ if headline_found:
+ cgFirst.insert(insertLoc,firstImg)
+ else:
+ self.log(">>> No class:'columnGroup first' found <<<")
+
+ # Change captions to italic
+ for caption in soup.findAll(True, {'class':'caption'}) :
+ if caption and caption.contents[0]:
+ cTag = Tag(soup, "p", [("class", "caption")])
+ c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip()
+ mp_off = c.find("More Photos")
+ if mp_off >= 0:
+ c = c[:mp_off]
+ cTag.insert(0, c)
+ caption.replaceWith(cTag)
+
+ # Change to
+ h1 = soup.find('h1')
+ if h1:
+ headline = h1.find("nyt_headline")
+ if headline:
+ tag = Tag(soup, "h2")
+ tag['class'] = "headline"
+ tag.insert(0, self.fixChars(headline.contents[0]))
+ h1.replaceWith(tag)
+ else:
+ # Blog entry - replace headline, remove tags
+ headline = soup.find('title')
+ if headline:
+ tag = Tag(soup, "h2")
+ tag['class'] = "headline"
+ tag.insert(0, self.fixChars(headline.contents[0]))
+ soup.insert(0, tag)
+ hrs = soup.findAll('hr')
+ for hr in hrs:
+ hr.extract()
+
+ # Change to - used in editorial blogs
+ masthead = soup.find("h1")
+ if masthead:
+ # Nuke the href
+ if masthead.a:
+ del(masthead.a['href'])
+ tag = Tag(soup, "h3")
+ tag.insert(0, self.fixChars(masthead.contents[0]))
+ masthead.replaceWith(tag)
+
+ # Change to
+ for subhead in soup.findAll(True, {'class':'bold'}) :
+ if subhead.contents:
+ bTag = Tag(soup, "b")
+ bTag.insert(0, subhead.contents[0])
+ subhead.replaceWith(bTag)
+
+ divTag = soup.find('div',attrs={'id':'articleBody'})
+ if divTag:
+ divTag['class'] = divTag['id']
+
+ # Add class="authorId" to so we can format with CSS
+ divTag = soup.find('div',attrs={'id':'authorId'})
+ if divTag and divTag.contents[0]:
+ tag = Tag(soup, "p")
+ tag['class'] = "authorId"
+ tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0],
+ use_alt=False)))
+ divTag.replaceWith(tag)
+
+ return soup
| |