mirror of
git://github.com/kovidgoyal/calibre.git
synced 2026-05-08 12:53:37 +02:00
Merge from trunk
This commit is contained in:
commit
90be73fe5b
5 changed files with 117 additions and 46 deletions
|
|
@ -7,18 +7,18 @@ class NYTimes(BasicNewsRecipe):
|
|||
__author__ = 'Krittika Goyal'
|
||||
description = 'Canadian national newspaper'
|
||||
timefmt = ' [%d %b, %Y]'
|
||||
needs_subscription = False
|
||||
language = 'en_CA'
|
||||
needs_subscription = False
|
||||
|
||||
no_stylesheets = True
|
||||
#remove_tags_before = dict(name='h1', attrs={'class':'heading'})
|
||||
#remove_tags_after = dict(name='td', attrs={'class':'newptool1'})
|
||||
remove_tags_after = dict(name='div', attrs={'class':'npStoryTools npWidth1-6 npRight npTxtStrong'})
|
||||
remove_tags = [
|
||||
dict(name='iframe'),
|
||||
dict(name='div', attrs={'class':'story-tools'}),
|
||||
dict(name='div', attrs={'class':['story-tools', 'npStoryTools npWidth1-6 npRight npTxtStrong']}),
|
||||
#dict(name='div', attrs={'id':['qrformdiv', 'inSection', 'alpha-inner']}),
|
||||
#dict(name='form', attrs={'onsubmit':''}),
|
||||
#dict(name='table', attrs={'cellspacing':'0'}),
|
||||
dict(name='ul', attrs={'class':'npTxtAlt npGroup npTxtCentre npStoryShare npTxtStrong npTxtDim'}),
|
||||
]
|
||||
|
||||
# def preprocess_html(self, soup):
|
||||
|
|
@ -37,7 +37,7 @@ def nejm_get_index(self):
|
|||
def parse_index(self):
|
||||
soup = self.nejm_get_index()
|
||||
|
||||
div = soup.find(id='LegoText4')
|
||||
div = soup.find(id='npContentMain')
|
||||
|
||||
current_section = None
|
||||
current_articles = []
|
||||
|
|
@ -50,7 +50,7 @@ def parse_index(self):
|
|||
current_section = self.tag_to_string(x)
|
||||
current_articles = []
|
||||
self.log('\tFound section:', current_section)
|
||||
if current_section is not None and x.name == 'h3':
|
||||
if current_section is not None and x.name == 'h5':
|
||||
# Article found
|
||||
title = self.tag_to_string(x)
|
||||
a = x.find('a', href=lambda x: x and 'story' in x)
|
||||
|
|
@ -59,8 +59,8 @@ def parse_index(self):
|
|||
url = a.get('href', False)
|
||||
if not url or not title:
|
||||
continue
|
||||
if url.startswith('story'):
|
||||
url = 'http://www.nationalpost.com/todays-paper/'+url
|
||||
#if url.startswith('story'):
|
||||
url = 'http://www.nationalpost.com/todays-paper/'+url
|
||||
self.log('\t\tFound article:', title)
|
||||
self.log('\t\t\t', url)
|
||||
current_articles.append({'title': title, 'url':url,
|
||||
|
|
@ -70,28 +70,11 @@ def parse_index(self):
|
|||
feeds.append((current_section, current_articles))
|
||||
|
||||
return feeds
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
story = soup.find(name='div', attrs={'class':'triline'})
|
||||
page2_link = soup.find('p','pagenav')
|
||||
if page2_link:
|
||||
atag = page2_link.find('a',href=True)
|
||||
if atag:
|
||||
page2_url = atag['href']
|
||||
if page2_url.startswith('story'):
|
||||
page2_url = 'http://www.nationalpost.com/todays-paper/'+page2_url
|
||||
elif page2_url.startswith( '/todays-paper/story.html'):
|
||||
page2_url = 'http://www.nationalpost.com/'+page2_url
|
||||
page2_soup = self.index_to_soup(page2_url)
|
||||
if page2_soup:
|
||||
page2_content = page2_soup.find('div','story-content')
|
||||
if page2_content:
|
||||
full_story = BeautifulSoup('<div></div>')
|
||||
full_story.insert(0,story)
|
||||
full_story.insert(1,page2_content)
|
||||
story = full_story
|
||||
story = soup.find(name='div', attrs={'id':'npContentMain'})
|
||||
##td = heading.findParent(name='td')
|
||||
##td.extract()
|
||||
soup = BeautifulSoup('<html><head><title>t</title></head><body></body></html>')
|
||||
body = soup.find(name='body')
|
||||
body.insert(0, story)
|
||||
return soup
|
||||
|
||||
|
|
|
|||
|
|
@ -32,15 +32,16 @@ class NewScientist(BasicNewsRecipe):
|
|||
}
|
||||
preprocess_regexps = [(re.compile(r'</title>.*?</head>', re.DOTALL|re.IGNORECASE),lambda match: '</title></head>')]
|
||||
|
||||
keep_only_tags = [dict(name='div', attrs={'id':['pgtop','maincol','nsblgposts','hldgalcols']})]
|
||||
keep_only_tags = [dict(name='div', attrs={'id':['pgtop','maincol','blgmaincol','nsblgposts','hldgalcols']})]
|
||||
|
||||
remove_tags = [
|
||||
dict(name='div' , attrs={'class':['hldBd','adline','pnl','infotext' ]})
|
||||
,dict(name='div' , attrs={'id' :['compnl','artIssueInfo','artTools']})
|
||||
,dict(name='div' , attrs={'id' :['compnl','artIssueInfo','artTools','comments','blgsocial']})
|
||||
,dict(name='p' , attrs={'class':['marker','infotext' ]})
|
||||
,dict(name='meta' , attrs={'name' :'description' })
|
||||
,dict(name='a' , attrs={'rel' :'tag' })
|
||||
]
|
||||
remove_tags_after = dict(attrs={'class':'nbpcopy'})
|
||||
remove_tags_after = dict(attrs={'class':['nbpcopy','comments']})
|
||||
remove_attributes = ['height','width']
|
||||
|
||||
feeds = [
|
||||
|
|
|
|||
|
|
@ -3,17 +3,18 @@
|
|||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import traceback, sys, textwrap, re
|
||||
import traceback, sys, textwrap, re, urllib2
|
||||
from threading import Thread
|
||||
|
||||
from calibre import prints
|
||||
from calibre import prints, browser
|
||||
from calibre.utils.config import OptionParser
|
||||
from calibre.utils.logging import default_log
|
||||
from calibre.customize import Plugin
|
||||
from calibre.ebooks.metadata.library_thing import OPENLIBRARY
|
||||
|
||||
metadata_config = None
|
||||
|
||||
class MetadataSource(Plugin):
|
||||
class MetadataSource(Plugin): # {{{
|
||||
|
||||
author = 'Kovid Goyal'
|
||||
|
||||
|
|
@ -130,7 +131,9 @@ def save_settings(self, w):
|
|||
def customization_help(self):
|
||||
return 'This plugin can only be customized using the GUI'
|
||||
|
||||
class GoogleBooks(MetadataSource):
|
||||
# }}}
|
||||
|
||||
class GoogleBooks(MetadataSource): # {{{
|
||||
|
||||
name = 'Google Books'
|
||||
description = _('Downloads metadata from Google Books')
|
||||
|
|
@ -145,8 +148,9 @@ def fetch(self):
|
|||
self.exception = e
|
||||
self.tb = traceback.format_exc()
|
||||
|
||||
# }}}
|
||||
|
||||
class ISBNDB(MetadataSource):
|
||||
class ISBNDB(MetadataSource): # {{{
|
||||
|
||||
name = 'IsbnDB'
|
||||
description = _('Downloads metadata from isbndb.com')
|
||||
|
|
@ -181,7 +185,9 @@ def string_customization_help(self):
|
|||
'and enter your access key below.')
|
||||
return '<p>'+ans%('<a href="http://www.isbndb.com">', '</a>')
|
||||
|
||||
class Amazon(MetadataSource):
|
||||
# }}}
|
||||
|
||||
class Amazon(MetadataSource): # {{{
|
||||
|
||||
name = 'Amazon'
|
||||
metadata_type = 'social'
|
||||
|
|
@ -198,7 +204,9 @@ def fetch(self):
|
|||
self.exception = e
|
||||
self.tb = traceback.format_exc()
|
||||
|
||||
class LibraryThing(MetadataSource):
|
||||
# }}}
|
||||
|
||||
class LibraryThing(MetadataSource): # {{{
|
||||
|
||||
name = 'LibraryThing'
|
||||
metadata_type = 'social'
|
||||
|
|
@ -207,7 +215,6 @@ class LibraryThing(MetadataSource):
|
|||
def fetch(self):
|
||||
if not self.isbn:
|
||||
return
|
||||
from calibre import browser
|
||||
from calibre.ebooks.metadata import MetaInformation
|
||||
import json
|
||||
br = browser()
|
||||
|
|
@ -228,6 +235,7 @@ def fetch(self):
|
|||
except Exception, e:
|
||||
self.exception = e
|
||||
self.tb = traceback.format_exc()
|
||||
# }}}
|
||||
|
||||
|
||||
def result_index(source, result):
|
||||
|
|
@ -268,6 +276,31 @@ def join(self):
|
|||
for s in self.sources:
|
||||
s.join()
|
||||
|
||||
def filter_metadata_results(item):
|
||||
keywords = ["audio", "tape", "cassette", "abridged", "playaway"]
|
||||
for keyword in keywords:
|
||||
if item.publisher and keyword in item.publisher.lower():
|
||||
return False
|
||||
return True
|
||||
|
||||
class HeadRequest(urllib2.Request):
|
||||
def get_method(self):
|
||||
return "HEAD"
|
||||
|
||||
def do_cover_check(item):
|
||||
opener = browser()
|
||||
item.has_cover = False
|
||||
try:
|
||||
opener.open(HeadRequest(OPENLIBRARY%item.isbn), timeout=5)
|
||||
item.has_cover = True
|
||||
except:
|
||||
pass # Cover not found
|
||||
|
||||
def check_for_covers(items):
|
||||
threads = [Thread(target=do_cover_check, args=(item,)) for item in items]
|
||||
for t in threads: t.start()
|
||||
for t in threads: t.join()
|
||||
|
||||
def search(title=None, author=None, publisher=None, isbn=None, isbndb_key=None,
|
||||
verbose=0):
|
||||
assert not(title is None and author is None and publisher is None and \
|
||||
|
|
@ -285,10 +318,60 @@ def search(title=None, author=None, publisher=None, isbn=None, isbndb_key=None,
|
|||
for fetcher in fetchers[1:]:
|
||||
merge_results(results, fetcher.results)
|
||||
|
||||
results = sorted(results, cmp=lambda x, y : cmp(
|
||||
(x.comments.strip() if x.comments else ''),
|
||||
(y.comments.strip() if y.comments else '')
|
||||
), reverse=True)
|
||||
results = list(filter(filter_metadata_results, results))
|
||||
|
||||
check_for_covers(results)
|
||||
|
||||
words = ("the", "a", "an", "of", "and")
|
||||
prefix_pat = re.compile(r'^(%s)\s+'%("|".join(words)))
|
||||
trailing_paren_pat = re.compile(r'\(.*\)$')
|
||||
whitespace_pat = re.compile(r'\s+')
|
||||
|
||||
def sort_func(x, y):
|
||||
|
||||
def cleanup_title(s):
|
||||
s = s.strip().lower()
|
||||
s = prefix_pat.sub(' ', s)
|
||||
s = trailing_paren_pat.sub('', s)
|
||||
s = whitespace_pat.sub(' ', s)
|
||||
return s.strip()
|
||||
|
||||
t = cleanup_title(title)
|
||||
x_title = cleanup_title(x.title)
|
||||
y_title = cleanup_title(y.title)
|
||||
|
||||
# prefer titles that start with the search title
|
||||
tx = cmp(t, x_title)
|
||||
ty = cmp(t, y_title)
|
||||
result = 0 if abs(tx) == abs(ty) else abs(tx) - abs(ty)
|
||||
|
||||
# then prefer titles that have a cover image
|
||||
if result == 0:
|
||||
result = -cmp(x.has_cover, y.has_cover)
|
||||
|
||||
# then prefer titles with the longest comment, with in 10%
|
||||
if result == 0:
|
||||
cx = len(x.comments.strip() if x.comments else '')
|
||||
cy = len(y.comments.strip() if y.comments else '')
|
||||
t = (cx + cy) / 20
|
||||
result = cy - cx
|
||||
if abs(result) < t:
|
||||
result = 0
|
||||
|
||||
return result
|
||||
|
||||
results = sorted(results, cmp=sort_func)
|
||||
|
||||
# if for some reason there is no comment in the top selection, go looking for one
|
||||
if len(results) > 1:
|
||||
if not results[0].comments or len(results[0].comments) == 0:
|
||||
for r in results[1:]:
|
||||
if title.lower() == r.title[:len(title)].lower() and r.comments and len(r.comments):
|
||||
results[0].comments = r.comments
|
||||
break
|
||||
|
||||
# for r in results:
|
||||
# print "{0:14.14} {1:30.30} {2:20.20} {3:6} {4}".format(r.isbn, r.title, r.publisher, len(r.comments if r.comments else ''), r.has_cover)
|
||||
|
||||
return results, [(x.name, x.exception, x.tb) for x in fetchers]
|
||||
|
||||
|
|
|
|||
|
|
@ -181,14 +181,14 @@ Title match ignores leading indefinite articles ("the", "a",
|
|||
<item>
|
||||
<widget class="QCheckBox" name="preserve_user_collections">
|
||||
<property name="text">
|
||||
<string>Preserve user collections.</string>
|
||||
<string>Preserve device collections.</string>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item>
|
||||
<widget class="QLabel" name="label_41">
|
||||
<property name="text">
|
||||
<string>If checked, collections will not be deleted even if a book with changed metadata is resent and the collection is not in the book's metadata. In addition, editing collections on the device view will be enabled.</string>
|
||||
<string>If checked, collections will not be deleted even if a book with changed metadata is resent and the collection is not in the book's metadata. In addition, editing collections in the device view will be enabled. If unchecked, collections will be always reflect only the metadata in the calibre library.</string>
|
||||
</property>
|
||||
<property name="wordWrap">
|
||||
<bool>true</bool>
|
||||
|
|
|
|||
|
|
@ -325,6 +325,10 @@ Post any output you see in a help message on the `Forum <http://www.mobileread.c
|
|||
|app| is not starting on OS X?
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
One common cause of failures on OS X is the use of accessibility technologies that are incompatible with the graphics toolkit |app| uses.
|
||||
Try turning off VoiceOver if you have it on. Also go to System Preferences->System->Universal Access and turn off the setting for enabling
|
||||
access for assistive devices in all the tabs.
|
||||
|
||||
You can obtain debug output about why |app| is not starting by running `Console.app`. Debug output will
|
||||
be printed to it. If the debug output contains a line that looks like::
|
||||
|
||||
|
|
@ -334,9 +338,9 @@ then the problem is probably a corrupted font cache. You can clear the cache by
|
|||
`instructions <http://www.macworld.com/article/139383/2009/03/fontcacheclear.html>`_. If that doesn't
|
||||
solve it, look for a corrupted font file on your system, in ~/Library/Fonts or the like.
|
||||
|
||||
|
||||
My antivirus program claims |app| is a virus/trojan?
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Your antivirus program is wrong. |app| is a completely open source product. You can actually browse the source code yourself (or hire someone to do it for you) to verify that it is not a virus. Please report the false identification to whatever company you buy your antivirus software from. If the antivirus program is preventing you from downloading/installing |app|, disable it temporarily, install |app| and then re-enable it.
|
||||
|
||||
How do I use purchased EPUB books with |app|?
|
||||
|
|
|
|||
Loading…
Reference in a new issue