Open library covers plugin migrated. Google plugin adds ratings and can now detect when an entry has a cover

This commit is contained in:
Kovid Goyal 2011-03-23 19:10:22 -06:00
parent 2848e0d2f1
commit d8e1dcf8e5
6 changed files with 104 additions and 42 deletions

View file

@ -1032,7 +1032,8 @@ class Misc(PreferencesPlugin):
# New metadata download plugins {{{
from calibre.ebooks.metadata.sources.google import GoogleBooks
from calibre.ebooks.metadata.sources.amazon import Amazon
from calibre.ebooks.metadata.sources.openlibrary import OpenLibrary
plugins += [GoogleBooks, Amazon]
plugins += [GoogleBooks, Amazon, OpenLibrary]
# }}}

View file

@ -468,7 +468,7 @@ def download_cover(self, log, result_queue, abort, # {{{
if cached_url is not None:
break
if cached_url is None:
log.info('No cover found for')
log.info('No cover found')
return
if abort.is_set():

View file

@ -47,12 +47,12 @@ class InternalMetadataCompareKeyGen(object):
The algorithm is:
1. Prefer results that have the same ISBN as specified in the query
2. Prefer results with all available fields filled in
3. Prefer results that are an exact title match to the query
4. Prefer results with longer comments (greater than 10 % longer)
5. Prefer results with a cached cover URL
6. Use the relevance of the result as reported by the metadata source's search
* Prefer results that have the same ISBN as specified in the query
* Prefer results with all available fields filled in
* Prefer results that are an exact title match to the query
* Prefer results with a cached cover URL
* Prefer results with longer comments (greater than 10 % longer)
* Use the relevance of the result as reported by the metadata source's search
engine
'''
@ -67,9 +67,9 @@ def __init__(self, mi, source_plugin, title, authors, identifiers):
has_cover = 2 if source_plugin.get_cached_cover_url(mi.identifiers)\
is None else 1
self.base = (isbn, all_fields, exact_title)
self.base = (isbn, all_fields, exact_title, has_cover)
self.comments_len = len(mi.comments.strip() if mi.comments else '')
self.extra = (has_cover, getattr(mi, 'source_relevance', 0))
self.extra = (getattr(mi, 'source_relevance', 0), )
def __cmp__(self, other):
result = cmp(self.base, other.base)
@ -130,6 +130,12 @@ def browser(self):
# Utility functions {{{
def get_related_isbns(self, id_):
with self.cache_lock:
for isbn, q in self._isbn_to_identifier_cache.iteritems():
if q == id_:
yield isbn
def cache_isbn_to_identifier(self, isbn, identifier):
with self.cache_lock:
self._isbn_to_identifier_cache[isbn] = identifier

View file

@ -25,7 +25,8 @@
NAMESPACES = {
'openSearch':'http://a9.com/-/spec/opensearchrss/1.0/',
'atom' : 'http://www.w3.org/2005/Atom',
'dc': 'http://purl.org/dc/terms'
'dc' : 'http://purl.org/dc/terms',
'gd' : 'http://schemas.google.com/g/2005'
}
XPath = partial(etree.XPath, namespaces=NAMESPACES)
@ -42,6 +43,7 @@
subject = XPath('descendant::dc:subject')
description = XPath('descendant::dc:description')
language = XPath('descendant::dc:language')
rating = XPath('descendant::gd:rating[@average]')
def get_details(browser, url, timeout): # {{{
try:
@ -114,8 +116,10 @@ def get_text(extra, x):
btags = [x.text for x in subject(extra) if x.text]
tags = []
for t in btags:
tags.extend([y.strip() for y in t.split('/')])
tags = list(sorted(list(set(tags))))
atags = [y.strip() for y in t.split('/')]
for tag in atags:
if tag not in tags:
tags.append(tag)
except:
log.exception('Failed to parse tags:')
tags = []
@ -131,6 +135,18 @@ def get_text(extra, x):
except:
log.exception('Failed to parse pubdate')
# Ratings
for x in rating(extra):
try:
mi.rating = float(x.get('average'))
if mi.rating > 5:
mi.rating /= 2
except:
log.exception('Failed to parse rating')
# Cover
mi.has_google_cover = len(extra.xpath(
'//*[@rel="http://schemas.google.com/books/2008/thumbnail"]')) > 0
return mi
# }}}
@ -142,9 +158,11 @@ class GoogleBooks(Source):
capabilities = frozenset(['identify', 'cover'])
touched_fields = frozenset(['title', 'authors', 'tags', 'pubdate',
'comments', 'publisher', 'identifier:isbn',
'comments', 'publisher', 'identifier:isbn', 'rating',
'identifier:google']) # language currently disabled
GOOGLE_COVER = 'http://books.google.com/books?id=%s&printsec=frontcover&img=1'
def create_query(self, log, title=None, authors=None, identifiers={}): # {{{
BASE_URL = 'http://books.google.com/books/feeds/volumes?'
isbn = check_isbn(identifiers.get('isbn', None))
@ -175,18 +193,9 @@ def build_term(prefix, parts):
})
# }}}
def cover_url_from_identifiers(self, identifiers):
goog = identifiers.get('google', None)
if goog is None:
isbn = identifiers.get('isbn', None)
goog = self.cached_isbn_to_identifier(isbn)
if goog is not None:
return ('http://books.google.com/books?id=%s&printsec=frontcover&img=1' %
goog)
def download_cover(self, log, result_queue, abort, # {{{
title=None, authors=None, identifiers={}, timeout=30):
cached_url = self.cover_url_from_identifiers(identifiers)
cached_url = self.get_cached_cover_url(identifiers)
if cached_url is None:
log.info('No cached cover found, running identify')
rq = Queue()
@ -215,32 +224,38 @@ def download_cover(self, log, result_queue, abort, # {{{
br = self.browser
try:
cdata = br.open_novisit(cached_url, timeout=timeout).read()
if self.is_cover_image_valid(cdata):
result_queue.put(cdata)
else:
log.error('No cover found for %r'%identifiers)
result_queue.put(cdata)
except:
log.exception('Failed to download cover from:', cached_url)
# }}}
def get_cached_cover_url(self, identifiers): # {{{
url = None
goog = identifiers.get('google', None)
if goog is None:
isbn = identifiers.get('isbn', None)
if isbn is not None:
goog = self.cached_isbn_to_identifier(isbn)
if goog is not None:
url = self.cached_identifier_to_cover_url(goog)
def is_cover_image_valid(self, raw):
# When no cover is present, returns a PNG saying image not available
# Try for example google identifier llNqPwAACAAJ
# I have yet to see an actual cover in PNG format
return raw and len(raw) > 17000 and raw[1:4] != b'PNG'
return url
# }}}
def get_all_details(self, br, log, entries, abort, result_queue, timeout):
def get_all_details(self, br, log, entries, abort, # {{{
result_queue, timeout):
for relevance, i in enumerate(entries):
try:
ans = to_metadata(br, log, i, timeout)
if isinstance(ans, Metadata):
ans.source_relevance = relevance
goog = ans.identifiers['google']
for isbn in getattr(ans, 'all_isbns', []):
self.cache_isbn_to_identifier(isbn,
ans.identifiers['google'])
self.cache_isbn_to_identifier(isbn, goog)
if ans.has_google_cover:
self.cache_identifier_to_cover_url(goog,
self.GOOGLE_COVER%goog)
result_queue.put(ans)
except:
log.exception(
@ -248,6 +263,7 @@ def get_all_details(self, br, log, entries, abort, result_queue, timeout):
etree.tostring(i))
if abort.is_set():
break
# }}}
def identify(self, log, result_queue, abort, title=None, authors=None, # {{{
identifiers={}, timeout=30):
@ -281,7 +297,7 @@ def identify(self, log, result_queue, abort, title=None, authors=None, # {{{
return None
# }}}
if __name__ == '__main__':
if __name__ == '__main__': # tests {{{
# To run these test use: calibre-debug -e src/calibre/ebooks/metadata/sources/google.py
from calibre.ebooks.metadata.sources.test import (test_identify_plugin,
title_test, authors_test)
@ -296,8 +312,10 @@ def identify(self, log, result_queue, abort, title=None, authors=None, # {{{
authors_test(['Francis Scott Fitzgerald'])]
),
#(
# {'title': 'Great Expectations', 'authors':['Charles Dickens']},
# [title_test('Great Expectations', exact=True)]
#),
(
{'title': 'Flatland', 'authors':['Abbott']},
[title_test('Flatland', exact=False)]
),
])
# }}}

View file

@ -0,0 +1,35 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
__license__ = 'GPL v3'
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
from calibre.ebooks.metadata.sources.base import Source
class OpenLibrary(Source):
name = 'Open Library'
description = _('Downloads metadata from The Open Library')
capabilities = frozenset(['cover'])
OPENLIBRARY = 'http://covers.openlibrary.org/b/isbn/%s-L.jpg?default=false'
def download_cover(self, log, result_queue, abort,
title=None, authors=None, identifiers={}, timeout=30):
if 'isbn' not in identifiers:
return
isbn = identifiers['isbn']
br = self.browser
try:
ans = br.open_novisit(self.OPENLIBRARY%isbn, timeout=timeout).read()
result_queue.put(ans)
except Exception as e:
if callable(getattr(e, 'getcode', None)) and e.getcode() == 404:
log.error('No cover for ISBN: %r found'%isbn)
else:
log.exception('Failed to download cover for ISBN:', isbn)

View file

@ -99,6 +99,8 @@ def test_identify_plugin(name, tests):
for i, mi in enumerate(results):
prints('*'*30, 'Relevance:', i, '*'*30)
prints(mi)
prints('\nCached cover URL :',
plugin.get_cached_cover_url(mi.identifiers))
prints('*'*75, '\n\n')
possibles = []