Open library covers plugin migrated. Google plugin adds ratings and can now detect when an entry has a cover

2026-04-26 23:04:27 +02:00 · 2011-03-23 19:10:22 -06:00 · 2011-03-23 19:10:22 -06:00 · d8e1dcf8e5
commit d8e1dcf8e5
parent 2848e0d2f1
6 changed files with 104 additions and 42 deletions
--- a/src/calibre/customize/builtins.py
+++ b/src/calibre/customize/builtins.py
@ -1032,7 +1032,8 @@ class Misc(PreferencesPlugin):
 # New metadata download plugins {{{
 from calibre.ebooks.metadata.sources.google import GoogleBooks
 from calibre.ebooks.metadata.sources.amazon import Amazon
+from calibre.ebooks.metadata.sources.openlibrary import OpenLibrary

-plugins += [GoogleBooks, Amazon]
+plugins += [GoogleBooks, Amazon, OpenLibrary]

 # }}}
--- a/src/calibre/ebooks/metadata/sources/amazon.py
+++ b/src/calibre/ebooks/metadata/sources/amazon.py
@ -468,7 +468,7 @@ def download_cover(self, log, result_queue, abort, # {{{
                if cached_url is not None:
                    break
        if cached_url is None:
-            log.info('No cover found for')
+            log.info('No cover found')
            return

        if abort.is_set():
--- a/src/calibre/ebooks/metadata/sources/base.py
+++ b/src/calibre/ebooks/metadata/sources/base.py
@ -47,12 +47,12 @@ class InternalMetadataCompareKeyGen(object):

    The algorithm is:

-        1. Prefer results that have the same ISBN as specified in the query
-        2. Prefer results with all available fields filled in
-        3. Prefer results that are an exact title match to the query
-        4. Prefer results with longer comments (greater than 10 % longer)
-        5. Prefer results with a cached cover URL
-        6. Use the relevance of the result as reported by the metadata source's search
+        * Prefer results that have the same ISBN as specified in the query
+        * Prefer results with all available fields filled in
+        * Prefer results that are an exact title match to the query
+        * Prefer results with a cached cover URL
+        * Prefer results with longer comments (greater than 10 % longer)
+        * Use the relevance of the result as reported by the metadata source's search
           engine
    '''

@ -67,9 +67,9 @@ def __init__(self, mi, source_plugin, title, authors, identifiers):
        has_cover = 2 if source_plugin.get_cached_cover_url(mi.identifiers)\
                is None else 1

-        self.base = (isbn, all_fields, exact_title)
+        self.base = (isbn, all_fields, exact_title, has_cover)
        self.comments_len = len(mi.comments.strip() if mi.comments else '')
-        self.extra = (has_cover, getattr(mi, 'source_relevance', 0))
+        self.extra = (getattr(mi, 'source_relevance', 0), )

    def __cmp__(self, other):
        result = cmp(self.base, other.base)
@ -130,6 +130,12 @@ def browser(self):

    # Utility functions {{{

+    def get_related_isbns(self, id_):
+        with self.cache_lock:
+            for isbn, q in self._isbn_to_identifier_cache.iteritems():
+                if q == id_:
+                    yield isbn
+
    def cache_isbn_to_identifier(self, isbn, identifier):
        with self.cache_lock:
            self._isbn_to_identifier_cache[isbn] = identifier
--- a/src/calibre/ebooks/metadata/sources/google.py
+++ b/src/calibre/ebooks/metadata/sources/google.py
@ -25,7 +25,8 @@
 NAMESPACES = {
              'openSearch':'http://a9.com/-/spec/opensearchrss/1.0/',
              'atom' : 'http://www.w3.org/2005/Atom',
-              'dc': 'http://purl.org/dc/terms'
+              'dc'   : 'http://purl.org/dc/terms',
+              'gd'   : 'http://schemas.google.com/g/2005'
            }
 XPath = partial(etree.XPath, namespaces=NAMESPACES)

@ -42,6 +43,7 @@
 subject        = XPath('descendant::dc:subject')
 description    = XPath('descendant::dc:description')
 language       = XPath('descendant::dc:language')
+rating         = XPath('descendant::gd:rating[@average]')

 def get_details(browser, url, timeout): # {{{
    try:
@ -114,8 +116,10 @@ def get_text(extra, x):
        btags = [x.text for x in subject(extra) if x.text]
        tags = []
        for t in btags:
-            tags.extend([y.strip() for y in t.split('/')])
-        tags = list(sorted(list(set(tags))))
+            atags = [y.strip() for y in t.split('/')]
+            for tag in atags:
+                if tag not in tags:
+                    tags.append(tag)
    except:
        log.exception('Failed to parse tags:')
        tags = []
@ -131,6 +135,18 @@ def get_text(extra, x):
        except:
            log.exception('Failed to parse pubdate')

+    # Ratings
+    for x in rating(extra):
+        try:
+            mi.rating = float(x.get('average'))
+            if mi.rating > 5:
+                mi.rating /= 2
+        except:
+            log.exception('Failed to parse rating')
+
+    # Cover
+    mi.has_google_cover = len(extra.xpath(
+        '//*[@rel="http://schemas.google.com/books/2008/thumbnail"]')) > 0

    return mi
 # }}}
@ -142,9 +158,11 @@ class GoogleBooks(Source):

    capabilities = frozenset(['identify', 'cover'])
    touched_fields = frozenset(['title', 'authors', 'tags', 'pubdate',
-        'comments', 'publisher', 'identifier:isbn',
+        'comments', 'publisher', 'identifier:isbn', 'rating',
        'identifier:google']) # language currently disabled

+    GOOGLE_COVER = 'http://books.google.com/books?id=%s&printsec=frontcover&img=1'
+
    def create_query(self, log, title=None, authors=None, identifiers={}): # {{{
        BASE_URL = 'http://books.google.com/books/feeds/volumes?'
        isbn = check_isbn(identifiers.get('isbn', None))
@ -175,18 +193,9 @@ def build_term(prefix, parts):
            })
    # }}}

-    def cover_url_from_identifiers(self, identifiers):
-        goog = identifiers.get('google', None)
-        if goog is None:
-            isbn = identifiers.get('isbn', None)
-            goog = self.cached_isbn_to_identifier(isbn)
-        if goog is not None:
-            return ('http://books.google.com/books?id=%s&printsec=frontcover&img=1' %
-                goog)
-
    def download_cover(self, log, result_queue, abort, # {{{
            title=None, authors=None, identifiers={}, timeout=30):
-        cached_url = self.cover_url_from_identifiers(identifiers)
+        cached_url = self.get_cached_cover_url(identifiers)
        if cached_url is None:
            log.info('No cached cover found, running identify')
            rq = Queue()
@ -215,32 +224,38 @@ def download_cover(self, log, result_queue, abort, # {{{
        br = self.browser
        try:
            cdata = br.open_novisit(cached_url, timeout=timeout).read()
-            if self.is_cover_image_valid(cdata):
-                result_queue.put(cdata)
-            else:
-                log.error('No cover found for %r'%identifiers)
+            result_queue.put(cdata)
        except:
            log.exception('Failed to download cover from:', cached_url)

-
    # }}}

+    def get_cached_cover_url(self, identifiers): # {{{
+        url = None
+        goog = identifiers.get('google', None)
+        if goog is None:
+            isbn = identifiers.get('isbn', None)
+            if isbn is not None:
+                goog = self.cached_isbn_to_identifier(isbn)
+        if goog is not None:
+            url = self.cached_identifier_to_cover_url(goog)

-    def is_cover_image_valid(self, raw):
-        # When no cover is present, returns a PNG saying image not available
-        # Try for example google identifier llNqPwAACAAJ
-        # I have yet to see an actual cover in PNG format
-        return raw and len(raw) > 17000 and raw[1:4] != b'PNG'
+        return url
+    # }}}

-    def get_all_details(self, br, log, entries, abort, result_queue, timeout):
+    def get_all_details(self, br, log, entries, abort, # {{{
+            result_queue, timeout):
        for relevance, i in enumerate(entries):
            try:
                ans = to_metadata(br, log, i, timeout)
                if isinstance(ans, Metadata):
                    ans.source_relevance = relevance
+                    goog = ans.identifiers['google']
                    for isbn in getattr(ans, 'all_isbns', []):
-                        self.cache_isbn_to_identifier(isbn,
-                                ans.identifiers['google'])
+                        self.cache_isbn_to_identifier(isbn, goog)
+                        if ans.has_google_cover:
+                            self.cache_identifier_to_cover_url(goog,
+                                    self.GOOGLE_COVER%goog)
                    result_queue.put(ans)
            except:
                log.exception(
@ -248,6 +263,7 @@ def get_all_details(self, br, log, entries, abort, result_queue, timeout):
                    etree.tostring(i))
            if abort.is_set():
                break
+    # }}}

    def identify(self, log, result_queue, abort, title=None, authors=None, # {{{
            identifiers={}, timeout=30):
@ -281,7 +297,7 @@ def identify(self, log, result_queue, abort, title=None, authors=None, # {{{
        return None
    # }}}

-if __name__ == '__main__':
+if __name__ == '__main__': # tests {{{
    # To run these test use: calibre-debug -e src/calibre/ebooks/metadata/sources/google.py
    from calibre.ebooks.metadata.sources.test import (test_identify_plugin,
            title_test, authors_test)
@ -296,8 +312,10 @@ def identify(self, log, result_queue, abort, title=None, authors=None, # {{{
                    authors_test(['Francis Scott Fitzgerald'])]
            ),

-            #(
-            #    {'title': 'Great Expectations', 'authors':['Charles Dickens']},
-            #    [title_test('Great Expectations', exact=True)]
-            #),
+            (
+                {'title': 'Flatland', 'authors':['Abbott']},
+                [title_test('Flatland', exact=False)]
+            ),
    ])
+# }}}
+
--- a/src/calibre/ebooks/metadata/sources/openlibrary.py
+++ b/src/calibre/ebooks/metadata/sources/openlibrary.py
@ -0,0 +1,35 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function)
+
+__license__   = 'GPL v3'
+__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+from calibre.ebooks.metadata.sources.base import Source
+
+class OpenLibrary(Source):
+
+    name = 'Open Library'
+    description = _('Downloads metadata from The Open Library')
+
+    capabilities = frozenset(['cover'])
+
+    OPENLIBRARY = 'http://covers.openlibrary.org/b/isbn/%s-L.jpg?default=false'
+
+    def download_cover(self, log, result_queue, abort,
+            title=None, authors=None, identifiers={}, timeout=30):
+        if 'isbn' not in identifiers:
+            return
+        isbn = identifiers['isbn']
+        br = self.browser
+        try:
+            ans = br.open_novisit(self.OPENLIBRARY%isbn, timeout=timeout).read()
+            result_queue.put(ans)
+        except Exception as e:
+            if callable(getattr(e, 'getcode', None)) and e.getcode() == 404:
+                log.error('No cover for ISBN: %r found'%isbn)
+            else:
+                log.exception('Failed to download cover for ISBN:', isbn)
+
--- a/src/calibre/ebooks/metadata/sources/test.py
+++ b/src/calibre/ebooks/metadata/sources/test.py
@ -99,6 +99,8 @@ def test_identify_plugin(name, tests):
        for i, mi in enumerate(results):
            prints('*'*30, 'Relevance:', i, '*'*30)
            prints(mi)
+            prints('\nCached cover URL    :',
+                    plugin.get_cached_cover_url(mi.identifiers))
            prints('*'*75, '\n\n')

        possibles = []