From a7eace81d4cfac520e72eb8afb1cd3535d9c5159 Mon Sep 17 00:00:00 2001
From: Tom Jaspers <tomjaspers@gmail.com>
Date: Tue, 19 May 2015 09:57:54 +0200
Subject: [PATCH] fetchart: improve Wikipedia art source

Artists with non-typical casing (e.g., alt-J, dEUS) would not get matched on
DBPedia, as the RDFS:label uses arbitrary casing, and SPARQL provides only exact
matches. The FOAF:name attribute is always title-cased (e.g., Alt-J, Deus).

Due to a bug in DBPedia, the cover filename is truncated when it contains
parentheses, (e.g., 'Foo bar (band).jpg' gets truncated to 'Foo bar .jpg').
To work around this, an additional Wikipedia call gets made for all its
images, in which we try to match our truncated image.

The Wikipedia art source now catches the correct exceptions, instead of
a broad catch-all.

Wikipedia album images can be gifs, so these are now added to the list of
accepted content types.
---
 beetsplug/fetchart.py | 59 +++++++++++++++++++++++++++++++++++--------
 docs/changelog.rst    |  2 ++
 2 files changed, 50 insertions(+), 11 deletions(-)

diff --git a/beetsplug/fetchart.py b/beetsplug/fetchart.py
index badac9d79..4a94ca643 100644
--- a/beetsplug/fetchart.py
+++ b/beetsplug/fetchart.py
@@ -38,7 +38,7 @@ except ImportError:
     HAVE_ITUNES = False
 
 IMAGE_EXTENSIONS = ['png', 'jpg', 'jpeg']
-CONTENT_TYPES = ('image/jpeg',)
+CONTENT_TYPES = ('image/jpeg', 'image/gif')
 DOWNLOAD_EXTENSION = '.jpg'
 
 requests_session = requests.Session()
@@ -171,13 +171,16 @@ class Wikipedia(ArtSource):
                  PREFIX dbpprop: <http://dbpedia.org/property/>
                  PREFIX owl: <http://dbpedia.org/ontology/>
                  PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
-                 SELECT DISTINCT ?coverFilename WHERE {{
+                 PREFIX foaf: <http://xmlns.com/foaf/0.1/>
+
+                 SELECT DISTINCT ?pageId ?coverFilename WHERE {{
+                   ?subject owl:wikiPageID ?pageId .
                    ?subject dbpprop:name ?name .
                    ?subject rdfs:label ?label .
                    {{ ?subject dbpprop:artist ?artist }}
                      UNION
                    {{ ?subject owl:artist ?artist }}
-                   {{ ?artist rdfs:label "{artist}"@en }}
+                   {{ ?artist foaf:name "{artist}"@en }}
                      UNION
                    {{ ?artist dbpprop:name "{artist}"@en }}
                    ?subject rdf:type <http://dbpedia.org/ontology/Album> .
@@ -191,29 +194,62 @@ class Wikipedia(ArtSource):
             return
 
         # Find the name of the cover art filename on DBpedia
-        cover_filename = None
+        cover_filename, page_id = None, None
         dbpedia_response = requests.get(
             self.DBPEDIA_URL,
             params={
                 'format': 'application/sparql-results+json',
                 'timeout': 2500,
-                'query': self.SPARQL_QUERY.format(artist=album.albumartist,
-                                                  album=album.album)
+                'query': self.SPARQL_QUERY.format(
+                    artist=album.albumartist.title(), album=album.album)
             }, headers={'content-type': 'application/json'})
         try:
             data = dbpedia_response.json()
             results = data['results']['bindings']
             if results:
-                cover_filename = results[0]['coverFilename']['value']
+                cover_filename = 'File:' + results[0]['coverFilename']['value']
+                page_id = results[0]['pageId']['value']
             else:
                 self._log.debug(u'album not found on dbpedia')
-        except:
+        except (ValueError, KeyError, IndexError):
             self._log.debug(u'error scraping dbpedia album page')
 
         # Ensure we have a filename before attempting to query wikipedia
-        if not cover_filename:
+        if not (cover_filename and page_id):
             return
 
+        # DBPedia sometimes provides an incomplete cover_filename, indicated
+        # by the filename having a space before the extension, e.g., 'foo .bar'
+        # An additional Wikipedia call can help to find the real filename.
+        # This may be removed once the DBPedia issue is resolved, see:
+        # https://github.com/dbpedia/extraction-framework/issues/396
+        if '.' not in cover_filename.split(' .')[-1]:
+            self._log.debug(u'dbpedia provided incomplete cover_filename')
+            lpart, rpart = cover_filename.rsplit(' .', 1)
+
+            # Query all the images in the page
+            wikipedia_response = requests.get(self.WIKIPEDIA_URL, params={
+                'format': 'json',
+                'action': 'query',
+                'continue': '',
+                'prop': 'images',
+                'pageids': page_id},
+                headers={'content-type': 'application/json'})
+
+            # Try to see if one of the images on the pages matches our
+            # imcomplete cover_filename
+            try:
+                data = wikipedia_response.json()
+                results = data['query']['pages'][page_id]['images']
+                for result in results:
+                    if re.match(re.escape(lpart) + r'.*?\.' + re.escape(rpart),
+                                result['title']):
+                        cover_filename = result['title']
+                        break
+            except (ValueError, KeyError):
+                self._log.debug(u'failed to retrieve a cover_filename')
+                return
+
         # Find the absolute url of the cover art on Wikipedia
         wikipedia_response = requests.get(self.WIKIPEDIA_URL, params={
             'format': 'json',
@@ -221,15 +257,16 @@ class Wikipedia(ArtSource):
             'continue': '',
             'prop': 'imageinfo',
             'iiprop': 'url',
-            'titles': ('File:' + cover_filename).encode('utf-8')},
+            'titles': cover_filename.encode('utf-8')},
             headers={'content-type': 'application/json'})
+
         try:
             data = wikipedia_response.json()
             results = data['query']['pages']
             for _, result in results.iteritems():
                 image_url = result['imageinfo'][0]['url']
                 yield image_url
-        except:
+        except (ValueError, KeyError, IndexError):
             self._log.debug(u'error scraping wikipedia imageinfo')
             return
 
diff --git a/docs/changelog.rst b/docs/changelog.rst
index eb80cddbc..487c27ad9 100644
--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@@ -27,6 +27,8 @@ Fixes:
 * Fix sorting by paths when case-insensitive. :bug:`1451`
 * :doc:`/plugins/embedart`: Avoid an error when trying to embed invalid images
   into MPEG-4 files.
+* :doc:`/plugins/fetchart`: The Wikipedia source is now able to better deal
+  with non-typical cased artists (e.g., alt-J, dEUS).
 
 
 1.3.13 (April 24, 2015)