fetchart: improve Wikipedia art source

Artists with non-typical casing (e.g., alt-J, dEUS) would not get matched on DBPedia, as the RDFS:label uses arbitrary casing, and SPARQL provides only exact matches. The FOAF:name attribute is always title-cased (e.g., Alt-J, Deus). Due to a bug in DBPedia, the cover filename is truncated when it contains parentheses, (e.g., 'Foo bar (band).jpg' gets truncated to 'Foo bar .jpg'). To work around this, an additional Wikipedia call gets made for all its images, in which we try to match our truncated image. The Wikipedia art source now catches the correct exceptions, instead of a broad catch-all. Wikipedia album images can be gifs, so these are now added to the list of accepted content types.
2026-02-24 08:12:54 +01:00 · 2015-05-19 09:57:54 +02:00 · 2015-05-19 09:57:54 +02:00 · a7eace81d4
commit a7eace81d4
parent a82dee35cb
2 changed files with 50 additions and 11 deletions
--- a/beetsplug/fetchart.py
+++ b/beetsplug/fetchart.py
@ -38,7 +38,7 @@ except ImportError:
    HAVE_ITUNES = False

 IMAGE_EXTENSIONS = ['png', 'jpg', 'jpeg']
-CONTENT_TYPES = ('image/jpeg',)
+CONTENT_TYPES = ('image/jpeg', 'image/gif')
 DOWNLOAD_EXTENSION = '.jpg'

 requests_session = requests.Session()
@ -171,13 +171,16 @@ class Wikipedia(ArtSource):
                 PREFIX dbpprop: <http://dbpedia.org/property/>
                 PREFIX owl: <http://dbpedia.org/ontology/>
                 PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
-                 SELECT DISTINCT ?coverFilename WHERE {{
+                 PREFIX foaf: <http://xmlns.com/foaf/0.1/>
+
+                 SELECT DISTINCT ?pageId ?coverFilename WHERE {{
+                   ?subject owl:wikiPageID ?pageId .
                   ?subject dbpprop:name ?name .
                   ?subject rdfs:label ?label .
                   {{ ?subject dbpprop:artist ?artist }}
                     UNION
                   {{ ?subject owl:artist ?artist }}
-                   {{ ?artist rdfs:label "{artist}"@en }}
+                   {{ ?artist foaf:name "{artist}"@en }}
                     UNION
                   {{ ?artist dbpprop:name "{artist}"@en }}
                   ?subject rdf:type <http://dbpedia.org/ontology/Album> .
@ -191,29 +194,62 @@ class Wikipedia(ArtSource):
            return

        # Find the name of the cover art filename on DBpedia
-        cover_filename = None
+        cover_filename, page_id = None, None
        dbpedia_response = requests.get(
            self.DBPEDIA_URL,
            params={
                'format': 'application/sparql-results+json',
                'timeout': 2500,
-                'query': self.SPARQL_QUERY.format(artist=album.albumartist,
-                                                  album=album.album)
+                'query': self.SPARQL_QUERY.format(
+                    artist=album.albumartist.title(), album=album.album)
            }, headers={'content-type': 'application/json'})
        try:
            data = dbpedia_response.json()
            results = data['results']['bindings']
            if results:
-                cover_filename = results[0]['coverFilename']['value']
+                cover_filename = 'File:' + results[0]['coverFilename']['value']
+                page_id = results[0]['pageId']['value']
            else:
                self._log.debug(u'album not found on dbpedia')
-        except:
+        except (ValueError, KeyError, IndexError):
            self._log.debug(u'error scraping dbpedia album page')

        # Ensure we have a filename before attempting to query wikipedia
-        if not cover_filename:
+        if not (cover_filename and page_id):
            return

+        # DBPedia sometimes provides an incomplete cover_filename, indicated
+        # by the filename having a space before the extension, e.g., 'foo .bar'
+        # An additional Wikipedia call can help to find the real filename.
+        # This may be removed once the DBPedia issue is resolved, see:
+        # https://github.com/dbpedia/extraction-framework/issues/396
+        if '.' not in cover_filename.split(' .')[-1]:
+            self._log.debug(u'dbpedia provided incomplete cover_filename')
+            lpart, rpart = cover_filename.rsplit(' .', 1)
+
+            # Query all the images in the page
+            wikipedia_response = requests.get(self.WIKIPEDIA_URL, params={
+                'format': 'json',
+                'action': 'query',
+                'continue': '',
+                'prop': 'images',
+                'pageids': page_id},
+                headers={'content-type': 'application/json'})
+
+            # Try to see if one of the images on the pages matches our
+            # imcomplete cover_filename
+            try:
+                data = wikipedia_response.json()
+                results = data['query']['pages'][page_id]['images']
+                for result in results:
+                    if re.match(re.escape(lpart) + r'.*?\.' + re.escape(rpart),
+                                result['title']):
+                        cover_filename = result['title']
+                        break
+            except (ValueError, KeyError):
+                self._log.debug(u'failed to retrieve a cover_filename')
+                return
+
        # Find the absolute url of the cover art on Wikipedia
        wikipedia_response = requests.get(self.WIKIPEDIA_URL, params={
            'format': 'json',
@ -221,15 +257,16 @@ class Wikipedia(ArtSource):
            'continue': '',
            'prop': 'imageinfo',
            'iiprop': 'url',
-            'titles': ('File:' + cover_filename).encode('utf-8')},
+            'titles': cover_filename.encode('utf-8')},
            headers={'content-type': 'application/json'})
+
        try:
            data = wikipedia_response.json()
            results = data['query']['pages']
            for _, result in results.iteritems():
                image_url = result['imageinfo'][0]['url']
                yield image_url
-        except:
+        except (ValueError, KeyError, IndexError):
            self._log.debug(u'error scraping wikipedia imageinfo')
            return

--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@ -27,6 +27,8 @@ Fixes:
 * Fix sorting by paths when case-insensitive. :bug:`1451`
 * :doc:`/plugins/embedart`: Avoid an error when trying to embed invalid images
  into MPEG-4 files.
+* :doc:`/plugins/fetchart`: The Wikipedia source is now able to better deal
+  with non-typical cased artists (e.g., alt-J, dEUS).


 1.3.13 (April 24, 2015)