From a7eace81d4cfac520e72eb8afb1cd3535d9c5159 Mon Sep 17 00:00:00 2001 From: Tom Jaspers Date: Tue, 19 May 2015 09:57:54 +0200 Subject: [PATCH] fetchart: improve Wikipedia art source Artists with non-typical casing (e.g., alt-J, dEUS) would not get matched on DBPedia, as the RDFS:label uses arbitrary casing, and SPARQL provides only exact matches. The FOAF:name attribute is always title-cased (e.g., Alt-J, Deus). Due to a bug in DBPedia, the cover filename is truncated when it contains parentheses, (e.g., 'Foo bar (band).jpg' gets truncated to 'Foo bar .jpg'). To work around this, an additional Wikipedia call gets made for all its images, in which we try to match our truncated image. The Wikipedia art source now catches the correct exceptions, instead of a broad catch-all. Wikipedia album images can be gifs, so these are now added to the list of accepted content types. --- beetsplug/fetchart.py | 59 +++++++++++++++++++++++++++++++++++-------- docs/changelog.rst | 2 ++ 2 files changed, 50 insertions(+), 11 deletions(-) diff --git a/beetsplug/fetchart.py b/beetsplug/fetchart.py index badac9d79..4a94ca643 100644 --- a/beetsplug/fetchart.py +++ b/beetsplug/fetchart.py @@ -38,7 +38,7 @@ except ImportError: HAVE_ITUNES = False IMAGE_EXTENSIONS = ['png', 'jpg', 'jpeg'] -CONTENT_TYPES = ('image/jpeg',) +CONTENT_TYPES = ('image/jpeg', 'image/gif') DOWNLOAD_EXTENSION = '.jpg' requests_session = requests.Session() @@ -171,13 +171,16 @@ class Wikipedia(ArtSource): PREFIX dbpprop: PREFIX owl: PREFIX rdfs: - SELECT DISTINCT ?coverFilename WHERE {{ + PREFIX foaf: + + SELECT DISTINCT ?pageId ?coverFilename WHERE {{ + ?subject owl:wikiPageID ?pageId . ?subject dbpprop:name ?name . ?subject rdfs:label ?label . {{ ?subject dbpprop:artist ?artist }} UNION {{ ?subject owl:artist ?artist }} - {{ ?artist rdfs:label "{artist}"@en }} + {{ ?artist foaf:name "{artist}"@en }} UNION {{ ?artist dbpprop:name "{artist}"@en }} ?subject rdf:type . @@ -191,29 +194,62 @@ class Wikipedia(ArtSource): return # Find the name of the cover art filename on DBpedia - cover_filename = None + cover_filename, page_id = None, None dbpedia_response = requests.get( self.DBPEDIA_URL, params={ 'format': 'application/sparql-results+json', 'timeout': 2500, - 'query': self.SPARQL_QUERY.format(artist=album.albumartist, - album=album.album) + 'query': self.SPARQL_QUERY.format( + artist=album.albumartist.title(), album=album.album) }, headers={'content-type': 'application/json'}) try: data = dbpedia_response.json() results = data['results']['bindings'] if results: - cover_filename = results[0]['coverFilename']['value'] + cover_filename = 'File:' + results[0]['coverFilename']['value'] + page_id = results[0]['pageId']['value'] else: self._log.debug(u'album not found on dbpedia') - except: + except (ValueError, KeyError, IndexError): self._log.debug(u'error scraping dbpedia album page') # Ensure we have a filename before attempting to query wikipedia - if not cover_filename: + if not (cover_filename and page_id): return + # DBPedia sometimes provides an incomplete cover_filename, indicated + # by the filename having a space before the extension, e.g., 'foo .bar' + # An additional Wikipedia call can help to find the real filename. + # This may be removed once the DBPedia issue is resolved, see: + # https://github.com/dbpedia/extraction-framework/issues/396 + if '.' not in cover_filename.split(' .')[-1]: + self._log.debug(u'dbpedia provided incomplete cover_filename') + lpart, rpart = cover_filename.rsplit(' .', 1) + + # Query all the images in the page + wikipedia_response = requests.get(self.WIKIPEDIA_URL, params={ + 'format': 'json', + 'action': 'query', + 'continue': '', + 'prop': 'images', + 'pageids': page_id}, + headers={'content-type': 'application/json'}) + + # Try to see if one of the images on the pages matches our + # imcomplete cover_filename + try: + data = wikipedia_response.json() + results = data['query']['pages'][page_id]['images'] + for result in results: + if re.match(re.escape(lpart) + r'.*?\.' + re.escape(rpart), + result['title']): + cover_filename = result['title'] + break + except (ValueError, KeyError): + self._log.debug(u'failed to retrieve a cover_filename') + return + # Find the absolute url of the cover art on Wikipedia wikipedia_response = requests.get(self.WIKIPEDIA_URL, params={ 'format': 'json', @@ -221,15 +257,16 @@ class Wikipedia(ArtSource): 'continue': '', 'prop': 'imageinfo', 'iiprop': 'url', - 'titles': ('File:' + cover_filename).encode('utf-8')}, + 'titles': cover_filename.encode('utf-8')}, headers={'content-type': 'application/json'}) + try: data = wikipedia_response.json() results = data['query']['pages'] for _, result in results.iteritems(): image_url = result['imageinfo'][0]['url'] yield image_url - except: + except (ValueError, KeyError, IndexError): self._log.debug(u'error scraping wikipedia imageinfo') return diff --git a/docs/changelog.rst b/docs/changelog.rst index eb80cddbc..487c27ad9 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -27,6 +27,8 @@ Fixes: * Fix sorting by paths when case-insensitive. :bug:`1451` * :doc:`/plugins/embedart`: Avoid an error when trying to embed invalid images into MPEG-4 files. +* :doc:`/plugins/fetchart`: The Wikipedia source is now able to better deal + with non-typical cased artists (e.g., alt-J, dEUS). 1.3.13 (April 24, 2015)