From 700c7cd9f8859b403d38d2268c169d9cc105f0aa Mon Sep 17 00:00:00 2001 From: Adrian Sampson Date: Wed, 7 Dec 2011 11:11:35 -0800 Subject: [PATCH] albumart.org scraper art source (#272) --- beets/autotag/art.py | 63 +++++++++++++++++++++++++++++++++++--------- docs/changelog.rst | 3 +++ test/test_art.py | 56 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 110 insertions(+), 12 deletions(-) diff --git a/beets/autotag/art.py b/beets/autotag/art.py index 768704b3d..682f22a98 100644 --- a/beets/autotag/art.py +++ b/beets/autotag/art.py @@ -18,6 +18,7 @@ import urllib import sys import logging import os +import re from beets.autotag.mb import album_for_id @@ -28,29 +29,64 @@ COVER_NAMES = ['cover', 'front', 'art', 'album', 'folder'] log = logging.getLogger('beets') +CONTENT_TYPES = ('image/jpeg',) +def _fetch_image(url): + """Downloads an image from a URL and checks whether it seems to + actually be an image. If so, returns a path to the downloaded image. + Otherwise, returns None. + """ + log.debug('Downloading art: %s' % url) + try: + fn, headers = urllib.urlretrieve(url) + except IOError: + log.debug('error fetching art') + return + + # Make sure it's actually an image. + if headers.gettype() in CONTENT_TYPES: + log.debug('Downloaded art to: %s' % fn) + return fn + else: + log.debug('Not an image.') + + # Art from Amazon. AMAZON_URL = 'http://images.amazon.com/images/P/%s.%02i.LZZZZZZZ.jpg' AMAZON_INDICES = (1,2) -AMAZON_CONTENT_TYPE = 'image/jpeg' def art_for_asin(asin): """Fetches art for an Amazon ID (ASIN) string.""" for index in AMAZON_INDICES: # Fetch the image. url = AMAZON_URL % (asin, index) - try: - log.debug('Downloading art: %s' % url) - fn, headers = urllib.urlretrieve(url) - except IOError: - log.debug('error fetching art at URL %s' % url) - continue - - # Make sure it's actually an image. - if headers.gettype() == AMAZON_CONTENT_TYPE: - log.debug('Downloaded art to: %s' % fn) + fn = _fetch_image(url) + if fn: return fn +# AlbumArt.org scraper. + +AAO_URL = 'http://www.albumart.org/index_detail.php' +AAO_PAT = r'href\s*=\s*"([^>"]*)"[^>]*title\s*=\s*"View larger image"' +def aao_art(asin): + # Get the page from albumart.org. + url = '%s?%s' % (AAO_URL, urllib.urlencode({'asin': asin})) + try: + log.debug('Scraping art URL: %s' % url) + page = urllib.urlopen(url).read() + except IOError: + log.debug('Error scraping art page') + return + + # Search the page for the image URL. + m = re.search(AAO_PAT, page) + if m: + image_url = m.group(1) + return _fetch_image(image_url) + else: + log.debug('No image found on page') + + # Art from the filesystem. def art_in_path(path): @@ -91,7 +127,10 @@ def art_for_album(album, path): if album.asin: log.debug('Fetching album art for ASIN %s.' % album.asin) - return art_for_asin(album.asin) + out = art_for_asin(album.asin) + if out: + return out + return aao_art(album.asin) else: log.debug('No ASIN available: no art found.') return None diff --git a/docs/changelog.rst b/docs/changelog.rst index c9bec389c..34e8f0216 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -41,6 +41,8 @@ Changelog via the ``ignore`` setting; see :doc:`/reference/config`. * The database now keeps track of files' modification times so that, during an ``update``, unmodified files can be skipped. (Thanks to Jos van der Til.) +* The album art fetcher now uses `albumart.org`_ as a fallback when the Amazon + art downloader fails. * A new ``timeout`` config value avoids database locking errors on slow systems. * Fix a crash after using the "as Tracks" option during import. * Fix a Unicode error when tagging items with missing titles. @@ -53,6 +55,7 @@ Changelog .. _acoustid: http://acoustid.org/ .. _Peter Brunner: https://github.com/Lugoues .. _Simon Chopin: https://github.com/laarmen +.. _albumart.org: http://www.albumart.org/ 1.0b10 (September 22, 2011) --------------------------- diff --git a/test/test_art.py b/test/test_art.py index 9833d24dc..4e9d5bab9 100644 --- a/test/test_art.py +++ b/test/test_art.py @@ -21,6 +21,7 @@ from beets.autotag import art from beets.autotag import AlbumInfo import os import shutil +import StringIO class MockHeaders(object): def __init__(self, typeval): @@ -31,7 +32,9 @@ class MockUrlRetrieve(object): def __init__(self, pathval, typeval): self.pathval = pathval self.headers = MockHeaders(typeval) + self.fetched = None def __call__(self, url): + self.fetched = url return self.pathval, self.headers class AmazonArtTest(unittest.TestCase): @@ -72,8 +75,16 @@ class CombinedTest(unittest.TestCase): def setUp(self): self.dpath = os.path.join(_common.RSRC, 'arttest') os.mkdir(self.dpath) + self.old_urlopen = art.urllib.urlopen + art.urllib.urlopen = self._urlopen + self.page_text = "" def tearDown(self): shutil.rmtree(self.dpath) + art.urllib.urlopen = self.old_urlopen + + def _urlopen(self, url): + self.urlopen_called = True + return StringIO.StringIO(self.page_text) def test_main_interface_returns_amazon_art(self): art.urllib.urlretrieve = MockUrlRetrieve('anotherpath', 'image/jpeg') @@ -99,6 +110,51 @@ class CombinedTest(unittest.TestCase): artpath = art.art_for_album(album, self.dpath) self.assertEqual(artpath, 'anotherpath') + def test_main_interface_tries_amazon_before_aao(self): + self.urlopen_called = False + art.urllib.urlretrieve = MockUrlRetrieve('anotherpath', 'image/jpeg') + album = AlbumInfo(None, None, None, None, None, asin='xxxx') + art.art_for_album(album, self.dpath) + self.assertFalse(self.urlopen_called) + + def test_main_interface_falls_back_to_aao(self): + self.urlopen_called = False + art.urllib.urlretrieve = MockUrlRetrieve('anotherpath', 'text/html') + album = AlbumInfo(None, None, None, None, None, asin='xxxx') + art.art_for_album(album, self.dpath) + self.assertTrue(self.urlopen_called) + +class AAOTest(unittest.TestCase): + def setUp(self): + self.old_urlopen = art.urllib.urlopen + self.old_urlretrieve = art.urllib.urlretrieve + art.urllib.urlopen = self._urlopen + self.retriever = MockUrlRetrieve('somepath', 'image/jpeg') + art.urllib.urlretrieve = self.retriever + self.page_text = '' + def tearDown(self): + art.urllib.urlopen = self.old_urlopen + art.urllib.urlretrieve = self.old_urlretrieve + + def _urlopen(self, url): + return StringIO.StringIO(self.page_text) + + def test_aao_scraper_finds_image(self): + self.page_text = """ +
+ + View larger image + """ + res = art.aao_art('x') + self.assertEqual(self.retriever.fetched, 'TARGET_URL') + self.assertEqual(res, 'somepath') + + def test_aao_scraper_returns_none_when_no_image_present(self): + self.page_text = "blah blah" + res = art.aao_art('x') + self.assertEqual(self.retriever.fetched, None) + self.assertEqual(res, None) + def suite(): return unittest.TestLoader().loadTestsFromName(__name__)