From 700c7cd9f8859b403d38d2268c169d9cc105f0aa Mon Sep 17 00:00:00 2001
From: Adrian Sampson <adrian@radbox.org>
Date: Wed, 7 Dec 2011 11:11:35 -0800
Subject: [PATCH] albumart.org scraper art source (#272)

---
 beets/autotag/art.py | 63 +++++++++++++++++++++++++++++++++++---------
 docs/changelog.rst   |  3 +++
 test/test_art.py     | 56 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 110 insertions(+), 12 deletions(-)

diff --git a/beets/autotag/art.py b/beets/autotag/art.py
index 768704b3d..682f22a98 100644
--- a/beets/autotag/art.py
+++ b/beets/autotag/art.py
@@ -18,6 +18,7 @@ import urllib
 import sys
 import logging
 import os
+import re
 
 from beets.autotag.mb import album_for_id
 
@@ -28,29 +29,64 @@ COVER_NAMES = ['cover', 'front', 'art', 'album', 'folder']
 log = logging.getLogger('beets')
 
 
+CONTENT_TYPES = ('image/jpeg',)
+def _fetch_image(url):
+    """Downloads an image from a URL and checks whether it seems to
+    actually be an image. If so, returns a path to the downloaded image.
+    Otherwise, returns None.
+    """
+    log.debug('Downloading art: %s' % url)
+    try:
+        fn, headers = urllib.urlretrieve(url)
+    except IOError:
+        log.debug('error fetching art')
+        return
+
+    # Make sure it's actually an image.
+    if headers.gettype() in CONTENT_TYPES:
+        log.debug('Downloaded art to: %s' % fn)
+        return fn
+    else:
+        log.debug('Not an image.')
+
+
 # Art from Amazon.
 
 AMAZON_URL = 'http://images.amazon.com/images/P/%s.%02i.LZZZZZZZ.jpg'
 AMAZON_INDICES = (1,2)
-AMAZON_CONTENT_TYPE = 'image/jpeg'
 def art_for_asin(asin):
     """Fetches art for an Amazon ID (ASIN) string."""
     for index in AMAZON_INDICES:
         # Fetch the image.
         url = AMAZON_URL % (asin, index)
-        try:
-            log.debug('Downloading art: %s' % url)
-            fn, headers = urllib.urlretrieve(url)
-        except IOError:
-            log.debug('error fetching art at URL %s' % url)
-            continue
-            
-        # Make sure it's actually an image.
-        if headers.gettype() == AMAZON_CONTENT_TYPE:
-            log.debug('Downloaded art to: %s' % fn)
+        fn = _fetch_image(url)
+        if fn:
             return fn
 
 
+# AlbumArt.org scraper.
+
+AAO_URL = 'http://www.albumart.org/index_detail.php'
+AAO_PAT = r'href\s*=\s*"([^>"]*)"[^>]*title\s*=\s*"View larger image"'
+def aao_art(asin):
+    # Get the page from albumart.org.
+    url = '%s?%s' % (AAO_URL, urllib.urlencode({'asin': asin}))
+    try:
+        log.debug('Scraping art URL: %s' % url)
+        page = urllib.urlopen(url).read()
+    except IOError:
+        log.debug('Error scraping art page')
+        return
+
+    # Search the page for the image URL.
+    m = re.search(AAO_PAT, page)
+    if m:
+        image_url = m.group(1)
+        return _fetch_image(image_url)
+    else:
+        log.debug('No image found on page')
+
+
 # Art from the filesystem.
 
 def art_in_path(path):
@@ -91,7 +127,10 @@ def art_for_album(album, path):
 
     if album.asin:
         log.debug('Fetching album art for ASIN %s.' % album.asin)
-        return art_for_asin(album.asin)
+        out = art_for_asin(album.asin)
+        if out:
+            return out
+        return aao_art(album.asin)
     else:
         log.debug('No ASIN available: no art found.')
         return None
diff --git a/docs/changelog.rst b/docs/changelog.rst
index c9bec389c..34e8f0216 100644
--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@@ -41,6 +41,8 @@ Changelog
   via the ``ignore`` setting; see :doc:`/reference/config`.
 * The database now keeps track of files' modification times so that, during
   an ``update``, unmodified files can be skipped. (Thanks to Jos van der Til.)
+* The album art fetcher now uses `albumart.org`_ as a fallback when the Amazon
+  art downloader fails.
 * A new ``timeout`` config value avoids database locking errors on slow systems.
 * Fix a crash after using the "as Tracks" option during import.
 * Fix a Unicode error when tagging items with missing titles.
@@ -53,6 +55,7 @@ Changelog
 .. _acoustid: http://acoustid.org/
 .. _Peter Brunner: https://github.com/Lugoues
 .. _Simon Chopin: https://github.com/laarmen
+.. _albumart.org: http://www.albumart.org/
 
 1.0b10 (September 22, 2011)
 ---------------------------
diff --git a/test/test_art.py b/test/test_art.py
index 9833d24dc..4e9d5bab9 100644
--- a/test/test_art.py
+++ b/test/test_art.py
@@ -21,6 +21,7 @@ from beets.autotag import art
 from beets.autotag import AlbumInfo
 import os
 import shutil
+import StringIO
 
 class MockHeaders(object):
     def __init__(self, typeval):
@@ -31,7 +32,9 @@ class MockUrlRetrieve(object):
     def __init__(self, pathval, typeval):
         self.pathval = pathval
         self.headers = MockHeaders(typeval)
+        self.fetched = None
     def __call__(self, url):
+        self.fetched = url
         return self.pathval, self.headers
 
 class AmazonArtTest(unittest.TestCase):
@@ -72,8 +75,16 @@ class CombinedTest(unittest.TestCase):
     def setUp(self):
         self.dpath = os.path.join(_common.RSRC, 'arttest')
         os.mkdir(self.dpath)
+        self.old_urlopen = art.urllib.urlopen
+        art.urllib.urlopen = self._urlopen
+        self.page_text = ""
     def tearDown(self):
         shutil.rmtree(self.dpath)
+        art.urllib.urlopen = self.old_urlopen
+
+    def _urlopen(self, url):
+        self.urlopen_called = True
+        return StringIO.StringIO(self.page_text)
 
     def test_main_interface_returns_amazon_art(self):
         art.urllib.urlretrieve = MockUrlRetrieve('anotherpath', 'image/jpeg')
@@ -99,6 +110,51 @@ class CombinedTest(unittest.TestCase):
         artpath = art.art_for_album(album, self.dpath)
         self.assertEqual(artpath, 'anotherpath')
 
+    def test_main_interface_tries_amazon_before_aao(self):
+        self.urlopen_called = False
+        art.urllib.urlretrieve = MockUrlRetrieve('anotherpath', 'image/jpeg')
+        album = AlbumInfo(None, None, None, None, None, asin='xxxx')
+        art.art_for_album(album, self.dpath)
+        self.assertFalse(self.urlopen_called)
+
+    def test_main_interface_falls_back_to_aao(self):
+        self.urlopen_called = False
+        art.urllib.urlretrieve = MockUrlRetrieve('anotherpath', 'text/html')
+        album = AlbumInfo(None, None, None, None, None, asin='xxxx')
+        art.art_for_album(album, self.dpath)
+        self.assertTrue(self.urlopen_called)
+
+class AAOTest(unittest.TestCase):
+    def setUp(self):
+        self.old_urlopen = art.urllib.urlopen
+        self.old_urlretrieve = art.urllib.urlretrieve
+        art.urllib.urlopen = self._urlopen
+        self.retriever = MockUrlRetrieve('somepath', 'image/jpeg')
+        art.urllib.urlretrieve = self.retriever
+        self.page_text = ''
+    def tearDown(self):
+        art.urllib.urlopen = self.old_urlopen
+        art.urllib.urlretrieve = self.old_urlretrieve
+
+    def _urlopen(self, url):
+        return StringIO.StringIO(self.page_text)
+
+    def test_aao_scraper_finds_image(self):
+        self.page_text = """
+        <br />
+        <a href="TARGET_URL" title="View larger image" class="thickbox" style="color: #7E9DA2; text-decoration:none;">
+        <img src="http://www.albumart.org/images/zoom-icon.jpg" alt="View larger image" width="17" height="15"  border="0"/></a>
+        """
+        res = art.aao_art('x')
+        self.assertEqual(self.retriever.fetched, 'TARGET_URL')
+        self.assertEqual(res, 'somepath')
+
+    def test_aao_scraper_returns_none_when_no_image_present(self):
+        self.page_text = "blah blah"
+        res = art.aao_art('x')
+        self.assertEqual(self.retriever.fetched, None)
+        self.assertEqual(res, None)
+
 def suite():
     return unittest.TestLoader().loadTestsFromName(__name__)