From d47182a3e5f758aaee5adca7398fed1b3d9145c8 Mon Sep 17 00:00:00 2001 From: wordofglass Date: Thu, 23 Jun 2016 16:48:17 +0200 Subject: [PATCH 1/8] fetchart: do not rely on the server-provided Content-Type, but determine the image format from the file magic --- beetsplug/fetchart.py | 41 ++++++++++++++++++++++++++++++----------- 1 file changed, 30 insertions(+), 11 deletions(-) diff --git a/beetsplug/fetchart.py b/beetsplug/fetchart.py index a2ce1661d..c629aeca6 100644 --- a/beetsplug/fetchart.py +++ b/beetsplug/fetchart.py @@ -29,6 +29,7 @@ from beets import importer from beets import ui from beets import util from beets import config +from beets.mediafile import _image_mime_type from beets.util.artresizer import ArtResizer from beets.util import confit from beets.util import syspath, bytestring_path @@ -231,21 +232,39 @@ class RemoteArtSource(ArtSource): message=u'downloading image')) as resp: ct = resp.headers.get('Content-Type', None) if ct not in CONTENT_TYPES: - self._log.debug( - u'not a supported image: {}', - resp.headers.get('Content-Type') or u'no content type', - ) + self._log.debug(u'not a supported image: {}', + ct or u'no content type') candidate.path = None return - # Generate a temporary file with the correct extension. - with NamedTemporaryFile(suffix=b'.' + CONTENT_TYPES[ct][0], - delete=False) as fh: - for chunk in resp.iter_content(chunk_size=1024): + # Generate a temporary file and guess the extension based on + # the Content-Type header. This may be wrong for badly + # configured servers. E.g. fanart.tv only allows .jpg uploads + # and ALWAYS returns a image/jpeg Content-Type, but other + # formats with a erroneous .jp(e)g extension apparently can + # sneak through the upload filter. Therefore validate the type + # using the file magic. + data = resp.iter_content(chunk_size=1024) + try: + chunk = next(data) + except StopIteration: + pass + else: + real_ct = _image_mime_type(chunk) + ext = b'.' + CONTENT_TYPES[real_ct][0] + if real_ct != ct: + self._log.warn(u'Server specified {}, but returned a ' + u'{} image. Correcting the extension ' + u'to {}', + ct, real_ct, ext) + with NamedTemporaryFile(suffix=ext, delete=False) as fh: fh.write(chunk) - self._log.debug(u'downloaded art to: {0}', - util.displayable_path(fh.name)) - candidate.path = util.bytestring_path(fh.name) + for chunk in data: + fh.write(chunk) + self._log.debug(u'downloaded art to: {0}', + util.displayable_path(fh.name)) + candidate.path = util.bytestring_path(fh.name) + return except (IOError, requests.RequestException, TypeError) as exc: From 34cdf0f3f382acc855e87c680bb231fad27777dc Mon Sep 17 00:00:00 2001 From: wordofglass Date: Thu, 23 Jun 2016 16:58:57 +0200 Subject: [PATCH 2/8] fetchart: complain about unsupported images only after validating the Content-Type --- beetsplug/fetchart.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/beetsplug/fetchart.py b/beetsplug/fetchart.py index c629aeca6..9614b17f2 100644 --- a/beetsplug/fetchart.py +++ b/beetsplug/fetchart.py @@ -231,11 +231,6 @@ class RemoteArtSource(ArtSource): with closing(self.request(candidate.url, stream=True, message=u'downloading image')) as resp: ct = resp.headers.get('Content-Type', None) - if ct not in CONTENT_TYPES: - self._log.debug(u'not a supported image: {}', - ct or u'no content type') - candidate.path = None - return # Generate a temporary file and guess the extension based on # the Content-Type header. This may be wrong for badly @@ -251,12 +246,19 @@ class RemoteArtSource(ArtSource): pass else: real_ct = _image_mime_type(chunk) + if real_ct not in CONTENT_TYPES: + self._log.debug(u'not a supported image: {}', + real_ct or u'unknown content type') + candidate.path = None + return + ext = b'.' + CONTENT_TYPES[real_ct][0] if real_ct != ct: self._log.warn(u'Server specified {}, but returned a ' u'{} image. Correcting the extension ' u'to {}', ct, real_ct, ext) + with NamedTemporaryFile(suffix=ext, delete=False) as fh: fh.write(chunk) for chunk in data: From c789d31614a94009f68778bb1fa6bc4edf26a7eb Mon Sep 17 00:00:00 2001 From: wordofglass Date: Thu, 23 Jun 2016 17:30:10 +0200 Subject: [PATCH 3/8] fetchart: fix and add tests for the new behaviour --- test/test_art.py | 45 +++++++++++++++++++++++++++------------------ 1 file changed, 27 insertions(+), 18 deletions(-) diff --git a/test/test_art.py b/test/test_art.py index 58b15d8e5..e2867a7ec 100644 --- a/test/test_art.py +++ b/test/test_art.py @@ -45,16 +45,26 @@ class UseThePlugin(_common.TestCase): self.plugin = fetchart.FetchArtPlugin() -class FetchImageTest(UseThePlugin): - URL = 'http://example.com/test.jpg' - +class FetchImageHelper(_common.TestCase): + """Helper mixin for mocking requests when fetching images + with remote art sources. + """ @responses.activate def run(self, *args, **kwargs): - super(FetchImageTest, self).run(*args, **kwargs) + super(FetchImageHelper, self).run(*args, **kwargs) - def mock_response(self, content_type): - responses.add(responses.GET, self.URL, - content_type=content_type) + def mock_response(self, url, content_type='image/jpeg', file_type=None): + IMAGEHEADER = {'image/jpeg': b'\x00' * 6 + b'JFIF', + 'image/png': b'\211PNG\r\n\032\n', } + if file_type is None: + file_type = content_type + responses.add(responses.GET, url, + content_type=content_type, + body=IMAGEHEADER.get(file_type, b'\x00' * 32)) + + +class FetchImageTest(FetchImageHelper, UseThePlugin): + URL = 'http://example.com/test.jpg' def setUp(self): super(FetchImageTest, self).setUp() @@ -64,17 +74,23 @@ class FetchImageTest(UseThePlugin): self.candidate = fetchart.Candidate(logger, url=self.URL) def test_invalid_type_returns_none(self): - self.mock_response('image/watercolour') + self.mock_response(self.URL, 'image/watercolour') self.source.fetch_image(self.candidate, self.extra) self.assertEqual(self.candidate.path, None) def test_jpeg_type_returns_path(self): - self.mock_response('image/jpeg') + self.mock_response(self.URL, 'image/jpeg') self.source.fetch_image(self.candidate, self.extra) self.assertNotEqual(self.candidate.path, None) def test_extension_set_by_content_type(self): - self.mock_response('image/png') + self.mock_response(self.URL, 'image/png') + self.source.fetch_image(self.candidate, self.extra) + self.assertEqual(os.path.splitext(self.candidate.path)[1], b'.png') + self.assertExists(self.candidate.path) + + def test_does_not_rely_on_server_content_type(self): + self.mock_response(self.URL, 'image/jpeg', 'image/png') self.source.fetch_image(self.candidate, self.extra) self.assertEqual(os.path.splitext(self.candidate.path)[1], b'.png') self.assertExists(self.candidate.path) @@ -128,7 +144,7 @@ class FSArtTest(UseThePlugin): self.assertEqual(candidates, paths) -class CombinedTest(UseThePlugin): +class CombinedTest(FetchImageHelper, UseThePlugin): ASIN = 'xxxx' MBID = 'releaseid' AMAZON_URL = 'http://images.amazon.com/images/P/{0}.01.LZZZZZZZ.jpg' \ @@ -143,13 +159,6 @@ class CombinedTest(UseThePlugin): self.dpath = os.path.join(self.temp_dir, b'arttest') os.mkdir(self.dpath) - @responses.activate - def run(self, *args, **kwargs): - super(CombinedTest, self).run(*args, **kwargs) - - def mock_response(self, url, content_type='image/jpeg'): - responses.add(responses.GET, url, content_type=content_type) - def test_main_interface_returns_amazon_art(self): self.mock_response(self.AMAZON_URL) album = _common.Bag(asin=self.ASIN) From 9968288358784bb8a6d051cb9acc93eadeea8215 Mon Sep 17 00:00:00 2001 From: wordofglass Date: Sat, 25 Jun 2016 13:25:29 +0200 Subject: [PATCH 4/8] fetchart: update comments --- beetsplug/fetchart.py | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/beetsplug/fetchart.py b/beetsplug/fetchart.py index 9614b17f2..2ac7fb8ab 100644 --- a/beetsplug/fetchart.py +++ b/beetsplug/fetchart.py @@ -232,20 +232,25 @@ class RemoteArtSource(ArtSource): message=u'downloading image')) as resp: ct = resp.headers.get('Content-Type', None) - # Generate a temporary file and guess the extension based on - # the Content-Type header. This may be wrong for badly - # configured servers. E.g. fanart.tv only allows .jpg uploads - # and ALWAYS returns a image/jpeg Content-Type, but other - # formats with a erroneous .jp(e)g extension apparently can - # sneak through the upload filter. Therefore validate the type - # using the file magic. + # Download the image to a temporary file. As some servers + # (notably fanart.tv) have proven to return wrong Content-Types + # when images were uploaded with a bad file extension, do not + # rely on it. Instead validate the type using the file magic + # and only then determine the extension. data = resp.iter_content(chunk_size=1024) try: + # stream only a small part of the image to get its header chunk = next(data) except StopIteration: pass else: real_ct = _image_mime_type(chunk) + if real_ct is None: + # detection by file magic failed, fall back to the + # server-supplied Content-Type + # Is our type detection failsafe enough to drop this? + real_ct = ct + if real_ct not in CONTENT_TYPES: self._log.debug(u'not a supported image: {}', real_ct or u'unknown content type') @@ -260,7 +265,9 @@ class RemoteArtSource(ArtSource): ct, real_ct, ext) with NamedTemporaryFile(suffix=ext, delete=False) as fh: + # write the first already loaded part of the image fh.write(chunk) + # download the remaining part of the image for chunk in data: fh.write(chunk) self._log.debug(u'downloaded art to: {0}', From 697291b04e34dcbb2e8297ed10191381e5d9b077 Mon Sep 17 00:00:00 2001 From: wordofglass Date: Sat, 25 Jun 2016 13:43:20 +0200 Subject: [PATCH 5/8] fetchart: improve error resilience of the Content-Type detection by file magic --- beetsplug/fetchart.py | 67 +++++++++++++++++++++++-------------------- test/test_art.py | 3 +- 2 files changed, 38 insertions(+), 32 deletions(-) diff --git a/beetsplug/fetchart.py b/beetsplug/fetchart.py index 2ac7fb8ab..af44a7660 100644 --- a/beetsplug/fetchart.py +++ b/beetsplug/fetchart.py @@ -238,42 +238,47 @@ class RemoteArtSource(ArtSource): # rely on it. Instead validate the type using the file magic # and only then determine the extension. data = resp.iter_content(chunk_size=1024) - try: - # stream only a small part of the image to get its header - chunk = next(data) - except StopIteration: - pass + header = b'' + for chunk in data: + header += chunk + if len(header) >= 32: + # The imghdr module will only read 32 bytes, and our + # own additions in mediafile even less. + break else: - real_ct = _image_mime_type(chunk) - if real_ct is None: - # detection by file magic failed, fall back to the - # server-supplied Content-Type - # Is our type detection failsafe enough to drop this? - real_ct = ct + # server didn't return enough data, i.e. corrupt image + return - if real_ct not in CONTENT_TYPES: - self._log.debug(u'not a supported image: {}', - real_ct or u'unknown content type') - candidate.path = None - return + real_ct = _image_mime_type(header) + if real_ct is None: + # detection by file magic failed, fall back to the + # server-supplied Content-Type + # Is our type detection failsafe enough to drop this? + real_ct = ct - ext = b'.' + CONTENT_TYPES[real_ct][0] - if real_ct != ct: - self._log.warn(u'Server specified {}, but returned a ' - u'{} image. Correcting the extension ' - u'to {}', - ct, real_ct, ext) + if real_ct not in CONTENT_TYPES: + self._log.debug(u'not a supported image: {}', + real_ct or u'unknown content type') + candidate.path = None + return - with NamedTemporaryFile(suffix=ext, delete=False) as fh: - # write the first already loaded part of the image + ext = b'.' + CONTENT_TYPES[real_ct][0] + + if real_ct != ct: + self._log.warn(u'Server specified {}, but returned a ' + u'{} image. Correcting the extension ' + u'to {}', + ct, real_ct, ext) + + with NamedTemporaryFile(suffix=ext, delete=False) as fh: + # write the first already loaded part of the image + fh.write(header) + # download the remaining part of the image + for chunk in data: fh.write(chunk) - # download the remaining part of the image - for chunk in data: - fh.write(chunk) - self._log.debug(u'downloaded art to: {0}', - util.displayable_path(fh.name)) - candidate.path = util.bytestring_path(fh.name) - + self._log.debug(u'downloaded art to: {0}', + util.displayable_path(fh.name)) + candidate.path = util.bytestring_path(fh.name) return except (IOError, requests.RequestException, TypeError) as exc: diff --git a/test/test_art.py b/test/test_art.py index e2867a7ec..1388282f8 100644 --- a/test/test_art.py +++ b/test/test_art.py @@ -60,7 +60,8 @@ class FetchImageHelper(_common.TestCase): file_type = content_type responses.add(responses.GET, url, content_type=content_type, - body=IMAGEHEADER.get(file_type, b'\x00' * 32)) + # imghdr reads 32 bytes + body=IMAGEHEADER.get(file_type, b'').ljust(32, b'\x00')) class FetchImageTest(FetchImageHelper, UseThePlugin): From 5d00ca8bdaa97ab35e895bcef627eba67b30d587 Mon Sep 17 00:00:00 2001 From: wordofglass Date: Sat, 25 Jun 2016 13:46:26 +0200 Subject: [PATCH 6/8] fetchart: do not unnecessarily reset Candidate.path on download failure --- beetsplug/fetchart.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/beetsplug/fetchart.py b/beetsplug/fetchart.py index af44a7660..4d4158ec1 100644 --- a/beetsplug/fetchart.py +++ b/beetsplug/fetchart.py @@ -259,7 +259,6 @@ class RemoteArtSource(ArtSource): if real_ct not in CONTENT_TYPES: self._log.debug(u'not a supported image: {}', real_ct or u'unknown content type') - candidate.path = None return ext = b'.' + CONTENT_TYPES[real_ct][0] @@ -285,7 +284,6 @@ class RemoteArtSource(ArtSource): # Handling TypeError works around a urllib3 bug: # https://github.com/shazow/urllib3/issues/556 self._log.debug(u'error fetching art: {}', exc) - candidate.path = None return From 2db1530ef8d3a09b3436ceab296dbbedec1ecf17 Mon Sep 17 00:00:00 2001 From: wordofglass Date: Sat, 25 Jun 2016 21:55:44 +0200 Subject: [PATCH 7/8] fetchart: flake8 fixes --- test/test_art.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/test/test_art.py b/test/test_art.py index 1388282f8..8659cf12a 100644 --- a/test/test_art.py +++ b/test/test_art.py @@ -46,22 +46,24 @@ class UseThePlugin(_common.TestCase): class FetchImageHelper(_common.TestCase): - """Helper mixin for mocking requests when fetching images + """Helper mixin for mocking requests when fetching images with remote art sources. """ @responses.activate def run(self, *args, **kwargs): super(FetchImageHelper, self).run(*args, **kwargs) + IMAGEHEADER = {'image/jpeg': b'\x00' * 6 + b'JFIF', + 'image/png': b'\211PNG\r\n\032\n', } + def mock_response(self, url, content_type='image/jpeg', file_type=None): - IMAGEHEADER = {'image/jpeg': b'\x00' * 6 + b'JFIF', - 'image/png': b'\211PNG\r\n\032\n', } if file_type is None: file_type = content_type responses.add(responses.GET, url, content_type=content_type, # imghdr reads 32 bytes - body=IMAGEHEADER.get(file_type, b'').ljust(32, b'\x00')) + body=self.IMAGEHEADER.get( + file_type, b'').ljust(32, b'\x00')) class FetchImageTest(FetchImageHelper, UseThePlugin): From 7bf6554cdc8031662795132dbedcf35dbe97debe Mon Sep 17 00:00:00 2001 From: wordofglass Date: Sat, 25 Jun 2016 21:57:10 +0200 Subject: [PATCH 8/8] fetchart: update changelog --- docs/changelog.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/changelog.rst b/docs/changelog.rst index 4cd44372d..b01af1495 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -40,9 +40,9 @@ Fixes: * Fix a crash when specifying non-ASCII format strings on the command line with the ``-f`` option to many commands. :bug:`2063` * :doc:`/plugins/fetchart`: Determine the file extension for downloaded images - based on the ``Content-Type`` header instead of hardcoding it to .jpg. - Reported in :bug:`2053`, which for now it does not fix due to - server-side issues, though. + based on the image's magic bytes and warn if result is not consistent with + the server-supplied ``Content-Type`` header instead of + hardcoding it to .jpg. :bug:`2053` 1.3.18 (May 31, 2016) ---------------------