Clean up code and tests for Genius Lyrics plugin backend (#3641)

* clean-up code & add tests for genius lyrics backend * add genius fetch tests * organize imports: standard lib -> pip -> local * check in sample genius lyrics page * fix mock import * force utf-8 encoding for opened files * use io.open to force utf-8 encoding w/ python2.7
2026-02-18 21:36:35 +01:00 · 2020-07-06 08:41:27 -05:00 · 2020-07-06 08:41:27 -05:00 · 45abc9ed7a
commit 45abc9ed7a
parent cb668ccdab
4 changed files with 2356 additions and 82 deletions
--- a/.gitignore
+++ b/.gitignore
@ -7,7 +7,6 @@

 # Project Specific patterns
 man
-test/rsrc/lyrics/*

 # The rest is from https://www.gitignore.io/api/python

--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@ -359,16 +359,54 @@ class Genius(Backend):
            'User-Agent': USER_AGENT,
        }

-    def lyrics_from_song_page(self, page_url):
-        # Gotta go regular html scraping... come on Genius.
-        self._log.debug(u'fetching lyrics from: {0}', page_url)
-        try:
-            page = requests.get(page_url)
-        except requests.RequestException as exc:
-            self._log.debug(u'Genius page request for {0} failed: {1}',
-                            page_url, exc)
+    def fetch(self, artist, title):
+        """Fetch lyrics from genius.com
+
+        Because genius doesn't allow accesssing lyrics via the api,
+        we first query the api for a url matching our artist & title,
+        then attempt to scrape that url for the lyrics.
+        """
+        json = self._search(artist, title)
+        if not json:
+            self._log.debug(u'Genius API request returned invalid JSON')
            return None
-        html = BeautifulSoup(page.text, "html.parser")
+
+        # find a matching artist in the json
+        for hit in json["response"]["hits"]:
+            hit_artist = hit["result"]["primary_artist"]["name"]
+
+            if slug(hit_artist) == slug(artist):
+                return self._scrape_lyrics_from_html(
+                    self.fetch_url(hit["result"]["url"]))
+
+        self._log.debug(u'Genius failed to find a matching artist for \'{0}\'',
+                        artist)
+
+    def _search(self, artist, title):
+        """Searches the genius api for a given artist and title
+
+        https://docs.genius.com/#search-h2
+
+        :returns: json response
+        """
+        search_url = self.base_url + "/search"
+        data = {'q': title + " " + artist.lower()}
+        try:
+            response = requests.get(
+                search_url, data=data, headers=self.headers)
+        except requests.RequestException as exc:
+            self._log.debug(u'Genius API request failed: {0}', exc)
+            return None
+
+        try:
+            return response.json()
+        except ValueError:
+            return None
+
+    def _scrape_lyrics_from_html(self, html):
+        """Scrape lyrics from a given genius.com html"""
+
+        html = BeautifulSoup(html, "html.parser")

        # Remove script tags that they put in the middle of the lyrics.
        [h.extract() for h in html('script')]
@ -402,31 +440,6 @@ class Genius(Backend):

        return lyrics_div.get_text()

-    def fetch(self, artist, title):
-        search_url = self.base_url + "/search"
-        data = {'q': title + " " + artist.lower()}
-        try:
-            response = requests.get(search_url, data=data,
-                                    headers=self.headers)
-        except requests.RequestException as exc:
-            self._log.debug(u'Genius API request failed: {0}', exc)
-            return None
-
-        try:
-            json = response.json()
-        except ValueError:
-            self._log.debug(u'Genius API request returned invalid JSON')
-            return None
-
-        for hit in json["response"]["hits"]:
-            hit_artist = hit["result"]["primary_artist"]["name"]
-
-            if slug(hit_artist) == slug(artist):
-                return self.lyrics_from_song_page(hit["result"]["url"])
-
-        self._log.debug(u'Genius failed to find a matching artist for \'{0}\'',
-                        artist)
-

 class LyricsWiki(SymbolsReplaced):
    """Fetch lyrics from LyricsWiki."""
--- a/test/rsrc/lyrics/geniuscom/Wutangclancreamlyrics.txt
+++ b/test/rsrc/lyrics/geniuscom/Wutangclancreamlyrics.txt
--- a/test/test_lyrics.py
+++ b/test/test_lyrics.py
@ -18,23 +18,21 @@
 from __future__ import absolute_import, division, print_function

 import itertools
+from io import open
 import os
 import re
 import six
 import sys
 import unittest

-from mock import patch
-from test import _common
+import confuse
+from mock import MagicMock, patch

 from beets import logging
 from beets.library import Item
 from beets.util import bytestring_path
-import confuse
-
 from beetsplug import lyrics
-
-from mock import MagicMock
+from test import _common


 log = logging.getLogger('beets.test_lyrics')
@ -232,38 +230,11 @@ class MockFetchUrl(object):
    def __call__(self, url, filename=None):
        self.fetched = url
        fn = url_to_filename(url)
-        with open(fn, 'r') as f:
+        with open(fn, 'r', encoding="utf8") as f:
            content = f.read()
        return content


-class GeniusMockGet(object):
-
-    def __init__(self, pathval='fetched_path'):
-        self.pathval = pathval
-        self.fetched = None
-
-    def __call__(self, url, headers=False):
-        from requests.models import Response
-        # for the first requests.get() return a path
-        if headers:
-            response = Response()
-            response.status_code = 200
-            response._content = b'{"meta":{"status":200},\
-                                "response":{"song":{"path":"/lyrics/sample"}}}'
-            return response
-        # for the second requests.get() return the genius page
-        else:
-            from mock import PropertyMock
-            self.fetched = url
-            fn = url_to_filename(url)
-            with open(fn, 'r') as f:
-                content = f.read()
-            response = Response()
-            type(response).text = PropertyMock(return_value=content)
-            return response
-
-
 def is_lyrics_content_ok(title, text):
    """Compare lyrics text to expected lyrics for given title."""
    if not text:
@ -445,8 +416,9 @@ class LyricsGooglePluginMachineryTest(LyricsGoogleBaseTest):
        google.is_page_candidate(url, url_title, s['title'], u'Sunn O)))')


-class LyricsGeniusBaseTest(unittest.TestCase):
+# test Genius backend

+class GeniusBaseTest(unittest.TestCase):
    def setUp(self):
        """Set up configuration."""
        try:
@ -457,28 +429,91 @@ class LyricsGeniusBaseTest(unittest.TestCase):
            self.skipTest("Python's built-in HTML parser is not good enough")


-class LyricsGeniusScrapeTest(LyricsGeniusBaseTest):
-
-    """Checks that Genius backend works as intended.
-    """
-    import requests
+class GeniusScrapeLyricsFromHtmlTest(GeniusBaseTest):
+    """tests Genius._scrape_lyrics_from_html()"""

    def setUp(self):
        """Set up configuration"""
-        LyricsGeniusBaseTest.setUp(self)
+        GeniusBaseTest.setUp(self)
        self.plugin = lyrics.LyricsPlugin()

-    @patch.object(requests, 'get', GeniusMockGet())
    def test_no_lyrics_div(self):
-        """Ensure that `lyrics_from_song_page` doesn't crash when the html
-        for a Genius page doesn't contain <div class="lyrics"></div>
+        """Ensure we don't crash when the scraping the html for a genius page
+        doesn't contain <div class="lyrics"></div>
        """
        # https://github.com/beetbox/beets/issues/3535
        # expected return value None
-        song_url = 'https://genius.com/sample'
-        self.assertEqual(genius.lyrics_from_song_page(song_url),
-                         None)
+        url = 'https://genius.com/sample'
+        mock = MockFetchUrl()
+        self.assertEqual(genius._scrape_lyrics_from_html(mock(url)), None)

+    def test_good_lyrics(self):
+        """Ensure we are able to scrape a page with lyrics"""
+        url = 'https://genius.com/Wu-tang-clan-cream-lyrics'
+        mock = MockFetchUrl()
+        self.assertIsNotNone(genius._scrape_lyrics_from_html(mock(url)))
+
+    # TODO: find an example of a lyrics page with multiple divs and test it
+
+
+class GeniusFetchTest(GeniusBaseTest):
+    """tests Genius.fetch()"""
+
+    def setUp(self):
+        """Set up configuration"""
+        GeniusBaseTest.setUp(self)
+        self.plugin = lyrics.LyricsPlugin()
+
+    @patch.object(lyrics.Genius, '_scrape_lyrics_from_html')
+    @patch.object(lyrics.Backend, 'fetch_url', return_value=True)
+    def test_json(self, mock_fetch_url, mock_scrape):
+        """Ensure we're finding artist matches"""
+        with patch.object(
+            lyrics.Genius, '_search', return_value={
+                "response": {
+                    "hits": [
+                        {
+                            "result": {
+                                "primary_artist": {
+                                    "name": u"\u200Bblackbear",
+                                    },
+                                "url": "blackbear_url"
+                                }
+                        },
+                        {
+                            "result": {
+                                "primary_artist": {
+                                    "name": u"El\u002Dp"
+                                    },
+                                "url": "El-p_url"
+                                }
+                        }
+                    ]
+                }
+            }
+        ) as mock_json:
+            # genius uses zero-width-spaces (\u200B) for lowercase
+            # artists so we make sure we can match those
+            self.assertIsNotNone(genius.fetch('blackbear', 'Idfc'))
+            mock_fetch_url.assert_called_once_with("blackbear_url")
+            mock_scrape.assert_called_once_with(True)
+
+            # genius uses the hypen minus (\u002D) as their dash
+            self.assertIsNotNone(genius.fetch('El-p', 'Idfc'))
+            mock_fetch_url.assert_called_with('El-p_url')
+            mock_scrape.assert_called_with(True)
+
+            # test no matching artist
+            self.assertIsNone(genius.fetch('doesntexist', 'none'))
+
+            # test invalid json
+            mock_json.return_value = None
+            self.assertIsNone(genius.fetch('blackbear', 'Idfc'))
+
+    # TODO: add integration test hitting real api
+
+
+# test utilties

 class SlugTests(unittest.TestCase):