From ac043f9be0094089ed29ec8c0552781994945607 Mon Sep 17 00:00:00 2001 From: stlutz Date: Sat, 16 May 2020 13:14:39 +0200 Subject: [PATCH 1/5] When fetching lyrics from Genius, search for title and artist. Searching only for the title and just verifying the artist afterwards leads to songs with very common titles not being found, since Genius limits the amount of returned hits. An example would be 'Saviour' by 'Circa Waves'. --- beetsplug/lyrics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py index 20e1c8858..3a56cc738 100644 --- a/beetsplug/lyrics.py +++ b/beetsplug/lyrics.py @@ -390,7 +390,7 @@ class Genius(Backend): def fetch(self, artist, title): search_url = self.base_url + "/search" - data = {'q': title} + data = {'q': title + " " + artist.lower()} try: response = requests.get(search_url, data=data, headers=self.headers) From 46143d97629c48777e82dc147ecfff08f7c5e683 Mon Sep 17 00:00:00 2001 From: stlutz Date: Sat, 16 May 2020 16:28:17 +0200 Subject: [PATCH 2/5] Remove unnecessary intermediate web request to genius.com when fetching lyrics. The search results already include the correct song page url, making it superfluous to do another request via the /song api just to get it. --- beetsplug/lyrics.py | 20 ++++---------------- test/test_lyrics.py | 8 ++++---- 2 files changed, 8 insertions(+), 20 deletions(-) diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py index 3a56cc738..71969cab3 100644 --- a/beetsplug/lyrics.py +++ b/beetsplug/lyrics.py @@ -359,14 +359,9 @@ class Genius(Backend): 'User-Agent': USER_AGENT, } - def lyrics_from_song_api_path(self, song_api_path): - song_url = self.base_url + song_api_path - response = requests.get(song_url, headers=self.headers) - json = response.json() - path = json["response"]["song"]["path"] - + def lyrics_from_song_page(self, page_url): # Gotta go regular html scraping... come on Genius. - page_url = "https://genius.com" + path + self._log.debug(u'fetching lyrics from: {0}', page_url) try: page = requests.get(page_url) except requests.RequestException as exc: @@ -404,7 +399,6 @@ class Genius(Backend): self._log.debug(u'Genius API request returned invalid JSON') return None - song_info = None for hit in json["response"]["hits"]: # Genius uses zero-width characters to denote lowercase # artist names. @@ -412,15 +406,9 @@ class Genius(Backend): strip(u'\u200b').lower() if hit_artist == artist.lower(): - song_info = hit - break + return self.lyrics_from_song_page(hit["result"]["url"]) - if song_info: - self._log.debug(u'fetched: {0}', song_info["result"]["url"]) - song_api_path = song_info["result"]["api_path"] - return self.lyrics_from_song_api_path(song_api_path) - else: - self._log.debug(u'genius: no matching artist') + self._log.debug(u'genius: no matching artist') class LyricsWiki(SymbolsReplaced): diff --git a/test/test_lyrics.py b/test/test_lyrics.py index 81d62fea9..d0507c16f 100644 --- a/test/test_lyrics.py +++ b/test/test_lyrics.py @@ -456,7 +456,7 @@ class LyricsGeniusBaseTest(unittest.TestCase): self.skipTest("Python's built-in HTML parser is not good enough") -class LyricsGeniusScrapTest(LyricsGeniusBaseTest): +class LyricsGeniusScrapeTest(LyricsGeniusBaseTest): """Checks that Genius backend works as intended. """ @@ -469,12 +469,12 @@ class LyricsGeniusScrapTest(LyricsGeniusBaseTest): @patch.object(requests, 'get', GeniusMockGet()) def test_no_lyrics_div(self): - """Ensure that `lyrics_from_song_api_path` doesn't crash when the html - for a Genius page contain
+ """Ensure that `lyrics_from_song_page` doesn't crash when the html + for a Genius page doesn't contain
""" # https://github.com/beetbox/beets/issues/3535 # expected return value None - self.assertEqual(genius.lyrics_from_song_api_path('/nolyric'), + self.assertEqual(genius.lyrics_from_song_page('https://genius.com/sample'), None) From 15402f6aa7bceceef8f3909b5e5939342bd549ca Mon Sep 17 00:00:00 2001 From: stlutz Date: Sat, 16 May 2020 17:15:45 +0200 Subject: [PATCH 3/5] Add alternative scraping algorithm to deal with Genius's new song page html layout. --- beetsplug/lyrics.py | 33 +++++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py index 71969cab3..27a29e149 100644 --- a/beetsplug/lyrics.py +++ b/beetsplug/lyrics.py @@ -373,13 +373,34 @@ class Genius(Backend): # Remove script tags that they put in the middle of the lyrics. [h.extract() for h in html('script')] - # At least Genius is nice and has a tag called 'lyrics'! - # Updated css where the lyrics are based in HTML. + # Most of the time, the page contains a div with class="lyrics" where + # all of the lyrics can be found already correctly formatted + # Sometimes, though, it packages the lyrics into separate divs, most + # likely for easier ad placement lyrics_div = html.find("div", class_="lyrics") - if lyrics_div is None: - self._log.debug(u'Genius lyrics for {0} not found', - page_url) - return None + if not lyrics_div: + self._log.debug(u'Received unusual song page html') + verse_div = html.find("div", + class_=re.compile("Lyrics__Container")) + if not verse_div: + with open('instrumental.html', 'w') as text_file: + text_file.write(str(html)) + if html.find("div", + class_=re.compile("LyricsPlaceholder__Message"), + string="This song is an instrumental"): + self._log.debug('Detected instrumental') + return "[Instrumental]" + else: + self._log.debug("Couldn't scrape page using known layouts") + return None + + lyrics_div = verse_div.parent + for br in lyrics_div.find_all("br"): + br.replace_with("\n") + ads = lyrics_div.find_all("div", + class_=re.compile("InreadAd__Container")) + for ad in ads: + ad.replace_with("\n") return lyrics_div.get_text() From 5d306d6fd9b48b51cbf25934cd5ded90a83b1b02 Mon Sep 17 00:00:00 2001 From: stlutz Date: Sat, 16 May 2020 17:48:56 +0200 Subject: [PATCH 4/5] Remove debugging statement and conform to line length. --- beetsplug/lyrics.py | 2 -- test/test_lyrics.py | 3 ++- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py index 27a29e149..71e1d4ef3 100644 --- a/beetsplug/lyrics.py +++ b/beetsplug/lyrics.py @@ -383,8 +383,6 @@ class Genius(Backend): verse_div = html.find("div", class_=re.compile("Lyrics__Container")) if not verse_div: - with open('instrumental.html', 'w') as text_file: - text_file.write(str(html)) if html.find("div", class_=re.compile("LyricsPlaceholder__Message"), string="This song is an instrumental"): diff --git a/test/test_lyrics.py b/test/test_lyrics.py index d0507c16f..d31116284 100644 --- a/test/test_lyrics.py +++ b/test/test_lyrics.py @@ -474,7 +474,8 @@ class LyricsGeniusScrapeTest(LyricsGeniusBaseTest): """ # https://github.com/beetbox/beets/issues/3535 # expected return value None - self.assertEqual(genius.lyrics_from_song_page('https://genius.com/sample'), + song_url = 'https://genius.com/sample' + self.assertEqual(genius.lyrics_from_song_page(song_url), None) From 5298e1a97656ab5c8f33f95a5a537c645e35cca1 Mon Sep 17 00:00:00 2001 From: stlutz Date: Sun, 17 May 2020 18:42:23 +0200 Subject: [PATCH 5/5] Add changelog entry. --- docs/changelog.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/changelog.rst b/docs/changelog.rst index f00380a2c..bd1a77973 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -202,6 +202,7 @@ Fixes: The log message has also been rewritten for to improve clarity. Thanks to :user:`autrimpo`. :bug:`3533` +* :doc:`/plugins/lyrics`: Reduce the failure rate of the Genius backend. For plugin developers: