diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py index 20e1c8858..71e1d4ef3 100644 --- a/beetsplug/lyrics.py +++ b/beetsplug/lyrics.py @@ -359,14 +359,9 @@ class Genius(Backend): 'User-Agent': USER_AGENT, } - def lyrics_from_song_api_path(self, song_api_path): - song_url = self.base_url + song_api_path - response = requests.get(song_url, headers=self.headers) - json = response.json() - path = json["response"]["song"]["path"] - + def lyrics_from_song_page(self, page_url): # Gotta go regular html scraping... come on Genius. - page_url = "https://genius.com" + path + self._log.debug(u'fetching lyrics from: {0}', page_url) try: page = requests.get(page_url) except requests.RequestException as exc: @@ -378,19 +373,38 @@ class Genius(Backend): # Remove script tags that they put in the middle of the lyrics. [h.extract() for h in html('script')] - # At least Genius is nice and has a tag called 'lyrics'! - # Updated css where the lyrics are based in HTML. + # Most of the time, the page contains a div with class="lyrics" where + # all of the lyrics can be found already correctly formatted + # Sometimes, though, it packages the lyrics into separate divs, most + # likely for easier ad placement lyrics_div = html.find("div", class_="lyrics") - if lyrics_div is None: - self._log.debug(u'Genius lyrics for {0} not found', - page_url) - return None + if not lyrics_div: + self._log.debug(u'Received unusual song page html') + verse_div = html.find("div", + class_=re.compile("Lyrics__Container")) + if not verse_div: + if html.find("div", + class_=re.compile("LyricsPlaceholder__Message"), + string="This song is an instrumental"): + self._log.debug('Detected instrumental') + return "[Instrumental]" + else: + self._log.debug("Couldn't scrape page using known layouts") + return None + + lyrics_div = verse_div.parent + for br in lyrics_div.find_all("br"): + br.replace_with("\n") + ads = lyrics_div.find_all("div", + class_=re.compile("InreadAd__Container")) + for ad in ads: + ad.replace_with("\n") return lyrics_div.get_text() def fetch(self, artist, title): search_url = self.base_url + "/search" - data = {'q': title} + data = {'q': title + " " + artist.lower()} try: response = requests.get(search_url, data=data, headers=self.headers) @@ -404,7 +418,6 @@ class Genius(Backend): self._log.debug(u'Genius API request returned invalid JSON') return None - song_info = None for hit in json["response"]["hits"]: # Genius uses zero-width characters to denote lowercase # artist names. @@ -412,15 +425,9 @@ class Genius(Backend): strip(u'\u200b').lower() if hit_artist == artist.lower(): - song_info = hit - break + return self.lyrics_from_song_page(hit["result"]["url"]) - if song_info: - self._log.debug(u'fetched: {0}', song_info["result"]["url"]) - song_api_path = song_info["result"]["api_path"] - return self.lyrics_from_song_api_path(song_api_path) - else: - self._log.debug(u'genius: no matching artist') + self._log.debug(u'genius: no matching artist') class LyricsWiki(SymbolsReplaced): diff --git a/docs/changelog.rst b/docs/changelog.rst index f00380a2c..bd1a77973 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -202,6 +202,7 @@ Fixes: The log message has also been rewritten for to improve clarity. Thanks to :user:`autrimpo`. :bug:`3533` +* :doc:`/plugins/lyrics`: Reduce the failure rate of the Genius backend. For plugin developers: diff --git a/test/test_lyrics.py b/test/test_lyrics.py index 81d62fea9..d31116284 100644 --- a/test/test_lyrics.py +++ b/test/test_lyrics.py @@ -456,7 +456,7 @@ class LyricsGeniusBaseTest(unittest.TestCase): self.skipTest("Python's built-in HTML parser is not good enough") -class LyricsGeniusScrapTest(LyricsGeniusBaseTest): +class LyricsGeniusScrapeTest(LyricsGeniusBaseTest): """Checks that Genius backend works as intended. """ @@ -469,12 +469,13 @@ class LyricsGeniusScrapTest(LyricsGeniusBaseTest): @patch.object(requests, 'get', GeniusMockGet()) def test_no_lyrics_div(self): - """Ensure that `lyrics_from_song_api_path` doesn't crash when the html - for a Genius page contain
+ """Ensure that `lyrics_from_song_page` doesn't crash when the html + for a Genius page doesn't contain """ # https://github.com/beetbox/beets/issues/3535 # expected return value None - self.assertEqual(genius.lyrics_from_song_api_path('/nolyric'), + song_url = 'https://genius.com/sample' + self.assertEqual(genius.lyrics_from_song_page(song_url), None)