Fix bug #4815 (#5352)

2026-02-28 10:15:23 +01:00 · 2024-07-11 15:20:01 +10:00 · 2024-07-11 15:20:01 +10:00 · 61e885c85a
commit 61e885c85a
parent 9122722283 79449b0851
4 changed files with 1196 additions and 31 deletions
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@ -448,40 +448,51 @@ class Genius(Backend):
        # Sometimes, though, it packages the lyrics into separate divs, most
        # likely for easier ad placement

-        lyrics_div = soup.find("div", {"data-lyrics-container": True})
-
-        if lyrics_div:
-            self.replace_br(lyrics_div)
-
-        if not lyrics_div:
+        lyrics_divs = soup.find_all("div", {"data-lyrics-container": True})
+        if not lyrics_divs:
            self._log.debug("Received unusual song page html")
-            verse_div = soup.find("div", class_=re.compile("Lyrics__Container"))
-            if not verse_div:
-                if soup.find(
-                    "div",
-                    class_=re.compile("LyricsPlaceholder__Message"),
-                    string="This song is an instrumental",
-                ):
-                    self._log.debug("Detected instrumental")
-                    return "[Instrumental]"
-                else:
-                    self._log.debug("Couldn't scrape page using known layouts")
-                    return None
-
-            lyrics_div = verse_div.parent
+            return self._try_extracting_lyrics_from_non_data_lyrics_container(
+                soup
+            )
+        lyrics = ""
+        for lyrics_div in lyrics_divs:
            self.replace_br(lyrics_div)
+            lyrics += lyrics_div.get_text() + "\n\n"
+        while lyrics[-1] == "\n":
+            lyrics = lyrics[:-1]
+        return lyrics

-            ads = lyrics_div.find_all(
-                "div", class_=re.compile("InreadAd__Container")
-            )
-            for ad in ads:
-                ad.replace_with("\n")
+    def _try_extracting_lyrics_from_non_data_lyrics_container(self, soup):
+        """Extract lyrics from a div without attribute data-lyrics-container
+        This is the second most common layout on genius.com
+        """
+        verse_div = soup.find("div", class_=re.compile("Lyrics__Container"))
+        if not verse_div:
+            if soup.find(
+                "div",
+                class_=re.compile("LyricsPlaceholder__Message"),
+                string="This song is an instrumental",
+            ):
+                self._log.debug("Detected instrumental")
+                return "[Instrumental]"
+            else:
+                self._log.debug("Couldn't scrape page using known layouts")
+                return None

-            footers = lyrics_div.find_all(
-                "div", class_=re.compile("Lyrics__Footer")
-            )
-            for footer in footers:
-                footer.replace_with("")
+        lyrics_div = verse_div.parent
+        self.replace_br(lyrics_div)
+
+        ads = lyrics_div.find_all(
+            "div", class_=re.compile("InreadAd__Container")
+        )
+        for ad in ads:
+            ad.replace_with("\n")
+
+        footers = lyrics_div.find_all(
+            "div", class_=re.compile("Lyrics__Footer")
+        )
+        for footer in footers:
+            footer.replace_with("")
        return lyrics_div.get_text()


--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@ -17,6 +17,7 @@ Bug fixes:

 * Improved naming of temporary files by separating the random part with the file extension.
 * Fixed the ``auto`` value for the :ref:`reflink` config option.
+* Fixed lyrics plugin only getting part of the lyrics from ``Genius.com`` :bug:`4815`

 For packagers:

--- a/test/plugins/test_lyrics.py
+++ b/test/plugins/test_lyrics.py
@ -492,7 +492,17 @@ class GeniusScrapeLyricsFromHtmlTest(GeniusBaseTest):
        """Ensure we are able to scrape a page with lyrics"""
        url = "https://genius.com/Ttng-chinchilla-lyrics"
        mock = MockFetchUrl()
-        self.assertIsNotNone(genius._scrape_lyrics_from_html(mock(url)))
+        lyrics = genius._scrape_lyrics_from_html(mock(url))
+        self.assertIsNotNone(lyrics)
+        self.assertEqual(lyrics.count("\n"), 28)
+
+    def test_good_lyrics_multiple_divs(self):
+        """Ensure we are able to scrape a page with lyrics"""
+        url = "https://genius.com/2pac-all-eyez-on-me-lyrics"
+        mock = MockFetchUrl()
+        lyrics = genius._scrape_lyrics_from_html(mock(url))
+        self.assertIsNotNone(lyrics)
+        self.assertEqual(lyrics.count("\n"), 133)

    # TODO: find an example of a lyrics page with multiple divs and test it

--- a/test/rsrc/lyrics/geniuscom/2pacalleyezonmelyrics.txt
+++ b/test/rsrc/lyrics/geniuscom/2pacalleyezonmelyrics.txt