This commit is contained in:
Serene 2024-07-11 15:20:01 +10:00 committed by GitHub
commit 61e885c85a
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 1196 additions and 31 deletions

View file

@ -448,40 +448,51 @@ class Genius(Backend):
# Sometimes, though, it packages the lyrics into separate divs, most
# likely for easier ad placement
lyrics_div = soup.find("div", {"data-lyrics-container": True})
if lyrics_div:
self.replace_br(lyrics_div)
if not lyrics_div:
lyrics_divs = soup.find_all("div", {"data-lyrics-container": True})
if not lyrics_divs:
self._log.debug("Received unusual song page html")
verse_div = soup.find("div", class_=re.compile("Lyrics__Container"))
if not verse_div:
if soup.find(
"div",
class_=re.compile("LyricsPlaceholder__Message"),
string="This song is an instrumental",
):
self._log.debug("Detected instrumental")
return "[Instrumental]"
else:
self._log.debug("Couldn't scrape page using known layouts")
return None
lyrics_div = verse_div.parent
return self._try_extracting_lyrics_from_non_data_lyrics_container(
soup
)
lyrics = ""
for lyrics_div in lyrics_divs:
self.replace_br(lyrics_div)
lyrics += lyrics_div.get_text() + "\n\n"
while lyrics[-1] == "\n":
lyrics = lyrics[:-1]
return lyrics
ads = lyrics_div.find_all(
"div", class_=re.compile("InreadAd__Container")
)
for ad in ads:
ad.replace_with("\n")
def _try_extracting_lyrics_from_non_data_lyrics_container(self, soup):
"""Extract lyrics from a div without attribute data-lyrics-container
This is the second most common layout on genius.com
"""
verse_div = soup.find("div", class_=re.compile("Lyrics__Container"))
if not verse_div:
if soup.find(
"div",
class_=re.compile("LyricsPlaceholder__Message"),
string="This song is an instrumental",
):
self._log.debug("Detected instrumental")
return "[Instrumental]"
else:
self._log.debug("Couldn't scrape page using known layouts")
return None
footers = lyrics_div.find_all(
"div", class_=re.compile("Lyrics__Footer")
)
for footer in footers:
footer.replace_with("")
lyrics_div = verse_div.parent
self.replace_br(lyrics_div)
ads = lyrics_div.find_all(
"div", class_=re.compile("InreadAd__Container")
)
for ad in ads:
ad.replace_with("\n")
footers = lyrics_div.find_all(
"div", class_=re.compile("Lyrics__Footer")
)
for footer in footers:
footer.replace_with("")
return lyrics_div.get_text()

View file

@ -17,6 +17,7 @@ Bug fixes:
* Improved naming of temporary files by separating the random part with the file extension.
* Fixed the ``auto`` value for the :ref:`reflink` config option.
* Fixed lyrics plugin only getting part of the lyrics from ``Genius.com`` :bug:`4815`
For packagers:

View file

@ -492,7 +492,17 @@ class GeniusScrapeLyricsFromHtmlTest(GeniusBaseTest):
"""Ensure we are able to scrape a page with lyrics"""
url = "https://genius.com/Ttng-chinchilla-lyrics"
mock = MockFetchUrl()
self.assertIsNotNone(genius._scrape_lyrics_from_html(mock(url)))
lyrics = genius._scrape_lyrics_from_html(mock(url))
self.assertIsNotNone(lyrics)
self.assertEqual(lyrics.count("\n"), 28)
def test_good_lyrics_multiple_divs(self):
"""Ensure we are able to scrape a page with lyrics"""
url = "https://genius.com/2pac-all-eyez-on-me-lyrics"
mock = MockFetchUrl()
lyrics = genius._scrape_lyrics_from_html(mock(url))
self.assertIsNotNone(lyrics)
self.assertEqual(lyrics.count("\n"), 133)
# TODO: find an example of a lyrics page with multiple divs and test it

File diff suppressed because one or more lines are too long