mirror of
https://github.com/beetbox/beets.git
synced 2026-01-08 08:56:56 +01:00
commit
61e885c85a
4 changed files with 1196 additions and 31 deletions
|
|
@ -448,40 +448,51 @@ class Genius(Backend):
|
|||
# Sometimes, though, it packages the lyrics into separate divs, most
|
||||
# likely for easier ad placement
|
||||
|
||||
lyrics_div = soup.find("div", {"data-lyrics-container": True})
|
||||
|
||||
if lyrics_div:
|
||||
self.replace_br(lyrics_div)
|
||||
|
||||
if not lyrics_div:
|
||||
lyrics_divs = soup.find_all("div", {"data-lyrics-container": True})
|
||||
if not lyrics_divs:
|
||||
self._log.debug("Received unusual song page html")
|
||||
verse_div = soup.find("div", class_=re.compile("Lyrics__Container"))
|
||||
if not verse_div:
|
||||
if soup.find(
|
||||
"div",
|
||||
class_=re.compile("LyricsPlaceholder__Message"),
|
||||
string="This song is an instrumental",
|
||||
):
|
||||
self._log.debug("Detected instrumental")
|
||||
return "[Instrumental]"
|
||||
else:
|
||||
self._log.debug("Couldn't scrape page using known layouts")
|
||||
return None
|
||||
|
||||
lyrics_div = verse_div.parent
|
||||
return self._try_extracting_lyrics_from_non_data_lyrics_container(
|
||||
soup
|
||||
)
|
||||
lyrics = ""
|
||||
for lyrics_div in lyrics_divs:
|
||||
self.replace_br(lyrics_div)
|
||||
lyrics += lyrics_div.get_text() + "\n\n"
|
||||
while lyrics[-1] == "\n":
|
||||
lyrics = lyrics[:-1]
|
||||
return lyrics
|
||||
|
||||
ads = lyrics_div.find_all(
|
||||
"div", class_=re.compile("InreadAd__Container")
|
||||
)
|
||||
for ad in ads:
|
||||
ad.replace_with("\n")
|
||||
def _try_extracting_lyrics_from_non_data_lyrics_container(self, soup):
|
||||
"""Extract lyrics from a div without attribute data-lyrics-container
|
||||
This is the second most common layout on genius.com
|
||||
"""
|
||||
verse_div = soup.find("div", class_=re.compile("Lyrics__Container"))
|
||||
if not verse_div:
|
||||
if soup.find(
|
||||
"div",
|
||||
class_=re.compile("LyricsPlaceholder__Message"),
|
||||
string="This song is an instrumental",
|
||||
):
|
||||
self._log.debug("Detected instrumental")
|
||||
return "[Instrumental]"
|
||||
else:
|
||||
self._log.debug("Couldn't scrape page using known layouts")
|
||||
return None
|
||||
|
||||
footers = lyrics_div.find_all(
|
||||
"div", class_=re.compile("Lyrics__Footer")
|
||||
)
|
||||
for footer in footers:
|
||||
footer.replace_with("")
|
||||
lyrics_div = verse_div.parent
|
||||
self.replace_br(lyrics_div)
|
||||
|
||||
ads = lyrics_div.find_all(
|
||||
"div", class_=re.compile("InreadAd__Container")
|
||||
)
|
||||
for ad in ads:
|
||||
ad.replace_with("\n")
|
||||
|
||||
footers = lyrics_div.find_all(
|
||||
"div", class_=re.compile("Lyrics__Footer")
|
||||
)
|
||||
for footer in footers:
|
||||
footer.replace_with("")
|
||||
return lyrics_div.get_text()
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -17,6 +17,7 @@ Bug fixes:
|
|||
|
||||
* Improved naming of temporary files by separating the random part with the file extension.
|
||||
* Fixed the ``auto`` value for the :ref:`reflink` config option.
|
||||
* Fixed lyrics plugin only getting part of the lyrics from ``Genius.com`` :bug:`4815`
|
||||
|
||||
For packagers:
|
||||
|
||||
|
|
|
|||
|
|
@ -492,7 +492,17 @@ class GeniusScrapeLyricsFromHtmlTest(GeniusBaseTest):
|
|||
"""Ensure we are able to scrape a page with lyrics"""
|
||||
url = "https://genius.com/Ttng-chinchilla-lyrics"
|
||||
mock = MockFetchUrl()
|
||||
self.assertIsNotNone(genius._scrape_lyrics_from_html(mock(url)))
|
||||
lyrics = genius._scrape_lyrics_from_html(mock(url))
|
||||
self.assertIsNotNone(lyrics)
|
||||
self.assertEqual(lyrics.count("\n"), 28)
|
||||
|
||||
def test_good_lyrics_multiple_divs(self):
|
||||
"""Ensure we are able to scrape a page with lyrics"""
|
||||
url = "https://genius.com/2pac-all-eyez-on-me-lyrics"
|
||||
mock = MockFetchUrl()
|
||||
lyrics = genius._scrape_lyrics_from_html(mock(url))
|
||||
self.assertIsNotNone(lyrics)
|
||||
self.assertEqual(lyrics.count("\n"), 133)
|
||||
|
||||
# TODO: find an example of a lyrics page with multiple divs and test it
|
||||
|
||||
|
|
|
|||
1143
test/rsrc/lyrics/geniuscom/2pacalleyezonmelyrics.txt
Normal file
1143
test/rsrc/lyrics/geniuscom/2pacalleyezonmelyrics.txt
Normal file
File diff suppressed because one or more lines are too long
Loading…
Reference in a new issue