Merge pull request #3594 from stlutz/master

Scrape new Genius song page html
This commit is contained in:
Adrian Sampson 2020-05-17 14:44:57 -04:00
commit 485abb08c0
3 changed files with 36 additions and 27 deletions

View file

@ -359,14 +359,9 @@ class Genius(Backend):
'User-Agent': USER_AGENT,
}
def lyrics_from_song_api_path(self, song_api_path):
song_url = self.base_url + song_api_path
response = requests.get(song_url, headers=self.headers)
json = response.json()
path = json["response"]["song"]["path"]
def lyrics_from_song_page(self, page_url):
# Gotta go regular html scraping... come on Genius.
page_url = "https://genius.com" + path
self._log.debug(u'fetching lyrics from: {0}', page_url)
try:
page = requests.get(page_url)
except requests.RequestException as exc:
@ -378,19 +373,38 @@ class Genius(Backend):
# Remove script tags that they put in the middle of the lyrics.
[h.extract() for h in html('script')]
# At least Genius is nice and has a tag called 'lyrics'!
# Updated css where the lyrics are based in HTML.
# Most of the time, the page contains a div with class="lyrics" where
# all of the lyrics can be found already correctly formatted
# Sometimes, though, it packages the lyrics into separate divs, most
# likely for easier ad placement
lyrics_div = html.find("div", class_="lyrics")
if lyrics_div is None:
self._log.debug(u'Genius lyrics for {0} not found',
page_url)
return None
if not lyrics_div:
self._log.debug(u'Received unusual song page html')
verse_div = html.find("div",
class_=re.compile("Lyrics__Container"))
if not verse_div:
if html.find("div",
class_=re.compile("LyricsPlaceholder__Message"),
string="This song is an instrumental"):
self._log.debug('Detected instrumental')
return "[Instrumental]"
else:
self._log.debug("Couldn't scrape page using known layouts")
return None
lyrics_div = verse_div.parent
for br in lyrics_div.find_all("br"):
br.replace_with("\n")
ads = lyrics_div.find_all("div",
class_=re.compile("InreadAd__Container"))
for ad in ads:
ad.replace_with("\n")
return lyrics_div.get_text()
def fetch(self, artist, title):
search_url = self.base_url + "/search"
data = {'q': title}
data = {'q': title + " " + artist.lower()}
try:
response = requests.get(search_url, data=data,
headers=self.headers)
@ -404,7 +418,6 @@ class Genius(Backend):
self._log.debug(u'Genius API request returned invalid JSON')
return None
song_info = None
for hit in json["response"]["hits"]:
# Genius uses zero-width characters to denote lowercase
# artist names.
@ -412,15 +425,9 @@ class Genius(Backend):
strip(u'\u200b').lower()
if hit_artist == artist.lower():
song_info = hit
break
return self.lyrics_from_song_page(hit["result"]["url"])
if song_info:
self._log.debug(u'fetched: {0}', song_info["result"]["url"])
song_api_path = song_info["result"]["api_path"]
return self.lyrics_from_song_api_path(song_api_path)
else:
self._log.debug(u'genius: no matching artist')
self._log.debug(u'genius: no matching artist')
class LyricsWiki(SymbolsReplaced):

View file

@ -202,6 +202,7 @@ Fixes:
The log message has also been rewritten for to improve clarity.
Thanks to :user:`autrimpo`.
:bug:`3533`
* :doc:`/plugins/lyrics`: Reduce the failure rate of the Genius backend.
For plugin developers:

View file

@ -456,7 +456,7 @@ class LyricsGeniusBaseTest(unittest.TestCase):
self.skipTest("Python's built-in HTML parser is not good enough")
class LyricsGeniusScrapTest(LyricsGeniusBaseTest):
class LyricsGeniusScrapeTest(LyricsGeniusBaseTest):
"""Checks that Genius backend works as intended.
"""
@ -469,12 +469,13 @@ class LyricsGeniusScrapTest(LyricsGeniusBaseTest):
@patch.object(requests, 'get', GeniusMockGet())
def test_no_lyrics_div(self):
"""Ensure that `lyrics_from_song_api_path` doesn't crash when the html
for a Genius page contain <div class="lyrics"></div>
"""Ensure that `lyrics_from_song_page` doesn't crash when the html
for a Genius page doesn't contain <div class="lyrics"></div>
"""
# https://github.com/beetbox/beets/issues/3535
# expected return value None
self.assertEqual(genius.lyrics_from_song_api_path('/nolyric'),
song_url = 'https://genius.com/sample'
self.assertEqual(genius.lyrics_from_song_page(song_url),
None)