mirror of
https://github.com/beetbox/beets.git
synced 2026-01-04 15:03:22 +01:00
Merge pull request #3594 from stlutz/master
Scrape new Genius song page html
This commit is contained in:
commit
485abb08c0
3 changed files with 36 additions and 27 deletions
|
|
@ -359,14 +359,9 @@ class Genius(Backend):
|
|||
'User-Agent': USER_AGENT,
|
||||
}
|
||||
|
||||
def lyrics_from_song_api_path(self, song_api_path):
|
||||
song_url = self.base_url + song_api_path
|
||||
response = requests.get(song_url, headers=self.headers)
|
||||
json = response.json()
|
||||
path = json["response"]["song"]["path"]
|
||||
|
||||
def lyrics_from_song_page(self, page_url):
|
||||
# Gotta go regular html scraping... come on Genius.
|
||||
page_url = "https://genius.com" + path
|
||||
self._log.debug(u'fetching lyrics from: {0}', page_url)
|
||||
try:
|
||||
page = requests.get(page_url)
|
||||
except requests.RequestException as exc:
|
||||
|
|
@ -378,19 +373,38 @@ class Genius(Backend):
|
|||
# Remove script tags that they put in the middle of the lyrics.
|
||||
[h.extract() for h in html('script')]
|
||||
|
||||
# At least Genius is nice and has a tag called 'lyrics'!
|
||||
# Updated css where the lyrics are based in HTML.
|
||||
# Most of the time, the page contains a div with class="lyrics" where
|
||||
# all of the lyrics can be found already correctly formatted
|
||||
# Sometimes, though, it packages the lyrics into separate divs, most
|
||||
# likely for easier ad placement
|
||||
lyrics_div = html.find("div", class_="lyrics")
|
||||
if lyrics_div is None:
|
||||
self._log.debug(u'Genius lyrics for {0} not found',
|
||||
page_url)
|
||||
return None
|
||||
if not lyrics_div:
|
||||
self._log.debug(u'Received unusual song page html')
|
||||
verse_div = html.find("div",
|
||||
class_=re.compile("Lyrics__Container"))
|
||||
if not verse_div:
|
||||
if html.find("div",
|
||||
class_=re.compile("LyricsPlaceholder__Message"),
|
||||
string="This song is an instrumental"):
|
||||
self._log.debug('Detected instrumental')
|
||||
return "[Instrumental]"
|
||||
else:
|
||||
self._log.debug("Couldn't scrape page using known layouts")
|
||||
return None
|
||||
|
||||
lyrics_div = verse_div.parent
|
||||
for br in lyrics_div.find_all("br"):
|
||||
br.replace_with("\n")
|
||||
ads = lyrics_div.find_all("div",
|
||||
class_=re.compile("InreadAd__Container"))
|
||||
for ad in ads:
|
||||
ad.replace_with("\n")
|
||||
|
||||
return lyrics_div.get_text()
|
||||
|
||||
def fetch(self, artist, title):
|
||||
search_url = self.base_url + "/search"
|
||||
data = {'q': title}
|
||||
data = {'q': title + " " + artist.lower()}
|
||||
try:
|
||||
response = requests.get(search_url, data=data,
|
||||
headers=self.headers)
|
||||
|
|
@ -404,7 +418,6 @@ class Genius(Backend):
|
|||
self._log.debug(u'Genius API request returned invalid JSON')
|
||||
return None
|
||||
|
||||
song_info = None
|
||||
for hit in json["response"]["hits"]:
|
||||
# Genius uses zero-width characters to denote lowercase
|
||||
# artist names.
|
||||
|
|
@ -412,15 +425,9 @@ class Genius(Backend):
|
|||
strip(u'\u200b').lower()
|
||||
|
||||
if hit_artist == artist.lower():
|
||||
song_info = hit
|
||||
break
|
||||
return self.lyrics_from_song_page(hit["result"]["url"])
|
||||
|
||||
if song_info:
|
||||
self._log.debug(u'fetched: {0}', song_info["result"]["url"])
|
||||
song_api_path = song_info["result"]["api_path"]
|
||||
return self.lyrics_from_song_api_path(song_api_path)
|
||||
else:
|
||||
self._log.debug(u'genius: no matching artist')
|
||||
self._log.debug(u'genius: no matching artist')
|
||||
|
||||
|
||||
class LyricsWiki(SymbolsReplaced):
|
||||
|
|
|
|||
|
|
@ -202,6 +202,7 @@ Fixes:
|
|||
The log message has also been rewritten for to improve clarity.
|
||||
Thanks to :user:`autrimpo`.
|
||||
:bug:`3533`
|
||||
* :doc:`/plugins/lyrics`: Reduce the failure rate of the Genius backend.
|
||||
|
||||
For plugin developers:
|
||||
|
||||
|
|
|
|||
|
|
@ -456,7 +456,7 @@ class LyricsGeniusBaseTest(unittest.TestCase):
|
|||
self.skipTest("Python's built-in HTML parser is not good enough")
|
||||
|
||||
|
||||
class LyricsGeniusScrapTest(LyricsGeniusBaseTest):
|
||||
class LyricsGeniusScrapeTest(LyricsGeniusBaseTest):
|
||||
|
||||
"""Checks that Genius backend works as intended.
|
||||
"""
|
||||
|
|
@ -469,12 +469,13 @@ class LyricsGeniusScrapTest(LyricsGeniusBaseTest):
|
|||
|
||||
@patch.object(requests, 'get', GeniusMockGet())
|
||||
def test_no_lyrics_div(self):
|
||||
"""Ensure that `lyrics_from_song_api_path` doesn't crash when the html
|
||||
for a Genius page contain <div class="lyrics"></div>
|
||||
"""Ensure that `lyrics_from_song_page` doesn't crash when the html
|
||||
for a Genius page doesn't contain <div class="lyrics"></div>
|
||||
"""
|
||||
# https://github.com/beetbox/beets/issues/3535
|
||||
# expected return value None
|
||||
self.assertEqual(genius.lyrics_from_song_api_path('/nolyric'),
|
||||
song_url = 'https://genius.com/sample'
|
||||
self.assertEqual(genius.lyrics_from_song_page(song_url),
|
||||
None)
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue