From 1b35a5df0db707150e4a790ec2e3dfc8f4f2357b Mon Sep 17 00:00:00 2001 From: Lucas Magno Date: Sun, 8 Oct 2017 09:13:51 -0300 Subject: [PATCH 1/2] Fetch lyrics from Genius through scraper --- beetsplug/lyrics.py | 113 +++++++++++++------------------------------- 1 file changed, 33 insertions(+), 80 deletions(-) diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py index d7fca27c2..025a1374c 100644 --- a/beetsplug/lyrics.py +++ b/beetsplug/lyrics.py @@ -335,8 +335,11 @@ class MusiXmatch(SymbolsReplaced): class Genius(Backend): - """Fetch lyrics from Genius via genius-api.""" - + """Fetch lyrics from Genius via genius-api. + Simply adapted from https://bigishdata.com/2016/09/27/getting-song-lyrics-from-geniuss-api-scraping/""" + + base_url = "https://api.genius.com" + def __init__(self, config, log): super(Genius, self).__init__(config, log) self.api_key = config['genius_api_key'].as_str() @@ -345,85 +348,35 @@ class Genius(Backend): 'User-Agent': USER_AGENT, } - def search_genius(self, artist, title): - query = u"%s %s" % (artist, title) - url = u'https://api.genius.com/search?q=%s' \ - % (urllib.parse.quote(query.encode('utf-8'))) - - self._log.debug(u'genius: requesting search {}', url) - try: - req = requests.get( - url, - headers=self.headers, - allow_redirects=True - ) - req.raise_for_status() - except requests.RequestException as exc: - self._log.debug(u'genius: request error: {}', exc) - return None - - try: - return req.json() - except ValueError: - self._log.debug(u'genius: invalid response: {}', req.text) - return None - - def get_lyrics(self, link): - url = u'http://genius-api.com/api/lyricsInfo' - - self._log.debug(u'genius: requesting lyrics for link {}', link) - try: - req = requests.post( - url, - data={'link': link}, - headers=self.headers, - allow_redirects=True - ) - req.raise_for_status() - except requests.RequestException as exc: - self._log.debug(u'genius: request error: {}', exc) - return None - - try: - return req.json() - except ValueError: - self._log.debug(u'genius: invalid response: {}', req.text) - return None - - def build_lyric_string(self, lyrics): - if 'lyrics' not in lyrics: - return - sections = lyrics['lyrics']['sections'] - - lyrics_list = [] - for section in sections: - lyrics_list.append(section['name']) - lyrics_list.append('\n') - for verse in section['verses']: - if 'content' in verse: - lyrics_list.append(verse['content']) - - return ''.join(lyrics_list) + def lyrics_from_song_api_path(self, song_api_path): + song_url = self.base_url + song_api_path + response = requests.get(song_url, headers=self.headers) + json = response.json() + path = json["response"]["song"]["path"] + #gotta go regular html scraping... come on Genius + page_url = "https://genius.com" + path + page = requests.get(page_url) + html = BeautifulSoup(page.text, "html.parser") + #remove script tags that they put in the middle of the lyrics + [h.extract() for h in html('script')] + #at least Genius is nice and has a tag called 'lyrics'! + lyrics = html.find("div", class_="lyrics").get_text() #updated css where the lyrics are based in HTML + return lyrics def fetch(self, artist, title): - search_data = self.search_genius(artist, title) - if not search_data: - return - - if not search_data['meta']['status'] == 200: - return - else: - records = search_data['response']['hits'] - if not records: - return - - record_url = records[0]['result']['url'] - lyric_data = self.get_lyrics(record_url) - if not lyric_data: - return - lyrics = self.build_lyric_string(lyric_data) - - return lyrics + search_url = self.base_url + "/search" + data = {'q': title} + response = requests.get(search_url, data=data, headers=self.headers) + json = response.json() + + song_info = None + for hit in json["response"]["hits"]: + if hit["result"]["primary_artist"]["name"] == artist: + song_info = hit + break + if song_info: + song_api_path = song_info["result"]["api_path"] + return self.lyrics_from_song_api_path(song_api_path) class LyricsWiki(SymbolsReplaced): @@ -638,7 +591,7 @@ class Google(Backend): class LyricsPlugin(plugins.BeetsPlugin): - SOURCES = ['google', 'lyricwiki', 'musixmatch'] + SOURCES = ['google', 'lyricwiki', 'musixmatch', 'genius'] SOURCE_BACKENDS = { 'google': Google, 'lyricwiki': LyricsWiki, From fc2d379fb529000f825017e635a0baf0e86a1242 Mon Sep 17 00:00:00 2001 From: Lucas Magno Date: Mon, 9 Oct 2017 06:22:42 -0300 Subject: [PATCH 2/2] Comply with PEP8 --- beetsplug/lyrics.py | 67 +++++++++++++++++++++++++-------------------- 1 file changed, 38 insertions(+), 29 deletions(-) diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py index 025a1374c..1987b67e7 100644 --- a/beetsplug/lyrics.py +++ b/beetsplug/lyrics.py @@ -336,10 +336,13 @@ class MusiXmatch(SymbolsReplaced): class Genius(Backend): """Fetch lyrics from Genius via genius-api. - Simply adapted from https://bigishdata.com/2016/09/27/getting-song-lyrics-from-geniuss-api-scraping/""" - + + Simply adapted from + bigishdata.com/2016/09/27/getting-song-lyrics-from-geniuss-api-scraping/ + """ + base_url = "https://api.genius.com" - + def __init__(self, config, log): super(Genius, self).__init__(config, log) self.api_key = config['genius_api_key'].as_str() @@ -349,34 +352,40 @@ class Genius(Backend): } def lyrics_from_song_api_path(self, song_api_path): - song_url = self.base_url + song_api_path - response = requests.get(song_url, headers=self.headers) - json = response.json() - path = json["response"]["song"]["path"] - #gotta go regular html scraping... come on Genius - page_url = "https://genius.com" + path - page = requests.get(page_url) - html = BeautifulSoup(page.text, "html.parser") - #remove script tags that they put in the middle of the lyrics - [h.extract() for h in html('script')] - #at least Genius is nice and has a tag called 'lyrics'! - lyrics = html.find("div", class_="lyrics").get_text() #updated css where the lyrics are based in HTML - return lyrics + song_url = self.base_url + song_api_path + response = requests.get(song_url, headers=self.headers) + json = response.json() + path = json["response"]["song"]["path"] + + # Gotta go regular html scraping... come on Genius. + page_url = "https://genius.com" + path + page = requests.get(page_url) + html = BeautifulSoup(page.text, "html.parser") + + # Remove script tags that they put in the middle of the lyrics. + [h.extract() for h in html('script')] + + # At least Genius is nice and has a tag called 'lyrics'! + # Updated css where the lyrics are based in HTML. + lyrics = html.find("div", class_="lyrics").get_text() + + return lyrics def fetch(self, artist, title): - search_url = self.base_url + "/search" - data = {'q': title} - response = requests.get(search_url, data=data, headers=self.headers) - json = response.json() - - song_info = None - for hit in json["response"]["hits"]: - if hit["result"]["primary_artist"]["name"] == artist: - song_info = hit - break - if song_info: - song_api_path = song_info["result"]["api_path"] - return self.lyrics_from_song_api_path(song_api_path) + search_url = self.base_url + "/search" + data = {'q': title} + response = requests.get(search_url, data=data, headers=self.headers) + json = response.json() + + song_info = None + for hit in json["response"]["hits"]: + if hit["result"]["primary_artist"]["name"] == artist: + song_info = hit + break + + if song_info: + song_api_path = song_info["result"]["api_path"] + return self.lyrics_from_song_api_path(song_api_path) class LyricsWiki(SymbolsReplaced):