mirror of
https://github.com/beetbox/beets.git
synced 2026-01-04 15:03:22 +01:00
Clean up code and tests for Genius Lyrics plugin backend (#3641)
* clean-up code & add tests for genius lyrics backend * add genius fetch tests * organize imports: standard lib -> pip -> local * check in sample genius lyrics page * fix mock import * force utf-8 encoding for opened files * use io.open to force utf-8 encoding w/ python2.7
This commit is contained in:
parent
cb668ccdab
commit
45abc9ed7a
4 changed files with 2356 additions and 82 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
|
@ -7,7 +7,6 @@
|
|||
|
||||
# Project Specific patterns
|
||||
man
|
||||
test/rsrc/lyrics/*
|
||||
|
||||
# The rest is from https://www.gitignore.io/api/python
|
||||
|
||||
|
|
|
|||
|
|
@ -359,16 +359,54 @@ class Genius(Backend):
|
|||
'User-Agent': USER_AGENT,
|
||||
}
|
||||
|
||||
def lyrics_from_song_page(self, page_url):
|
||||
# Gotta go regular html scraping... come on Genius.
|
||||
self._log.debug(u'fetching lyrics from: {0}', page_url)
|
||||
try:
|
||||
page = requests.get(page_url)
|
||||
except requests.RequestException as exc:
|
||||
self._log.debug(u'Genius page request for {0} failed: {1}',
|
||||
page_url, exc)
|
||||
def fetch(self, artist, title):
|
||||
"""Fetch lyrics from genius.com
|
||||
|
||||
Because genius doesn't allow accesssing lyrics via the api,
|
||||
we first query the api for a url matching our artist & title,
|
||||
then attempt to scrape that url for the lyrics.
|
||||
"""
|
||||
json = self._search(artist, title)
|
||||
if not json:
|
||||
self._log.debug(u'Genius API request returned invalid JSON')
|
||||
return None
|
||||
html = BeautifulSoup(page.text, "html.parser")
|
||||
|
||||
# find a matching artist in the json
|
||||
for hit in json["response"]["hits"]:
|
||||
hit_artist = hit["result"]["primary_artist"]["name"]
|
||||
|
||||
if slug(hit_artist) == slug(artist):
|
||||
return self._scrape_lyrics_from_html(
|
||||
self.fetch_url(hit["result"]["url"]))
|
||||
|
||||
self._log.debug(u'Genius failed to find a matching artist for \'{0}\'',
|
||||
artist)
|
||||
|
||||
def _search(self, artist, title):
|
||||
"""Searches the genius api for a given artist and title
|
||||
|
||||
https://docs.genius.com/#search-h2
|
||||
|
||||
:returns: json response
|
||||
"""
|
||||
search_url = self.base_url + "/search"
|
||||
data = {'q': title + " " + artist.lower()}
|
||||
try:
|
||||
response = requests.get(
|
||||
search_url, data=data, headers=self.headers)
|
||||
except requests.RequestException as exc:
|
||||
self._log.debug(u'Genius API request failed: {0}', exc)
|
||||
return None
|
||||
|
||||
try:
|
||||
return response.json()
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
def _scrape_lyrics_from_html(self, html):
|
||||
"""Scrape lyrics from a given genius.com html"""
|
||||
|
||||
html = BeautifulSoup(html, "html.parser")
|
||||
|
||||
# Remove script tags that they put in the middle of the lyrics.
|
||||
[h.extract() for h in html('script')]
|
||||
|
|
@ -402,31 +440,6 @@ class Genius(Backend):
|
|||
|
||||
return lyrics_div.get_text()
|
||||
|
||||
def fetch(self, artist, title):
|
||||
search_url = self.base_url + "/search"
|
||||
data = {'q': title + " " + artist.lower()}
|
||||
try:
|
||||
response = requests.get(search_url, data=data,
|
||||
headers=self.headers)
|
||||
except requests.RequestException as exc:
|
||||
self._log.debug(u'Genius API request failed: {0}', exc)
|
||||
return None
|
||||
|
||||
try:
|
||||
json = response.json()
|
||||
except ValueError:
|
||||
self._log.debug(u'Genius API request returned invalid JSON')
|
||||
return None
|
||||
|
||||
for hit in json["response"]["hits"]:
|
||||
hit_artist = hit["result"]["primary_artist"]["name"]
|
||||
|
||||
if slug(hit_artist) == slug(artist):
|
||||
return self.lyrics_from_song_page(hit["result"]["url"])
|
||||
|
||||
self._log.debug(u'Genius failed to find a matching artist for \'{0}\'',
|
||||
artist)
|
||||
|
||||
|
||||
class LyricsWiki(SymbolsReplaced):
|
||||
"""Fetch lyrics from LyricsWiki."""
|
||||
|
|
|
|||
2227
test/rsrc/lyrics/geniuscom/Wutangclancreamlyrics.txt
Normal file
2227
test/rsrc/lyrics/geniuscom/Wutangclancreamlyrics.txt
Normal file
File diff suppressed because one or more lines are too long
|
|
@ -18,23 +18,21 @@
|
|||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
import itertools
|
||||
from io import open
|
||||
import os
|
||||
import re
|
||||
import six
|
||||
import sys
|
||||
import unittest
|
||||
|
||||
from mock import patch
|
||||
from test import _common
|
||||
import confuse
|
||||
from mock import MagicMock, patch
|
||||
|
||||
from beets import logging
|
||||
from beets.library import Item
|
||||
from beets.util import bytestring_path
|
||||
import confuse
|
||||
|
||||
from beetsplug import lyrics
|
||||
|
||||
from mock import MagicMock
|
||||
from test import _common
|
||||
|
||||
|
||||
log = logging.getLogger('beets.test_lyrics')
|
||||
|
|
@ -232,38 +230,11 @@ class MockFetchUrl(object):
|
|||
def __call__(self, url, filename=None):
|
||||
self.fetched = url
|
||||
fn = url_to_filename(url)
|
||||
with open(fn, 'r') as f:
|
||||
with open(fn, 'r', encoding="utf8") as f:
|
||||
content = f.read()
|
||||
return content
|
||||
|
||||
|
||||
class GeniusMockGet(object):
|
||||
|
||||
def __init__(self, pathval='fetched_path'):
|
||||
self.pathval = pathval
|
||||
self.fetched = None
|
||||
|
||||
def __call__(self, url, headers=False):
|
||||
from requests.models import Response
|
||||
# for the first requests.get() return a path
|
||||
if headers:
|
||||
response = Response()
|
||||
response.status_code = 200
|
||||
response._content = b'{"meta":{"status":200},\
|
||||
"response":{"song":{"path":"/lyrics/sample"}}}'
|
||||
return response
|
||||
# for the second requests.get() return the genius page
|
||||
else:
|
||||
from mock import PropertyMock
|
||||
self.fetched = url
|
||||
fn = url_to_filename(url)
|
||||
with open(fn, 'r') as f:
|
||||
content = f.read()
|
||||
response = Response()
|
||||
type(response).text = PropertyMock(return_value=content)
|
||||
return response
|
||||
|
||||
|
||||
def is_lyrics_content_ok(title, text):
|
||||
"""Compare lyrics text to expected lyrics for given title."""
|
||||
if not text:
|
||||
|
|
@ -445,8 +416,9 @@ class LyricsGooglePluginMachineryTest(LyricsGoogleBaseTest):
|
|||
google.is_page_candidate(url, url_title, s['title'], u'Sunn O)))')
|
||||
|
||||
|
||||
class LyricsGeniusBaseTest(unittest.TestCase):
|
||||
# test Genius backend
|
||||
|
||||
class GeniusBaseTest(unittest.TestCase):
|
||||
def setUp(self):
|
||||
"""Set up configuration."""
|
||||
try:
|
||||
|
|
@ -457,28 +429,91 @@ class LyricsGeniusBaseTest(unittest.TestCase):
|
|||
self.skipTest("Python's built-in HTML parser is not good enough")
|
||||
|
||||
|
||||
class LyricsGeniusScrapeTest(LyricsGeniusBaseTest):
|
||||
|
||||
"""Checks that Genius backend works as intended.
|
||||
"""
|
||||
import requests
|
||||
class GeniusScrapeLyricsFromHtmlTest(GeniusBaseTest):
|
||||
"""tests Genius._scrape_lyrics_from_html()"""
|
||||
|
||||
def setUp(self):
|
||||
"""Set up configuration"""
|
||||
LyricsGeniusBaseTest.setUp(self)
|
||||
GeniusBaseTest.setUp(self)
|
||||
self.plugin = lyrics.LyricsPlugin()
|
||||
|
||||
@patch.object(requests, 'get', GeniusMockGet())
|
||||
def test_no_lyrics_div(self):
|
||||
"""Ensure that `lyrics_from_song_page` doesn't crash when the html
|
||||
for a Genius page doesn't contain <div class="lyrics"></div>
|
||||
"""Ensure we don't crash when the scraping the html for a genius page
|
||||
doesn't contain <div class="lyrics"></div>
|
||||
"""
|
||||
# https://github.com/beetbox/beets/issues/3535
|
||||
# expected return value None
|
||||
song_url = 'https://genius.com/sample'
|
||||
self.assertEqual(genius.lyrics_from_song_page(song_url),
|
||||
None)
|
||||
url = 'https://genius.com/sample'
|
||||
mock = MockFetchUrl()
|
||||
self.assertEqual(genius._scrape_lyrics_from_html(mock(url)), None)
|
||||
|
||||
def test_good_lyrics(self):
|
||||
"""Ensure we are able to scrape a page with lyrics"""
|
||||
url = 'https://genius.com/Wu-tang-clan-cream-lyrics'
|
||||
mock = MockFetchUrl()
|
||||
self.assertIsNotNone(genius._scrape_lyrics_from_html(mock(url)))
|
||||
|
||||
# TODO: find an example of a lyrics page with multiple divs and test it
|
||||
|
||||
|
||||
class GeniusFetchTest(GeniusBaseTest):
|
||||
"""tests Genius.fetch()"""
|
||||
|
||||
def setUp(self):
|
||||
"""Set up configuration"""
|
||||
GeniusBaseTest.setUp(self)
|
||||
self.plugin = lyrics.LyricsPlugin()
|
||||
|
||||
@patch.object(lyrics.Genius, '_scrape_lyrics_from_html')
|
||||
@patch.object(lyrics.Backend, 'fetch_url', return_value=True)
|
||||
def test_json(self, mock_fetch_url, mock_scrape):
|
||||
"""Ensure we're finding artist matches"""
|
||||
with patch.object(
|
||||
lyrics.Genius, '_search', return_value={
|
||||
"response": {
|
||||
"hits": [
|
||||
{
|
||||
"result": {
|
||||
"primary_artist": {
|
||||
"name": u"\u200Bblackbear",
|
||||
},
|
||||
"url": "blackbear_url"
|
||||
}
|
||||
},
|
||||
{
|
||||
"result": {
|
||||
"primary_artist": {
|
||||
"name": u"El\u002Dp"
|
||||
},
|
||||
"url": "El-p_url"
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
) as mock_json:
|
||||
# genius uses zero-width-spaces (\u200B) for lowercase
|
||||
# artists so we make sure we can match those
|
||||
self.assertIsNotNone(genius.fetch('blackbear', 'Idfc'))
|
||||
mock_fetch_url.assert_called_once_with("blackbear_url")
|
||||
mock_scrape.assert_called_once_with(True)
|
||||
|
||||
# genius uses the hypen minus (\u002D) as their dash
|
||||
self.assertIsNotNone(genius.fetch('El-p', 'Idfc'))
|
||||
mock_fetch_url.assert_called_with('El-p_url')
|
||||
mock_scrape.assert_called_with(True)
|
||||
|
||||
# test no matching artist
|
||||
self.assertIsNone(genius.fetch('doesntexist', 'none'))
|
||||
|
||||
# test invalid json
|
||||
mock_json.return_value = None
|
||||
self.assertIsNone(genius.fetch('blackbear', 'Idfc'))
|
||||
|
||||
# TODO: add integration test hitting real api
|
||||
|
||||
|
||||
# test utilties
|
||||
|
||||
class SlugTests(unittest.TestCase):
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue