Clean up code and tests for Genius Lyrics plugin backend (#3641)

* clean-up code & add tests for genius lyrics backend

* add genius fetch tests

* organize imports: standard lib -> pip -> local

* check in sample genius lyrics page

* fix mock import

* force utf-8 encoding for opened files

* use io.open to force utf-8 encoding w/ python2.7
This commit is contained in:
jtpavlock 2020-07-06 08:41:27 -05:00 committed by GitHub
parent cb668ccdab
commit 45abc9ed7a
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 2356 additions and 82 deletions

1
.gitignore vendored
View file

@ -7,7 +7,6 @@
# Project Specific patterns
man
test/rsrc/lyrics/*
# The rest is from https://www.gitignore.io/api/python

View file

@ -359,16 +359,54 @@ class Genius(Backend):
'User-Agent': USER_AGENT,
}
def lyrics_from_song_page(self, page_url):
# Gotta go regular html scraping... come on Genius.
self._log.debug(u'fetching lyrics from: {0}', page_url)
try:
page = requests.get(page_url)
except requests.RequestException as exc:
self._log.debug(u'Genius page request for {0} failed: {1}',
page_url, exc)
def fetch(self, artist, title):
"""Fetch lyrics from genius.com
Because genius doesn't allow accesssing lyrics via the api,
we first query the api for a url matching our artist & title,
then attempt to scrape that url for the lyrics.
"""
json = self._search(artist, title)
if not json:
self._log.debug(u'Genius API request returned invalid JSON')
return None
html = BeautifulSoup(page.text, "html.parser")
# find a matching artist in the json
for hit in json["response"]["hits"]:
hit_artist = hit["result"]["primary_artist"]["name"]
if slug(hit_artist) == slug(artist):
return self._scrape_lyrics_from_html(
self.fetch_url(hit["result"]["url"]))
self._log.debug(u'Genius failed to find a matching artist for \'{0}\'',
artist)
def _search(self, artist, title):
"""Searches the genius api for a given artist and title
https://docs.genius.com/#search-h2
:returns: json response
"""
search_url = self.base_url + "/search"
data = {'q': title + " " + artist.lower()}
try:
response = requests.get(
search_url, data=data, headers=self.headers)
except requests.RequestException as exc:
self._log.debug(u'Genius API request failed: {0}', exc)
return None
try:
return response.json()
except ValueError:
return None
def _scrape_lyrics_from_html(self, html):
"""Scrape lyrics from a given genius.com html"""
html = BeautifulSoup(html, "html.parser")
# Remove script tags that they put in the middle of the lyrics.
[h.extract() for h in html('script')]
@ -402,31 +440,6 @@ class Genius(Backend):
return lyrics_div.get_text()
def fetch(self, artist, title):
search_url = self.base_url + "/search"
data = {'q': title + " " + artist.lower()}
try:
response = requests.get(search_url, data=data,
headers=self.headers)
except requests.RequestException as exc:
self._log.debug(u'Genius API request failed: {0}', exc)
return None
try:
json = response.json()
except ValueError:
self._log.debug(u'Genius API request returned invalid JSON')
return None
for hit in json["response"]["hits"]:
hit_artist = hit["result"]["primary_artist"]["name"]
if slug(hit_artist) == slug(artist):
return self.lyrics_from_song_page(hit["result"]["url"])
self._log.debug(u'Genius failed to find a matching artist for \'{0}\'',
artist)
class LyricsWiki(SymbolsReplaced):
"""Fetch lyrics from LyricsWiki."""

File diff suppressed because one or more lines are too long

View file

@ -18,23 +18,21 @@
from __future__ import absolute_import, division, print_function
import itertools
from io import open
import os
import re
import six
import sys
import unittest
from mock import patch
from test import _common
import confuse
from mock import MagicMock, patch
from beets import logging
from beets.library import Item
from beets.util import bytestring_path
import confuse
from beetsplug import lyrics
from mock import MagicMock
from test import _common
log = logging.getLogger('beets.test_lyrics')
@ -232,38 +230,11 @@ class MockFetchUrl(object):
def __call__(self, url, filename=None):
self.fetched = url
fn = url_to_filename(url)
with open(fn, 'r') as f:
with open(fn, 'r', encoding="utf8") as f:
content = f.read()
return content
class GeniusMockGet(object):
def __init__(self, pathval='fetched_path'):
self.pathval = pathval
self.fetched = None
def __call__(self, url, headers=False):
from requests.models import Response
# for the first requests.get() return a path
if headers:
response = Response()
response.status_code = 200
response._content = b'{"meta":{"status":200},\
"response":{"song":{"path":"/lyrics/sample"}}}'
return response
# for the second requests.get() return the genius page
else:
from mock import PropertyMock
self.fetched = url
fn = url_to_filename(url)
with open(fn, 'r') as f:
content = f.read()
response = Response()
type(response).text = PropertyMock(return_value=content)
return response
def is_lyrics_content_ok(title, text):
"""Compare lyrics text to expected lyrics for given title."""
if not text:
@ -445,8 +416,9 @@ class LyricsGooglePluginMachineryTest(LyricsGoogleBaseTest):
google.is_page_candidate(url, url_title, s['title'], u'Sunn O)))')
class LyricsGeniusBaseTest(unittest.TestCase):
# test Genius backend
class GeniusBaseTest(unittest.TestCase):
def setUp(self):
"""Set up configuration."""
try:
@ -457,28 +429,91 @@ class LyricsGeniusBaseTest(unittest.TestCase):
self.skipTest("Python's built-in HTML parser is not good enough")
class LyricsGeniusScrapeTest(LyricsGeniusBaseTest):
"""Checks that Genius backend works as intended.
"""
import requests
class GeniusScrapeLyricsFromHtmlTest(GeniusBaseTest):
"""tests Genius._scrape_lyrics_from_html()"""
def setUp(self):
"""Set up configuration"""
LyricsGeniusBaseTest.setUp(self)
GeniusBaseTest.setUp(self)
self.plugin = lyrics.LyricsPlugin()
@patch.object(requests, 'get', GeniusMockGet())
def test_no_lyrics_div(self):
"""Ensure that `lyrics_from_song_page` doesn't crash when the html
for a Genius page doesn't contain <div class="lyrics"></div>
"""Ensure we don't crash when the scraping the html for a genius page
doesn't contain <div class="lyrics"></div>
"""
# https://github.com/beetbox/beets/issues/3535
# expected return value None
song_url = 'https://genius.com/sample'
self.assertEqual(genius.lyrics_from_song_page(song_url),
None)
url = 'https://genius.com/sample'
mock = MockFetchUrl()
self.assertEqual(genius._scrape_lyrics_from_html(mock(url)), None)
def test_good_lyrics(self):
"""Ensure we are able to scrape a page with lyrics"""
url = 'https://genius.com/Wu-tang-clan-cream-lyrics'
mock = MockFetchUrl()
self.assertIsNotNone(genius._scrape_lyrics_from_html(mock(url)))
# TODO: find an example of a lyrics page with multiple divs and test it
class GeniusFetchTest(GeniusBaseTest):
"""tests Genius.fetch()"""
def setUp(self):
"""Set up configuration"""
GeniusBaseTest.setUp(self)
self.plugin = lyrics.LyricsPlugin()
@patch.object(lyrics.Genius, '_scrape_lyrics_from_html')
@patch.object(lyrics.Backend, 'fetch_url', return_value=True)
def test_json(self, mock_fetch_url, mock_scrape):
"""Ensure we're finding artist matches"""
with patch.object(
lyrics.Genius, '_search', return_value={
"response": {
"hits": [
{
"result": {
"primary_artist": {
"name": u"\u200Bblackbear",
},
"url": "blackbear_url"
}
},
{
"result": {
"primary_artist": {
"name": u"El\u002Dp"
},
"url": "El-p_url"
}
}
]
}
}
) as mock_json:
# genius uses zero-width-spaces (\u200B) for lowercase
# artists so we make sure we can match those
self.assertIsNotNone(genius.fetch('blackbear', 'Idfc'))
mock_fetch_url.assert_called_once_with("blackbear_url")
mock_scrape.assert_called_once_with(True)
# genius uses the hypen minus (\u002D) as their dash
self.assertIsNotNone(genius.fetch('El-p', 'Idfc'))
mock_fetch_url.assert_called_with('El-p_url')
mock_scrape.assert_called_with(True)
# test no matching artist
self.assertIsNone(genius.fetch('doesntexist', 'none'))
# test invalid json
mock_json.return_value = None
self.assertIsNone(genius.fetch('blackbear', 'Idfc'))
# TODO: add integration test hitting real api
# test utilties
class SlugTests(unittest.TestCase):