Merge pull request #2538 from Kraymer/lyrics-test

Tests to track whether lyrics websites are correctly fetched
This commit is contained in:
Fabrice Laporte 2017-05-03 20:02:03 +02:00 committed by GitHub
commit fc6b65d592
3 changed files with 211 additions and 193 deletions

View file

@ -21,6 +21,7 @@ from __future__ import absolute_import, division, print_function
import difflib
import itertools
import json
import struct
import re
import requests
import unicodedata
@ -53,7 +54,6 @@ from beets import plugins
from beets import ui
import beets
DIV_RE = re.compile(r'<(/?)div>?', re.I)
COMMENT_RE = re.compile(r'<!--.*-->', re.S)
TAG_RE = re.compile(r'<[^>]*>')
@ -77,6 +77,12 @@ USER_AGENT = 'beets/{}'.format(beets.__version__)
# Utilities.
def unichar(i):
try:
return six.unichr(i)
except ValueError:
return struct.pack('i', i).decode('utf-32')
def unescape(text):
"""Resolve &#xxx; HTML entities (and some others)."""
@ -86,7 +92,7 @@ def unescape(text):
def replchar(m):
num = m.group(1)
return six.unichr(int(num))
return unichar(int(num))
out = re.sub(u"&#(\d+);", replchar, out)
return out
@ -104,7 +110,6 @@ def extract_text_in(html, starttag):
"""Extract the text from a <DIV> tag in the HTML starting with
``starttag``. Returns None if parsing fails.
"""
# Strip off the leading text before opening tag.
try:
_, html = html.split(starttag, 1)
@ -145,10 +150,10 @@ def search_pairs(item):
and featured artists from the strings and add them as candidates.
The method also tries to split multiple titles separated with `/`.
"""
def generate_alternatives(string, patterns):
"""Generate string alternatives by extracting first matching group for
each given pattern."""
each given pattern.
"""
alternatives = [string]
for pattern in patterns:
match = re.search(pattern, string, re.IGNORECASE)
@ -254,16 +259,18 @@ class MusiXmatch(SymbolsReplaced):
def fetch(self, artist, title):
url = self.build_url(artist, title)
html = self.fetch_url(url)
if not html:
return
lyrics = extract_text_between(html,
'"body":', '"language":')
html_part = html.split('<p class="mxm-lyrics__content')[-1]
lyrics = extract_text_between(html_part, '>', '</p>')
return lyrics.strip(',"').replace('\\n', '\n')
class Genius(Backend):
"""Fetch lyrics from Genius via genius-api."""
def __init__(self, config, log):
super(Genius, self).__init__(config, log)
self.api_key = config['genius_api_key'].as_str()
@ -355,6 +362,7 @@ class Genius(Backend):
class LyricsWiki(SymbolsReplaced):
"""Fetch lyrics from LyricsWiki."""
URL_PATTERN = 'http://lyrics.wikia.com/%s:%s'
def fetch(self, artist, title):
@ -375,6 +383,7 @@ class LyricsWiki(SymbolsReplaced):
class LyricsCom(Backend):
"""Fetch lyrics from Lyrics.com."""
URL_PATTERN = 'http://www.lyrics.com/%s-lyrics-%s.html'
NOT_FOUND = (
'Sorry, we do not have the lyric',
@ -478,6 +487,7 @@ def scrape_lyrics_from_html(html):
class Google(Backend):
"""Fetch lyrics from Google search results."""
def __init__(self, config, log):
super(Google, self).__init__(config, log)
self.api_key = config['google_API_key'].as_str()
@ -713,7 +723,8 @@ class LyricsPlugin(plugins.BeetsPlugin):
def fetch_item_lyrics(self, lib, item, write, force):
"""Fetch and store lyrics for a single item. If ``write``, then the
lyrics will also be written to the file itself."""
lyrics will also be written to the file itself.
"""
# Skip if the item already has lyrics.
if not force and item.lyrics:
self._log.info(u'lyrics already present: {0}', item)

View file

@ -1,45 +1,56 @@
Beets_song:
- geeks
- bouquet
- panacea
# Song used by LyricsGooglePluginMachineryTest
Amsterdam:
- oriflammes
- fortune
- batave
- pissent
Lady_Madonna:
- heaven
- tuesday
- thursday
Jazz_n_blues:
- parkway
- balance
- impatient
- shoes
Hey_it_s_ok:
- swear
- forgive
- drink
- found
City_of_dreams:
- groves
- landmarks
- twilight
- freeways
Black_magic_woman:
- devil
- magic
- spell
- heart
Beets_song: |
beets is the media library management system for obsessive-compulsive music geeks the purpose of
beets is to get your music collection right once and for all it catalogs your collection
automatically improving its metadata as it goes it then provides a bouquet of tools for
manipulating and accessing your music here's an example of beets' brainy tag corrector doing its
because beets is designed as a library it can do almost anything you can imagine for your
music collection via plugins beets becomes a panacea
missing_texts: |
Lyricsmania staff is working hard for you to add $TITLE lyrics as soon
as they'll be released by $ARTIST, check back soon!
In case you have the lyrics to $TITLE and want to send them to us, fill out
the following form.
# Songs lyrics used to test the different sources present in the google custom search engine.
# Text is randomized for copyright infringement reason.
Amsterdam: |
coup corps coeur invitent mains comme trop morue le hantent mais la dames joli revenir aux
mangent croquer pleine plantent rire de sortent pleins fortune d'amsterdam bruit ruisselants
large poissons braguette leur putains blanches jusque pissent dans soleils dansent et port
bien vertu nez sur chaleur femmes rotant dorment marins boivent bu les que d'un qui je
une cou hambourg plus ils dents ou tournent or berges d'ailleurs tout ciel haubans ce son lueurs
en lune ont mouchent leurs long frottant jusqu'en vous regard montrent langueurs chantent
tordent pleure donnent drames mornes des panse pour un sent encore referment nappes au meurent
geste quand puis alors frites grosses batave expire naissent reboivent oriflammes grave riant a
enfin rance fier y bouffer s'entendre se mieux
Lady_Madonna: |
feed his money tuesday manage didn't head feet see arrives at in madonna rest morning children
wonder how make thursday your to sunday music papers come tie you has was is listen suitcase
ends friday run that needed breast they child baby mending on lady learned a nun like did wednesday
bed think without afternoon night meet the playing lying
Jazz_n_blues: |
all shoes money through follow blow til father to his hit jazz kiss now cool bar cause 50 night
heading i'll says yeah cash forgot blues out what for ways away fingers waiting got ever bold
screen sixty throw wait on about last compton days o pick love wall had within jeans jd next
miss standing from it's two long fight extravagant tell today more buy shopping that didn't
what's but russian up can parkway balance my and gone am it as at in check if bags when cross
machine take you drinks coke june wrong coming fancy's i n' impatient so the main's spend
that's
Hey_it_s_ok: |
and forget be when please it against fighting mama cause ! again what said
things papa hey to much lovers way wet was too do drink and i who forgive
hey fourteen please know not wanted had myself ok friends bed times looked
swear act found the my mean
Black_magic_woman: |
blind heart sticks just don't into back alone see need yes your out devil make that to black got
you might me woman turning spell stop baby with 'round a on stone messin' magic i of
tricks up leave turn bad so pick she's my can't

View file

@ -15,21 +15,25 @@
"""Tests for the 'lyrics' plugin."""
from __future__ import division, absolute_import, print_function
from __future__ import absolute_import, division, print_function
import os
import sys
import re
import six
import sys
import unittest
from mock import patch
from test import _common
from mock import MagicMock
from beets import logging
from beets.library import Item
from beets.util import bytestring_path, confit
from beetsplug import lyrics
from beets.library import Item
from beets.util import confit, bytestring_path
from beets import logging
import six
from mock import MagicMock
log = logging.getLogger('beets.test_lyrics')
raw_backend = lyrics.Backend({}, log)
@ -37,8 +41,9 @@ google = lyrics.Google(MagicMock(), log)
class LyricsPluginTest(unittest.TestCase):
def setUp(self):
"""Set up configuration"""
"""Set up configuration."""
lyrics.LyricsPlugin()
def test_search_artist(self):
@ -194,16 +199,8 @@ def url_to_filename(url):
return fn
def check_lyrics_fetched():
"""Return True if lyrics_download_samples.py has been runned and lyrics
pages are present in resources directory"""
lyrics_dirs = len([d for d in os.listdir(LYRICS_ROOT_DIR) if
os.path.isdir(os.path.join(LYRICS_ROOT_DIR, d))])
# example.com is the only lyrics dir added to repo
return lyrics_dirs > 1
class MockFetchUrl(object):
def __init__(self, pathval='fetched_path'):
self.pathval = pathval
self.fetched = None
@ -217,174 +214,173 @@ class MockFetchUrl(object):
def is_lyrics_content_ok(title, text):
"""Compare lyrics text to expected lyrics for given title"""
keywords = LYRICS_TEXTS[google.slugify(title)]
return all(x in text.lower() for x in keywords)
"""Compare lyrics text to expected lyrics for given title."""
if not text:
return
keywords = set(LYRICS_TEXTS[google.slugify(title)].split())
words = set(x.strip(".?, ") for x in text.lower().split())
return keywords <= words
LYRICS_ROOT_DIR = os.path.join(_common.RSRC, b'lyrics')
LYRICS_TEXTS = confit.load_yaml(os.path.join(_common.RSRC, b'lyricstext.yaml'))
DEFAULT_SONG = dict(artist=u'The Beatles', title=u'Lady Madonna')
DEFAULT_SOURCES = [
dict(DEFAULT_SONG, url=u'http://lyrics.wikia.com/',
path=u'The_Beatles:Lady_Madonna'),
dict(artist=u'Santana', title=u'Black magic woman',
url='http://www.lyrics.com/',
path=u'black-magic-woman-lyrics-santana.html'),
dict(DEFAULT_SONG, url='https://www.musixmatch.com/',
path=u'lyrics/The-Beatles/Lady-Madonna'),
]
# Every source entered in default beets google custom search engine
# must be listed below.
# Use default query when possible, or override artist and title fields
# if website don't have lyrics for default query.
GOOGLE_SOURCES = [
dict(DEFAULT_SONG,
url=u'http://www.absolutelyrics.com',
path=u'/lyrics/view/the_beatles/lady_madonna'),
dict(DEFAULT_SONG,
url=u'http://www.azlyrics.com',
path=u'/lyrics/beatles/ladymadonna.html'),
dict(DEFAULT_SONG,
url=u'http://www.chartlyrics.com',
path=u'/_LsLsZ7P4EK-F-LD4dJgDQ/Lady+Madonna.aspx'),
dict(DEFAULT_SONG,
url=u'http://www.elyricsworld.com',
path=u'/lady_madonna_lyrics_beatles.html'),
dict(url=u'http://www.lacoccinelle.net',
artist=u'Jacques Brel', title=u"Amsterdam",
path=u'/paroles-officielles/275679.html'),
dict(DEFAULT_SONG,
url=u'http://letras.mus.br/', path=u'the-beatles/275/'),
dict(DEFAULT_SONG,
url='http://www.lyricsmania.com/',
path='lady_madonna_lyrics_the_beatles.html'),
dict(artist=u'Santana', title=u'Black magic woman',
url='http://www.lyrics.com/',
path=u'black-magic-woman-lyrics-santana.html'),
dict(DEFAULT_SONG, url=u'http://lyrics.wikia.com/',
path=u'The_Beatles:Lady_Madonna'),
dict(DEFAULT_SONG,
url=u'http://www.lyrics.net', path=u'/lyric/19110224'),
dict(DEFAULT_SONG,
url=u'http://www.lyricsmode.com',
path=u'/lyrics/b/beatles/lady_madonna.html'),
dict(url=u'http://www.lyricsontop.com',
artist=u'Amy Winehouse', title=u"Jazz'n'blues",
path=u'/amy-winehouse-songs/jazz-n-blues-lyrics.html'),
dict(DEFAULT_SONG,
url='http://www.metrolyrics.com/',
path='lady-madonna-lyrics-beatles.html'),
dict(url='http://www.musica.com/', path='letras.asp?letra=2738',
artist=u'Santana', title=u'Black magic woman'),
dict(DEFAULT_SONG,
url=u'http://www.onelyrics.net/',
artist=u'Ben & Ellen Harper', title=u'City of dreams',
path='ben-ellen-harper-city-of-dreams-lyrics'),
dict(url=u'http://www.paroles.net/',
artist=u'Lilly Wood & the prick', title=u"Hey it's ok",
path=u'lilly-wood-the-prick/paroles-hey-it-s-ok'),
dict(DEFAULT_SONG,
url='http://www.releaselyrics.com',
path=u'/346e/the-beatles-lady-madonna-(love-version)/'),
dict(DEFAULT_SONG,
url=u'http://www.smartlyrics.com',
path=u'/Song18148-The-Beatles-Lady-Madonna-lyrics.aspx'),
dict(DEFAULT_SONG,
url='http://www.songlyrics.com',
path=u'/the-beatles/lady-madonna-lyrics'),
dict(DEFAULT_SONG,
url=u'http://www.stlyrics.com',
path=u'/songs/r/richiehavens48961/ladymadonna2069109.html'),
dict(DEFAULT_SONG,
url=u'http://www.sweetslyrics.com',
path=u'/761696.The%20Beatles%20-%20Lady%20Madonna.html')
]
class LyricsGooglePluginTest(unittest.TestCase):
"""Test scraping heuristics on a fake html page.
Or run lyrics_download_samples.py first to check that beets google
custom search engine sources are correctly scraped.
"""
source = dict(url=u'http://www.example.com', artist=u'John Doe',
title=u'Beets song', path=u'/lyrics/beetssong')
class LyricsGoogleBaseTest(unittest.TestCase):
def setUp(self):
"""Set up configuration"""
"""Set up configuration."""
try:
__import__('bs4')
except ImportError:
self.skipTest('Beautiful Soup 4 not available')
if sys.version_info[:3] < (2, 7, 3):
self.skipTest("Python's built-in HTML parser is not good enough")
lyrics.LyricsPlugin()
raw_backend.fetch_url = MockFetchUrl()
class LyricsPluginSourcesTest(LyricsGoogleBaseTest):
"""Check that beets google custom search engine sources are correctly
scraped.
"""
DEFAULT_SONG = dict(artist=u'The Beatles', title=u'Lady Madonna')
DEFAULT_SOURCES = [
dict(DEFAULT_SONG, backend=lyrics.LyricsWiki),
dict(DEFAULT_SONG, backend=lyrics.LyricsCom),
dict(artist=u'Santana', title=u'Black magic woman',
backend=lyrics.MusiXmatch),
dict(DEFAULT_SONG, backend=lyrics.Genius),
]
GOOGLE_SOURCES = [
dict(DEFAULT_SONG,
url=u'http://www.absolutelyrics.com',
path=u'/lyrics/view/the_beatles/lady_madonna'),
dict(DEFAULT_SONG,
url=u'http://www.azlyrics.com',
path=u'/lyrics/beatles/ladymadonna.html'),
dict(DEFAULT_SONG,
url=u'http://www.chartlyrics.com',
path=u'/_LsLsZ7P4EK-F-LD4dJgDQ/Lady+Madonna.aspx'),
dict(DEFAULT_SONG,
url=u'http://www.elyricsworld.com',
path=u'/lady_madonna_lyrics_beatles.html'),
dict(url=u'http://www.lacoccinelle.net',
artist=u'Jacques Brel', title=u"Amsterdam",
path=u'/paroles-officielles/275679.html'),
dict(DEFAULT_SONG,
url=u'http://letras.mus.br/', path=u'the-beatles/275/'),
dict(DEFAULT_SONG,
url='http://www.lyricsmania.com/',
path='lady_madonna_lyrics_the_beatles.html'),
dict(DEFAULT_SONG, url=u'http://lyrics.wikia.com/',
path=u'The_Beatles:Lady_Madonna'),
dict(DEFAULT_SONG,
url=u'http://www.lyricsmode.com',
path=u'/lyrics/b/beatles/lady_madonna.html'),
dict(url=u'http://www.lyricsontop.com',
artist=u'Amy Winehouse', title=u"Jazz'n'blues",
path=u'/amy-winehouse-songs/jazz-n-blues-lyrics.html'),
dict(DEFAULT_SONG,
url='http://www.metrolyrics.com/',
path='lady-madonna-lyrics-beatles.html'),
dict(url='http://www.musica.com/', path='letras.asp?letra=2738',
artist=u'Santana', title=u'Black magic woman'),
dict(url=u'http://www.paroles.net/',
artist=u'Lilly Wood & the prick', title=u"Hey it's ok",
path=u'lilly-wood-the-prick/paroles-hey-it-s-ok'),
dict(DEFAULT_SONG,
url='http://www.songlyrics.com',
path=u'/the-beatles/lady-madonna-lyrics'),
dict(DEFAULT_SONG,
url=u'http://www.sweetslyrics.com',
path=u'/761696.The%20Beatles%20-%20Lady%20Madonna.html')
]
def setUp(self):
LyricsGoogleBaseTest.setUp(self)
self.plugin = lyrics.LyricsPlugin()
@unittest.skipUnless(os.environ.get(
'BEETS_TEST_LYRICS_SOURCES', '0') == '1',
'lyrics sources testing not enabled')
def test_backend_sources_ok(self):
"""Test default backends with songs known to exist in respective databases.
"""
errors = []
for s in self.DEFAULT_SOURCES:
res = s['backend'](self.plugin.config, self.plugin._log).fetch(
s['artist'], s['title'])
if not is_lyrics_content_ok(s['title'], res):
errors.append(s['backend'].__name__)
self.assertFalse(errors)
@unittest.skipUnless(os.environ.get(
'BEETS_TEST_LYRICS_SOURCES', '0') == '1',
'lyrics sources testing not enabled')
def test_google_sources_ok(self):
"""Test if lyrics present on websites registered in beets google custom
search engine are correctly scraped.
"""
for s in self.GOOGLE_SOURCES:
url = s['url'] + s['path']
res = lyrics.scrape_lyrics_from_html(
raw_backend.fetch_url(url))
self.assertTrue(google.is_lyrics(res), url)
self.assertTrue(is_lyrics_content_ok(s['title'], res), url)
class LyricsGooglePluginMachineryTest(LyricsGoogleBaseTest):
"""Test scraping heuristics on a fake html page.
"""
source = dict(url=u'http://www.example.com', artist=u'John Doe',
title=u'Beets song', path=u'/lyrics/beetssong')
def setUp(self):
"""Set up configuration"""
LyricsGoogleBaseTest.setUp(self)
self.plugin = lyrics.LyricsPlugin()
@patch.object(lyrics.Backend, 'fetch_url', MockFetchUrl())
def test_mocked_source_ok(self):
"""Test that lyrics of the mocked page are correctly scraped"""
url = self.source['url'] + self.source['path']
if os.path.isfile(url_to_filename(url)):
res = lyrics.scrape_lyrics_from_html(raw_backend.fetch_url(url))
self.assertTrue(google.is_lyrics(res), url)
self.assertTrue(is_lyrics_content_ok(self.source['title'], res),
url)
def test_google_sources_ok(self):
"""Test if lyrics present on websites registered in beets google custom
search engine are correctly scraped."""
if not check_lyrics_fetched():
self.skipTest("Run lyrics_download_samples.py script first.")
for s in GOOGLE_SOURCES:
url = s['url'] + s['path']
if os.path.isfile(url_to_filename(url)):
res = lyrics.scrape_lyrics_from_html(
raw_backend.fetch_url(url))
self.assertTrue(google.is_lyrics(res), url)
self.assertTrue(is_lyrics_content_ok(s['title'], res), url)
def test_default_ok(self):
"""Test default engines with the default query"""
if not check_lyrics_fetched():
self.skipTest("Run lyrics_download_samples.py script first.")
for (source, s) in zip([lyrics.LyricsWiki,
lyrics.LyricsCom,
lyrics.MusiXmatch], DEFAULT_SOURCES):
url = s['url'] + s['path']
if os.path.isfile(url_to_filename(url)):
res = source({}, log).fetch(s['artist'], s['title'])
self.assertTrue(google.is_lyrics(res), url)
self.assertTrue(is_lyrics_content_ok(s['title'], res), url)
res = lyrics.scrape_lyrics_from_html(raw_backend.fetch_url(url))
self.assertTrue(google.is_lyrics(res), url)
self.assertTrue(is_lyrics_content_ok(self.source['title'], res),
url)
@patch.object(lyrics.Backend, 'fetch_url', MockFetchUrl())
def test_is_page_candidate_exact_match(self):
"""Test matching html page title with song infos -- when song infos are
present in the title."""
present in the title.
"""
from bs4 import SoupStrainer, BeautifulSoup
s = self.source
url = six.text_type(s['url'] + s['path'])
html = raw_backend.fetch_url(url)
soup = BeautifulSoup(html, "html.parser",
parse_only=SoupStrainer('title'))
self.assertEqual(google.is_page_candidate(url, soup.title.string,
s['title'], s['artist']),
True, url)
self.assertEqual(
google.is_page_candidate(url, soup.title.string,
s['title'], s['artist']), True, url)
def test_is_page_candidate_fuzzy_match(self):
"""Test matching html page title with song infos -- when song infos are
not present in the title."""
not present in the title.
"""
s = self.source
url = s['url'] + s['path']
url_title = u'example.com | Beats song by John doe'
# very small diffs (typo) are ok eg 'beats' vs 'beets' with same artist
self.assertEqual(google.is_page_candidate(url, url_title, s['title'],
s['artist']), True, url)
s['artist']), True, url)
# reject different title
url_title = u'example.com | seets bong lyrics by John doe'
self.assertEqual(google.is_page_candidate(url, url_title, s['title'],
s['artist']), False, url)
s['artist']), False, url)
def test_is_page_candidate_special_chars(self):
"""Ensure that `is_page_candidate` doesn't crash when the artist