diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py index 6714b2fee..cdaf102e3 100644 --- a/beetsplug/lyrics.py +++ b/beetsplug/lyrics.py @@ -21,6 +21,7 @@ from __future__ import absolute_import, division, print_function import difflib import itertools import json +import struct import re import requests import unicodedata @@ -53,7 +54,6 @@ from beets import plugins from beets import ui import beets - DIV_RE = re.compile(r'<(/?)div>?', re.I) COMMENT_RE = re.compile(r'', re.S) TAG_RE = re.compile(r'<[^>]*>') @@ -77,6 +77,12 @@ USER_AGENT = 'beets/{}'.format(beets.__version__) # Utilities. +def unichar(i): + try: + return six.unichr(i) + except ValueError: + return struct.pack('i', i).decode('utf-32') + def unescape(text): """Resolve xx; HTML entities (and some others).""" @@ -86,7 +92,7 @@ def unescape(text): def replchar(m): num = m.group(1) - return six.unichr(int(num)) + return unichar(int(num)) out = re.sub(u"(\d+);", replchar, out) return out @@ -104,7 +110,6 @@ def extract_text_in(html, starttag): """Extract the text from a
1 - - class MockFetchUrl(object): + def __init__(self, pathval='fetched_path'): self.pathval = pathval self.fetched = None @@ -217,174 +214,173 @@ class MockFetchUrl(object): def is_lyrics_content_ok(title, text): - """Compare lyrics text to expected lyrics for given title""" - - keywords = LYRICS_TEXTS[google.slugify(title)] - return all(x in text.lower() for x in keywords) + """Compare lyrics text to expected lyrics for given title.""" + if not text: + return + keywords = set(LYRICS_TEXTS[google.slugify(title)].split()) + words = set(x.strip(".?, ") for x in text.lower().split()) + return keywords <= words LYRICS_ROOT_DIR = os.path.join(_common.RSRC, b'lyrics') LYRICS_TEXTS = confit.load_yaml(os.path.join(_common.RSRC, b'lyricstext.yaml')) -DEFAULT_SONG = dict(artist=u'The Beatles', title=u'Lady Madonna') - -DEFAULT_SOURCES = [ - dict(DEFAULT_SONG, url=u'http://lyrics.wikia.com/', - path=u'The_Beatles:Lady_Madonna'), - dict(artist=u'Santana', title=u'Black magic woman', - url='http://www.lyrics.com/', - path=u'black-magic-woman-lyrics-santana.html'), - dict(DEFAULT_SONG, url='https://www.musixmatch.com/', - path=u'lyrics/The-Beatles/Lady-Madonna'), -] - -# Every source entered in default beets google custom search engine -# must be listed below. -# Use default query when possible, or override artist and title fields -# if website don't have lyrics for default query. -GOOGLE_SOURCES = [ - dict(DEFAULT_SONG, - url=u'http://www.absolutelyrics.com', - path=u'/lyrics/view/the_beatles/lady_madonna'), - dict(DEFAULT_SONG, - url=u'http://www.azlyrics.com', - path=u'/lyrics/beatles/ladymadonna.html'), - dict(DEFAULT_SONG, - url=u'http://www.chartlyrics.com', - path=u'/_LsLsZ7P4EK-F-LD4dJgDQ/Lady+Madonna.aspx'), - dict(DEFAULT_SONG, - url=u'http://www.elyricsworld.com', - path=u'/lady_madonna_lyrics_beatles.html'), - dict(url=u'http://www.lacoccinelle.net', - artist=u'Jacques Brel', title=u"Amsterdam", - path=u'/paroles-officielles/275679.html'), - dict(DEFAULT_SONG, - url=u'http://letras.mus.br/', path=u'the-beatles/275/'), - dict(DEFAULT_SONG, - url='http://www.lyricsmania.com/', - path='lady_madonna_lyrics_the_beatles.html'), - dict(artist=u'Santana', title=u'Black magic woman', - url='http://www.lyrics.com/', - path=u'black-magic-woman-lyrics-santana.html'), - dict(DEFAULT_SONG, url=u'http://lyrics.wikia.com/', - path=u'The_Beatles:Lady_Madonna'), - dict(DEFAULT_SONG, - url=u'http://www.lyrics.net', path=u'/lyric/19110224'), - dict(DEFAULT_SONG, - url=u'http://www.lyricsmode.com', - path=u'/lyrics/b/beatles/lady_madonna.html'), - dict(url=u'http://www.lyricsontop.com', - artist=u'Amy Winehouse', title=u"Jazz'n'blues", - path=u'/amy-winehouse-songs/jazz-n-blues-lyrics.html'), - dict(DEFAULT_SONG, - url='http://www.metrolyrics.com/', - path='lady-madonna-lyrics-beatles.html'), - dict(url='http://www.musica.com/', path='letras.asp?letra=2738', - artist=u'Santana', title=u'Black magic woman'), - dict(DEFAULT_SONG, - url=u'http://www.onelyrics.net/', - artist=u'Ben & Ellen Harper', title=u'City of dreams', - path='ben-ellen-harper-city-of-dreams-lyrics'), - dict(url=u'http://www.paroles.net/', - artist=u'Lilly Wood & the prick', title=u"Hey it's ok", - path=u'lilly-wood-the-prick/paroles-hey-it-s-ok'), - dict(DEFAULT_SONG, - url='http://www.releaselyrics.com', - path=u'/346e/the-beatles-lady-madonna-(love-version)/'), - dict(DEFAULT_SONG, - url=u'http://www.smartlyrics.com', - path=u'/Song18148-The-Beatles-Lady-Madonna-lyrics.aspx'), - dict(DEFAULT_SONG, - url='http://www.songlyrics.com', - path=u'/the-beatles/lady-madonna-lyrics'), - dict(DEFAULT_SONG, - url=u'http://www.stlyrics.com', - path=u'/songs/r/richiehavens48961/ladymadonna2069109.html'), - dict(DEFAULT_SONG, - url=u'http://www.sweetslyrics.com', - path=u'/761696.The%20Beatles%20-%20Lady%20Madonna.html') -] -class LyricsGooglePluginTest(unittest.TestCase): - """Test scraping heuristics on a fake html page. - Or run lyrics_download_samples.py first to check that beets google - custom search engine sources are correctly scraped. - """ - source = dict(url=u'http://www.example.com', artist=u'John Doe', - title=u'Beets song', path=u'/lyrics/beetssong') +class LyricsGoogleBaseTest(unittest.TestCase): def setUp(self): - """Set up configuration""" + """Set up configuration.""" try: __import__('bs4') except ImportError: self.skipTest('Beautiful Soup 4 not available') if sys.version_info[:3] < (2, 7, 3): self.skipTest("Python's built-in HTML parser is not good enough") - lyrics.LyricsPlugin() - raw_backend.fetch_url = MockFetchUrl() + +class LyricsPluginSourcesTest(LyricsGoogleBaseTest): + """Check that beets google custom search engine sources are correctly + scraped. + """ + + DEFAULT_SONG = dict(artist=u'The Beatles', title=u'Lady Madonna') + + DEFAULT_SOURCES = [ + dict(DEFAULT_SONG, backend=lyrics.LyricsWiki), + dict(DEFAULT_SONG, backend=lyrics.LyricsCom), + dict(artist=u'Santana', title=u'Black magic woman', + backend=lyrics.MusiXmatch), + dict(DEFAULT_SONG, backend=lyrics.Genius), + ] + + GOOGLE_SOURCES = [ + dict(DEFAULT_SONG, + url=u'http://www.absolutelyrics.com', + path=u'/lyrics/view/the_beatles/lady_madonna'), + dict(DEFAULT_SONG, + url=u'http://www.azlyrics.com', + path=u'/lyrics/beatles/ladymadonna.html'), + dict(DEFAULT_SONG, + url=u'http://www.chartlyrics.com', + path=u'/_LsLsZ7P4EK-F-LD4dJgDQ/Lady+Madonna.aspx'), + dict(DEFAULT_SONG, + url=u'http://www.elyricsworld.com', + path=u'/lady_madonna_lyrics_beatles.html'), + dict(url=u'http://www.lacoccinelle.net', + artist=u'Jacques Brel', title=u"Amsterdam", + path=u'/paroles-officielles/275679.html'), + dict(DEFAULT_SONG, + url=u'http://letras.mus.br/', path=u'the-beatles/275/'), + dict(DEFAULT_SONG, + url='http://www.lyricsmania.com/', + path='lady_madonna_lyrics_the_beatles.html'), + dict(DEFAULT_SONG, url=u'http://lyrics.wikia.com/', + path=u'The_Beatles:Lady_Madonna'), + dict(DEFAULT_SONG, + url=u'http://www.lyricsmode.com', + path=u'/lyrics/b/beatles/lady_madonna.html'), + dict(url=u'http://www.lyricsontop.com', + artist=u'Amy Winehouse', title=u"Jazz'n'blues", + path=u'/amy-winehouse-songs/jazz-n-blues-lyrics.html'), + dict(DEFAULT_SONG, + url='http://www.metrolyrics.com/', + path='lady-madonna-lyrics-beatles.html'), + dict(url='http://www.musica.com/', path='letras.asp?letra=2738', + artist=u'Santana', title=u'Black magic woman'), + dict(url=u'http://www.paroles.net/', + artist=u'Lilly Wood & the prick', title=u"Hey it's ok", + path=u'lilly-wood-the-prick/paroles-hey-it-s-ok'), + dict(DEFAULT_SONG, + url='http://www.songlyrics.com', + path=u'/the-beatles/lady-madonna-lyrics'), + dict(DEFAULT_SONG, + url=u'http://www.sweetslyrics.com', + path=u'/761696.The%20Beatles%20-%20Lady%20Madonna.html') + ] + + def setUp(self): + LyricsGoogleBaseTest.setUp(self) + self.plugin = lyrics.LyricsPlugin() + + @unittest.skipUnless(os.environ.get( + 'BEETS_TEST_LYRICS_SOURCES', '0') == '1', + 'lyrics sources testing not enabled') + def test_backend_sources_ok(self): + """Test default backends with songs known to exist in respective databases. + """ + errors = [] + for s in self.DEFAULT_SOURCES: + res = s['backend'](self.plugin.config, self.plugin._log).fetch( + s['artist'], s['title']) + if not is_lyrics_content_ok(s['title'], res): + errors.append(s['backend'].__name__) + self.assertFalse(errors) + + @unittest.skipUnless(os.environ.get( + 'BEETS_TEST_LYRICS_SOURCES', '0') == '1', + 'lyrics sources testing not enabled') + def test_google_sources_ok(self): + """Test if lyrics present on websites registered in beets google custom + search engine are correctly scraped. + """ + for s in self.GOOGLE_SOURCES: + url = s['url'] + s['path'] + res = lyrics.scrape_lyrics_from_html( + raw_backend.fetch_url(url)) + self.assertTrue(google.is_lyrics(res), url) + self.assertTrue(is_lyrics_content_ok(s['title'], res), url) + + +class LyricsGooglePluginMachineryTest(LyricsGoogleBaseTest): + """Test scraping heuristics on a fake html page. + """ + + source = dict(url=u'http://www.example.com', artist=u'John Doe', + title=u'Beets song', path=u'/lyrics/beetssong') + + def setUp(self): + """Set up configuration""" + LyricsGoogleBaseTest.setUp(self) + self.plugin = lyrics.LyricsPlugin() + + @patch.object(lyrics.Backend, 'fetch_url', MockFetchUrl()) def test_mocked_source_ok(self): """Test that lyrics of the mocked page are correctly scraped""" url = self.source['url'] + self.source['path'] - if os.path.isfile(url_to_filename(url)): - res = lyrics.scrape_lyrics_from_html(raw_backend.fetch_url(url)) - self.assertTrue(google.is_lyrics(res), url) - self.assertTrue(is_lyrics_content_ok(self.source['title'], res), - url) - - def test_google_sources_ok(self): - """Test if lyrics present on websites registered in beets google custom - search engine are correctly scraped.""" - if not check_lyrics_fetched(): - self.skipTest("Run lyrics_download_samples.py script first.") - for s in GOOGLE_SOURCES: - url = s['url'] + s['path'] - if os.path.isfile(url_to_filename(url)): - res = lyrics.scrape_lyrics_from_html( - raw_backend.fetch_url(url)) - self.assertTrue(google.is_lyrics(res), url) - self.assertTrue(is_lyrics_content_ok(s['title'], res), url) - - def test_default_ok(self): - """Test default engines with the default query""" - if not check_lyrics_fetched(): - self.skipTest("Run lyrics_download_samples.py script first.") - for (source, s) in zip([lyrics.LyricsWiki, - lyrics.LyricsCom, - lyrics.MusiXmatch], DEFAULT_SOURCES): - url = s['url'] + s['path'] - if os.path.isfile(url_to_filename(url)): - res = source({}, log).fetch(s['artist'], s['title']) - self.assertTrue(google.is_lyrics(res), url) - self.assertTrue(is_lyrics_content_ok(s['title'], res), url) + res = lyrics.scrape_lyrics_from_html(raw_backend.fetch_url(url)) + self.assertTrue(google.is_lyrics(res), url) + self.assertTrue(is_lyrics_content_ok(self.source['title'], res), + url) + @patch.object(lyrics.Backend, 'fetch_url', MockFetchUrl()) def test_is_page_candidate_exact_match(self): """Test matching html page title with song infos -- when song infos are - present in the title.""" + present in the title. + """ from bs4 import SoupStrainer, BeautifulSoup s = self.source url = six.text_type(s['url'] + s['path']) html = raw_backend.fetch_url(url) soup = BeautifulSoup(html, "html.parser", parse_only=SoupStrainer('title')) - self.assertEqual(google.is_page_candidate(url, soup.title.string, - s['title'], s['artist']), - True, url) + self.assertEqual( + google.is_page_candidate(url, soup.title.string, + s['title'], s['artist']), True, url) def test_is_page_candidate_fuzzy_match(self): """Test matching html page title with song infos -- when song infos are - not present in the title.""" + not present in the title. + """ s = self.source url = s['url'] + s['path'] url_title = u'example.com | Beats song by John doe' # very small diffs (typo) are ok eg 'beats' vs 'beets' with same artist self.assertEqual(google.is_page_candidate(url, url_title, s['title'], - s['artist']), True, url) + s['artist']), True, url) # reject different title url_title = u'example.com | seets bong lyrics by John doe' self.assertEqual(google.is_page_candidate(url, url_title, s['title'], - s['artist']), False, url) + s['artist']), False, url) def test_is_page_candidate_special_chars(self): """Ensure that `is_page_candidate` doesn't crash when the artist