From 2bf58a61c31ebec6b0dfc0ea212fc22f3da3614b Mon Sep 17 00:00:00 2001 From: Fabrice Laporte Date: Sun, 30 Apr 2017 23:14:23 +0200 Subject: [PATCH 01/15] Decode string with Unicode escape --- beetsplug/lyrics.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py index 6714b2fee..14ca6a4b5 100644 --- a/beetsplug/lyrics.py +++ b/beetsplug/lyrics.py @@ -21,6 +21,7 @@ from __future__ import absolute_import, division, print_function import difflib import itertools import json +import struct import re import requests import unicodedata @@ -77,6 +78,11 @@ USER_AGENT = 'beets/{}'.format(beets.__version__) # Utilities. +def unichar(i): + try: + return six.unichr(i) + except ValueError: + return struct.pack('i', i).decode('utf-32') def unescape(text): """Resolve &#xxx; HTML entities (and some others).""" @@ -86,7 +92,7 @@ def unescape(text): def replchar(m): num = m.group(1) - return six.unichr(int(num)) + return unichar(int(num)) out = re.sub(u"&#(\d+);", replchar, out) return out From a165d6c00bf022345852e20ccc6624fd3d5da04d Mon Sep 17 00:00:00 2001 From: Fabrice Laporte Date: Mon, 1 May 2017 23:40:09 +0200 Subject: [PATCH 02/15] Fix MusiXmatch text extraction markers --- beetsplug/lyrics.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py index 14ca6a4b5..ad2d278b5 100644 --- a/beetsplug/lyrics.py +++ b/beetsplug/lyrics.py @@ -263,8 +263,8 @@ class MusiXmatch(SymbolsReplaced): html = self.fetch_url(url) if not html: return - lyrics = extract_text_between(html, - '"body":', '"language":') + lyrics = extract_text_between(html, '

', + '

') return lyrics.strip(',"').replace('\\n', '\n') From f8862ac0ea8d3a055a5ebfc0abda0c2b28773947 Mon Sep 17 00:00:00 2001 From: Fabrice Laporte Date: Tue, 2 May 2017 00:52:07 +0200 Subject: [PATCH 03/15] Sort imports --- test/test_lyrics.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/test/test_lyrics.py b/test/test_lyrics.py index 13ba07fdf..eb9d17dec 100644 --- a/test/test_lyrics.py +++ b/test/test_lyrics.py @@ -15,21 +15,25 @@ """Tests for the 'lyrics' plugin.""" -from __future__ import division, absolute_import, print_function +from __future__ import absolute_import, division, print_function import os -import sys import re +import six +import sys import unittest +from mock import patch from test import _common -from mock import MagicMock + +from beets import logging +from beets.library import Item +from beets.util import bytestring_path, confit from beetsplug import lyrics -from beets.library import Item -from beets.util import confit, bytestring_path -from beets import logging -import six + +from mock import MagicMock + log = logging.getLogger('beets.test_lyrics') raw_backend = lyrics.Backend({}, log) From 4e0527f07d5cdfa623216caa21904d74fdd289dd Mon Sep 17 00:00:00 2001 From: Fabrice Laporte Date: Tue, 2 May 2017 00:54:58 +0200 Subject: [PATCH 04/15] Docstrings style --- test/test_lyrics.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test_lyrics.py b/test/test_lyrics.py index eb9d17dec..d8b9e672e 100644 --- a/test/test_lyrics.py +++ b/test/test_lyrics.py @@ -42,7 +42,7 @@ google = lyrics.Google(MagicMock(), log) class LyricsPluginTest(unittest.TestCase): def setUp(self): - """Set up configuration""" + """Set up configuration.""" lyrics.LyricsPlugin() def test_search_artist(self): @@ -317,7 +317,7 @@ class LyricsGooglePluginTest(unittest.TestCase): title=u'Beets song', path=u'/lyrics/beetssong') def setUp(self): - """Set up configuration""" + """Set up configuration.""" try: __import__('bs4') except ImportError: From a85dcd88c4898c067982e7a87f7234be99aa9f5d Mon Sep 17 00:00:00 2001 From: Fabrice Laporte Date: Tue, 2 May 2017 00:56:56 +0200 Subject: [PATCH 05/15] Store whole expected lyrics, not just keywords, but randomized --- test/rsrc/lyricstext.yaml | 87 ++++++++++++++++++++++----------------- test/test_lyrics.py | 10 +++-- 2 files changed, 55 insertions(+), 42 deletions(-) diff --git a/test/rsrc/lyricstext.yaml b/test/rsrc/lyricstext.yaml index 814c207df..7ae1a70e7 100644 --- a/test/rsrc/lyricstext.yaml +++ b/test/rsrc/lyricstext.yaml @@ -1,45 +1,56 @@ -Beets_song: - - geeks - - bouquet - - panacea +# Song used by LyricsGooglePluginMachineryTest -Amsterdam: - - oriflammes - - fortune - - batave - - pissent - -Lady_Madonna: - - heaven - - tuesday - - thursday - -Jazz_n_blues: - - parkway - - balance - - impatient - - shoes - -Hey_it_s_ok: - - swear - - forgive - - drink - - found - -City_of_dreams: - - groves - - landmarks - - twilight - - freeways - -Black_magic_woman: - - devil - - magic - - spell - - heart +Beets_song: | + beets is the media library management system for obsessive-compulsive music geeks the purpose of + beets is to get your music collection right once and for all it catalogs your collection + automatically improving its metadata as it goes it then provides a bouquet of tools for + manipulating and accessing your music here's an example of beets' brainy tag corrector doing its + because beets is designed as a library it can do almost anything you can imagine for your + music collection via plugins beets becomes a panacea missing_texts: | Lyricsmania staff is working hard for you to add $TITLE lyrics as soon as they'll be released by $ARTIST, check back soon! In case you have the lyrics to $TITLE and want to send them to us, fill out the following form. + +# Songs lyrics used to test the different sources present in the google custom search engine. +# Text is randomized for copyright infringement reason. + +Amsterdam: | + coup corps coeur invitent mains comme trop morue le hantent mais la dames joli revenir aux + mangent croquer pleine plantent rire de sortent pleins fortune d'amsterdam bruit ruisselants + large poissons braguette leur putains blanches jusque pissent dans soleils dansent et port + bien vertu nez sur chaleur femmes rotant dorment marins boivent bu les que d'un qui je + une cou hambourg plus ils dents ou tournent or berges d'ailleurs tout ciel haubans ce son lueurs + en lune ont mouchent leurs long frottant jusqu'en vous regard montrent langueurs chantent + tordent pleure donnent drames mornes des panse pour un sent encore referment nappes au meurent + geste quand puis alors frites grosses batave expire naissent reboivent oriflammes grave riant a + enfin rance fier y bouffer s'entendre se mieux + +Lady_Madonna: | + feed his money tuesday manage didn't head feet see arrives at in madonna rest morning children + wonder how make thursday your to sunday music papers come tie you has was is listen suitcase + ends friday run that needed breast they child baby mending on lady learned a nun like did wednesday + bed think without afternoon night meet the playing lying + +Jazz_n_blues: | + all shoes money through follow blow til father to his hit jazz kiss now cool bar cause 50 night + heading i'll says yeah cash forgot blues out what for ways away fingers waiting got ever bold + screen sixty throw wait on about last compton days o pick love wall had within jeans jd next + miss standing from it's two long fight extravagant tell today more buy shopping that didn't + what's but russian up can parkway balance my and gone am it as at in check if bags when cross + machine take you drinks coke june wrong coming fancy's i n' impatient so the main's spend + that's + +Hey_it_s_ok: | + and forget be when please it against fighting mama cause ! again what said + things papa hey to much lovers way wet was too do drink and i who forgive + hey fourteen please know not wanted had myself ok friends bed times looked + swear act found the my mean + +Black_magic_woman: | + blind heart sticks just don't into back alone see need yes your out devil make that to black got + you might me woman turning spell stop baby with 'round a on stone messin' magic i of + tricks up leave turn bad so pick she's my can't + diff --git a/test/test_lyrics.py b/test/test_lyrics.py index d8b9e672e..d36499d16 100644 --- a/test/test_lyrics.py +++ b/test/test_lyrics.py @@ -221,10 +221,12 @@ class MockFetchUrl(object): def is_lyrics_content_ok(title, text): - """Compare lyrics text to expected lyrics for given title""" - - keywords = LYRICS_TEXTS[google.slugify(title)] - return all(x in text.lower() for x in keywords) + """Compare lyrics text to expected lyrics for given title.""" + if not text: + return + keywords = set(LYRICS_TEXTS[google.slugify(title)].split()) + words = set(x.strip(".?, ") for x in text.lower().split()) + return keywords <= words LYRICS_ROOT_DIR = os.path.join(_common.RSRC, b'lyrics') LYRICS_TEXTS = confit.load_yaml(os.path.join(_common.RSRC, b'lyricstext.yaml')) From d88cabc8464ab338b143e336afcf03ace2303266 Mon Sep 17 00:00:00 2001 From: Fabrice Laporte Date: Tue, 2 May 2017 01:03:26 +0200 Subject: [PATCH 06/15] Divide LyricsGooglePluginTest into two classes. Move existing tests into LyricsGooglePluginMachineryTest. Create LyricsPluginSourcesTest class to check fetching of each source. Some code was supposed to do that until now but was never executed as we exited early at the "if not check_lyrics_fetched():" check. --- test/test_lyrics.py | 227 +++++++++++++++++++------------------------- 1 file changed, 100 insertions(+), 127 deletions(-) diff --git a/test/test_lyrics.py b/test/test_lyrics.py index d36499d16..b6abf8d9e 100644 --- a/test/test_lyrics.py +++ b/test/test_lyrics.py @@ -198,15 +198,6 @@ def url_to_filename(url): return fn -def check_lyrics_fetched(): - """Return True if lyrics_download_samples.py has been runned and lyrics - pages are present in resources directory""" - lyrics_dirs = len([d for d in os.listdir(LYRICS_ROOT_DIR) if - os.path.isdir(os.path.join(LYRICS_ROOT_DIR, d))]) - # example.com is the only lyrics dir added to repo - return lyrics_dirs > 1 - - class MockFetchUrl(object): def __init__(self, pathval='fetched_path'): self.pathval = pathval @@ -230,94 +221,9 @@ def is_lyrics_content_ok(title, text): LYRICS_ROOT_DIR = os.path.join(_common.RSRC, b'lyrics') LYRICS_TEXTS = confit.load_yaml(os.path.join(_common.RSRC, b'lyricstext.yaml')) -DEFAULT_SONG = dict(artist=u'The Beatles', title=u'Lady Madonna') - -DEFAULT_SOURCES = [ - dict(DEFAULT_SONG, url=u'http://lyrics.wikia.com/', - path=u'The_Beatles:Lady_Madonna'), - dict(artist=u'Santana', title=u'Black magic woman', - url='http://www.lyrics.com/', - path=u'black-magic-woman-lyrics-santana.html'), - dict(DEFAULT_SONG, url='https://www.musixmatch.com/', - path=u'lyrics/The-Beatles/Lady-Madonna'), -] - -# Every source entered in default beets google custom search engine -# must be listed below. -# Use default query when possible, or override artist and title fields -# if website don't have lyrics for default query. -GOOGLE_SOURCES = [ - dict(DEFAULT_SONG, - url=u'http://www.absolutelyrics.com', - path=u'/lyrics/view/the_beatles/lady_madonna'), - dict(DEFAULT_SONG, - url=u'http://www.azlyrics.com', - path=u'/lyrics/beatles/ladymadonna.html'), - dict(DEFAULT_SONG, - url=u'http://www.chartlyrics.com', - path=u'/_LsLsZ7P4EK-F-LD4dJgDQ/Lady+Madonna.aspx'), - dict(DEFAULT_SONG, - url=u'http://www.elyricsworld.com', - path=u'/lady_madonna_lyrics_beatles.html'), - dict(url=u'http://www.lacoccinelle.net', - artist=u'Jacques Brel', title=u"Amsterdam", - path=u'/paroles-officielles/275679.html'), - dict(DEFAULT_SONG, - url=u'http://letras.mus.br/', path=u'the-beatles/275/'), - dict(DEFAULT_SONG, - url='http://www.lyricsmania.com/', - path='lady_madonna_lyrics_the_beatles.html'), - dict(artist=u'Santana', title=u'Black magic woman', - url='http://www.lyrics.com/', - path=u'black-magic-woman-lyrics-santana.html'), - dict(DEFAULT_SONG, url=u'http://lyrics.wikia.com/', - path=u'The_Beatles:Lady_Madonna'), - dict(DEFAULT_SONG, - url=u'http://www.lyrics.net', path=u'/lyric/19110224'), - dict(DEFAULT_SONG, - url=u'http://www.lyricsmode.com', - path=u'/lyrics/b/beatles/lady_madonna.html'), - dict(url=u'http://www.lyricsontop.com', - artist=u'Amy Winehouse', title=u"Jazz'n'blues", - path=u'/amy-winehouse-songs/jazz-n-blues-lyrics.html'), - dict(DEFAULT_SONG, - url='http://www.metrolyrics.com/', - path='lady-madonna-lyrics-beatles.html'), - dict(url='http://www.musica.com/', path='letras.asp?letra=2738', - artist=u'Santana', title=u'Black magic woman'), - dict(DEFAULT_SONG, - url=u'http://www.onelyrics.net/', - artist=u'Ben & Ellen Harper', title=u'City of dreams', - path='ben-ellen-harper-city-of-dreams-lyrics'), - dict(url=u'http://www.paroles.net/', - artist=u'Lilly Wood & the prick', title=u"Hey it's ok", - path=u'lilly-wood-the-prick/paroles-hey-it-s-ok'), - dict(DEFAULT_SONG, - url='http://www.releaselyrics.com', - path=u'/346e/the-beatles-lady-madonna-(love-version)/'), - dict(DEFAULT_SONG, - url=u'http://www.smartlyrics.com', - path=u'/Song18148-The-Beatles-Lady-Madonna-lyrics.aspx'), - dict(DEFAULT_SONG, - url='http://www.songlyrics.com', - path=u'/the-beatles/lady-madonna-lyrics'), - dict(DEFAULT_SONG, - url=u'http://www.stlyrics.com', - path=u'/songs/r/richiehavens48961/ladymadonna2069109.html'), - dict(DEFAULT_SONG, - url=u'http://www.sweetslyrics.com', - path=u'/761696.The%20Beatles%20-%20Lady%20Madonna.html') -] -class LyricsGooglePluginTest(unittest.TestCase): - """Test scraping heuristics on a fake html page. - Or run lyrics_download_samples.py first to check that beets google - custom search engine sources are correctly scraped. - """ - source = dict(url=u'http://www.example.com', artist=u'John Doe', - title=u'Beets song', path=u'/lyrics/beetssong') - +class LyricsGoogleBaseTest(unittest.TestCase): def setUp(self): """Set up configuration.""" try: @@ -326,44 +232,112 @@ class LyricsGooglePluginTest(unittest.TestCase): self.skipTest('Beautiful Soup 4 not available') if sys.version_info[:3] < (2, 7, 3): self.skipTest("Python's built-in HTML parser is not good enough") - lyrics.LyricsPlugin() - raw_backend.fetch_url = MockFetchUrl() - def test_mocked_source_ok(self): - """Test that lyrics of the mocked page are correctly scraped""" - url = self.source['url'] + self.source['path'] - if os.path.isfile(url_to_filename(url)): - res = lyrics.scrape_lyrics_from_html(raw_backend.fetch_url(url)) - self.assertTrue(google.is_lyrics(res), url) - self.assertTrue(is_lyrics_content_ok(self.source['title'], res), - url) + +class LyricsPluginSourcesTest(LyricsGoogleBaseTest): + """Check that beets google custom search engine sources are correctly scraped. + """ + + DEFAULT_SONG = dict(artist=u'The Beatles', title=u'Lady Madonna') + + DEFAULT_SOURCES = [ + dict(DEFAULT_SONG, backend=lyrics.LyricsWiki), + # dict(artist=u'Santana', title=u'Black magic woman', backend=lyrics.MusiXmatch), + # dict(DEFAULT_SONG, backend=lyrics.Genius), + ] + + GOOGLE_SOURCES = [ + dict(DEFAULT_SONG, + url=u'http://www.absolutelyrics.com', + path=u'/lyrics/view/the_beatles/lady_madonna'), + dict(DEFAULT_SONG, + url=u'http://www.azlyrics.com', + path=u'/lyrics/beatles/ladymadonna.html'), + dict(DEFAULT_SONG, + url=u'http://www.chartlyrics.com', + path=u'/_LsLsZ7P4EK-F-LD4dJgDQ/Lady+Madonna.aspx'), + dict(DEFAULT_SONG, + url=u'http://www.elyricsworld.com', + path=u'/lady_madonna_lyrics_beatles.html'), + dict(url=u'http://www.lacoccinelle.net', + artist=u'Jacques Brel', title=u"Amsterdam", + path=u'/paroles-officielles/275679.html'), + dict(DEFAULT_SONG, + url=u'http://letras.mus.br/', path=u'the-beatles/275/'), + dict(DEFAULT_SONG, + url='http://www.lyricsmania.com/', + path='lady_madonna_lyrics_the_beatles.html'), + dict(DEFAULT_SONG, url=u'http://lyrics.wikia.com/', + path=u'The_Beatles:Lady_Madonna'), + dict(DEFAULT_SONG, + url=u'http://www.lyricsmode.com', + path=u'/lyrics/b/beatles/lady_madonna.html'), + dict(url=u'http://www.lyricsontop.com', + artist=u'Amy Winehouse', title=u"Jazz'n'blues", + path=u'/amy-winehouse-songs/jazz-n-blues-lyrics.html'), + dict(DEFAULT_SONG, + url='http://www.metrolyrics.com/', + path='lady-madonna-lyrics-beatles.html'), + dict(url='http://www.musica.com/', path='letras.asp?letra=2738', + artist=u'Santana', title=u'Black magic woman'), + dict(url=u'http://www.paroles.net/', + artist=u'Lilly Wood & the prick', title=u"Hey it's ok", + path=u'lilly-wood-the-prick/paroles-hey-it-s-ok'), + dict(DEFAULT_SONG, + url='http://www.songlyrics.com', + path=u'/the-beatles/lady-madonna-lyrics'), + dict(DEFAULT_SONG, + url=u'http://www.sweetslyrics.com', + path=u'/761696.The%20Beatles%20-%20Lady%20Madonna.html') + ] + + def setUp(self): + LyricsGoogleBaseTest.setUp(self) + self.plugin = lyrics.LyricsPlugin() + + def test_backend_sources_ok(self): + """Test default backends with songs known to exist in respective databases. + """ + errors = [] + for s in self.DEFAULT_SOURCES: + res = s['backend'](self.plugin.config, self.plugin._log).fetch(s['artist'], s['title']) + if not is_lyrics_content_ok(s['title'], res): + errors.append(s['backend'].__name__) + self.assertFalse(errors) def test_google_sources_ok(self): """Test if lyrics present on websites registered in beets google custom search engine are correctly scraped.""" - if not check_lyrics_fetched(): - self.skipTest("Run lyrics_download_samples.py script first.") - for s in GOOGLE_SOURCES: + for s in self.GOOGLE_SOURCES: url = s['url'] + s['path'] - if os.path.isfile(url_to_filename(url)): - res = lyrics.scrape_lyrics_from_html( - raw_backend.fetch_url(url)) - self.assertTrue(google.is_lyrics(res), url) - self.assertTrue(is_lyrics_content_ok(s['title'], res), url) + res = lyrics.scrape_lyrics_from_html( + raw_backend.fetch_url(url)) + self.assertTrue(google.is_lyrics(res), url) + self.assertTrue(is_lyrics_content_ok(s['title'], res), url) - def test_default_ok(self): - """Test default engines with the default query""" - if not check_lyrics_fetched(): - self.skipTest("Run lyrics_download_samples.py script first.") - for (source, s) in zip([lyrics.LyricsWiki, - lyrics.LyricsCom, - lyrics.MusiXmatch], DEFAULT_SOURCES): - url = s['url'] + s['path'] - if os.path.isfile(url_to_filename(url)): - res = source({}, log).fetch(s['artist'], s['title']) - self.assertTrue(google.is_lyrics(res), url) - self.assertTrue(is_lyrics_content_ok(s['title'], res), url) +class LyricsGooglePluginMachineryTest(LyricsGoogleBaseTest): + """Test scraping heuristics on a fake html page. + """ + source = dict(url=u'http://www.example.com', artist=u'John Doe', + title=u'Beets song', path=u'/lyrics/beetssong') + + def setUp(self): + """Set up configuration""" + LyricsGoogleBaseTest.setUp(self) + self.plugin = lyrics.LyricsPlugin() + + + @patch.object(lyrics.Backend, 'fetch_url', MockFetchUrl()) + def test_mocked_source_ok(self): + """Test that lyrics of the mocked page are correctly scraped""" + url = self.source['url'] + self.source['path'] + res = lyrics.scrape_lyrics_from_html(raw_backend.fetch_url(url)) + self.assertTrue(google.is_lyrics(res), url) + self.assertTrue(is_lyrics_content_ok(self.source['title'], res), + url) + + @patch.object(lyrics.Backend, 'fetch_url', MockFetchUrl()) def test_is_page_candidate_exact_match(self): """Test matching html page title with song infos -- when song infos are present in the title.""" @@ -373,8 +347,7 @@ class LyricsGooglePluginTest(unittest.TestCase): html = raw_backend.fetch_url(url) soup = BeautifulSoup(html, "html.parser", parse_only=SoupStrainer('title')) - self.assertEqual(google.is_page_candidate(url, soup.title.string, - s['title'], s['artist']), + self.assertEqual(google.is_page_candidate(url, soup.title.string, s['title'], s['artist']), True, url) def test_is_page_candidate_fuzzy_match(self): From fa9262d61b65b5023b3be6bd68bd04c434cb3b5c Mon Sep 17 00:00:00 2001 From: Fabrice Laporte Date: Tue, 2 May 2017 01:05:18 +0200 Subject: [PATCH 07/15] Disable tests that do real requests to lyrics sites by default. Set BEETS_TEST_LYRICS_SOURCES environment variable to '1' to not skip the tests. --- test/test_lyrics.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test/test_lyrics.py b/test/test_lyrics.py index b6abf8d9e..9ed0eb4b4 100644 --- a/test/test_lyrics.py +++ b/test/test_lyrics.py @@ -295,6 +295,8 @@ class LyricsPluginSourcesTest(LyricsGoogleBaseTest): LyricsGoogleBaseTest.setUp(self) self.plugin = lyrics.LyricsPlugin() + @unittest.skipUnless(os.environ.get('BEETS_TEST_LYRICS_SOURCES', '0') == '1', + 'lyrics sources testing not enabled') def test_backend_sources_ok(self): """Test default backends with songs known to exist in respective databases. """ @@ -305,6 +307,8 @@ class LyricsPluginSourcesTest(LyricsGoogleBaseTest): errors.append(s['backend'].__name__) self.assertFalse(errors) + @unittest.skipUnless(os.environ.get('BEETS_TEST_LYRICS_SOURCES', '0') == '1', + 'lyrics sources testing not enabled') def test_google_sources_ok(self): """Test if lyrics present on websites registered in beets google custom search engine are correctly scraped.""" From 3e3ad6974cc58d4b9b2f55f83bd01403339eb65a Mon Sep 17 00:00:00 2001 From: Fabrice Laporte Date: Tue, 2 May 2017 07:30:40 +0200 Subject: [PATCH 08/15] Fix PEP8 --- beetsplug/lyrics.py | 13 +++++++++---- test/test_lyrics.py | 11 +++++++---- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py index ad2d278b5..9a60df119 100644 --- a/beetsplug/lyrics.py +++ b/beetsplug/lyrics.py @@ -84,6 +84,7 @@ def unichar(i): except ValueError: return struct.pack('i', i).decode('utf-32') + def unescape(text): """Resolve &#xxx; HTML entities (and some others).""" if isinstance(text, bytes): @@ -110,7 +111,6 @@ def extract_text_in(html, starttag): """Extract the text from a
tag in the HTML starting with ``starttag``. Returns None if parsing fails. """ - # Strip off the leading text before opening tag. try: _, html = html.split(starttag, 1) @@ -151,10 +151,10 @@ def search_pairs(item): and featured artists from the strings and add them as candidates. The method also tries to split multiple titles separated with `/`. """ - def generate_alternatives(string, patterns): """Generate string alternatives by extracting first matching group for - each given pattern.""" + each given pattern. + """ alternatives = [string] for pattern in patterns: match = re.search(pattern, string, re.IGNORECASE) @@ -270,6 +270,7 @@ class MusiXmatch(SymbolsReplaced): class Genius(Backend): """Fetch lyrics from Genius via genius-api.""" + def __init__(self, config, log): super(Genius, self).__init__(config, log) self.api_key = config['genius_api_key'].as_str() @@ -361,6 +362,7 @@ class Genius(Backend): class LyricsWiki(SymbolsReplaced): """Fetch lyrics from LyricsWiki.""" + URL_PATTERN = 'http://lyrics.wikia.com/%s:%s' def fetch(self, artist, title): @@ -381,6 +383,7 @@ class LyricsWiki(SymbolsReplaced): class LyricsCom(Backend): """Fetch lyrics from Lyrics.com.""" + URL_PATTERN = 'http://www.lyrics.com/%s-lyrics-%s.html' NOT_FOUND = ( 'Sorry, we do not have the lyric', @@ -484,6 +487,7 @@ def scrape_lyrics_from_html(html): class Google(Backend): """Fetch lyrics from Google search results.""" + def __init__(self, config, log): super(Google, self).__init__(config, log) self.api_key = config['google_API_key'].as_str() @@ -719,7 +723,8 @@ class LyricsPlugin(plugins.BeetsPlugin): def fetch_item_lyrics(self, lib, item, write, force): """Fetch and store lyrics for a single item. If ``write``, then the - lyrics will also be written to the file itself.""" + lyrics will also be written to the file itself. + """ # Skip if the item already has lyrics. if not force and item.lyrics: self._log.info(u'lyrics already present: {0}', item) diff --git a/test/test_lyrics.py b/test/test_lyrics.py index 9ed0eb4b4..0dbf658fe 100644 --- a/test/test_lyrics.py +++ b/test/test_lyrics.py @@ -311,7 +311,8 @@ class LyricsPluginSourcesTest(LyricsGoogleBaseTest): 'lyrics sources testing not enabled') def test_google_sources_ok(self): """Test if lyrics present on websites registered in beets google custom - search engine are correctly scraped.""" + search engine are correctly scraped. + """ for s in self.GOOGLE_SOURCES: url = s['url'] + s['path'] res = lyrics.scrape_lyrics_from_html( @@ -323,6 +324,7 @@ class LyricsPluginSourcesTest(LyricsGoogleBaseTest): class LyricsGooglePluginMachineryTest(LyricsGoogleBaseTest): """Test scraping heuristics on a fake html page. """ + source = dict(url=u'http://www.example.com', artist=u'John Doe', title=u'Beets song', path=u'/lyrics/beetssong') @@ -330,7 +332,6 @@ class LyricsGooglePluginMachineryTest(LyricsGoogleBaseTest): """Set up configuration""" LyricsGoogleBaseTest.setUp(self) self.plugin = lyrics.LyricsPlugin() - @patch.object(lyrics.Backend, 'fetch_url', MockFetchUrl()) def test_mocked_source_ok(self): @@ -344,7 +345,8 @@ class LyricsGooglePluginMachineryTest(LyricsGoogleBaseTest): @patch.object(lyrics.Backend, 'fetch_url', MockFetchUrl()) def test_is_page_candidate_exact_match(self): """Test matching html page title with song infos -- when song infos are - present in the title.""" + present in the title. + """ from bs4 import SoupStrainer, BeautifulSoup s = self.source url = six.text_type(s['url'] + s['path']) @@ -356,7 +358,8 @@ class LyricsGooglePluginMachineryTest(LyricsGoogleBaseTest): def test_is_page_candidate_fuzzy_match(self): """Test matching html page title with song infos -- when song infos are - not present in the title.""" + not present in the title. + """ s = self.source url = s['url'] + s['path'] url_title = u'example.com | Beats song by John doe' From 11eb90c7588f1d4ef688af1396f01b114f7ba833 Mon Sep 17 00:00:00 2001 From: Fabrice Laporte Date: Tue, 2 May 2017 07:46:36 +0200 Subject: [PATCH 09/15] Fix PEP8 --- beetsplug/lyrics.py | 3 ++- test/test_lyrics.py | 19 ++++++++++++------- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py index 9a60df119..5cf93471c 100644 --- a/beetsplug/lyrics.py +++ b/beetsplug/lyrics.py @@ -263,7 +263,8 @@ class MusiXmatch(SymbolsReplaced): html = self.fetch_url(url) if not html: return - lyrics = extract_text_between(html, '

', + lyrics = extract_text_between(html, + '

', '

') return lyrics.strip(',"').replace('\\n', '\n') diff --git a/test/test_lyrics.py b/test/test_lyrics.py index 0dbf658fe..42969e3ea 100644 --- a/test/test_lyrics.py +++ b/test/test_lyrics.py @@ -235,14 +235,16 @@ class LyricsGoogleBaseTest(unittest.TestCase): class LyricsPluginSourcesTest(LyricsGoogleBaseTest): - """Check that beets google custom search engine sources are correctly scraped. + """Check that beets google custom search engine sources are correctly + scraped. """ DEFAULT_SONG = dict(artist=u'The Beatles', title=u'Lady Madonna') DEFAULT_SOURCES = [ dict(DEFAULT_SONG, backend=lyrics.LyricsWiki), - # dict(artist=u'Santana', title=u'Black magic woman', backend=lyrics.MusiXmatch), + # dict(artist=u'Santana', title=u'Black magic woman', + # backend=lyrics.MusiXmatch), # dict(DEFAULT_SONG, backend=lyrics.Genius), ] @@ -295,19 +297,22 @@ class LyricsPluginSourcesTest(LyricsGoogleBaseTest): LyricsGoogleBaseTest.setUp(self) self.plugin = lyrics.LyricsPlugin() - @unittest.skipUnless(os.environ.get('BEETS_TEST_LYRICS_SOURCES', '0') == '1', + @unittest.skipUnless(os.environ.get( + 'BEETS_TEST_LYRICS_SOURCES', '0') == '1', 'lyrics sources testing not enabled') def test_backend_sources_ok(self): """Test default backends with songs known to exist in respective databases. """ errors = [] for s in self.DEFAULT_SOURCES: - res = s['backend'](self.plugin.config, self.plugin._log).fetch(s['artist'], s['title']) + res = s['backend'](self.plugin.config, self.plugin._log).fetch( + s['artist'], s['title']) if not is_lyrics_content_ok(s['title'], res): errors.append(s['backend'].__name__) self.assertFalse(errors) - @unittest.skipUnless(os.environ.get('BEETS_TEST_LYRICS_SOURCES', '0') == '1', + @unittest.skipUnless(os.environ.get( + 'BEETS_TEST_LYRICS_SOURCES', '0') == '1', 'lyrics sources testing not enabled') def test_google_sources_ok(self): """Test if lyrics present on websites registered in beets google custom @@ -353,8 +358,8 @@ class LyricsGooglePluginMachineryTest(LyricsGoogleBaseTest): html = raw_backend.fetch_url(url) soup = BeautifulSoup(html, "html.parser", parse_only=SoupStrainer('title')) - self.assertEqual(google.is_page_candidate(url, soup.title.string, s['title'], s['artist']), - True, url) + self.assertEqual(google.is_page_candidate(url, soup.title.string, + s['title'], s['artist']), True, url) def test_is_page_candidate_fuzzy_match(self): """Test matching html page title with song infos -- when song infos are From 3e38a33c4a2730d83a5c9d43cdc5c9de6644b552 Mon Sep 17 00:00:00 2001 From: Fabrice Laporte Date: Tue, 2 May 2017 23:37:20 +0200 Subject: [PATCH 10/15] Fix PEP8 --- beetsplug/lyrics.py | 2 +- test/test_lyrics.py | 11 +++++++---- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py index 5cf93471c..f75b157c7 100644 --- a/beetsplug/lyrics.py +++ b/beetsplug/lyrics.py @@ -52,7 +52,6 @@ except ImportError: from beets import plugins from beets import ui -import beets DIV_RE = re.compile(r'<(/?)div>?', re.I) @@ -260,6 +259,7 @@ class MusiXmatch(SymbolsReplaced): def fetch(self, artist, title): url = self.build_url(artist, title) + html = self.fetch_url(url) if not html: return diff --git a/test/test_lyrics.py b/test/test_lyrics.py index 42969e3ea..8dc3e24f0 100644 --- a/test/test_lyrics.py +++ b/test/test_lyrics.py @@ -41,6 +41,7 @@ google = lyrics.Google(MagicMock(), log) class LyricsPluginTest(unittest.TestCase): + def setUp(self): """Set up configuration.""" lyrics.LyricsPlugin() @@ -199,6 +200,7 @@ def url_to_filename(url): class MockFetchUrl(object): + def __init__(self, pathval='fetched_path'): self.pathval = pathval self.fetched = None @@ -224,6 +226,7 @@ LYRICS_TEXTS = confit.load_yaml(os.path.join(_common.RSRC, b'lyricstext.yaml')) class LyricsGoogleBaseTest(unittest.TestCase): + def setUp(self): """Set up configuration.""" try: @@ -244,7 +247,7 @@ class LyricsPluginSourcesTest(LyricsGoogleBaseTest): DEFAULT_SOURCES = [ dict(DEFAULT_SONG, backend=lyrics.LyricsWiki), # dict(artist=u'Santana', title=u'Black magic woman', - # backend=lyrics.MusiXmatch), + # backend=lyrics.MusiXmatch), # dict(DEFAULT_SONG, backend=lyrics.Genius), ] @@ -359,7 +362,7 @@ class LyricsGooglePluginMachineryTest(LyricsGoogleBaseTest): soup = BeautifulSoup(html, "html.parser", parse_only=SoupStrainer('title')) self.assertEqual(google.is_page_candidate(url, soup.title.string, - s['title'], s['artist']), True, url) + s['title'], s['artist']), True, url) def test_is_page_candidate_fuzzy_match(self): """Test matching html page title with song infos -- when song infos are @@ -371,11 +374,11 @@ class LyricsGooglePluginMachineryTest(LyricsGoogleBaseTest): # very small diffs (typo) are ok eg 'beats' vs 'beets' with same artist self.assertEqual(google.is_page_candidate(url, url_title, s['title'], - s['artist']), True, url) + s['artist']), True, url) # reject different title url_title = u'example.com | seets bong lyrics by John doe' self.assertEqual(google.is_page_candidate(url, url_title, s['title'], - s['artist']), False, url) + s['artist']), False, url) def test_is_page_candidate_special_chars(self): """Ensure that `is_page_candidate` doesn't crash when the artist From 07af27e44b9c923e23673f46aa3de683b34d15ae Mon Sep 17 00:00:00 2001 From: Fabrice Laporte Date: Tue, 2 May 2017 23:40:25 +0200 Subject: [PATCH 11/15] Lyrics are last paragraph with class 'mxm-lyrics__content' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove ‘data-reactid’ from marker. --- beetsplug/lyrics.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py index f75b157c7..39e1502ed 100644 --- a/beetsplug/lyrics.py +++ b/beetsplug/lyrics.py @@ -263,9 +263,8 @@ class MusiXmatch(SymbolsReplaced): html = self.fetch_url(url) if not html: return - lyrics = extract_text_between(html, - '

', - '

') + html_part = html.split('

Date: Tue, 2 May 2017 23:48:20 +0200 Subject: [PATCH 12/15] Restore beets module import --- beetsplug/lyrics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py index 39e1502ed..cdaf102e3 100644 --- a/beetsplug/lyrics.py +++ b/beetsplug/lyrics.py @@ -52,7 +52,7 @@ except ImportError: from beets import plugins from beets import ui - +import beets DIV_RE = re.compile(r'<(/?)div>?', re.I) COMMENT_RE = re.compile(r'', re.S) From b3fbdbae5a76980f2d022539af88f44b41fc9597 Mon Sep 17 00:00:00 2001 From: Fabrice Laporte Date: Wed, 3 May 2017 00:02:09 +0200 Subject: [PATCH 13/15] Fix flake8 --- test/test_lyrics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_lyrics.py b/test/test_lyrics.py index 8dc3e24f0..3b260c482 100644 --- a/test/test_lyrics.py +++ b/test/test_lyrics.py @@ -362,7 +362,7 @@ class LyricsGooglePluginMachineryTest(LyricsGoogleBaseTest): soup = BeautifulSoup(html, "html.parser", parse_only=SoupStrainer('title')) self.assertEqual(google.is_page_candidate(url, soup.title.string, - s['title'], s['artist']), True, url) + s['title'], s['artist']), True, url) def test_is_page_candidate_fuzzy_match(self): """Test matching html page title with song infos -- when song infos are From f53ab801b83d2d24444e3ae7de1bb11dfbe55dbe Mon Sep 17 00:00:00 2001 From: Fabrice Laporte Date: Wed, 3 May 2017 00:11:26 +0200 Subject: [PATCH 14/15] Add indent --- test/test_lyrics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_lyrics.py b/test/test_lyrics.py index 3b260c482..e2e5958b1 100644 --- a/test/test_lyrics.py +++ b/test/test_lyrics.py @@ -362,7 +362,7 @@ class LyricsGooglePluginMachineryTest(LyricsGoogleBaseTest): soup = BeautifulSoup(html, "html.parser", parse_only=SoupStrainer('title')) self.assertEqual(google.is_page_candidate(url, soup.title.string, - s['title'], s['artist']), True, url) + s['title'], s['artist']), True, url) def test_is_page_candidate_fuzzy_match(self): """Test matching html page title with song infos -- when song infos are From 8f32bfed82ef7fcf8a3fd88ffe305230bc37d711 Mon Sep 17 00:00:00 2001 From: Fabrice Laporte Date: Wed, 3 May 2017 07:42:50 +0200 Subject: [PATCH 15/15] Reactivate test of LyricsCom and MusiXmatch sources --- test/test_lyrics.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/test/test_lyrics.py b/test/test_lyrics.py index e2e5958b1..a96551e75 100644 --- a/test/test_lyrics.py +++ b/test/test_lyrics.py @@ -246,9 +246,10 @@ class LyricsPluginSourcesTest(LyricsGoogleBaseTest): DEFAULT_SOURCES = [ dict(DEFAULT_SONG, backend=lyrics.LyricsWiki), - # dict(artist=u'Santana', title=u'Black magic woman', - # backend=lyrics.MusiXmatch), - # dict(DEFAULT_SONG, backend=lyrics.Genius), + dict(DEFAULT_SONG, backend=lyrics.LyricsCom), + dict(artist=u'Santana', title=u'Black magic woman', + backend=lyrics.MusiXmatch), + dict(DEFAULT_SONG, backend=lyrics.Genius), ] GOOGLE_SOURCES = [ @@ -361,8 +362,9 @@ class LyricsGooglePluginMachineryTest(LyricsGoogleBaseTest): html = raw_backend.fetch_url(url) soup = BeautifulSoup(html, "html.parser", parse_only=SoupStrainer('title')) - self.assertEqual(google.is_page_candidate(url, soup.title.string, - s['title'], s['artist']), True, url) + self.assertEqual( + google.is_page_candidate(url, soup.title.string, + s['title'], s['artist']), True, url) def test_is_page_candidate_fuzzy_match(self): """Test matching html page title with song infos -- when song infos are