Merge pull request #2538 from Kraymer/lyrics-test

Tests to track whether lyrics websites are correctly fetched
2026-01-30 03:54:21 +01:00 · 2017-05-03 20:02:03 +02:00 · 2017-05-03 20:02:03 +02:00 · fc6b65d592
commit fc6b65d592
parent 84febb13c1 8f32bfed82
3 changed files with 211 additions and 193 deletions
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@ -21,6 +21,7 @@ from __future__ import absolute_import, division, print_function
 import difflib
 import itertools
 import json
+import struct
 import re
 import requests
 import unicodedata
@ -53,7 +54,6 @@ from beets import plugins
 from beets import ui
 import beets

-
 DIV_RE = re.compile(r'<(/?)div>?', re.I)
 COMMENT_RE = re.compile(r'<!--.*-->', re.S)
 TAG_RE = re.compile(r'<[^>]*>')
@ -77,6 +77,12 @@ USER_AGENT = 'beets/{}'.format(beets.__version__)

 # Utilities.

+def unichar(i):
+    try:
+        return six.unichr(i)
+    except ValueError:
+        return struct.pack('i', i).decode('utf-32')
+

 def unescape(text):
    """Resolve &#xxx; HTML entities (and some others)."""
@ -86,7 +92,7 @@ def unescape(text):

    def replchar(m):
        num = m.group(1)
-        return six.unichr(int(num))
+        return unichar(int(num))
    out = re.sub(u"&#(\d+);", replchar, out)
    return out

@ -104,7 +110,6 @@ def extract_text_in(html, starttag):
    """Extract the text from a <DIV> tag in the HTML starting with
    ``starttag``. Returns None if parsing fails.
    """
-
    # Strip off the leading text before opening tag.
    try:
        _, html = html.split(starttag, 1)
@ -145,10 +150,10 @@ def search_pairs(item):
    and featured artists from the strings and add them as candidates.
    The method also tries to split multiple titles separated with `/`.
    """
-
    def generate_alternatives(string, patterns):
        """Generate string alternatives by extracting first matching group for
-           each given pattern."""
+           each given pattern.
+        """
        alternatives = [string]
        for pattern in patterns:
            match = re.search(pattern, string, re.IGNORECASE)
@ -254,16 +259,18 @@ class MusiXmatch(SymbolsReplaced):

    def fetch(self, artist, title):
        url = self.build_url(artist, title)
+
        html = self.fetch_url(url)
        if not html:
            return
-        lyrics = extract_text_between(html,
-                                      '"body":', '"language":')
+        html_part = html.split('<p class="mxm-lyrics__content')[-1]
+        lyrics = extract_text_between(html_part, '>', '</p>')
        return lyrics.strip(',"').replace('\\n', '\n')


 class Genius(Backend):
    """Fetch lyrics from Genius via genius-api."""
+
    def __init__(self, config, log):
        super(Genius, self).__init__(config, log)
        self.api_key = config['genius_api_key'].as_str()
@ -355,6 +362,7 @@ class Genius(Backend):

 class LyricsWiki(SymbolsReplaced):
    """Fetch lyrics from LyricsWiki."""
+
    URL_PATTERN = 'http://lyrics.wikia.com/%s:%s'

    def fetch(self, artist, title):
@ -375,6 +383,7 @@ class LyricsWiki(SymbolsReplaced):

 class LyricsCom(Backend):
    """Fetch lyrics from Lyrics.com."""
+
    URL_PATTERN = 'http://www.lyrics.com/%s-lyrics-%s.html'
    NOT_FOUND = (
        'Sorry, we do not have the lyric',
@ -478,6 +487,7 @@ def scrape_lyrics_from_html(html):

 class Google(Backend):
    """Fetch lyrics from Google search results."""
+
    def __init__(self, config, log):
        super(Google, self).__init__(config, log)
        self.api_key = config['google_API_key'].as_str()
@ -713,7 +723,8 @@ class LyricsPlugin(plugins.BeetsPlugin):

    def fetch_item_lyrics(self, lib, item, write, force):
        """Fetch and store lyrics for a single item. If ``write``, then the
-        lyrics will also be written to the file itself."""
+           lyrics will also be written to the file itself.
+        """
        # Skip if the item already has lyrics.
        if not force and item.lyrics:
            self._log.info(u'lyrics already present: {0}', item)
--- a/test/rsrc/lyricstext.yaml
+++ b/test/rsrc/lyricstext.yaml
@ -1,45 +1,56 @@
-Beets_song:
-    - geeks
-    - bouquet
-    - panacea
+# Song used by LyricsGooglePluginMachineryTest

-Amsterdam:
-    - oriflammes
-    - fortune
-    - batave
-    - pissent
-
-Lady_Madonna:
-    - heaven
-    - tuesday
-    - thursday
-
-Jazz_n_blues:
-    - parkway
-    - balance
-    - impatient
-    - shoes
-
-Hey_it_s_ok:
-    - swear
-    - forgive
-    - drink
-    - found
-
-City_of_dreams:
-    - groves
-    - landmarks
-    - twilight
-    - freeways
-
-Black_magic_woman:
-    - devil
-    - magic
-    - spell
-    - heart
+Beets_song: |
+    beets is the media library management system for obsessive-compulsive music geeks the purpose of 
+    beets is to get your music collection right once and for all it catalogs your collection 
+    automatically improving its metadata as it goes it then provides a bouquet of tools for 
+    manipulating and accessing your music here's an example of beets' brainy tag corrector doing its 
+    because beets is designed as a library it can do almost anything you can imagine for your 
+    music collection via plugins beets becomes a panacea

 missing_texts: |
    Lyricsmania staff is working hard for you to add $TITLE lyrics as soon
    as they'll be released by $ARTIST, check back soon!
    In case you have the lyrics to $TITLE and want to send them to us, fill out
    the following form.
+
+# Songs lyrics used to test the different sources present in the google custom search engine.
+# Text is randomized for copyright infringement reason.
+
+Amsterdam: |
+    coup corps coeur invitent mains comme trop morue le hantent mais la dames joli revenir aux 
+    mangent croquer pleine plantent rire de sortent pleins fortune d'amsterdam bruit ruisselants 
+    large poissons braguette leur putains blanches jusque pissent dans soleils dansent et port
+    bien vertu nez sur chaleur femmes rotant dorment marins boivent bu les que d'un qui je 
+    une cou hambourg plus ils dents ou tournent or berges d'ailleurs tout ciel haubans ce son lueurs
+    en lune ont mouchent leurs long frottant jusqu'en vous regard montrent langueurs chantent
+    tordent pleure donnent drames mornes des panse pour un sent encore referment nappes au meurent
+    geste quand puis alors frites grosses batave expire naissent reboivent oriflammes grave riant a 
+    enfin rance fier y bouffer s'entendre se mieux
+
+Lady_Madonna: |
+    feed his money tuesday manage didn't head feet see arrives at in madonna rest morning children 
+    wonder how make thursday your to sunday music papers come tie you has was is listen suitcase 
+    ends friday run that needed breast they child baby mending on lady learned a nun like did wednesday 
+    bed think without afternoon night meet the playing lying
+
+Jazz_n_blues: |
+    all shoes money through follow blow til father to his hit jazz kiss now cool bar cause 50 night
+    heading i'll says yeah cash forgot blues out what for ways away fingers waiting got ever bold 
+    screen sixty throw wait on about last compton days o pick love wall had within jeans jd next 
+    miss standing from it's two long fight extravagant tell today more buy shopping that didn't 
+    what's but russian up can parkway balance my and gone am it as at in check if bags when cross 
+    machine take you drinks coke june wrong coming fancy's i n' impatient so the main's spend 
+    that's
+
+Hey_it_s_ok: |
+    and forget be when please it against fighting mama cause ! again what said
+    things papa hey to much lovers way wet was too do drink and i who forgive
+    hey fourteen please know not wanted had myself ok friends bed times looked
+    swear act found the my mean
+
+Black_magic_woman: |
+    blind heart sticks just don't into back alone see need yes your out devil make that to black got
+    you might me woman turning spell stop baby with 'round a on stone messin' magic i of 
+    tricks up leave turn bad so pick she's my can't
+
--- a/test/test_lyrics.py
+++ b/test/test_lyrics.py
@ -15,21 +15,25 @@

 """Tests for the 'lyrics' plugin."""

-from __future__ import division, absolute_import, print_function
+from __future__ import absolute_import, division, print_function

 import os
-import sys
 import re
+import six
+import sys
 import unittest

+from mock import patch
 from test import _common
-from mock import MagicMock
+
+from beets import logging
+from beets.library import Item
+from beets.util import bytestring_path, confit

 from beetsplug import lyrics
-from beets.library import Item
-from beets.util import confit, bytestring_path
-from beets import logging
-import six
+
+from mock import MagicMock
+

 log = logging.getLogger('beets.test_lyrics')
 raw_backend = lyrics.Backend({}, log)
@ -37,8 +41,9 @@ google = lyrics.Google(MagicMock(), log)


 class LyricsPluginTest(unittest.TestCase):
+
    def setUp(self):
-        """Set up configuration"""
+        """Set up configuration."""
        lyrics.LyricsPlugin()

    def test_search_artist(self):
@ -194,16 +199,8 @@ def url_to_filename(url):
    return fn


-def check_lyrics_fetched():
-    """Return True if lyrics_download_samples.py has been runned and lyrics
-    pages are present in resources directory"""
-    lyrics_dirs = len([d for d in os.listdir(LYRICS_ROOT_DIR) if
-                      os.path.isdir(os.path.join(LYRICS_ROOT_DIR, d))])
-    # example.com is the only lyrics dir added to repo
-    return lyrics_dirs > 1
-
-
 class MockFetchUrl(object):
+
    def __init__(self, pathval='fetched_path'):
        self.pathval = pathval
        self.fetched = None
@ -217,174 +214,173 @@ class MockFetchUrl(object):


 def is_lyrics_content_ok(title, text):
-    """Compare lyrics text to expected lyrics for given title"""
-
-    keywords = LYRICS_TEXTS[google.slugify(title)]
-    return all(x in text.lower() for x in keywords)
+    """Compare lyrics text to expected lyrics for given title."""
+    if not text:
+        return
+    keywords = set(LYRICS_TEXTS[google.slugify(title)].split())
+    words = set(x.strip(".?, ") for x in text.lower().split())
+    return keywords <= words

 LYRICS_ROOT_DIR = os.path.join(_common.RSRC, b'lyrics')
 LYRICS_TEXTS = confit.load_yaml(os.path.join(_common.RSRC, b'lyricstext.yaml'))
-DEFAULT_SONG = dict(artist=u'The Beatles', title=u'Lady Madonna')
-
-DEFAULT_SOURCES = [
-    dict(DEFAULT_SONG, url=u'http://lyrics.wikia.com/',
-         path=u'The_Beatles:Lady_Madonna'),
-    dict(artist=u'Santana', title=u'Black magic woman',
-         url='http://www.lyrics.com/',
-         path=u'black-magic-woman-lyrics-santana.html'),
-    dict(DEFAULT_SONG, url='https://www.musixmatch.com/',
-         path=u'lyrics/The-Beatles/Lady-Madonna'),
-]
-
-# Every source entered in default beets google custom search engine
-# must be listed below.
-# Use default query when possible, or override artist and title fields
-# if website don't have lyrics for default query.
-GOOGLE_SOURCES = [
-    dict(DEFAULT_SONG,
-         url=u'http://www.absolutelyrics.com',
-         path=u'/lyrics/view/the_beatles/lady_madonna'),
-    dict(DEFAULT_SONG,
-         url=u'http://www.azlyrics.com',
-         path=u'/lyrics/beatles/ladymadonna.html'),
-    dict(DEFAULT_SONG,
-         url=u'http://www.chartlyrics.com',
-         path=u'/_LsLsZ7P4EK-F-LD4dJgDQ/Lady+Madonna.aspx'),
-    dict(DEFAULT_SONG,
-         url=u'http://www.elyricsworld.com',
-         path=u'/lady_madonna_lyrics_beatles.html'),
-    dict(url=u'http://www.lacoccinelle.net',
-         artist=u'Jacques Brel', title=u"Amsterdam",
-         path=u'/paroles-officielles/275679.html'),
-    dict(DEFAULT_SONG,
-         url=u'http://letras.mus.br/', path=u'the-beatles/275/'),
-    dict(DEFAULT_SONG,
-         url='http://www.lyricsmania.com/',
-         path='lady_madonna_lyrics_the_beatles.html'),
-    dict(artist=u'Santana', title=u'Black magic woman',
-         url='http://www.lyrics.com/',
-         path=u'black-magic-woman-lyrics-santana.html'),
-    dict(DEFAULT_SONG, url=u'http://lyrics.wikia.com/',
-         path=u'The_Beatles:Lady_Madonna'),
-    dict(DEFAULT_SONG,
-         url=u'http://www.lyrics.net', path=u'/lyric/19110224'),
-    dict(DEFAULT_SONG,
-         url=u'http://www.lyricsmode.com',
-         path=u'/lyrics/b/beatles/lady_madonna.html'),
-    dict(url=u'http://www.lyricsontop.com',
-         artist=u'Amy Winehouse', title=u"Jazz'n'blues",
-         path=u'/amy-winehouse-songs/jazz-n-blues-lyrics.html'),
-    dict(DEFAULT_SONG,
-         url='http://www.metrolyrics.com/',
-         path='lady-madonna-lyrics-beatles.html'),
-    dict(url='http://www.musica.com/', path='letras.asp?letra=2738',
-         artist=u'Santana', title=u'Black magic woman'),
-    dict(DEFAULT_SONG,
-         url=u'http://www.onelyrics.net/',
-         artist=u'Ben & Ellen Harper', title=u'City of dreams',
-         path='ben-ellen-harper-city-of-dreams-lyrics'),
-    dict(url=u'http://www.paroles.net/',
-         artist=u'Lilly Wood & the prick', title=u"Hey it's ok",
-         path=u'lilly-wood-the-prick/paroles-hey-it-s-ok'),
-    dict(DEFAULT_SONG,
-         url='http://www.releaselyrics.com',
-         path=u'/346e/the-beatles-lady-madonna-(love-version)/'),
-    dict(DEFAULT_SONG,
-         url=u'http://www.smartlyrics.com',
-         path=u'/Song18148-The-Beatles-Lady-Madonna-lyrics.aspx'),
-    dict(DEFAULT_SONG,
-         url='http://www.songlyrics.com',
-         path=u'/the-beatles/lady-madonna-lyrics'),
-    dict(DEFAULT_SONG,
-         url=u'http://www.stlyrics.com',
-         path=u'/songs/r/richiehavens48961/ladymadonna2069109.html'),
-    dict(DEFAULT_SONG,
-         url=u'http://www.sweetslyrics.com',
-         path=u'/761696.The%20Beatles%20-%20Lady%20Madonna.html')
-]


-class LyricsGooglePluginTest(unittest.TestCase):
-    """Test scraping heuristics on a fake html page.
-    Or run lyrics_download_samples.py first to check that beets google
-    custom search engine sources are correctly scraped.
-    """
-    source = dict(url=u'http://www.example.com', artist=u'John Doe',
-                  title=u'Beets song', path=u'/lyrics/beetssong')
+class LyricsGoogleBaseTest(unittest.TestCase):

    def setUp(self):
-        """Set up configuration"""
+        """Set up configuration."""
        try:
            __import__('bs4')
        except ImportError:
            self.skipTest('Beautiful Soup 4 not available')
        if sys.version_info[:3] < (2, 7, 3):
            self.skipTest("Python's built-in HTML parser is not good enough")
-        lyrics.LyricsPlugin()
-        raw_backend.fetch_url = MockFetchUrl()

+
+class LyricsPluginSourcesTest(LyricsGoogleBaseTest):
+    """Check that beets google custom search engine sources are correctly
+       scraped.
+    """
+
+    DEFAULT_SONG = dict(artist=u'The Beatles', title=u'Lady Madonna')
+
+    DEFAULT_SOURCES = [
+        dict(DEFAULT_SONG, backend=lyrics.LyricsWiki),
+        dict(DEFAULT_SONG, backend=lyrics.LyricsCom),
+        dict(artist=u'Santana', title=u'Black magic woman',
+             backend=lyrics.MusiXmatch),
+        dict(DEFAULT_SONG, backend=lyrics.Genius),
+    ]
+
+    GOOGLE_SOURCES = [
+        dict(DEFAULT_SONG,
+             url=u'http://www.absolutelyrics.com',
+             path=u'/lyrics/view/the_beatles/lady_madonna'),
+        dict(DEFAULT_SONG,
+             url=u'http://www.azlyrics.com',
+             path=u'/lyrics/beatles/ladymadonna.html'),
+        dict(DEFAULT_SONG,
+             url=u'http://www.chartlyrics.com',
+             path=u'/_LsLsZ7P4EK-F-LD4dJgDQ/Lady+Madonna.aspx'),
+        dict(DEFAULT_SONG,
+             url=u'http://www.elyricsworld.com',
+             path=u'/lady_madonna_lyrics_beatles.html'),
+        dict(url=u'http://www.lacoccinelle.net',
+             artist=u'Jacques Brel', title=u"Amsterdam",
+             path=u'/paroles-officielles/275679.html'),
+        dict(DEFAULT_SONG,
+             url=u'http://letras.mus.br/', path=u'the-beatles/275/'),
+        dict(DEFAULT_SONG,
+             url='http://www.lyricsmania.com/',
+             path='lady_madonna_lyrics_the_beatles.html'),
+        dict(DEFAULT_SONG, url=u'http://lyrics.wikia.com/',
+             path=u'The_Beatles:Lady_Madonna'),
+        dict(DEFAULT_SONG,
+             url=u'http://www.lyricsmode.com',
+             path=u'/lyrics/b/beatles/lady_madonna.html'),
+        dict(url=u'http://www.lyricsontop.com',
+             artist=u'Amy Winehouse', title=u"Jazz'n'blues",
+             path=u'/amy-winehouse-songs/jazz-n-blues-lyrics.html'),
+        dict(DEFAULT_SONG,
+             url='http://www.metrolyrics.com/',
+             path='lady-madonna-lyrics-beatles.html'),
+        dict(url='http://www.musica.com/', path='letras.asp?letra=2738',
+             artist=u'Santana', title=u'Black magic woman'),
+        dict(url=u'http://www.paroles.net/',
+             artist=u'Lilly Wood & the prick', title=u"Hey it's ok",
+             path=u'lilly-wood-the-prick/paroles-hey-it-s-ok'),
+        dict(DEFAULT_SONG,
+             url='http://www.songlyrics.com',
+             path=u'/the-beatles/lady-madonna-lyrics'),
+        dict(DEFAULT_SONG,
+             url=u'http://www.sweetslyrics.com',
+             path=u'/761696.The%20Beatles%20-%20Lady%20Madonna.html')
+    ]
+
+    def setUp(self):
+        LyricsGoogleBaseTest.setUp(self)
+        self.plugin = lyrics.LyricsPlugin()
+
+    @unittest.skipUnless(os.environ.get(
+        'BEETS_TEST_LYRICS_SOURCES', '0') == '1',
+        'lyrics sources testing not enabled')
+    def test_backend_sources_ok(self):
+        """Test default backends with songs known to exist in respective databases.
+        """
+        errors = []
+        for s in self.DEFAULT_SOURCES:
+            res = s['backend'](self.plugin.config, self.plugin._log).fetch(
+                s['artist'], s['title'])
+            if not is_lyrics_content_ok(s['title'], res):
+                errors.append(s['backend'].__name__)
+        self.assertFalse(errors)
+
+    @unittest.skipUnless(os.environ.get(
+        'BEETS_TEST_LYRICS_SOURCES', '0') == '1',
+        'lyrics sources testing not enabled')
+    def test_google_sources_ok(self):
+        """Test if lyrics present on websites registered in beets google custom
+           search engine are correctly scraped.
+        """
+        for s in self.GOOGLE_SOURCES:
+            url = s['url'] + s['path']
+            res = lyrics.scrape_lyrics_from_html(
+                raw_backend.fetch_url(url))
+            self.assertTrue(google.is_lyrics(res), url)
+            self.assertTrue(is_lyrics_content_ok(s['title'], res), url)
+
+
+class LyricsGooglePluginMachineryTest(LyricsGoogleBaseTest):
+    """Test scraping heuristics on a fake html page.
+    """
+
+    source = dict(url=u'http://www.example.com', artist=u'John Doe',
+                  title=u'Beets song', path=u'/lyrics/beetssong')
+
+    def setUp(self):
+        """Set up configuration"""
+        LyricsGoogleBaseTest.setUp(self)
+        self.plugin = lyrics.LyricsPlugin()
+
+    @patch.object(lyrics.Backend, 'fetch_url', MockFetchUrl())
    def test_mocked_source_ok(self):
        """Test that lyrics of the mocked page are correctly scraped"""
        url = self.source['url'] + self.source['path']
-        if os.path.isfile(url_to_filename(url)):
-            res = lyrics.scrape_lyrics_from_html(raw_backend.fetch_url(url))
-            self.assertTrue(google.is_lyrics(res), url)
-            self.assertTrue(is_lyrics_content_ok(self.source['title'], res),
-                            url)
-
-    def test_google_sources_ok(self):
-        """Test if lyrics present on websites registered in beets google custom
-        search engine are correctly scraped."""
-        if not check_lyrics_fetched():
-            self.skipTest("Run lyrics_download_samples.py script first.")
-        for s in GOOGLE_SOURCES:
-            url = s['url'] + s['path']
-            if os.path.isfile(url_to_filename(url)):
-                res = lyrics.scrape_lyrics_from_html(
-                    raw_backend.fetch_url(url))
-                self.assertTrue(google.is_lyrics(res), url)
-                self.assertTrue(is_lyrics_content_ok(s['title'], res), url)
-
-    def test_default_ok(self):
-        """Test default engines with the default query"""
-        if not check_lyrics_fetched():
-            self.skipTest("Run lyrics_download_samples.py script first.")
-        for (source, s) in zip([lyrics.LyricsWiki,
-                                lyrics.LyricsCom,
-                                lyrics.MusiXmatch], DEFAULT_SOURCES):
-            url = s['url'] + s['path']
-            if os.path.isfile(url_to_filename(url)):
-                res = source({}, log).fetch(s['artist'], s['title'])
-                self.assertTrue(google.is_lyrics(res), url)
-                self.assertTrue(is_lyrics_content_ok(s['title'], res), url)
+        res = lyrics.scrape_lyrics_from_html(raw_backend.fetch_url(url))
+        self.assertTrue(google.is_lyrics(res), url)
+        self.assertTrue(is_lyrics_content_ok(self.source['title'], res),
+                        url)

+    @patch.object(lyrics.Backend, 'fetch_url', MockFetchUrl())
    def test_is_page_candidate_exact_match(self):
        """Test matching html page title with song infos -- when song infos are
-        present in the title."""
+           present in the title.
+        """
        from bs4 import SoupStrainer, BeautifulSoup
        s = self.source
        url = six.text_type(s['url'] + s['path'])
        html = raw_backend.fetch_url(url)
        soup = BeautifulSoup(html, "html.parser",
                             parse_only=SoupStrainer('title'))
-        self.assertEqual(google.is_page_candidate(url, soup.title.string,
-                                                  s['title'], s['artist']),
-                         True, url)
+        self.assertEqual(
+            google.is_page_candidate(url, soup.title.string,
+                                     s['title'], s['artist']), True, url)

    def test_is_page_candidate_fuzzy_match(self):
        """Test matching html page title with song infos -- when song infos are
-        not present in the title."""
+           not present in the title.
+        """
        s = self.source
        url = s['url'] + s['path']
        url_title = u'example.com | Beats song by John doe'

        # very small diffs (typo) are ok eg 'beats' vs 'beets' with same artist
        self.assertEqual(google.is_page_candidate(url, url_title, s['title'],
-                         s['artist']), True, url)
+                                                  s['artist']), True, url)
        # reject different title
        url_title = u'example.com | seets bong lyrics by John doe'
        self.assertEqual(google.is_page_candidate(url, url_title, s['title'],
-                         s['artist']), False, url)
+                                                  s['artist']), False, url)

    def test_is_page_candidate_special_chars(self):
        """Ensure that `is_page_candidate` doesn't crash when the artist