beets/test/test_lyrics.py

# This file is part of beets.
# Copyright 2014, Fabrice Laporte.
#
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and associated documentation files (the
# "Software"), to deal in the Software without restriction, including
# without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and to
# permit persons to whom the Software is furnished to do so, subject to
# the following conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.

"""Tests for the 'lyrics' plugin."""

import os
import _common
import sys
from _common import unittest
from beetsplug import lyrics
from beets.library import Item
from beets.util import confit


class LyricsPluginTest(unittest.TestCase):
    def setUp(self):
        """Set up configuration"""
        lyrics.LyricsPlugin()

    def test_search_artist(self):
        item = Item(artist='Alice ft. Bob', title='song')
        self.assertIn(('Alice ft. Bob', ['song']),
                      lyrics.search_pairs(item))
        self.assertIn(('Alice', ['song']),
                      lyrics.search_pairs(item))

        item = Item(artist='Alice feat Bob', title='song')
        self.assertIn(('Alice feat Bob', ['song']),
                      lyrics.search_pairs(item))
        self.assertIn(('Alice', ['song']),
                      lyrics.search_pairs(item))

        item = Item(artist='Alice feat. Bob', title='song')
        self.assertIn(('Alice feat. Bob', ['song']),
                      lyrics.search_pairs(item))
        self.assertIn(('Alice', ['song']),
                      lyrics.search_pairs(item))

        item = Item(artist='Alice feats Bob', title='song')
        self.assertIn(('Alice feats Bob', ['song']),
                      lyrics.search_pairs(item))
        self.assertNotIn(('Alice', ['song']),
                         lyrics.search_pairs(item))

        item = Item(artist='Alice featuring Bob', title='song')
        self.assertIn(('Alice featuring Bob', ['song']),
                      lyrics.search_pairs(item))
        self.assertIn(('Alice', ['song']),
                      lyrics.search_pairs(item))

        item = Item(artist='Alice & Bob', title='song')
        self.assertIn(('Alice & Bob', ['song']),
                      lyrics.search_pairs(item))
        self.assertIn(('Alice', ['song']),
                      lyrics.search_pairs(item))

        item = Item(artist='Alice and Bob', title='song')
        self.assertIn(('Alice and Bob', ['song']),
                      lyrics.search_pairs(item))
        self.assertIn(('Alice', ['song']),
                      lyrics.search_pairs(item))

    def test_search_pairs_multi_titles(self):
        item = Item(title='1 / 2', artist='A')
        self.assertIn(('A', ['1 / 2']), lyrics.search_pairs(item))
        self.assertIn(('A', ['1', '2']), lyrics.search_pairs(item))

        item = Item(title='1/2', artist='A')
        self.assertIn(('A', ['1/2']), lyrics.search_pairs(item))
        self.assertIn(('A', ['1', '2']), lyrics.search_pairs(item))

    def test_search_pairs_titles(self):
        item = Item(title='Song (live)', artist='A')
        self.assertIn(('A', ['Song']), lyrics.search_pairs(item))
        self.assertIn(('A', ['Song (live)']), lyrics.search_pairs(item))

        item = Item(title='Song (live) (new)', artist='A')
        self.assertIn(('A', ['Song']), lyrics.search_pairs(item))
        self.assertIn(('A', ['Song (live) (new)']), lyrics.search_pairs(item))

        item = Item(title='Song (live (new))', artist='A')
        self.assertIn(('A', ['Song']), lyrics.search_pairs(item))
        self.assertIn(('A', ['Song (live (new))']), lyrics.search_pairs(item))

        item = Item(title='Song ft. B', artist='A')
        self.assertIn(('A', ['Song']), lyrics.search_pairs(item))
        self.assertIn(('A', ['Song ft. B']), lyrics.search_pairs(item))

        item = Item(title='Song featuring B', artist='A')
        self.assertIn(('A', ['Song']), lyrics.search_pairs(item))
        self.assertIn(('A', ['Song featuring B']), lyrics.search_pairs(item))

        item = Item(title='Song and B', artist='A')
        self.assertNotIn(('A', ['Song']), lyrics.search_pairs(item))
        self.assertIn(('A', ['Song and B']), lyrics.search_pairs(item))

    def test_remove_credits(self):
        self.assertEqual(
            lyrics.remove_credits("""It's close to midnight
                                     Lyrics brought by example.com"""),
            "It's close to midnight"
        )
        self.assertEqual(
            lyrics.remove_credits("""Lyrics brought by example.com"""),
            ""
        )

        # don't remove 2nd verse for the only reason it contains 'lyrics' word
        text = """Look at all the shit that i done bought her
                  See lyrics ain't nothin
                  if the beat aint crackin"""
        self.assertEqual(lyrics.remove_credits(text), text)

    def test_is_lyrics(self):
        texts = ['LyricsMania.com - Copyright (c) 2013 - All Rights Reserved']
        texts += ["""All material found on this site is property\n
                     of mywickedsongtext brand"""]
        for t in texts:
            self.assertFalse(lyrics.is_lyrics(t))

    def test_slugify(self):
        text = u"http://site.com/\xe7afe-au_lait(boisson)"
        self.assertEqual(lyrics.slugify(text), 'http://site.com/cafe_au_lait')

    def test_scrape_strip_cruft(self):
        text = u"""<!--lyrics below-->
                  &nbsp;one
                  <br class='myclass'>
                  two  !
                  <br><br \>
                  <blink>four</blink>"""
        self.assertEqual(lyrics._scrape_strip_cruft(text, True),
                         "one\ntwo !\n\nfour")

    def test_scrape_strip_scripts(self):
        text = u"""foo<script>bar</script>baz"""
        self.assertEqual(lyrics._scrape_strip_cruft(text, True),
                         "foobaz")

    def test_scrape_strip_tag_in_comment(self):
        text = u"""foo<!--<bar>-->qux"""
        self.assertEqual(lyrics._scrape_strip_cruft(text, True),
                         "fooqux")

    def test_scrape_merge_paragraphs(self):
        text = u"one</p>   <p class='myclass'>two</p><p>three"
        self.assertEqual(lyrics._scrape_merge_paragraphs(text),
                         "one\ntwo\nthree")

    def test_missing_lyrics(self):
        self.assertFalse(lyrics.is_lyrics(LYRICS_TEXTS['missing_texts']))


def url_to_filename(url):
    url = url.replace('http://', '').replace('www.', '')
    fn = "".join(x for x in url if (x.isalnum() or x == '/'))
    fn = fn.split('/')
    fn = os.path.join(LYRICS_ROOT_DIR, fn[0], fn[-1]) + '.txt'
    return fn


def check_lyrics_fetched():
    """Return True if lyrics_download_samples.py has been runned and lyrics
    pages are present in resources directory"""
    lyrics_dirs = len([d for d in os.listdir(LYRICS_ROOT_DIR) if
                      os.path.isdir(os.path.join(LYRICS_ROOT_DIR, d))])
    # example.com is the only lyrics dir added to repo
    return lyrics_dirs > 1


class MockFetchUrl(object):
    def __init__(self, pathval='fetched_path'):
        self.pathval = pathval
        self.fetched = None

    def __call__(self, url, filename=None):
        self.fetched = url
        fn = url_to_filename(url)
        with open(fn, 'r') as f:
            content = f.read()
        return content


def is_lyrics_content_ok(title, text):
    """Compare lyrics text to expected lyrics for given title"""

    keywords = LYRICS_TEXTS[lyrics.slugify(title)]
    return all(x in text.lower() for x in keywords)

LYRICS_ROOT_DIR = os.path.join(_common.RSRC, 'lyrics')
LYRICS_TEXTS = confit.load_yaml(os.path.join(_common.RSRC, 'lyricstext.yaml'))
DEFAULT_SONG = dict(artist=u'The Beatles', title=u'Lady Madonna')

DEFAULT_SOURCES = [
    dict(DEFAULT_SONG, url=u'http://lyrics.wikia.com/',
         path=u'The_Beatles:Lady_Madonna'),
    dict(DEFAULT_SONG, url='http://www.lyrics.com/',
         path=u'lady-madonna-lyrics-the-beatles.html')
]

# Every source entered in default beets google custom search engine
# must be listed below.
# Use default query when possible, or override artist and title fields
# if website don't have lyrics for default query.
GOOGLE_SOURCES = [
    dict(DEFAULT_SONG,
         url=u'http://www.absolutelyrics.com',
         path=u'/lyrics/view/the_beatles/lady_madonna'),
    dict(DEFAULT_SONG,
         url=u'http://www.azlyrics.com',
         path=u'/lyrics/beatles/ladymadonna.html'),
    dict(DEFAULT_SONG,
         url=u'http://www.chartlyrics.com',
         path=u'/_LsLsZ7P4EK-F-LD4dJgDQ/Lady+Madonna.aspx'),
    dict(DEFAULT_SONG,
         url=u'http://www.elyricsworld.com',
         path=u'/lady_madonna_lyrics_beatles.html'),
    dict(url=u'http://www.lacoccinelle.net',
         artist=u'Jacques Brel', title=u"Amsterdam",
         path=u'/paroles-officielles/275679.html'),
    dict(DEFAULT_SONG,
         url=u'http://letras.mus.br/', path=u'the-beatles/275/'),
    dict(DEFAULT_SONG,
         url='http://www.lyricsmania.com/',
         path='lady_madonna_lyrics_the_beatles.html'),
    dict(DEFAULT_SONG,
         url=u'http://www.lyrics.net', path=u'/lyric/17547916'),
    dict(DEFAULT_SONG,
         url=u'http://www.lyricsmode.com',
         path=u'/lyrics/b/beatles/lady_madonna.html'),
    dict(url=u'http://www.lyricsontop.com',
         artist=u'Amy Winehouse', title=u"Jazz'n'blues",
         path=u'/amy-winehouse-songs/jazz-n-blues-lyrics.html'),
    dict(DEFAULT_SONG,
         url='http://www.metrolyrics.com/',
         path='lady-madonna-lyrics-beatles.html'),
    dict(url='http://www.musica.com/', path='letras.asp?letra=2738',
         artist=u'Santana', title=u'Black magic woman'),
    dict(DEFAULT_SONG,
         url=u'http://www.onelyrics.net/',
         artist=u'Ben & Ellen Harper', title=u'City of dreams',
         path='ben-ellen-harper-city-of-dreams-lyrics'),
    dict(url=u'http://www.paroles.net/',
         artist=u'Lilly Wood & the prick', title=u"Hey it's ok",
         path=u'lilly-wood-the-prick/paroles-hey-it-s-ok'),
    dict(DEFAULT_SONG,
         url='http://www.releaselyrics.com',
         path=u'/346e/the-beatles-lady-madonna-(love-version)/'),
    dict(DEFAULT_SONG,
         url=u'http://www.smartlyrics.com',
         path=u'/Song18148-The-Beatles-Lady-Madonna-lyrics.aspx'),
    dict(DEFAULT_SONG,
         url='http://www.songlyrics.com',
         path=u'/the-beatles/lady-madonna-lyrics'),
    dict(DEFAULT_SONG,
         url=u'http://www.stlyrics.com',
         path=u'/songs/r/richiehavens48961/ladymadonna2069109.html'),
    dict(DEFAULT_SONG,
         url=u'http://www.sweetslyrics.com',
         path=u'/761696.The%20Beatles%20-%20Lady%20Madonna.html')
]


class LyricsGooglePluginTest(unittest.TestCase):
    """Test scraping heuristics on a fake html page.
    Or run lyrics_download_samples.py first to check that beets google
    custom search engine sources are correctly scraped.
    """
    source = dict(url=u'http://www.example.com', artist=u'John Doe',
                  title=u'Beets song', path=u'/lyrics/beetssong')

    def setUp(self):
        """Set up configuration"""
        try:
            __import__('bs4')
        except ImportError:
            self.skipTest('Beautiful Soup 4 not available')
        if sys.version_info[:3] < (2, 7, 3):
            self.skipTest("Python's built-in HTML parser is not good enough")
        lyrics.LyricsPlugin()
        lyrics.fetch_url = MockFetchUrl()

    def test_google_sources_ok(self):
        """Test if lyrics present on websites registered in beets google custom
        search engine are correctly scraped."""
        if not check_lyrics_fetched():
            self.skipTest("Run lyrics_download_samples.py script first.")
        for s in GOOGLE_SOURCES:
            url = s['url'] + s['path']
            if os.path.isfile(url_to_filename(url)):
                res = lyrics.scrape_lyrics_from_html(lyrics.fetch_url(url))
                self.assertTrue(lyrics.is_lyrics(res), url)
                self.assertTrue(is_lyrics_content_ok(s['title'], res), url)

    def test_default_ok(self):
        """Test default engines with the default query"""
        if not check_lyrics_fetched():
            self.skipTest("Run lyrics_download_samples.py script first.")
        for (fun, s) in zip((lyrics.fetch_lyricswiki, lyrics.fetch_lyricscom),
                            DEFAULT_SOURCES):
            if os.path.isfile(url_to_filename(
                              s['url'] + s['path'])):
                res = fun(s['artist'], s['title'])
                self.assertTrue(lyrics.is_lyrics(res))
                self.assertTrue(is_lyrics_content_ok(
                                DEFAULT_SONG['title'], res))

    def test_is_page_candidate_exact_match(self):
        """Test matching html page title with song infos -- when song infos are
        present in the title."""
        from bs4 import SoupStrainer, BeautifulSoup
        s = self.source
        url = unicode(s['url'] + s['path'])
        html = lyrics.fetch_url(url)
        soup = BeautifulSoup(html, "html.parser",
                             parse_only=SoupStrainer('title'))
        self.assertEqual(lyrics.is_page_candidate(url, soup.title.string,
                                                  s['title'], s['artist']),
                         True, url)

    def test_is_page_candidate_fuzzy_match(self):
        """Test matching html page title with song infos -- when song infos are
        not present in the title."""
        s = self.source
        url = s['url'] + s['path']
        urlTitle = u'example.com | Beats song by John doe'

        # very small diffs (typo) are ok eg 'beats' vs 'beets' with same artist
        self.assertEqual(lyrics.is_page_candidate(url, urlTitle, s['title'],
                         s['artist']), True, url)
        # reject different title
        urlTitle = u'example.com | seets bong lyrics by John doe'
        self.assertEqual(lyrics.is_page_candidate(url, urlTitle, s['title'],
                         s['artist']), False, url)


def suite():
    return unittest.TestLoader().loadTestsFromName(__name__)

if __name__ == '__main__':
    unittest.main(defaultTest='suite')