beets/test/lyrics_sources.py

# This file is part of beets.
# Copyright 2014, Fabrice Laporte.
#
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and associated documentation files (the
# "Software"), to deal in the Software without restriction, including
# without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and to
# permit persons to whom the Software is furnished to do so, subject to
# the following conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.

"""Tests for the 'lyrics' plugin"""

import os
import logging
import _common
from _common import unittest
from beetsplug import lyrics
from beets import config
from beets.util import confit
from bs4 import BeautifulSoup

log = logging.getLogger('beets')
LYRICS_TEXTS = confit.load_yaml(os.path.join(_common.RSRC, 'lyricstext.yaml'))

try:
    googlekey = config['lyrics']['google_API_key'].get(unicode)
except confit.NotFoundError:
    googlekey = None

# default query for tests
definfo = dict(artist=u'The Beatles', title=u'Lady Madonna')


class MockFetchUrl(object):
    def __init__(self, pathval='fetched_path'):
        self.pathval = pathval
        self.fetched = None

    def __call__(self, url, filename=None):
        self.fetched = url
        url = url.replace('http://', '').replace('www.', '')
        fn = "".join(x for x in url if (x.isalnum() or x == '/'))
        fn = fn.split('/')
        fn = os.path.join('rsrc', 'lyrics', fn[0], fn[-1]) + '.txt'
        with open(fn, 'r') as f:
            content = f.read()
        return content


def is_lyrics_content_ok(title, text):
    """Compare lyrics text to expected lyrics for given title"""

    setexpected = set(LYRICS_TEXTS[lyrics.slugify(title)].split())
    settext = set(text.split())
    setinter = setexpected.intersection(settext)
    # consider lyrics ok if they share 50% or more with the reference
    if len(setinter):
        ratio = 1.0 * max(len(setexpected), len(settext)) / len(setinter)
        return (ratio > .5 and ratio < 2)
    return False


class LyricsPluginTest(unittest.TestCase):
    def setUp(self):
        """Set up configuration"""
        lyrics.LyricsPlugin()

    def test_default_ok(self):
        """Test each lyrics engine with the default query"""

        lyrics.fetch_url = MockFetchUrl()

        for f in (lyrics.fetch_lyricswiki, lyrics.fetch_lyricscom):
            res = f(definfo['artist'], definfo['title'])
            self.assertTrue(lyrics.is_lyrics(res))
            self.assertTrue(is_lyrics_content_ok(definfo['title'], res))

    def test_missing_lyrics(self):
        self.assertFalse(lyrics.is_lyrics(LYRICS_TEXTS['missing_texts']))


class LyricsScrapingPluginTest(unittest.TestCase):

    # Every source entered in default beets google custom search engine
    # must be listed below.
    # Use default query when possible, or override artist and title field
    # if website don't have lyrics for default query.
    sourcesOk = [
        dict(definfo, url=u'http://www.smartlyrics.com',
             path=u'/Song18148-The-Beatles-Lady-Madonna-lyrics.aspx'),
        dict(definfo, url=u'http://www.elyricsworld.com',
             path=u'/lady_madonna_lyrics_beatles.html'),
        dict(artist=u'Beres Hammond', title=u'I could beat myself',
             url=u'http://www.reggaelyrics.info',
             path=u'/beres-hammond/i-could-beat-myself'),
        dict(definfo, artist=u'Lilly Wood & the prick', title=u"Hey it's ok",
             url=u'http://www.lyricsmania.com',
             path=u'/hey_its_ok_lyrics_lilly_wood_and_the_prick.html'),
        dict(definfo, artist=u'Lilly Wood & the prick', title=u"Hey it's ok",
             url=u'http://www.paroles.net/',
             path=u'lilly-wood-the-prick/paroles-hey-it-s-ok'),
        dict(definfo, artist=u'Amy Winehouse', title=u"Jazz'n'blues",
             url=u'http://www.lyricsontop.com',
             path=u'/amy-winehouse-songs/jazz-n-blues-lyrics.html'),
        dict(definfo, url=u'http://www.sweetslyrics.com',
             path=u'/761696.The%20Beatles%20-%20Lady%20Madonna.html'),
        dict(definfo, url=u'http://www.lyrics007.com',
             path=u'/The%20Beatles%20Lyrics/Lady%20Madonna%20Lyrics.html'),
        dict(definfo, url=u'http://www.absolutelyrics.com',
             path=u'/lyrics/view/the_beatles/lady_madonna'),
        dict(definfo, url=u'http://www.azlyrics.com/',
             path=u'/lyrics/beatles/ladymadonna.html'),
        dict(definfo, url=u'http://www.chartlyrics.com',
             path=u'/_LsLsZ7P4EK-F-LD4dJgDQ/Lady+Madonna.aspx'),
        dict(definfo, url='http://www.releaselyrics.com',
             path=u'/e35f/the-beatles-lady-madonna'),
    ]

    # Websites that can't be scraped yet and whose results must be
    # flagged as invalid lyrics.
    sourcesFail = [
        dict(definfo, url='http://www.songlyrics.com',
             path=u'/the-beatles/lady-madonna-lyrics'),
        dict(definfo, url='http://www.metrolyrics.com/',
             path='best-for-last-lyrics-adele.html')
    ]

    # Websites that return truncated lyrics because of scraping issues, and
    # thus should not be included as sources to Google CSE.
    # They are good candidates for later inclusion after improvement
    # iterations of the scraping algorithm.
    sourcesIncomplete = [
        dict(definfo, artist=u'Lilly Wood & the prick', title=u"Hey it's ok",
             url=u'http://www.lacoccinelle.net',
             path=u'/paroles-officielles/550512.html'),
    ]

    def test_sources_ok(self):
        for s in self.sourcesOk:
            url = s['url'] + s['path']
            res = lyrics.scrape_lyrics_from_url(url)
            self.assertTrue(lyrics.is_lyrics(res), url)
            self.assertTrue(is_lyrics_content_ok(s['title'], res), url)

    def test_sources_fail(self):
        for s in self.sourcesFail:
            url = s['url'] + s['path']
            res = lyrics.scrape_lyrics_from_url(url)
            # very unlikely these sources pass if the scraping algo is not
            # tweaked on purpose for these cases
            self.assertFalse(lyrics.is_lyrics(res), "%s => %s" % (url, res))

    def test_sources_incomplete(self):
        for s in self.sourcesIncomplete:
            url = s['url'] + s['path']
            res = lyrics.scrape_lyrics_from_url(url)

            self.assertTrue(lyrics.is_lyrics(res))
            # these sources may pass if the html source evolve or after
            # a random improvement in the scraping algo: we want to
            # be noticed if it's the case.
            if is_lyrics_content_ok(s['title'], res):
                log.debug('Source %s actually return valid lyrics!' % s['url'])

    def test_is_page_candidate(self):
        for s in self.sourcesOk:
            url = unicode(s['url'] + s['path'])
            html = lyrics.fetch_url(url)
            soup = BeautifulSoup(html)
            if not soup.title:
                continue
            self.assertEqual(lyrics.is_page_candidate(url, soup.title.string,
                                                      s['title'], s['artist']),
                             True, url)


def suite():
    return unittest.TestLoader().loadTestsFromName(__name__)

if __name__ == '__main__':
    unittest.main(defaultTest='suite')