Move script to download pages out of tests_lyrics.py

By default (as runned by CI tools), only *fake* example.com page is present in rsr/lyrics and tests that check content of pages coming from *real* sources are thus skipped. Execute lyrics_download_samples.py to download pages from *real* sources. When done and *real* pages are present on disk, no tests are skipped.
2026-02-04 06:24:13 +01:00 · 2014-11-08 10:55:48 +01:00 · 2014-11-08 10:55:48 +01:00 · 84c82cc44b
commit 84c82cc44b
parent f5e7bd5d05
2 changed files with 49 additions and 29 deletions
--- a/test/lyrics_download_samples.py
+++ b/test/lyrics_download_samples.py
@ -30,24 +30,25 @@ def mkdir_p(path):


 def safe_open_w(path):
-    ''' Open "path" for writing, creating any parent directories as needed.
-    '''
+    """Open "path" for writing, creating any parent directories as needed.
+    """
    mkdir_p(os.path.dirname(path))
    return open(path, 'w')


 def main(argv=None):
-    """download"""
-
+    """Download one lyrics sample page per referenced source.
+    """
    if argv is None:
        argv = sys.argv

-    for s in test_lyrics.SOURCES:
+    for s in test_lyrics.GOOGLE_SOURCES + test_lyrics.DEFAULT_SOURCES:
        url = s['url'] + s['path']
        fn = test_lyrics.url_to_filename(url)
        if not os.path.isfile(fn):
            html = requests.get(url).text
            with safe_open_w(fn) as f:
+                print 'Writing %s' % fn
                f.write(html.encode('utf8'))

 if __name__ == "__main__":
--- a/test/test_lyrics.py
+++ b/test/test_lyrics.py
@ -17,12 +17,10 @@
 import os
 import _common
 import sys
-import requests
 from _common import unittest
 from beetsplug import lyrics
 from beets.library import Item
 from beets.util import confit
-from nose.plugins.attrib import attr


 class LyricsPluginTest(unittest.TestCase):
@ -168,10 +166,18 @@ def url_to_filename(url):
    url = url.replace('http://', '').replace('www.', '')
    fn = "".join(x for x in url if (x.isalnum() or x == '/'))
    fn = fn.split('/')
-    fn = os.path.join(_common.RSRC, 'lyrics', fn[0], fn[-1]) + '.txt'
+    fn = os.path.join(LYRICS_ROOT_DIR, fn[0], fn[-1]) + '.txt'
    return fn


+def check_lyrics_fetched():
+    """Return True if lyrics_download_samples.py has been runned and lyrics
+    pages are present in resources directory"""
+    lyrics_dirs = len([d for d in os.listdir(LYRICS_ROOT_DIR) if
+                      os.path.isdir(os.path.join(LYRICS_ROOT_DIR, d))])
+    # example.com is the only lyrics dir added to repo
+    return lyrics_dirs > 1
+

 class MockFetchUrl(object):
    def __init__(self, pathval='fetched_path'):
@ -198,14 +204,22 @@ def is_lyrics_content_ok(title, text):
        return (ratio > .5 and ratio < 2.5)
    return False

+LYRICS_ROOT_DIR = os.path.join(_common.RSRC, 'lyrics')
 LYRICS_TEXTS = confit.load_yaml(os.path.join(_common.RSRC, 'lyricstext.yaml'))
 DEFAULT_SONG = dict(artist=u'The Beatles', title=u'Lady Madonna')

+DEFAULT_SOURCES = [
+    dict(DEFAULT_SONG, url=u'http://lyrics.wikia.com/',
+         path=u'The_Beatles:Lady_Madonna'),
+    dict(DEFAULT_SONG, url='http://www.lyrics.com/',
+         path=u'lady-madonna-lyrics-the-beatles.html')
+
+]
 # Every source entered in default beets google custom search engine
 # must be listed below.
 # Use default query when possible, or override artist and title fields
 # if website don't have lyrics for default query.
-SOURCES = [
+GOOGLE_SOURCES = [
    dict(DEFAULT_SONG,
         url=u'http://www.absolutelyrics.com',
         path=u'/lyrics/view/the_beatles/lady_madonna'),
@ -224,9 +238,6 @@ SOURCES = [
    dict(DEFAULT_SONG,
         url=u'http://www.lyrics007.com',
         path=u'/The%20Beatles%20Lyrics/Lady%20Madonna%20Lyrics.html'),
-    dict(DEFAULT_SONG,
-         url='http://www.lyrics.com/',
-         path=u'lady-madonna-lyrics-the-beatles.html'),
    dict(DEFAULT_SONG,
         url='http://www.lyricsmania.com/',
         path='lady_madonna_lyrics_the_beatles.html'),
@ -236,9 +247,6 @@ SOURCES = [
    dict(url=u'http://www.lyricsontop.com',
         artist=u'Amy Winehouse', title=u"Jazz'n'blues",
         path=u'/amy-winehouse-songs/jazz-n-blues-lyrics.html'),
-    dict(DEFAULT_SONG,
-         url=u'http://lyrics.wikia.com/',
-         path=u'The_Beatles:Lady_Madonna'),
    dict(DEFAULT_SONG,
         url='http://www.metrolyrics.com/',
         path='lady-madonna-lyrics-beatles.html'),
@ -285,25 +293,34 @@ class LyricsGooglePluginTest(unittest.TestCase):
        lyrics.LyricsPlugin()
        lyrics.fetch_url = MockFetchUrl()

-    @attr('slow')
-    def test_sources_ok(self):
-        for s in SOURCES:
+    def test_google_sources_ok(self):
+        """Test if lyrics present on websites registered in beets google custom
+        search engine are correctly scraped."""
+        if not check_lyrics_fetched():
+            self.skipTest("Run lyrics_download_samples.py script first.")
+        for s in GOOGLE_SOURCES:
            url = s['url'] + s['path']
-            download_source_sample(url)
-            res = lyrics.scrape_lyrics_from_html(lyrics.fetch_url(url))
-            self.assertTrue(lyrics.is_lyrics(res), url)
-            self.assertTrue(is_lyrics_content_ok(s['title'], res), url)
+            if os.path.isfile(url_to_filename(url)):
+                res = lyrics.scrape_lyrics_from_html(lyrics.fetch_url(url))
+                self.assertTrue(lyrics.is_lyrics(res), url)
+                self.assertTrue(is_lyrics_content_ok(s['title'], res), url)

-    @attr('slow')
    def test_default_ok(self):
-        """Test each lyrics engine with the default query"""
-
-        for f in (lyrics.fetch_lyricswiki, lyrics.fetch_lyricscom):
-            res = f(DEFAULT_SONG['artist'], DEFAULT_SONG['title'])
-            self.assertTrue(lyrics.is_lyrics(res))
-            self.assertTrue(is_lyrics_content_ok(DEFAULT_SONG['title'], res))
+        """Test default engines with the default query"""
+        if not check_lyrics_fetched():
+            self.skipTest("Run lyrics_download_samples.py script first.")
+        for (fun, s) in zip((lyrics.fetch_lyricswiki, lyrics.fetch_lyricscom),
+                            DEFAULT_SOURCES):
+            if os.path.isfile(url_to_filename(
+                              s['url'] + s['path'])):
+                res = fun(s['artist'], s['title'])
+                self.assertTrue(lyrics.is_lyrics(res))
+                self.assertTrue(is_lyrics_content_ok(
+                                DEFAULT_SONG['title'], res))

    def test_is_page_candidate_exact_match(self):
+        """Test matching html page title with song infos -- when song infos are
+        present in the title."""
        from bs4 import SoupStrainer, BeautifulSoup
        s = self.source
        url = unicode(s['url'] + s['path'])
@ -315,6 +332,8 @@ class LyricsGooglePluginTest(unittest.TestCase):
                         True, url)

    def test_is_page_candidate_fuzzy_match(self):
+        """Test matching html page title with song infos -- when song infos are
+        not present in the title."""
        s = self.source
        url = s['url'] + s['path']
        urlTitle = u'example.com | Beats song by John doe'