From 84c82cc44ba95c57a5cd3da03f07493344c1aaa4 Mon Sep 17 00:00:00 2001
From: Fabrice Laporte <kraymer@gmail.com>
Date: Sat, 8 Nov 2014 10:55:48 +0100
Subject: [PATCH] Move script to download pages out of tests_lyrics.py

By default (as runned by CI tools), only *fake* example.com page is present in
rsr/lyrics and tests that check content of pages coming from *real* sources are
thus skipped.
Execute lyrics_download_samples.py to download pages from *real* sources. When
done and *real* pages are present on disk, no tests are skipped.
---
 test/lyrics_download_samples.py | 11 +++---
 test/test_lyrics.py             | 67 +++++++++++++++++++++------------
 2 files changed, 49 insertions(+), 29 deletions(-)

diff --git a/test/lyrics_download_samples.py b/test/lyrics_download_samples.py
index dfacd8a54..80e18fbfa 100644
--- a/test/lyrics_download_samples.py
+++ b/test/lyrics_download_samples.py
@@ -30,24 +30,25 @@ def mkdir_p(path):
 
 
 def safe_open_w(path):
-    ''' Open "path" for writing, creating any parent directories as needed.
-    '''
+    """Open "path" for writing, creating any parent directories as needed.
+    """
     mkdir_p(os.path.dirname(path))
     return open(path, 'w')
 
 
 def main(argv=None):
-    """download"""
-
+    """Download one lyrics sample page per referenced source.
+    """
     if argv is None:
         argv = sys.argv
 
-    for s in test_lyrics.SOURCES:
+    for s in test_lyrics.GOOGLE_SOURCES + test_lyrics.DEFAULT_SOURCES:
         url = s['url'] + s['path']
         fn = test_lyrics.url_to_filename(url)
         if not os.path.isfile(fn):
             html = requests.get(url).text
             with safe_open_w(fn) as f:
+                print 'Writing %s' % fn
                 f.write(html.encode('utf8'))
 
 if __name__ == "__main__":
diff --git a/test/test_lyrics.py b/test/test_lyrics.py
index 5ac53c723..1c3b6f6a1 100644
--- a/test/test_lyrics.py
+++ b/test/test_lyrics.py
@@ -17,12 +17,10 @@
 import os
 import _common
 import sys
-import requests
 from _common import unittest
 from beetsplug import lyrics
 from beets.library import Item
 from beets.util import confit
-from nose.plugins.attrib import attr
 
 
 class LyricsPluginTest(unittest.TestCase):
@@ -168,10 +166,18 @@ def url_to_filename(url):
     url = url.replace('http://', '').replace('www.', '')
     fn = "".join(x for x in url if (x.isalnum() or x == '/'))
     fn = fn.split('/')
-    fn = os.path.join(_common.RSRC, 'lyrics', fn[0], fn[-1]) + '.txt'
+    fn = os.path.join(LYRICS_ROOT_DIR, fn[0], fn[-1]) + '.txt'
     return fn
 
 
+def check_lyrics_fetched():
+    """Return True if lyrics_download_samples.py has been runned and lyrics
+    pages are present in resources directory"""
+    lyrics_dirs = len([d for d in os.listdir(LYRICS_ROOT_DIR) if
+                      os.path.isdir(os.path.join(LYRICS_ROOT_DIR, d))])
+    # example.com is the only lyrics dir added to repo
+    return lyrics_dirs > 1
+
 
 class MockFetchUrl(object):
     def __init__(self, pathval='fetched_path'):
@@ -198,14 +204,22 @@ def is_lyrics_content_ok(title, text):
         return (ratio > .5 and ratio < 2.5)
     return False
 
+LYRICS_ROOT_DIR = os.path.join(_common.RSRC, 'lyrics')
 LYRICS_TEXTS = confit.load_yaml(os.path.join(_common.RSRC, 'lyricstext.yaml'))
 DEFAULT_SONG = dict(artist=u'The Beatles', title=u'Lady Madonna')
 
+DEFAULT_SOURCES = [
+    dict(DEFAULT_SONG, url=u'http://lyrics.wikia.com/',
+         path=u'The_Beatles:Lady_Madonna'),
+    dict(DEFAULT_SONG, url='http://www.lyrics.com/',
+         path=u'lady-madonna-lyrics-the-beatles.html')
+
+]
 # Every source entered in default beets google custom search engine
 # must be listed below.
 # Use default query when possible, or override artist and title fields
 # if website don't have lyrics for default query.
-SOURCES = [
+GOOGLE_SOURCES = [
     dict(DEFAULT_SONG,
          url=u'http://www.absolutelyrics.com',
          path=u'/lyrics/view/the_beatles/lady_madonna'),
@@ -224,9 +238,6 @@ SOURCES = [
     dict(DEFAULT_SONG,
          url=u'http://www.lyrics007.com',
          path=u'/The%20Beatles%20Lyrics/Lady%20Madonna%20Lyrics.html'),
-    dict(DEFAULT_SONG,
-         url='http://www.lyrics.com/',
-         path=u'lady-madonna-lyrics-the-beatles.html'),
     dict(DEFAULT_SONG,
          url='http://www.lyricsmania.com/',
          path='lady_madonna_lyrics_the_beatles.html'),
@@ -236,9 +247,6 @@ SOURCES = [
     dict(url=u'http://www.lyricsontop.com',
          artist=u'Amy Winehouse', title=u"Jazz'n'blues",
          path=u'/amy-winehouse-songs/jazz-n-blues-lyrics.html'),
-    dict(DEFAULT_SONG,
-         url=u'http://lyrics.wikia.com/',
-         path=u'The_Beatles:Lady_Madonna'),
     dict(DEFAULT_SONG,
          url='http://www.metrolyrics.com/',
          path='lady-madonna-lyrics-beatles.html'),
@@ -285,25 +293,34 @@ class LyricsGooglePluginTest(unittest.TestCase):
         lyrics.LyricsPlugin()
         lyrics.fetch_url = MockFetchUrl()
 
-    @attr('slow')
-    def test_sources_ok(self):
-        for s in SOURCES:
+    def test_google_sources_ok(self):
+        """Test if lyrics present on websites registered in beets google custom
+        search engine are correctly scraped."""
+        if not check_lyrics_fetched():
+            self.skipTest("Run lyrics_download_samples.py script first.")
+        for s in GOOGLE_SOURCES:
             url = s['url'] + s['path']
-            download_source_sample(url)
-            res = lyrics.scrape_lyrics_from_html(lyrics.fetch_url(url))
-            self.assertTrue(lyrics.is_lyrics(res), url)
-            self.assertTrue(is_lyrics_content_ok(s['title'], res), url)
+            if os.path.isfile(url_to_filename(url)):
+                res = lyrics.scrape_lyrics_from_html(lyrics.fetch_url(url))
+                self.assertTrue(lyrics.is_lyrics(res), url)
+                self.assertTrue(is_lyrics_content_ok(s['title'], res), url)
 
-    @attr('slow')
     def test_default_ok(self):
-        """Test each lyrics engine with the default query"""
-
-        for f in (lyrics.fetch_lyricswiki, lyrics.fetch_lyricscom):
-            res = f(DEFAULT_SONG['artist'], DEFAULT_SONG['title'])
-            self.assertTrue(lyrics.is_lyrics(res))
-            self.assertTrue(is_lyrics_content_ok(DEFAULT_SONG['title'], res))
+        """Test default engines with the default query"""
+        if not check_lyrics_fetched():
+            self.skipTest("Run lyrics_download_samples.py script first.")
+        for (fun, s) in zip((lyrics.fetch_lyricswiki, lyrics.fetch_lyricscom),
+                            DEFAULT_SOURCES):
+            if os.path.isfile(url_to_filename(
+                              s['url'] + s['path'])):
+                res = fun(s['artist'], s['title'])
+                self.assertTrue(lyrics.is_lyrics(res))
+                self.assertTrue(is_lyrics_content_ok(
+                                DEFAULT_SONG['title'], res))
 
     def test_is_page_candidate_exact_match(self):
+        """Test matching html page title with song infos -- when song infos are
+        present in the title."""
         from bs4 import SoupStrainer, BeautifulSoup
         s = self.source
         url = unicode(s['url'] + s['path'])
@@ -315,6 +332,8 @@ class LyricsGooglePluginTest(unittest.TestCase):
                          True, url)
 
     def test_is_page_candidate_fuzzy_match(self):
+        """Test matching html page title with song infos -- when song infos are
+        not present in the title."""
         s = self.source
         url = s['url'] + s['path']
         urlTitle = u'example.com | Beats song by John doe'