Move script to download pages out of tests_lyrics.py

By default (as runned by CI tools), only *fake* example.com page is present in
rsr/lyrics and tests that check content of pages coming from *real* sources are
thus skipped.
Execute lyrics_download_samples.py to download pages from *real* sources. When
done and *real* pages are present on disk, no tests are skipped.
This commit is contained in:
Fabrice Laporte 2014-11-08 10:55:48 +01:00
parent f5e7bd5d05
commit 84c82cc44b
2 changed files with 49 additions and 29 deletions

View file

@ -30,24 +30,25 @@ def mkdir_p(path):
def safe_open_w(path):
''' Open "path" for writing, creating any parent directories as needed.
'''
"""Open "path" for writing, creating any parent directories as needed.
"""
mkdir_p(os.path.dirname(path))
return open(path, 'w')
def main(argv=None):
"""download"""
"""Download one lyrics sample page per referenced source.
"""
if argv is None:
argv = sys.argv
for s in test_lyrics.SOURCES:
for s in test_lyrics.GOOGLE_SOURCES + test_lyrics.DEFAULT_SOURCES:
url = s['url'] + s['path']
fn = test_lyrics.url_to_filename(url)
if not os.path.isfile(fn):
html = requests.get(url).text
with safe_open_w(fn) as f:
print 'Writing %s' % fn
f.write(html.encode('utf8'))
if __name__ == "__main__":

View file

@ -17,12 +17,10 @@
import os
import _common
import sys
import requests
from _common import unittest
from beetsplug import lyrics
from beets.library import Item
from beets.util import confit
from nose.plugins.attrib import attr
class LyricsPluginTest(unittest.TestCase):
@ -168,10 +166,18 @@ def url_to_filename(url):
url = url.replace('http://', '').replace('www.', '')
fn = "".join(x for x in url if (x.isalnum() or x == '/'))
fn = fn.split('/')
fn = os.path.join(_common.RSRC, 'lyrics', fn[0], fn[-1]) + '.txt'
fn = os.path.join(LYRICS_ROOT_DIR, fn[0], fn[-1]) + '.txt'
return fn
def check_lyrics_fetched():
"""Return True if lyrics_download_samples.py has been runned and lyrics
pages are present in resources directory"""
lyrics_dirs = len([d for d in os.listdir(LYRICS_ROOT_DIR) if
os.path.isdir(os.path.join(LYRICS_ROOT_DIR, d))])
# example.com is the only lyrics dir added to repo
return lyrics_dirs > 1
class MockFetchUrl(object):
def __init__(self, pathval='fetched_path'):
@ -198,14 +204,22 @@ def is_lyrics_content_ok(title, text):
return (ratio > .5 and ratio < 2.5)
return False
LYRICS_ROOT_DIR = os.path.join(_common.RSRC, 'lyrics')
LYRICS_TEXTS = confit.load_yaml(os.path.join(_common.RSRC, 'lyricstext.yaml'))
DEFAULT_SONG = dict(artist=u'The Beatles', title=u'Lady Madonna')
DEFAULT_SOURCES = [
dict(DEFAULT_SONG, url=u'http://lyrics.wikia.com/',
path=u'The_Beatles:Lady_Madonna'),
dict(DEFAULT_SONG, url='http://www.lyrics.com/',
path=u'lady-madonna-lyrics-the-beatles.html')
]
# Every source entered in default beets google custom search engine
# must be listed below.
# Use default query when possible, or override artist and title fields
# if website don't have lyrics for default query.
SOURCES = [
GOOGLE_SOURCES = [
dict(DEFAULT_SONG,
url=u'http://www.absolutelyrics.com',
path=u'/lyrics/view/the_beatles/lady_madonna'),
@ -224,9 +238,6 @@ SOURCES = [
dict(DEFAULT_SONG,
url=u'http://www.lyrics007.com',
path=u'/The%20Beatles%20Lyrics/Lady%20Madonna%20Lyrics.html'),
dict(DEFAULT_SONG,
url='http://www.lyrics.com/',
path=u'lady-madonna-lyrics-the-beatles.html'),
dict(DEFAULT_SONG,
url='http://www.lyricsmania.com/',
path='lady_madonna_lyrics_the_beatles.html'),
@ -236,9 +247,6 @@ SOURCES = [
dict(url=u'http://www.lyricsontop.com',
artist=u'Amy Winehouse', title=u"Jazz'n'blues",
path=u'/amy-winehouse-songs/jazz-n-blues-lyrics.html'),
dict(DEFAULT_SONG,
url=u'http://lyrics.wikia.com/',
path=u'The_Beatles:Lady_Madonna'),
dict(DEFAULT_SONG,
url='http://www.metrolyrics.com/',
path='lady-madonna-lyrics-beatles.html'),
@ -285,25 +293,34 @@ class LyricsGooglePluginTest(unittest.TestCase):
lyrics.LyricsPlugin()
lyrics.fetch_url = MockFetchUrl()
@attr('slow')
def test_sources_ok(self):
for s in SOURCES:
def test_google_sources_ok(self):
"""Test if lyrics present on websites registered in beets google custom
search engine are correctly scraped."""
if not check_lyrics_fetched():
self.skipTest("Run lyrics_download_samples.py script first.")
for s in GOOGLE_SOURCES:
url = s['url'] + s['path']
download_source_sample(url)
res = lyrics.scrape_lyrics_from_html(lyrics.fetch_url(url))
self.assertTrue(lyrics.is_lyrics(res), url)
self.assertTrue(is_lyrics_content_ok(s['title'], res), url)
if os.path.isfile(url_to_filename(url)):
res = lyrics.scrape_lyrics_from_html(lyrics.fetch_url(url))
self.assertTrue(lyrics.is_lyrics(res), url)
self.assertTrue(is_lyrics_content_ok(s['title'], res), url)
@attr('slow')
def test_default_ok(self):
"""Test each lyrics engine with the default query"""
for f in (lyrics.fetch_lyricswiki, lyrics.fetch_lyricscom):
res = f(DEFAULT_SONG['artist'], DEFAULT_SONG['title'])
self.assertTrue(lyrics.is_lyrics(res))
self.assertTrue(is_lyrics_content_ok(DEFAULT_SONG['title'], res))
"""Test default engines with the default query"""
if not check_lyrics_fetched():
self.skipTest("Run lyrics_download_samples.py script first.")
for (fun, s) in zip((lyrics.fetch_lyricswiki, lyrics.fetch_lyricscom),
DEFAULT_SOURCES):
if os.path.isfile(url_to_filename(
s['url'] + s['path'])):
res = fun(s['artist'], s['title'])
self.assertTrue(lyrics.is_lyrics(res))
self.assertTrue(is_lyrics_content_ok(
DEFAULT_SONG['title'], res))
def test_is_page_candidate_exact_match(self):
"""Test matching html page title with song infos -- when song infos are
present in the title."""
from bs4 import SoupStrainer, BeautifulSoup
s = self.source
url = unicode(s['url'] + s['path'])
@ -315,6 +332,8 @@ class LyricsGooglePluginTest(unittest.TestCase):
True, url)
def test_is_page_candidate_fuzzy_match(self):
"""Test matching html page title with song infos -- when song infos are
not present in the title."""
s = self.source
url = s['url'] + s['path']
urlTitle = u'example.com | Beats song by John doe'