mirror of
https://github.com/beetbox/beets.git
synced 2025-12-10 02:22:25 +01:00
Merge pull request #2538 from Kraymer/lyrics-test
Tests to track whether lyrics websites are correctly fetched
This commit is contained in:
commit
fc6b65d592
3 changed files with 211 additions and 193 deletions
|
|
@ -21,6 +21,7 @@ from __future__ import absolute_import, division, print_function
|
|||
import difflib
|
||||
import itertools
|
||||
import json
|
||||
import struct
|
||||
import re
|
||||
import requests
|
||||
import unicodedata
|
||||
|
|
@ -53,7 +54,6 @@ from beets import plugins
|
|||
from beets import ui
|
||||
import beets
|
||||
|
||||
|
||||
DIV_RE = re.compile(r'<(/?)div>?', re.I)
|
||||
COMMENT_RE = re.compile(r'<!--.*-->', re.S)
|
||||
TAG_RE = re.compile(r'<[^>]*>')
|
||||
|
|
@ -77,6 +77,12 @@ USER_AGENT = 'beets/{}'.format(beets.__version__)
|
|||
|
||||
# Utilities.
|
||||
|
||||
def unichar(i):
|
||||
try:
|
||||
return six.unichr(i)
|
||||
except ValueError:
|
||||
return struct.pack('i', i).decode('utf-32')
|
||||
|
||||
|
||||
def unescape(text):
|
||||
"""Resolve &#xxx; HTML entities (and some others)."""
|
||||
|
|
@ -86,7 +92,7 @@ def unescape(text):
|
|||
|
||||
def replchar(m):
|
||||
num = m.group(1)
|
||||
return six.unichr(int(num))
|
||||
return unichar(int(num))
|
||||
out = re.sub(u"&#(\d+);", replchar, out)
|
||||
return out
|
||||
|
||||
|
|
@ -104,7 +110,6 @@ def extract_text_in(html, starttag):
|
|||
"""Extract the text from a <DIV> tag in the HTML starting with
|
||||
``starttag``. Returns None if parsing fails.
|
||||
"""
|
||||
|
||||
# Strip off the leading text before opening tag.
|
||||
try:
|
||||
_, html = html.split(starttag, 1)
|
||||
|
|
@ -145,10 +150,10 @@ def search_pairs(item):
|
|||
and featured artists from the strings and add them as candidates.
|
||||
The method also tries to split multiple titles separated with `/`.
|
||||
"""
|
||||
|
||||
def generate_alternatives(string, patterns):
|
||||
"""Generate string alternatives by extracting first matching group for
|
||||
each given pattern."""
|
||||
each given pattern.
|
||||
"""
|
||||
alternatives = [string]
|
||||
for pattern in patterns:
|
||||
match = re.search(pattern, string, re.IGNORECASE)
|
||||
|
|
@ -254,16 +259,18 @@ class MusiXmatch(SymbolsReplaced):
|
|||
|
||||
def fetch(self, artist, title):
|
||||
url = self.build_url(artist, title)
|
||||
|
||||
html = self.fetch_url(url)
|
||||
if not html:
|
||||
return
|
||||
lyrics = extract_text_between(html,
|
||||
'"body":', '"language":')
|
||||
html_part = html.split('<p class="mxm-lyrics__content')[-1]
|
||||
lyrics = extract_text_between(html_part, '>', '</p>')
|
||||
return lyrics.strip(',"').replace('\\n', '\n')
|
||||
|
||||
|
||||
class Genius(Backend):
|
||||
"""Fetch lyrics from Genius via genius-api."""
|
||||
|
||||
def __init__(self, config, log):
|
||||
super(Genius, self).__init__(config, log)
|
||||
self.api_key = config['genius_api_key'].as_str()
|
||||
|
|
@ -355,6 +362,7 @@ class Genius(Backend):
|
|||
|
||||
class LyricsWiki(SymbolsReplaced):
|
||||
"""Fetch lyrics from LyricsWiki."""
|
||||
|
||||
URL_PATTERN = 'http://lyrics.wikia.com/%s:%s'
|
||||
|
||||
def fetch(self, artist, title):
|
||||
|
|
@ -375,6 +383,7 @@ class LyricsWiki(SymbolsReplaced):
|
|||
|
||||
class LyricsCom(Backend):
|
||||
"""Fetch lyrics from Lyrics.com."""
|
||||
|
||||
URL_PATTERN = 'http://www.lyrics.com/%s-lyrics-%s.html'
|
||||
NOT_FOUND = (
|
||||
'Sorry, we do not have the lyric',
|
||||
|
|
@ -478,6 +487,7 @@ def scrape_lyrics_from_html(html):
|
|||
|
||||
class Google(Backend):
|
||||
"""Fetch lyrics from Google search results."""
|
||||
|
||||
def __init__(self, config, log):
|
||||
super(Google, self).__init__(config, log)
|
||||
self.api_key = config['google_API_key'].as_str()
|
||||
|
|
@ -713,7 +723,8 @@ class LyricsPlugin(plugins.BeetsPlugin):
|
|||
|
||||
def fetch_item_lyrics(self, lib, item, write, force):
|
||||
"""Fetch and store lyrics for a single item. If ``write``, then the
|
||||
lyrics will also be written to the file itself."""
|
||||
lyrics will also be written to the file itself.
|
||||
"""
|
||||
# Skip if the item already has lyrics.
|
||||
if not force and item.lyrics:
|
||||
self._log.info(u'lyrics already present: {0}', item)
|
||||
|
|
|
|||
|
|
@ -1,45 +1,56 @@
|
|||
Beets_song:
|
||||
- geeks
|
||||
- bouquet
|
||||
- panacea
|
||||
# Song used by LyricsGooglePluginMachineryTest
|
||||
|
||||
Amsterdam:
|
||||
- oriflammes
|
||||
- fortune
|
||||
- batave
|
||||
- pissent
|
||||
|
||||
Lady_Madonna:
|
||||
- heaven
|
||||
- tuesday
|
||||
- thursday
|
||||
|
||||
Jazz_n_blues:
|
||||
- parkway
|
||||
- balance
|
||||
- impatient
|
||||
- shoes
|
||||
|
||||
Hey_it_s_ok:
|
||||
- swear
|
||||
- forgive
|
||||
- drink
|
||||
- found
|
||||
|
||||
City_of_dreams:
|
||||
- groves
|
||||
- landmarks
|
||||
- twilight
|
||||
- freeways
|
||||
|
||||
Black_magic_woman:
|
||||
- devil
|
||||
- magic
|
||||
- spell
|
||||
- heart
|
||||
Beets_song: |
|
||||
beets is the media library management system for obsessive-compulsive music geeks the purpose of
|
||||
beets is to get your music collection right once and for all it catalogs your collection
|
||||
automatically improving its metadata as it goes it then provides a bouquet of tools for
|
||||
manipulating and accessing your music here's an example of beets' brainy tag corrector doing its
|
||||
because beets is designed as a library it can do almost anything you can imagine for your
|
||||
music collection via plugins beets becomes a panacea
|
||||
|
||||
missing_texts: |
|
||||
Lyricsmania staff is working hard for you to add $TITLE lyrics as soon
|
||||
as they'll be released by $ARTIST, check back soon!
|
||||
In case you have the lyrics to $TITLE and want to send them to us, fill out
|
||||
the following form.
|
||||
|
||||
# Songs lyrics used to test the different sources present in the google custom search engine.
|
||||
# Text is randomized for copyright infringement reason.
|
||||
|
||||
Amsterdam: |
|
||||
coup corps coeur invitent mains comme trop morue le hantent mais la dames joli revenir aux
|
||||
mangent croquer pleine plantent rire de sortent pleins fortune d'amsterdam bruit ruisselants
|
||||
large poissons braguette leur putains blanches jusque pissent dans soleils dansent et port
|
||||
bien vertu nez sur chaleur femmes rotant dorment marins boivent bu les que d'un qui je
|
||||
une cou hambourg plus ils dents ou tournent or berges d'ailleurs tout ciel haubans ce son lueurs
|
||||
en lune ont mouchent leurs long frottant jusqu'en vous regard montrent langueurs chantent
|
||||
tordent pleure donnent drames mornes des panse pour un sent encore referment nappes au meurent
|
||||
geste quand puis alors frites grosses batave expire naissent reboivent oriflammes grave riant a
|
||||
enfin rance fier y bouffer s'entendre se mieux
|
||||
|
||||
Lady_Madonna: |
|
||||
feed his money tuesday manage didn't head feet see arrives at in madonna rest morning children
|
||||
wonder how make thursday your to sunday music papers come tie you has was is listen suitcase
|
||||
ends friday run that needed breast they child baby mending on lady learned a nun like did wednesday
|
||||
bed think without afternoon night meet the playing lying
|
||||
|
||||
Jazz_n_blues: |
|
||||
all shoes money through follow blow til father to his hit jazz kiss now cool bar cause 50 night
|
||||
heading i'll says yeah cash forgot blues out what for ways away fingers waiting got ever bold
|
||||
screen sixty throw wait on about last compton days o pick love wall had within jeans jd next
|
||||
miss standing from it's two long fight extravagant tell today more buy shopping that didn't
|
||||
what's but russian up can parkway balance my and gone am it as at in check if bags when cross
|
||||
machine take you drinks coke june wrong coming fancy's i n' impatient so the main's spend
|
||||
that's
|
||||
|
||||
Hey_it_s_ok: |
|
||||
and forget be when please it against fighting mama cause ! again what said
|
||||
things papa hey to much lovers way wet was too do drink and i who forgive
|
||||
hey fourteen please know not wanted had myself ok friends bed times looked
|
||||
swear act found the my mean
|
||||
|
||||
Black_magic_woman: |
|
||||
blind heart sticks just don't into back alone see need yes your out devil make that to black got
|
||||
you might me woman turning spell stop baby with 'round a on stone messin' magic i of
|
||||
tricks up leave turn bad so pick she's my can't
|
||||
|
||||
|
|
|
|||
|
|
@ -15,21 +15,25 @@
|
|||
|
||||
"""Tests for the 'lyrics' plugin."""
|
||||
|
||||
from __future__ import division, absolute_import, print_function
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
import os
|
||||
import sys
|
||||
import re
|
||||
import six
|
||||
import sys
|
||||
import unittest
|
||||
|
||||
from mock import patch
|
||||
from test import _common
|
||||
from mock import MagicMock
|
||||
|
||||
from beets import logging
|
||||
from beets.library import Item
|
||||
from beets.util import bytestring_path, confit
|
||||
|
||||
from beetsplug import lyrics
|
||||
from beets.library import Item
|
||||
from beets.util import confit, bytestring_path
|
||||
from beets import logging
|
||||
import six
|
||||
|
||||
from mock import MagicMock
|
||||
|
||||
|
||||
log = logging.getLogger('beets.test_lyrics')
|
||||
raw_backend = lyrics.Backend({}, log)
|
||||
|
|
@ -37,8 +41,9 @@ google = lyrics.Google(MagicMock(), log)
|
|||
|
||||
|
||||
class LyricsPluginTest(unittest.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
"""Set up configuration"""
|
||||
"""Set up configuration."""
|
||||
lyrics.LyricsPlugin()
|
||||
|
||||
def test_search_artist(self):
|
||||
|
|
@ -194,16 +199,8 @@ def url_to_filename(url):
|
|||
return fn
|
||||
|
||||
|
||||
def check_lyrics_fetched():
|
||||
"""Return True if lyrics_download_samples.py has been runned and lyrics
|
||||
pages are present in resources directory"""
|
||||
lyrics_dirs = len([d for d in os.listdir(LYRICS_ROOT_DIR) if
|
||||
os.path.isdir(os.path.join(LYRICS_ROOT_DIR, d))])
|
||||
# example.com is the only lyrics dir added to repo
|
||||
return lyrics_dirs > 1
|
||||
|
||||
|
||||
class MockFetchUrl(object):
|
||||
|
||||
def __init__(self, pathval='fetched_path'):
|
||||
self.pathval = pathval
|
||||
self.fetched = None
|
||||
|
|
@ -217,174 +214,173 @@ class MockFetchUrl(object):
|
|||
|
||||
|
||||
def is_lyrics_content_ok(title, text):
|
||||
"""Compare lyrics text to expected lyrics for given title"""
|
||||
|
||||
keywords = LYRICS_TEXTS[google.slugify(title)]
|
||||
return all(x in text.lower() for x in keywords)
|
||||
"""Compare lyrics text to expected lyrics for given title."""
|
||||
if not text:
|
||||
return
|
||||
keywords = set(LYRICS_TEXTS[google.slugify(title)].split())
|
||||
words = set(x.strip(".?, ") for x in text.lower().split())
|
||||
return keywords <= words
|
||||
|
||||
LYRICS_ROOT_DIR = os.path.join(_common.RSRC, b'lyrics')
|
||||
LYRICS_TEXTS = confit.load_yaml(os.path.join(_common.RSRC, b'lyricstext.yaml'))
|
||||
DEFAULT_SONG = dict(artist=u'The Beatles', title=u'Lady Madonna')
|
||||
|
||||
DEFAULT_SOURCES = [
|
||||
dict(DEFAULT_SONG, url=u'http://lyrics.wikia.com/',
|
||||
path=u'The_Beatles:Lady_Madonna'),
|
||||
dict(artist=u'Santana', title=u'Black magic woman',
|
||||
url='http://www.lyrics.com/',
|
||||
path=u'black-magic-woman-lyrics-santana.html'),
|
||||
dict(DEFAULT_SONG, url='https://www.musixmatch.com/',
|
||||
path=u'lyrics/The-Beatles/Lady-Madonna'),
|
||||
]
|
||||
|
||||
# Every source entered in default beets google custom search engine
|
||||
# must be listed below.
|
||||
# Use default query when possible, or override artist and title fields
|
||||
# if website don't have lyrics for default query.
|
||||
GOOGLE_SOURCES = [
|
||||
dict(DEFAULT_SONG,
|
||||
url=u'http://www.absolutelyrics.com',
|
||||
path=u'/lyrics/view/the_beatles/lady_madonna'),
|
||||
dict(DEFAULT_SONG,
|
||||
url=u'http://www.azlyrics.com',
|
||||
path=u'/lyrics/beatles/ladymadonna.html'),
|
||||
dict(DEFAULT_SONG,
|
||||
url=u'http://www.chartlyrics.com',
|
||||
path=u'/_LsLsZ7P4EK-F-LD4dJgDQ/Lady+Madonna.aspx'),
|
||||
dict(DEFAULT_SONG,
|
||||
url=u'http://www.elyricsworld.com',
|
||||
path=u'/lady_madonna_lyrics_beatles.html'),
|
||||
dict(url=u'http://www.lacoccinelle.net',
|
||||
artist=u'Jacques Brel', title=u"Amsterdam",
|
||||
path=u'/paroles-officielles/275679.html'),
|
||||
dict(DEFAULT_SONG,
|
||||
url=u'http://letras.mus.br/', path=u'the-beatles/275/'),
|
||||
dict(DEFAULT_SONG,
|
||||
url='http://www.lyricsmania.com/',
|
||||
path='lady_madonna_lyrics_the_beatles.html'),
|
||||
dict(artist=u'Santana', title=u'Black magic woman',
|
||||
url='http://www.lyrics.com/',
|
||||
path=u'black-magic-woman-lyrics-santana.html'),
|
||||
dict(DEFAULT_SONG, url=u'http://lyrics.wikia.com/',
|
||||
path=u'The_Beatles:Lady_Madonna'),
|
||||
dict(DEFAULT_SONG,
|
||||
url=u'http://www.lyrics.net', path=u'/lyric/19110224'),
|
||||
dict(DEFAULT_SONG,
|
||||
url=u'http://www.lyricsmode.com',
|
||||
path=u'/lyrics/b/beatles/lady_madonna.html'),
|
||||
dict(url=u'http://www.lyricsontop.com',
|
||||
artist=u'Amy Winehouse', title=u"Jazz'n'blues",
|
||||
path=u'/amy-winehouse-songs/jazz-n-blues-lyrics.html'),
|
||||
dict(DEFAULT_SONG,
|
||||
url='http://www.metrolyrics.com/',
|
||||
path='lady-madonna-lyrics-beatles.html'),
|
||||
dict(url='http://www.musica.com/', path='letras.asp?letra=2738',
|
||||
artist=u'Santana', title=u'Black magic woman'),
|
||||
dict(DEFAULT_SONG,
|
||||
url=u'http://www.onelyrics.net/',
|
||||
artist=u'Ben & Ellen Harper', title=u'City of dreams',
|
||||
path='ben-ellen-harper-city-of-dreams-lyrics'),
|
||||
dict(url=u'http://www.paroles.net/',
|
||||
artist=u'Lilly Wood & the prick', title=u"Hey it's ok",
|
||||
path=u'lilly-wood-the-prick/paroles-hey-it-s-ok'),
|
||||
dict(DEFAULT_SONG,
|
||||
url='http://www.releaselyrics.com',
|
||||
path=u'/346e/the-beatles-lady-madonna-(love-version)/'),
|
||||
dict(DEFAULT_SONG,
|
||||
url=u'http://www.smartlyrics.com',
|
||||
path=u'/Song18148-The-Beatles-Lady-Madonna-lyrics.aspx'),
|
||||
dict(DEFAULT_SONG,
|
||||
url='http://www.songlyrics.com',
|
||||
path=u'/the-beatles/lady-madonna-lyrics'),
|
||||
dict(DEFAULT_SONG,
|
||||
url=u'http://www.stlyrics.com',
|
||||
path=u'/songs/r/richiehavens48961/ladymadonna2069109.html'),
|
||||
dict(DEFAULT_SONG,
|
||||
url=u'http://www.sweetslyrics.com',
|
||||
path=u'/761696.The%20Beatles%20-%20Lady%20Madonna.html')
|
||||
]
|
||||
|
||||
|
||||
class LyricsGooglePluginTest(unittest.TestCase):
|
||||
"""Test scraping heuristics on a fake html page.
|
||||
Or run lyrics_download_samples.py first to check that beets google
|
||||
custom search engine sources are correctly scraped.
|
||||
"""
|
||||
source = dict(url=u'http://www.example.com', artist=u'John Doe',
|
||||
title=u'Beets song', path=u'/lyrics/beetssong')
|
||||
class LyricsGoogleBaseTest(unittest.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
"""Set up configuration"""
|
||||
"""Set up configuration."""
|
||||
try:
|
||||
__import__('bs4')
|
||||
except ImportError:
|
||||
self.skipTest('Beautiful Soup 4 not available')
|
||||
if sys.version_info[:3] < (2, 7, 3):
|
||||
self.skipTest("Python's built-in HTML parser is not good enough")
|
||||
lyrics.LyricsPlugin()
|
||||
raw_backend.fetch_url = MockFetchUrl()
|
||||
|
||||
|
||||
class LyricsPluginSourcesTest(LyricsGoogleBaseTest):
|
||||
"""Check that beets google custom search engine sources are correctly
|
||||
scraped.
|
||||
"""
|
||||
|
||||
DEFAULT_SONG = dict(artist=u'The Beatles', title=u'Lady Madonna')
|
||||
|
||||
DEFAULT_SOURCES = [
|
||||
dict(DEFAULT_SONG, backend=lyrics.LyricsWiki),
|
||||
dict(DEFAULT_SONG, backend=lyrics.LyricsCom),
|
||||
dict(artist=u'Santana', title=u'Black magic woman',
|
||||
backend=lyrics.MusiXmatch),
|
||||
dict(DEFAULT_SONG, backend=lyrics.Genius),
|
||||
]
|
||||
|
||||
GOOGLE_SOURCES = [
|
||||
dict(DEFAULT_SONG,
|
||||
url=u'http://www.absolutelyrics.com',
|
||||
path=u'/lyrics/view/the_beatles/lady_madonna'),
|
||||
dict(DEFAULT_SONG,
|
||||
url=u'http://www.azlyrics.com',
|
||||
path=u'/lyrics/beatles/ladymadonna.html'),
|
||||
dict(DEFAULT_SONG,
|
||||
url=u'http://www.chartlyrics.com',
|
||||
path=u'/_LsLsZ7P4EK-F-LD4dJgDQ/Lady+Madonna.aspx'),
|
||||
dict(DEFAULT_SONG,
|
||||
url=u'http://www.elyricsworld.com',
|
||||
path=u'/lady_madonna_lyrics_beatles.html'),
|
||||
dict(url=u'http://www.lacoccinelle.net',
|
||||
artist=u'Jacques Brel', title=u"Amsterdam",
|
||||
path=u'/paroles-officielles/275679.html'),
|
||||
dict(DEFAULT_SONG,
|
||||
url=u'http://letras.mus.br/', path=u'the-beatles/275/'),
|
||||
dict(DEFAULT_SONG,
|
||||
url='http://www.lyricsmania.com/',
|
||||
path='lady_madonna_lyrics_the_beatles.html'),
|
||||
dict(DEFAULT_SONG, url=u'http://lyrics.wikia.com/',
|
||||
path=u'The_Beatles:Lady_Madonna'),
|
||||
dict(DEFAULT_SONG,
|
||||
url=u'http://www.lyricsmode.com',
|
||||
path=u'/lyrics/b/beatles/lady_madonna.html'),
|
||||
dict(url=u'http://www.lyricsontop.com',
|
||||
artist=u'Amy Winehouse', title=u"Jazz'n'blues",
|
||||
path=u'/amy-winehouse-songs/jazz-n-blues-lyrics.html'),
|
||||
dict(DEFAULT_SONG,
|
||||
url='http://www.metrolyrics.com/',
|
||||
path='lady-madonna-lyrics-beatles.html'),
|
||||
dict(url='http://www.musica.com/', path='letras.asp?letra=2738',
|
||||
artist=u'Santana', title=u'Black magic woman'),
|
||||
dict(url=u'http://www.paroles.net/',
|
||||
artist=u'Lilly Wood & the prick', title=u"Hey it's ok",
|
||||
path=u'lilly-wood-the-prick/paroles-hey-it-s-ok'),
|
||||
dict(DEFAULT_SONG,
|
||||
url='http://www.songlyrics.com',
|
||||
path=u'/the-beatles/lady-madonna-lyrics'),
|
||||
dict(DEFAULT_SONG,
|
||||
url=u'http://www.sweetslyrics.com',
|
||||
path=u'/761696.The%20Beatles%20-%20Lady%20Madonna.html')
|
||||
]
|
||||
|
||||
def setUp(self):
|
||||
LyricsGoogleBaseTest.setUp(self)
|
||||
self.plugin = lyrics.LyricsPlugin()
|
||||
|
||||
@unittest.skipUnless(os.environ.get(
|
||||
'BEETS_TEST_LYRICS_SOURCES', '0') == '1',
|
||||
'lyrics sources testing not enabled')
|
||||
def test_backend_sources_ok(self):
|
||||
"""Test default backends with songs known to exist in respective databases.
|
||||
"""
|
||||
errors = []
|
||||
for s in self.DEFAULT_SOURCES:
|
||||
res = s['backend'](self.plugin.config, self.plugin._log).fetch(
|
||||
s['artist'], s['title'])
|
||||
if not is_lyrics_content_ok(s['title'], res):
|
||||
errors.append(s['backend'].__name__)
|
||||
self.assertFalse(errors)
|
||||
|
||||
@unittest.skipUnless(os.environ.get(
|
||||
'BEETS_TEST_LYRICS_SOURCES', '0') == '1',
|
||||
'lyrics sources testing not enabled')
|
||||
def test_google_sources_ok(self):
|
||||
"""Test if lyrics present on websites registered in beets google custom
|
||||
search engine are correctly scraped.
|
||||
"""
|
||||
for s in self.GOOGLE_SOURCES:
|
||||
url = s['url'] + s['path']
|
||||
res = lyrics.scrape_lyrics_from_html(
|
||||
raw_backend.fetch_url(url))
|
||||
self.assertTrue(google.is_lyrics(res), url)
|
||||
self.assertTrue(is_lyrics_content_ok(s['title'], res), url)
|
||||
|
||||
|
||||
class LyricsGooglePluginMachineryTest(LyricsGoogleBaseTest):
|
||||
"""Test scraping heuristics on a fake html page.
|
||||
"""
|
||||
|
||||
source = dict(url=u'http://www.example.com', artist=u'John Doe',
|
||||
title=u'Beets song', path=u'/lyrics/beetssong')
|
||||
|
||||
def setUp(self):
|
||||
"""Set up configuration"""
|
||||
LyricsGoogleBaseTest.setUp(self)
|
||||
self.plugin = lyrics.LyricsPlugin()
|
||||
|
||||
@patch.object(lyrics.Backend, 'fetch_url', MockFetchUrl())
|
||||
def test_mocked_source_ok(self):
|
||||
"""Test that lyrics of the mocked page are correctly scraped"""
|
||||
url = self.source['url'] + self.source['path']
|
||||
if os.path.isfile(url_to_filename(url)):
|
||||
res = lyrics.scrape_lyrics_from_html(raw_backend.fetch_url(url))
|
||||
self.assertTrue(google.is_lyrics(res), url)
|
||||
self.assertTrue(is_lyrics_content_ok(self.source['title'], res),
|
||||
url)
|
||||
|
||||
def test_google_sources_ok(self):
|
||||
"""Test if lyrics present on websites registered in beets google custom
|
||||
search engine are correctly scraped."""
|
||||
if not check_lyrics_fetched():
|
||||
self.skipTest("Run lyrics_download_samples.py script first.")
|
||||
for s in GOOGLE_SOURCES:
|
||||
url = s['url'] + s['path']
|
||||
if os.path.isfile(url_to_filename(url)):
|
||||
res = lyrics.scrape_lyrics_from_html(
|
||||
raw_backend.fetch_url(url))
|
||||
self.assertTrue(google.is_lyrics(res), url)
|
||||
self.assertTrue(is_lyrics_content_ok(s['title'], res), url)
|
||||
|
||||
def test_default_ok(self):
|
||||
"""Test default engines with the default query"""
|
||||
if not check_lyrics_fetched():
|
||||
self.skipTest("Run lyrics_download_samples.py script first.")
|
||||
for (source, s) in zip([lyrics.LyricsWiki,
|
||||
lyrics.LyricsCom,
|
||||
lyrics.MusiXmatch], DEFAULT_SOURCES):
|
||||
url = s['url'] + s['path']
|
||||
if os.path.isfile(url_to_filename(url)):
|
||||
res = source({}, log).fetch(s['artist'], s['title'])
|
||||
self.assertTrue(google.is_lyrics(res), url)
|
||||
self.assertTrue(is_lyrics_content_ok(s['title'], res), url)
|
||||
res = lyrics.scrape_lyrics_from_html(raw_backend.fetch_url(url))
|
||||
self.assertTrue(google.is_lyrics(res), url)
|
||||
self.assertTrue(is_lyrics_content_ok(self.source['title'], res),
|
||||
url)
|
||||
|
||||
@patch.object(lyrics.Backend, 'fetch_url', MockFetchUrl())
|
||||
def test_is_page_candidate_exact_match(self):
|
||||
"""Test matching html page title with song infos -- when song infos are
|
||||
present in the title."""
|
||||
present in the title.
|
||||
"""
|
||||
from bs4 import SoupStrainer, BeautifulSoup
|
||||
s = self.source
|
||||
url = six.text_type(s['url'] + s['path'])
|
||||
html = raw_backend.fetch_url(url)
|
||||
soup = BeautifulSoup(html, "html.parser",
|
||||
parse_only=SoupStrainer('title'))
|
||||
self.assertEqual(google.is_page_candidate(url, soup.title.string,
|
||||
s['title'], s['artist']),
|
||||
True, url)
|
||||
self.assertEqual(
|
||||
google.is_page_candidate(url, soup.title.string,
|
||||
s['title'], s['artist']), True, url)
|
||||
|
||||
def test_is_page_candidate_fuzzy_match(self):
|
||||
"""Test matching html page title with song infos -- when song infos are
|
||||
not present in the title."""
|
||||
not present in the title.
|
||||
"""
|
||||
s = self.source
|
||||
url = s['url'] + s['path']
|
||||
url_title = u'example.com | Beats song by John doe'
|
||||
|
||||
# very small diffs (typo) are ok eg 'beats' vs 'beets' with same artist
|
||||
self.assertEqual(google.is_page_candidate(url, url_title, s['title'],
|
||||
s['artist']), True, url)
|
||||
s['artist']), True, url)
|
||||
# reject different title
|
||||
url_title = u'example.com | seets bong lyrics by John doe'
|
||||
self.assertEqual(google.is_page_candidate(url, url_title, s['title'],
|
||||
s['artist']), False, url)
|
||||
s['artist']), False, url)
|
||||
|
||||
def test_is_page_candidate_special_chars(self):
|
||||
"""Ensure that `is_page_candidate` doesn't crash when the artist
|
||||
|
|
|
|||
Loading…
Reference in a new issue