Merge pull request #977 from KraYmer/lyrics_test

Lyrics test
This commit is contained in:
Adrian Sampson 2014-09-22 09:38:11 -07:00
commit ea89cf32eb
3 changed files with 39 additions and 47 deletions

View file

@ -33,10 +33,10 @@ from beets import config
log = logging.getLogger('beets')
DIV_RE = re.compile(r'<(/?)div>?')
DIV_RE = re.compile(r'<(/?)div>?', re.I)
COMMENT_RE = re.compile(r'<!--.*-->', re.S)
TAG_RE = re.compile(r'<[^>]*>')
BREAK_RE = re.compile(r'<br\s*/?>')
BREAK_RE = re.compile(r'<br\s*/?>', re.I)
URL_CHARACTERS = {
u'\u2018': u"'",
u'\u2019': u"'",
@ -122,6 +122,7 @@ def strip_cruft(lyrics, wscollapse=True):
lyrics = unescape(lyrics)
if wscollapse:
lyrics = re.sub(r'\s+', ' ', lyrics) # Whitespace collapse.
lyrics = re.sub(r'<(script).*?</\1>(?s)', '', lyrics) # Strip script tags.
lyrics = BREAK_RE.sub('\n', lyrics) # <BR> newlines.
lyrics = re.sub(r'\n +', '\n', lyrics)
@ -294,36 +295,6 @@ def is_page_candidate(urlLink, urlTitle, title, artist):
return difflib.SequenceMatcher(None, songTitle, title).ratio() >= typoRatio
def insert_line_feeds(text):
"""Insert newlines before upper-case characters.
"""
tokensStr = re.split("([a-z][A-Z])", text)
for idx in range(1, len(tokensStr), 2):
ltoken = list(tokensStr[idx])
tokensStr[idx] = ltoken[0] + '\n' + ltoken[1]
return ''.join(tokensStr)
def sanitize_lyrics(text):
"""Clean text, returning raw lyrics as output or None if it happens
that input text is actually not lyrics content. Clean (x)html tags
in text, correct layout and syntax...
"""
text = strip_cruft(text, False)
# Restore \n in input text
if '\n' not in text:
text = insert_line_feeds(text)
while text.count('\n\n') > text.count('\n') // 4:
# Remove first occurrence of \n for each sequence of \n
text = re.sub(r'\n(\n+)', '\g<1>', text)
text = re.sub(r'\n\n+', '\n\n', text) # keep at most two \n in a row
return text
def remove_credits(text):
"""Remove first/last line of text if it contains the word 'lyrics'
eg 'Lyrics by songsdatabase.com'
@ -343,7 +314,6 @@ def is_lyrics(text, artist=None):
"""
if not text:
return
badTriggersOcc = []
nbLines = text.count('\n')
if nbLines <= 1:
@ -356,7 +326,7 @@ def is_lyrics(text, artist=None):
# down
text = remove_credits(text)
badTriggers = ['lyrics', 'copyright', 'property']
badTriggers = ['lyrics', 'copyright', 'property', 'links']
if artist:
badTriggersOcc += [artist]
@ -450,7 +420,7 @@ def fetch_google(artist, title):
if not lyrics:
continue
lyrics = sanitize_lyrics(lyrics)
lyrics = strip_cruft(lyrics, False)
if is_lyrics(lyrics, artist):
log.debug(u'got lyrics from {0}'.format(item['displayLink']))

View file

@ -90,16 +90,13 @@ class LyricsScrapingPluginTest(unittest.TestCase):
# Use default query when possible, or override artist and title field
# if website don't have lyrics for default query.
sourcesOk = [
dict(definfo, url=u'http://www.smartlyrics.com',
path=u'/Song18148-The-Beatles-Lady-Madonna-lyrics.aspx'),
dict(definfo, url='http://www.songlyrics.com',
path=u'/the-beatles/lady-madonna-lyrics'),
dict(definfo, url=u'http://www.elyricsworld.com',
path=u'/lady_madonna_lyrics_beatles.html'),
dict(artist=u'Beres Hammond', title=u'I could beat myself',
url=u'http://www.reggaelyrics.info',
path=u'/beres-hammond/i-could-beat-myself'),
dict(definfo, artist=u'Lilly Wood & the prick', title=u"Hey it's ok",
url=u'http://www.lyricsmania.com',
path=u'/hey_its_ok_lyrics_lilly_wood_and_the_prick.html'),
dict(definfo, artist=u'Lilly Wood & the prick', title=u"Hey it's ok",
url=u'http://www.paroles.net/',
path=u'lilly-wood-the-prick/paroles-hey-it-s-ok'),
@ -108,25 +105,28 @@ class LyricsScrapingPluginTest(unittest.TestCase):
path=u'/amy-winehouse-songs/jazz-n-blues-lyrics.html'),
dict(definfo, url=u'http://www.sweetslyrics.com',
path=u'/761696.The%20Beatles%20-%20Lady%20Madonna.html'),
dict(definfo, url=u'http://www.lyrics007.com',
path=u'/The%20Beatles%20Lyrics/Lady%20Madonna%20Lyrics.html'),
dict(definfo, url=u'http://www.absolutelyrics.com',
path=u'/lyrics/view/the_beatles/lady_madonna'),
dict(definfo, url=u'http://www.azlyrics.com/',
path=u'/lyrics/beatles/ladymadonna.html'),
dict(definfo, url=u'http://www.chartlyrics.com',
path=u'/_LsLsZ7P4EK-F-LD4dJgDQ/Lady+Madonna.aspx'),
dict(definfo, url='http://www.releaselyrics.com',
path=u'/e35f/the-beatles-lady-madonna'),
]
# Websites that can't be scraped yet and whose results must be
# flagged as invalid lyrics.
sourcesFail = [
dict(definfo, url='http://www.songlyrics.com',
path=u'/the-beatles/lady-madonna-lyrics'),
dict(definfo, url=u'http://www.smartlyrics.com',
path=u'/Song18148-The-Beatles-Lady-Madonna-lyrics.aspx'),
dict(definfo, url='http://www.metrolyrics.com/',
path='best-for-last-lyrics-adele.html')
path='best-for-last-lyrics-adele.html'),
dict(definfo, artist=u'Lilly Wood & the prick', title=u"Hey it's ok",
url=u'http://www.lyricsmania.com',
path=u'/hey_its_ok_lyrics_lilly_wood_and_the_prick.html'),
dict(definfo, url=u'http://www.lyrics007.com',
path=u'/The%20Beatles%20Lyrics/Lady%20Madonna%20Lyrics.html'),
dict(definfo, url='http://www.releaselyrics.com',
path=u'/e35f/the-beatles-lady-madonna'),
]
# Websites that return truncated lyrics because of scraping issues, and

View file

@ -1,3 +1,4 @@
# -*- coding: utf-8 -*-
# This file is part of beets.
# Copyright 2014, Fabrice Laporte.
#
@ -111,11 +112,32 @@ class LyricsPluginTest(unittest.TestCase):
lyrics.remove_credits("""Lyrics brought by example.com"""),
""
)
# don't remove 2nd verse for the only reason it contains 'lyrics' word
text = """Look at all the shit that i done bought her
See lyrics ain't nothin
if the beat aint crackin"""
self.assertEqual(lyrics.remove_credits(text), text)
def test_strip_cruft(self):
text = """<!--lyrics below-->
<script type="javascript">
&nbsp; One<BR>\r\n
<blink>Two</blink>
"""
self.assertEqual(lyrics.strip_cruft(text), u"One\nTwo")
def test_is_lyrics(self):
texts = ['LyricsMania.com - Copyright (c) 2013 - All Rights Reserved']
texts += ["""All material found on this site is property\n
of mywickedsongtext brand"""]
for t in texts:
self.assertFalse(lyrics.is_lyrics(t))
def test_slugify(self):
text = u"http://site.com/çafe-au_lait(boisson)"
self.assertEqual(lyrics.slugify(text), 'http://site.com/cafe_au_lait')
def suite():
return unittest.TestLoader().loadTestsFromName(__name__)