diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py index 019faa4c5..b805ffeef 100644 --- a/beetsplug/lyrics.py +++ b/beetsplug/lyrics.py @@ -33,10 +33,10 @@ from beets import config log = logging.getLogger('beets') -DIV_RE = re.compile(r'<(/?)div>?') +DIV_RE = re.compile(r'<(/?)div>?', re.I) COMMENT_RE = re.compile(r'', re.S) TAG_RE = re.compile(r'<[^>]*>') -BREAK_RE = re.compile(r'') +BREAK_RE = re.compile(r'', re.I) URL_CHARACTERS = { u'\u2018': u"'", u'\u2019': u"'", @@ -122,6 +122,7 @@ def strip_cruft(lyrics, wscollapse=True): lyrics = unescape(lyrics) if wscollapse: lyrics = re.sub(r'\s+', ' ', lyrics) # Whitespace collapse. + lyrics = re.sub(r'<(script).*?(?s)', '', lyrics) # Strip script tags. lyrics = BREAK_RE.sub('\n', lyrics) #
newlines. lyrics = re.sub(r'\n +', '\n', lyrics) @@ -294,36 +295,6 @@ def is_page_candidate(urlLink, urlTitle, title, artist): return difflib.SequenceMatcher(None, songTitle, title).ratio() >= typoRatio -def insert_line_feeds(text): - """Insert newlines before upper-case characters. - """ - tokensStr = re.split("([a-z][A-Z])", text) - for idx in range(1, len(tokensStr), 2): - ltoken = list(tokensStr[idx]) - tokensStr[idx] = ltoken[0] + '\n' + ltoken[1] - return ''.join(tokensStr) - - -def sanitize_lyrics(text): - """Clean text, returning raw lyrics as output or None if it happens - that input text is actually not lyrics content. Clean (x)html tags - in text, correct layout and syntax... - """ - text = strip_cruft(text, False) - - # Restore \n in input text - if '\n' not in text: - text = insert_line_feeds(text) - - while text.count('\n\n') > text.count('\n') // 4: - # Remove first occurrence of \n for each sequence of \n - text = re.sub(r'\n(\n+)', '\g<1>', text) - - text = re.sub(r'\n\n+', '\n\n', text) # keep at most two \n in a row - - return text - - def remove_credits(text): """Remove first/last line of text if it contains the word 'lyrics' eg 'Lyrics by songsdatabase.com' @@ -343,7 +314,6 @@ def is_lyrics(text, artist=None): """ if not text: return - badTriggersOcc = [] nbLines = text.count('\n') if nbLines <= 1: @@ -356,7 +326,7 @@ def is_lyrics(text, artist=None): # down text = remove_credits(text) - badTriggers = ['lyrics', 'copyright', 'property'] + badTriggers = ['lyrics', 'copyright', 'property', 'links'] if artist: badTriggersOcc += [artist] @@ -450,7 +420,7 @@ def fetch_google(artist, title): if not lyrics: continue - lyrics = sanitize_lyrics(lyrics) + lyrics = strip_cruft(lyrics, False) if is_lyrics(lyrics, artist): log.debug(u'got lyrics from {0}'.format(item['displayLink'])) diff --git a/test/lyrics_sources.py b/test/lyrics_sources.py index 73f473be5..c9e89df3d 100644 --- a/test/lyrics_sources.py +++ b/test/lyrics_sources.py @@ -90,16 +90,13 @@ class LyricsScrapingPluginTest(unittest.TestCase): # Use default query when possible, or override artist and title field # if website don't have lyrics for default query. sourcesOk = [ - dict(definfo, url=u'http://www.smartlyrics.com', - path=u'/Song18148-The-Beatles-Lady-Madonna-lyrics.aspx'), + dict(definfo, url='http://www.songlyrics.com', + path=u'/the-beatles/lady-madonna-lyrics'), dict(definfo, url=u'http://www.elyricsworld.com', path=u'/lady_madonna_lyrics_beatles.html'), dict(artist=u'Beres Hammond', title=u'I could beat myself', url=u'http://www.reggaelyrics.info', path=u'/beres-hammond/i-could-beat-myself'), - dict(definfo, artist=u'Lilly Wood & the prick', title=u"Hey it's ok", - url=u'http://www.lyricsmania.com', - path=u'/hey_its_ok_lyrics_lilly_wood_and_the_prick.html'), dict(definfo, artist=u'Lilly Wood & the prick', title=u"Hey it's ok", url=u'http://www.paroles.net/', path=u'lilly-wood-the-prick/paroles-hey-it-s-ok'), @@ -108,25 +105,28 @@ class LyricsScrapingPluginTest(unittest.TestCase): path=u'/amy-winehouse-songs/jazz-n-blues-lyrics.html'), dict(definfo, url=u'http://www.sweetslyrics.com', path=u'/761696.The%20Beatles%20-%20Lady%20Madonna.html'), - dict(definfo, url=u'http://www.lyrics007.com', - path=u'/The%20Beatles%20Lyrics/Lady%20Madonna%20Lyrics.html'), dict(definfo, url=u'http://www.absolutelyrics.com', path=u'/lyrics/view/the_beatles/lady_madonna'), dict(definfo, url=u'http://www.azlyrics.com/', path=u'/lyrics/beatles/ladymadonna.html'), dict(definfo, url=u'http://www.chartlyrics.com', path=u'/_LsLsZ7P4EK-F-LD4dJgDQ/Lady+Madonna.aspx'), - dict(definfo, url='http://www.releaselyrics.com', - path=u'/e35f/the-beatles-lady-madonna'), ] # Websites that can't be scraped yet and whose results must be # flagged as invalid lyrics. sourcesFail = [ - dict(definfo, url='http://www.songlyrics.com', - path=u'/the-beatles/lady-madonna-lyrics'), + dict(definfo, url=u'http://www.smartlyrics.com', + path=u'/Song18148-The-Beatles-Lady-Madonna-lyrics.aspx'), dict(definfo, url='http://www.metrolyrics.com/', - path='best-for-last-lyrics-adele.html') + path='best-for-last-lyrics-adele.html'), + dict(definfo, artist=u'Lilly Wood & the prick', title=u"Hey it's ok", + url=u'http://www.lyricsmania.com', + path=u'/hey_its_ok_lyrics_lilly_wood_and_the_prick.html'), + dict(definfo, url=u'http://www.lyrics007.com', + path=u'/The%20Beatles%20Lyrics/Lady%20Madonna%20Lyrics.html'), + dict(definfo, url='http://www.releaselyrics.com', + path=u'/e35f/the-beatles-lady-madonna'), ] # Websites that return truncated lyrics because of scraping issues, and diff --git a/test/test_lyrics.py b/test/test_lyrics.py index e6e140c81..d240a3370 100644 --- a/test/test_lyrics.py +++ b/test/test_lyrics.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- # This file is part of beets. # Copyright 2014, Fabrice Laporte. # @@ -111,11 +112,32 @@ class LyricsPluginTest(unittest.TestCase): lyrics.remove_credits("""Lyrics brought by example.com"""), "" ) + + # don't remove 2nd verse for the only reason it contains 'lyrics' word text = """Look at all the shit that i done bought her See lyrics ain't nothin if the beat aint crackin""" self.assertEqual(lyrics.remove_credits(text), text) + def test_strip_cruft(self): + text = """ +