diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index 019faa4c5..b805ffeef 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -33,10 +33,10 @@ from beets import config
log = logging.getLogger('beets')
-DIV_RE = re.compile(r'<(/?)div>?')
+DIV_RE = re.compile(r'<(/?)div>?', re.I)
COMMENT_RE = re.compile(r'', re.S)
TAG_RE = re.compile(r'<[^>]*>')
-BREAK_RE = re.compile(r'
')
+BREAK_RE = re.compile(r'
', re.I)
URL_CHARACTERS = {
u'\u2018': u"'",
u'\u2019': u"'",
@@ -122,6 +122,7 @@ def strip_cruft(lyrics, wscollapse=True):
lyrics = unescape(lyrics)
if wscollapse:
lyrics = re.sub(r'\s+', ' ', lyrics) # Whitespace collapse.
+
lyrics = re.sub(r'<(script).*?\1>(?s)', '', lyrics) # Strip script tags.
lyrics = BREAK_RE.sub('\n', lyrics) #
newlines.
lyrics = re.sub(r'\n +', '\n', lyrics)
@@ -294,36 +295,6 @@ def is_page_candidate(urlLink, urlTitle, title, artist):
return difflib.SequenceMatcher(None, songTitle, title).ratio() >= typoRatio
-def insert_line_feeds(text):
- """Insert newlines before upper-case characters.
- """
- tokensStr = re.split("([a-z][A-Z])", text)
- for idx in range(1, len(tokensStr), 2):
- ltoken = list(tokensStr[idx])
- tokensStr[idx] = ltoken[0] + '\n' + ltoken[1]
- return ''.join(tokensStr)
-
-
-def sanitize_lyrics(text):
- """Clean text, returning raw lyrics as output or None if it happens
- that input text is actually not lyrics content. Clean (x)html tags
- in text, correct layout and syntax...
- """
- text = strip_cruft(text, False)
-
- # Restore \n in input text
- if '\n' not in text:
- text = insert_line_feeds(text)
-
- while text.count('\n\n') > text.count('\n') // 4:
- # Remove first occurrence of \n for each sequence of \n
- text = re.sub(r'\n(\n+)', '\g<1>', text)
-
- text = re.sub(r'\n\n+', '\n\n', text) # keep at most two \n in a row
-
- return text
-
-
def remove_credits(text):
"""Remove first/last line of text if it contains the word 'lyrics'
eg 'Lyrics by songsdatabase.com'
@@ -343,7 +314,6 @@ def is_lyrics(text, artist=None):
"""
if not text:
return
-
badTriggersOcc = []
nbLines = text.count('\n')
if nbLines <= 1:
@@ -356,7 +326,7 @@ def is_lyrics(text, artist=None):
# down
text = remove_credits(text)
- badTriggers = ['lyrics', 'copyright', 'property']
+ badTriggers = ['lyrics', 'copyright', 'property', 'links']
if artist:
badTriggersOcc += [artist]
@@ -450,7 +420,7 @@ def fetch_google(artist, title):
if not lyrics:
continue
- lyrics = sanitize_lyrics(lyrics)
+ lyrics = strip_cruft(lyrics, False)
if is_lyrics(lyrics, artist):
log.debug(u'got lyrics from {0}'.format(item['displayLink']))
diff --git a/test/lyrics_sources.py b/test/lyrics_sources.py
index 73f473be5..c9e89df3d 100644
--- a/test/lyrics_sources.py
+++ b/test/lyrics_sources.py
@@ -90,16 +90,13 @@ class LyricsScrapingPluginTest(unittest.TestCase):
# Use default query when possible, or override artist and title field
# if website don't have lyrics for default query.
sourcesOk = [
- dict(definfo, url=u'http://www.smartlyrics.com',
- path=u'/Song18148-The-Beatles-Lady-Madonna-lyrics.aspx'),
+ dict(definfo, url='http://www.songlyrics.com',
+ path=u'/the-beatles/lady-madonna-lyrics'),
dict(definfo, url=u'http://www.elyricsworld.com',
path=u'/lady_madonna_lyrics_beatles.html'),
dict(artist=u'Beres Hammond', title=u'I could beat myself',
url=u'http://www.reggaelyrics.info',
path=u'/beres-hammond/i-could-beat-myself'),
- dict(definfo, artist=u'Lilly Wood & the prick', title=u"Hey it's ok",
- url=u'http://www.lyricsmania.com',
- path=u'/hey_its_ok_lyrics_lilly_wood_and_the_prick.html'),
dict(definfo, artist=u'Lilly Wood & the prick', title=u"Hey it's ok",
url=u'http://www.paroles.net/',
path=u'lilly-wood-the-prick/paroles-hey-it-s-ok'),
@@ -108,25 +105,28 @@ class LyricsScrapingPluginTest(unittest.TestCase):
path=u'/amy-winehouse-songs/jazz-n-blues-lyrics.html'),
dict(definfo, url=u'http://www.sweetslyrics.com',
path=u'/761696.The%20Beatles%20-%20Lady%20Madonna.html'),
- dict(definfo, url=u'http://www.lyrics007.com',
- path=u'/The%20Beatles%20Lyrics/Lady%20Madonna%20Lyrics.html'),
dict(definfo, url=u'http://www.absolutelyrics.com',
path=u'/lyrics/view/the_beatles/lady_madonna'),
dict(definfo, url=u'http://www.azlyrics.com/',
path=u'/lyrics/beatles/ladymadonna.html'),
dict(definfo, url=u'http://www.chartlyrics.com',
path=u'/_LsLsZ7P4EK-F-LD4dJgDQ/Lady+Madonna.aspx'),
- dict(definfo, url='http://www.releaselyrics.com',
- path=u'/e35f/the-beatles-lady-madonna'),
]
# Websites that can't be scraped yet and whose results must be
# flagged as invalid lyrics.
sourcesFail = [
- dict(definfo, url='http://www.songlyrics.com',
- path=u'/the-beatles/lady-madonna-lyrics'),
+ dict(definfo, url=u'http://www.smartlyrics.com',
+ path=u'/Song18148-The-Beatles-Lady-Madonna-lyrics.aspx'),
dict(definfo, url='http://www.metrolyrics.com/',
- path='best-for-last-lyrics-adele.html')
+ path='best-for-last-lyrics-adele.html'),
+ dict(definfo, artist=u'Lilly Wood & the prick', title=u"Hey it's ok",
+ url=u'http://www.lyricsmania.com',
+ path=u'/hey_its_ok_lyrics_lilly_wood_and_the_prick.html'),
+ dict(definfo, url=u'http://www.lyrics007.com',
+ path=u'/The%20Beatles%20Lyrics/Lady%20Madonna%20Lyrics.html'),
+ dict(definfo, url='http://www.releaselyrics.com',
+ path=u'/e35f/the-beatles-lady-madonna'),
]
# Websites that return truncated lyrics because of scraping issues, and
diff --git a/test/test_lyrics.py b/test/test_lyrics.py
index e6e140c81..d240a3370 100644
--- a/test/test_lyrics.py
+++ b/test/test_lyrics.py
@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
# This file is part of beets.
# Copyright 2014, Fabrice Laporte.
#
@@ -111,11 +112,32 @@ class LyricsPluginTest(unittest.TestCase):
lyrics.remove_credits("""Lyrics brought by example.com"""),
""
)
+
+ # don't remove 2nd verse for the only reason it contains 'lyrics' word
text = """Look at all the shit that i done bought her
See lyrics ain't nothin
if the beat aint crackin"""
self.assertEqual(lyrics.remove_credits(text), text)
+ def test_strip_cruft(self):
+ text = """
+