Merge pull request #977 from KraYmer/lyrics_test

Lyrics test
2026-01-28 19:16:10 +01:00 · 2014-09-22 09:38:11 -07:00 · 2014-09-22 09:38:11 -07:00 · ea89cf32eb
commit ea89cf32eb
parent 4f3a52a26f 151ee87d8d
3 changed files with 39 additions and 47 deletions
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@ -33,10 +33,10 @@ from beets import config

 log = logging.getLogger('beets')

-DIV_RE = re.compile(r'<(/?)div>?')
+DIV_RE = re.compile(r'<(/?)div>?', re.I)
 COMMENT_RE = re.compile(r'<!--.*-->', re.S)
 TAG_RE = re.compile(r'<[^>]*>')
-BREAK_RE = re.compile(r'<br\s*/?>')
+BREAK_RE = re.compile(r'<br\s*/?>', re.I)
 URL_CHARACTERS = {
    u'\u2018': u"'",
    u'\u2019': u"'",
@ -122,6 +122,7 @@ def strip_cruft(lyrics, wscollapse=True):
    lyrics = unescape(lyrics)
    if wscollapse:
        lyrics = re.sub(r'\s+', ' ', lyrics)  # Whitespace collapse.
+
    lyrics = re.sub(r'<(script).*?</\1>(?s)', '', lyrics)  # Strip script tags.
    lyrics = BREAK_RE.sub('\n', lyrics)  # <BR> newlines.
    lyrics = re.sub(r'\n +', '\n', lyrics)
@ -294,36 +295,6 @@ def is_page_candidate(urlLink, urlTitle, title, artist):
    return difflib.SequenceMatcher(None, songTitle, title).ratio() >= typoRatio


-def insert_line_feeds(text):
-    """Insert newlines before upper-case characters.
-    """
-    tokensStr = re.split("([a-z][A-Z])", text)
-    for idx in range(1, len(tokensStr), 2):
-        ltoken = list(tokensStr[idx])
-        tokensStr[idx] = ltoken[0] + '\n' + ltoken[1]
-    return ''.join(tokensStr)
-
-
-def sanitize_lyrics(text):
-    """Clean text, returning raw lyrics as output or None if it happens
-    that input text is actually not lyrics content.  Clean (x)html tags
-    in text, correct layout and syntax...
-    """
-    text = strip_cruft(text, False)
-
-    # Restore \n in input text
-    if '\n' not in text:
-        text = insert_line_feeds(text)
-
-    while text.count('\n\n') > text.count('\n') // 4:
-        # Remove first occurrence of \n for each sequence of \n
-        text = re.sub(r'\n(\n+)', '\g<1>', text)
-
-    text = re.sub(r'\n\n+', '\n\n', text)   # keep at most two \n in a row
-
-    return text
-
-
 def remove_credits(text):
    """Remove first/last line of text if it contains the word 'lyrics'
    eg 'Lyrics by songsdatabase.com'
@ -343,7 +314,6 @@ def is_lyrics(text, artist=None):
    """
    if not text:
        return
-
    badTriggersOcc = []
    nbLines = text.count('\n')
    if nbLines <= 1:
@ -356,7 +326,7 @@ def is_lyrics(text, artist=None):
        # down
        text = remove_credits(text)

-    badTriggers = ['lyrics', 'copyright', 'property']
+    badTriggers = ['lyrics', 'copyright', 'property', 'links']
    if artist:
        badTriggersOcc += [artist]

@ -450,7 +420,7 @@ def fetch_google(artist, title):
            if not lyrics:
                continue

-            lyrics = sanitize_lyrics(lyrics)
+            lyrics = strip_cruft(lyrics, False)

            if is_lyrics(lyrics, artist):
                log.debug(u'got lyrics from {0}'.format(item['displayLink']))
--- a/test/lyrics_sources.py
+++ b/test/lyrics_sources.py
@ -90,16 +90,13 @@ class LyricsScrapingPluginTest(unittest.TestCase):
    # Use default query when possible, or override artist and title field
    # if website don't have lyrics for default query.
    sourcesOk = [
-        dict(definfo, url=u'http://www.smartlyrics.com',
-             path=u'/Song18148-The-Beatles-Lady-Madonna-lyrics.aspx'),
+        dict(definfo, url='http://www.songlyrics.com',
+             path=u'/the-beatles/lady-madonna-lyrics'),
        dict(definfo, url=u'http://www.elyricsworld.com',
             path=u'/lady_madonna_lyrics_beatles.html'),
        dict(artist=u'Beres Hammond', title=u'I could beat myself',
             url=u'http://www.reggaelyrics.info',
             path=u'/beres-hammond/i-could-beat-myself'),
-        dict(definfo, artist=u'Lilly Wood & the prick', title=u"Hey it's ok",
-             url=u'http://www.lyricsmania.com',
-             path=u'/hey_its_ok_lyrics_lilly_wood_and_the_prick.html'),
        dict(definfo, artist=u'Lilly Wood & the prick', title=u"Hey it's ok",
             url=u'http://www.paroles.net/',
             path=u'lilly-wood-the-prick/paroles-hey-it-s-ok'),
@ -108,25 +105,28 @@ class LyricsScrapingPluginTest(unittest.TestCase):
             path=u'/amy-winehouse-songs/jazz-n-blues-lyrics.html'),
        dict(definfo, url=u'http://www.sweetslyrics.com',
             path=u'/761696.The%20Beatles%20-%20Lady%20Madonna.html'),
-        dict(definfo, url=u'http://www.lyrics007.com',
-             path=u'/The%20Beatles%20Lyrics/Lady%20Madonna%20Lyrics.html'),
        dict(definfo, url=u'http://www.absolutelyrics.com',
             path=u'/lyrics/view/the_beatles/lady_madonna'),
        dict(definfo, url=u'http://www.azlyrics.com/',
             path=u'/lyrics/beatles/ladymadonna.html'),
        dict(definfo, url=u'http://www.chartlyrics.com',
             path=u'/_LsLsZ7P4EK-F-LD4dJgDQ/Lady+Madonna.aspx'),
-        dict(definfo, url='http://www.releaselyrics.com',
-             path=u'/e35f/the-beatles-lady-madonna'),
    ]

    # Websites that can't be scraped yet and whose results must be
    # flagged as invalid lyrics.
    sourcesFail = [
-        dict(definfo, url='http://www.songlyrics.com',
-             path=u'/the-beatles/lady-madonna-lyrics'),
+        dict(definfo, url=u'http://www.smartlyrics.com',
+             path=u'/Song18148-The-Beatles-Lady-Madonna-lyrics.aspx'),
        dict(definfo, url='http://www.metrolyrics.com/',
-             path='best-for-last-lyrics-adele.html')
+             path='best-for-last-lyrics-adele.html'),
+        dict(definfo, artist=u'Lilly Wood & the prick', title=u"Hey it's ok",
+             url=u'http://www.lyricsmania.com',
+             path=u'/hey_its_ok_lyrics_lilly_wood_and_the_prick.html'),
+        dict(definfo, url=u'http://www.lyrics007.com',
+             path=u'/The%20Beatles%20Lyrics/Lady%20Madonna%20Lyrics.html'),
+        dict(definfo, url='http://www.releaselyrics.com',
+             path=u'/e35f/the-beatles-lady-madonna'),
    ]

    # Websites that return truncated lyrics because of scraping issues, and
--- a/test/test_lyrics.py
+++ b/test/test_lyrics.py
@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
 # This file is part of beets.
 # Copyright 2014, Fabrice Laporte.
 #
@ -111,11 +112,32 @@ class LyricsPluginTest(unittest.TestCase):
            lyrics.remove_credits("""Lyrics brought by example.com"""),
            ""
        )
+
+        # don't remove 2nd verse for the only reason it contains 'lyrics' word
        text = """Look at all the shit that i done bought her
                  See lyrics ain't nothin
                  if the beat aint crackin"""
        self.assertEqual(lyrics.remove_credits(text), text)

+    def test_strip_cruft(self):
+        text = """<!--lyrics below-->
+                  <script type="javascript">
+                  &nbsp;  One<BR>\r\n
+                  <blink>Two</blink>
+               """
+        self.assertEqual(lyrics.strip_cruft(text), u"One\nTwo")
+
+    def test_is_lyrics(self):
+        texts = ['LyricsMania.com - Copyright (c) 2013 - All Rights Reserved']
+        texts += ["""All material found on this site is property\n
+                     of mywickedsongtext brand"""]
+        for t in texts:
+            self.assertFalse(lyrics.is_lyrics(t))
+
+    def test_slugify(self):
+        text = u"http://site.com/çafe-au_lait(boisson)"
+        self.assertEqual(lyrics.slugify(text), 'http://site.com/cafe_au_lait')
+

 def suite():
    return unittest.TestLoader().loadTestsFromName(__name__)