Some tweaking to yield better results by not

rejecting valid lyrics.
2026-01-25 17:45:44 +01:00 · 2013-06-29 14:23:53 +02:00 · 2013-06-29 14:23:53 +02:00 · 9780be270c
commit 9780be270c
parent c6f935ac4c
1 changed files with 10 additions and 10 deletions
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@ -237,7 +237,7 @@ def is_page_candidate(urlLink, urlTitle, title, artist):
                   difflib.SequenceMatcher(None, songTitle, title).ratio()))

    typoRatio = .8
-    return difflib.SequenceMatcher(None, songTitle, title).ratio() > typoRatio
+    return difflib.SequenceMatcher(None, songTitle, title).ratio() >= typoRatio

 def insert_line_feeds(text):
    """Insert newlines before upper-case characters.
@ -255,15 +255,6 @@ def sanitize_lyrics(text):
    """
    text = strip_cruft(text, False)

-    # Suppress advertisements.
-    # Match lines with an opening bracket but no ending one, ie lines that
-    # contained html link that has been wiped out when scraping.
-    LINK1_RE = re.compile(r'(\(|\[).*[^\)\]]$')
-    # Match lines containing url between brackets
-    LINK2_RE = re.compile(r'(\(|\[).*[http|www].*(\]|\))')
-    text = LINK1_RE.sub('', text)
-    text = LINK2_RE.sub('', text)
-
    # Restore \n in input text
    if '\n' not in text:
        text = insert_line_feeds(text)
@ -286,6 +277,15 @@ def is_lyrics(text, artist):
        return 0
    elif nbLines < 5:
        badTriggers.append('too_short')
+    else:
+        # Don't penalize long text because of lyrics keyword in credits
+        textlines = text.split('\n')
+        popped = False
+        for i in [len(textlines)-1, 0]:
+            if 'lyrics' in textlines[i].lower():
+                popped = textlines.pop(i)
+        if popped:
+            text = '\n'.join(textlines)

    for item in artist, 'lyrics', 'copyright', 'property':
        badTriggers += [item] * len(re.findall(r'\W%s\W' % item, text, re.I))