From c6f935ac4c6840db030b5c4ca93c3b871a457b52 Mon Sep 17 00:00:00 2001
From: Fabrice Laporte <kraymer@gmail.com>
Date: Sat, 29 Jun 2013 14:21:55 +0200
Subject: [PATCH 1/3] Don't consider text between parentheses when matching url
 title with song title.

---
 beetsplug/lyrics.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index 707d84cf0..31a215cc1 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -198,6 +198,10 @@ def slugify(text):
     """
     # http://stackoverflow.com/questions/295135/turn-a-string-into-a-valid-
     # filename-in-python
+
+    # Remove content within parentheses
+    pat = "([^,\(]*)\((.*?)\)"
+    text = re.sub(pat,'\g<1>', text).strip()
     try:
         text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore')
         text = unicode(re.sub('[-\s]+', ' ', text))

From 9780be270cdcdafc065cfbd3b7b8633f29f21df9 Mon Sep 17 00:00:00 2001
From: Fabrice Laporte <kraymer@gmail.com>
Date: Sat, 29 Jun 2013 14:23:53 +0200
Subject: [PATCH 2/3] Some tweaking to yield better results by not rejecting
 valid lyrics.

---
 beetsplug/lyrics.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index 31a215cc1..e9d6aa0d9 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -237,7 +237,7 @@ def is_page_candidate(urlLink, urlTitle, title, artist):
                    difflib.SequenceMatcher(None, songTitle, title).ratio()))
 
     typoRatio = .8
-    return difflib.SequenceMatcher(None, songTitle, title).ratio() > typoRatio
+    return difflib.SequenceMatcher(None, songTitle, title).ratio() >= typoRatio
 
 def insert_line_feeds(text):
     """Insert newlines before upper-case characters.
@@ -255,15 +255,6 @@ def sanitize_lyrics(text):
     """
     text = strip_cruft(text, False)
 
-    # Suppress advertisements.
-    # Match lines with an opening bracket but no ending one, ie lines that
-    # contained html link that has been wiped out when scraping.
-    LINK1_RE = re.compile(r'(\(|\[).*[^\)\]]$')
-    # Match lines containing url between brackets
-    LINK2_RE = re.compile(r'(\(|\[).*[http|www].*(\]|\))')
-    text = LINK1_RE.sub('', text)
-    text = LINK2_RE.sub('', text)
-
     # Restore \n in input text
     if '\n' not in text:
         text = insert_line_feeds(text)
@@ -286,6 +277,15 @@ def is_lyrics(text, artist):
         return 0
     elif nbLines < 5:
         badTriggers.append('too_short')
+    else:
+        # Don't penalize long text because of lyrics keyword in credits
+        textlines = text.split('\n')
+        popped = False
+        for i in [len(textlines)-1, 0]:
+            if 'lyrics' in textlines[i].lower():
+                popped = textlines.pop(i)
+        if popped:
+            text = '\n'.join(textlines)
 
     for item in artist, 'lyrics', 'copyright', 'property':
         badTriggers += [item] * len(re.findall(r'\W%s\W' % item, text, re.I))

From 995d75f3f39d1b76e368e467ef1a787f3b487a07 Mon Sep 17 00:00:00 2001
From: Fabrice Laporte <kraymer@gmail.com>
Date: Sat, 29 Jun 2013 14:24:41 +0200
Subject: [PATCH 3/3] Logging: remove match ratio, add source website name

---
 beetsplug/lyrics.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index e9d6aa0d9..bc71a5a89 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -231,11 +231,6 @@ def is_page_candidate(urlLink, urlTitle, title, artist):
              [artist, sitename, sitename.replace('www.','')] + LYRICS_TRANS
     songTitle = re.sub(u'(%s)' % u'|'.join(tokens) ,u'', urlTitle).strip('%20')
 
-    if songTitle:
-        log.debug("Match ratio of '%s' with title: %s" %
-                  (songTitle,
-                   difflib.SequenceMatcher(None, songTitle, title).ratio()))
-
     typoRatio = .8
     return difflib.SequenceMatcher(None, songTitle, title).ratio() >= typoRatio
 
@@ -374,6 +369,7 @@ def fetch_google(artist, title):
             lyrics = sanitize_lyrics(lyrics)
 
             if is_lyrics(lyrics, artist):
+                log.debug(u'got lyrics from %s' % item['displayLink'])
                 return lyrics