From c6f935ac4c6840db030b5c4ca93c3b871a457b52 Mon Sep 17 00:00:00 2001 From: Fabrice Laporte Date: Sat, 29 Jun 2013 14:21:55 +0200 Subject: [PATCH 1/3] Don't consider text between parentheses when matching url title with song title. --- beetsplug/lyrics.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py index 707d84cf0..31a215cc1 100644 --- a/beetsplug/lyrics.py +++ b/beetsplug/lyrics.py @@ -198,6 +198,10 @@ def slugify(text): """ # http://stackoverflow.com/questions/295135/turn-a-string-into-a-valid- # filename-in-python + + # Remove content within parentheses + pat = "([^,\(]*)\((.*?)\)" + text = re.sub(pat,'\g<1>', text).strip() try: text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore') text = unicode(re.sub('[-\s]+', ' ', text)) From 9780be270cdcdafc065cfbd3b7b8633f29f21df9 Mon Sep 17 00:00:00 2001 From: Fabrice Laporte Date: Sat, 29 Jun 2013 14:23:53 +0200 Subject: [PATCH 2/3] Some tweaking to yield better results by not rejecting valid lyrics. --- beetsplug/lyrics.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py index 31a215cc1..e9d6aa0d9 100644 --- a/beetsplug/lyrics.py +++ b/beetsplug/lyrics.py @@ -237,7 +237,7 @@ def is_page_candidate(urlLink, urlTitle, title, artist): difflib.SequenceMatcher(None, songTitle, title).ratio())) typoRatio = .8 - return difflib.SequenceMatcher(None, songTitle, title).ratio() > typoRatio + return difflib.SequenceMatcher(None, songTitle, title).ratio() >= typoRatio def insert_line_feeds(text): """Insert newlines before upper-case characters. @@ -255,15 +255,6 @@ def sanitize_lyrics(text): """ text = strip_cruft(text, False) - # Suppress advertisements. - # Match lines with an opening bracket but no ending one, ie lines that - # contained html link that has been wiped out when scraping. - LINK1_RE = re.compile(r'(\(|\[).*[^\)\]]$') - # Match lines containing url between brackets - LINK2_RE = re.compile(r'(\(|\[).*[http|www].*(\]|\))') - text = LINK1_RE.sub('', text) - text = LINK2_RE.sub('', text) - # Restore \n in input text if '\n' not in text: text = insert_line_feeds(text) @@ -286,6 +277,15 @@ def is_lyrics(text, artist): return 0 elif nbLines < 5: badTriggers.append('too_short') + else: + # Don't penalize long text because of lyrics keyword in credits + textlines = text.split('\n') + popped = False + for i in [len(textlines)-1, 0]: + if 'lyrics' in textlines[i].lower(): + popped = textlines.pop(i) + if popped: + text = '\n'.join(textlines) for item in artist, 'lyrics', 'copyright', 'property': badTriggers += [item] * len(re.findall(r'\W%s\W' % item, text, re.I)) From 995d75f3f39d1b76e368e467ef1a787f3b487a07 Mon Sep 17 00:00:00 2001 From: Fabrice Laporte Date: Sat, 29 Jun 2013 14:24:41 +0200 Subject: [PATCH 3/3] Logging: remove match ratio, add source website name --- beetsplug/lyrics.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py index e9d6aa0d9..bc71a5a89 100644 --- a/beetsplug/lyrics.py +++ b/beetsplug/lyrics.py @@ -231,11 +231,6 @@ def is_page_candidate(urlLink, urlTitle, title, artist): [artist, sitename, sitename.replace('www.','')] + LYRICS_TRANS songTitle = re.sub(u'(%s)' % u'|'.join(tokens) ,u'', urlTitle).strip('%20') - if songTitle: - log.debug("Match ratio of '%s' with title: %s" % - (songTitle, - difflib.SequenceMatcher(None, songTitle, title).ratio())) - typoRatio = .8 return difflib.SequenceMatcher(None, songTitle, title).ratio() >= typoRatio @@ -374,6 +369,7 @@ def fetch_google(artist, title): lyrics = sanitize_lyrics(lyrics) if is_lyrics(lyrics, artist): + log.debug(u'got lyrics from %s' % item['displayLink']) return lyrics