lyrics: strip title excerpt before matching

improve the extraction of lyrics title from url title and increase the
matching threshold as a consequence.
This commit is contained in:
Fabrice Laporte 2014-10-08 14:49:09 +02:00
parent 9992d65366
commit c0c474b20f

View file

@ -271,8 +271,9 @@ def is_page_candidate(urlLink, urlTitle, title, artist):
tokens = [by + '_' + artist for by in BY_TRANS] + \
[artist, sitename, sitename.replace('www.', '')] + LYRICS_TRANS
songTitle = re.sub(u'(%s)' % u'|'.join(tokens), u'', urlTitle)
songTitle = songTitle.strip('_|')
typoRatio = .9
typoRatio = .8
return difflib.SequenceMatcher(None, songTitle, title).ratio() >= typoRatio