diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py index 50d1529e1..51072e1ea 100644 --- a/beetsplug/lyrics.py +++ b/beetsplug/lyrics.py @@ -64,17 +64,20 @@ def fetch_url(url): log.debug(u'failed to fetch: {0} ({1})'.format(url, unicode(exc))) return None + def unescape(text): """Resolves &#xxx; HTML entities (and some others).""" if isinstance(text, str): text = text.decode('utf8', 'ignore') out = text.replace(u' ', u' ') + def replchar(m): num = m.group(1) return unichr(int(num)) out = re.sub(u"&#(\d+);", replchar, out) return out + def extract_text(html, starttag): """Extract the text from a
tag in the HTML starting with ``starttag``. Returns None if parsing fails. @@ -90,11 +93,11 @@ def extract_text(html, starttag): parts = [] pos = 0 for match in DIV_RE.finditer(html): - if match.group(1): # Closing tag. + if match.group(1): # Closing tag. level -= 1 if level == 0: pos = match.end() - else: # Opening tag. + else: # Opening tag. if level == 0: parts.append(html[pos:match.start()]) @@ -109,6 +112,7 @@ def extract_text(html, starttag): lyrics = ''.join(parts) return strip_cruft(lyrics) + def strip_cruft(lyrics, wscollapse=True): """Clean up HTML from an extracted lyrics string. For example,
tags are replaced with newlines. @@ -116,15 +120,46 @@ def strip_cruft(lyrics, wscollapse=True): lyrics = COMMENT_RE.sub('', lyrics) lyrics = unescape(lyrics) if wscollapse: - lyrics = re.sub(r'\s+', ' ', lyrics) # Whitespace collapse. - lyrics = BREAK_RE.sub('\n', lyrics) #
newlines. + lyrics = re.sub(r'\s+', ' ', lyrics) # Whitespace collapse. + lyrics = BREAK_RE.sub('\n', lyrics) #
newlines. lyrics = re.sub(r'\n +', '\n', lyrics) lyrics = re.sub(r' +\n', '\n', lyrics) - lyrics = TAG_RE.sub('', lyrics) # Strip remaining HTML tags. - lyrics = lyrics.replace('\r','\n') + lyrics = TAG_RE.sub('', lyrics) # Strip remaining HTML tags. + lyrics = lyrics.replace('\r', '\n') lyrics = lyrics.strip() return lyrics + +def split_multi_titles(s): + """Check for a dual song (e.g. Pink Floyd - Speak to Me / Breathe) + and returns titles as a list or None if song is not dual.""" + if '/' not in s: + return None + return [x.strip() for x in s.split('/')] + + +def remove_ft_artist_suffix(s): + """Remove featuring artist from string""" + + # Remove "featuring" suffixes + pattern = r"(.*?) (&|\b(and|feat(uring)?\b))" + match = re.search(pattern, s, re.IGNORECASE) + if match: + s = match.group(1) + return s + + +def remove_parenthesized_suffix(s): + """Remove parenthesized suffix from string common examples are (live), + (remix), (acoustic)""" + + pattern = r"(.+?)\s+[(].*[)]$" + match = re.search(pattern, s, re.IGNORECASE) + if match: + s = match.group(1) + return s + + def _encode(s): """Encode the string for inclusion in a URL (common to both LyricsWiki and Lyrics.com). @@ -139,6 +174,7 @@ def _encode(s): # LyricsWiki. LYRICSWIKI_URL_PATTERN = 'http://lyrics.wikia.com/%s:%s' + def _lw_encode(s): s = re.sub(r'\s+', '_', s) s = s.replace("<", "Less_Than") @@ -148,6 +184,7 @@ def _lw_encode(s): s = re.sub(r'[\]\}]', ')', s) return _encode(s) + def fetch_lyricswiki(artist, title): """Fetch lyrics from LyricsWiki.""" url = LYRICSWIKI_URL_PATTERN % (_lw_encode(artist), _lw_encode(title)) @@ -167,11 +204,13 @@ LYRICSCOM_NOT_FOUND = ( 'Sorry, we do not have the lyric', 'Submit Lyrics', ) + def _lc_encode(s): s = re.sub(r'[^\w\s-]', '', s) s = re.sub(r'\s+', '-', s) return _encode(s).lower() + def fetch_lyricscom(artist, title): """Fetch lyrics from Lyrics.com.""" url = LYRICSCOM_URL_PATTERN % (_lc_encode(title), _lc_encode(artist)) @@ -227,13 +266,14 @@ def is_page_candidate(urlLink, urlTitle, title, artist): return True # or try extracting song title from URL title and check if # they are close enough - tokens = [by+'%20'+artist for by in BY_TRANS] + \ + tokens = [by + '%20' + artist for by in BY_TRANS] + \ [artist, sitename, sitename.replace('www.','')] + LYRICS_TRANS - songTitle = re.sub(u'(%s)' % u'|'.join(tokens) ,u'', urlTitle).strip('%20') + songTitle = re.sub(u'(%s)' % u'|'.join(tokens) , u'', urlTitle).strip('%20') typoRatio = .8 return difflib.SequenceMatcher(None, songTitle, title).ratio() >= typoRatio + def insert_line_feeds(text): """Insert newlines before upper-case characters. """ @@ -243,6 +283,7 @@ def insert_line_feeds(text): tokensStr[idx] = ltoken[0] + '\n' + ltoken[1] return ''.join(tokensStr) + def sanitize_lyrics(text): """Clean text, returning raw lyrics as output or None if it happens that input text is actually not lyrics content. Clean (x)html tags @@ -254,7 +295,7 @@ def sanitize_lyrics(text): if '\n' not in text: text = insert_line_feeds(text) - while text.count('\n\n') > text.count('\n')/4: + while text.count('\n\n') > text.count('\n') / 4: # Remove first occurrence of \n for each sequence of \n text = re.sub(r'\n(\n+)', '\g<1>', text) @@ -262,6 +303,7 @@ def sanitize_lyrics(text): return text + def is_lyrics(text, artist): """Determine whether the text seems to be valid lyrics. """ @@ -290,6 +332,7 @@ def is_lyrics(text, artist): return len(badTriggers) < 2 + def scrape_lyrics_from_url(url): """Scrape lyrics from a URL. If no lyrics can be found, return None instead. @@ -306,7 +349,7 @@ def scrape_lyrics_from_url(url): # Remove non relevant html parts [s.extract() for s in soup(['head', 'script'])] - comments = soup.findAll(text=lambda text:isinstance(text, Comment)) + comments = soup.findAll(text=lambda text: isinstance(text, Comment)) [s.extract() for s in comments] try: @@ -315,7 +358,7 @@ def scrape_lyrics_from_url(url): except Exception, e: log.debug('Error %s when replacing containing marker by p marker' % e, - exc_info=True) + exc_info=True) # Make better soup from current soup! The previous unclosed

sections # are now closed. Use str() rather than prettify() as it's more @@ -343,6 +386,7 @@ def scrape_lyrics_from_url(url): soup = BeautifulSoup(tagTokens[0]) return unescape(tagTokens[0].strip("\n\r: ")) + def fetch_google(artist, title): """Fetch lyrics from Google search results. """ @@ -378,6 +422,7 @@ def fetch_google(artist, title): # Plugin logic. + class LyricsPlugin(BeetsPlugin): def __init__(self): super(LyricsPlugin, self).__init__() @@ -394,6 +439,7 @@ class LyricsPlugin(BeetsPlugin): if self.config['google_API_key'].get(): self.backends.insert(0, fetch_google) + def commands(self): cmd = ui.Subcommand('lyrics', help='fetch song lyrics') cmd.parser.add_option('-p', '--print', dest='printlyr', @@ -414,11 +460,14 @@ class LyricsPlugin(BeetsPlugin): cmd.func = func return [cmd] - # Auto-fetch lyrics on import. + def imported(self, session, task): + """Auto-fetch lyrics on import""" if self.config['auto']: for item in task.imported_items(): - self.fetch_item_lyrics(session.lib, logging.DEBUG, item, False, False) + self.fetch_item_lyrics(session.lib, logging.DEBUG, item, \ + False, False) + def fetch_item_lyrics(self, lib, loglevel, item, write, force): """Fetch and store lyrics for a single item. If ``write``, then the @@ -434,18 +483,35 @@ class LyricsPlugin(BeetsPlugin): (item.artist, item.title)) return + artist = remove_ft_artist_suffix(item.artist) + title = remove_parenthesized_suffix(\ + remove_ft_artist_suffix(item.title)) + # Fetch lyrics. - lyrics = self.get_lyrics(item.artist, item.title) + lyrics = self.get_lyrics(artist, title) + + if not lyrics: + # Check for a songs combinations + # (e.g. Pink Floyd - Speak to Me / Breathe) + titles = split_multi_titles(title) + for t in titles: + lyrics_title = self.get_lyrics(artist, t) + if lyrics_title: + if lyrics : + lyrics += u"\n\n---\n\n%s" % lyrics_title + else: + lyrics = lyrics_title + if not lyrics: log.log(loglevel, u'lyrics not found: %s - %s' % - (item.artist, item.title)) + (artist, title)) if fallback: lyrics = fallback else: return else: - log.log(loglevel, u'fetched lyrics: %s - %s' % - (item.artist, item.title)) + log.log(loglevel, u'fetched lyrics : %s - %s' % + (artist, title)) item.lyrics = lyrics @@ -453,6 +519,7 @@ class LyricsPlugin(BeetsPlugin): item.try_write() item.store() + def get_lyrics(self, artist, title): """Fetch lyrics, trying each source in turn. Return a string or None if no lyrics were found. @@ -471,4 +538,4 @@ class LyricsPlugin(BeetsPlugin): log.debug(u'got lyrics from backend: {0}'.format( backend.__name__ )) - return lyrics + return lyrics.strip() diff --git a/docs/changelog.rst b/docs/changelog.rst index d0a74ca85..b3180846f 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -13,6 +13,9 @@ New stuff: * :doc:`/plugins/replaygain`: Added support for calculating ReplayGain values with GStreamer as well the mp3gain programs. This enables ReplayGain calculation for any audio format. +* :doc:`/plugins/lyrics`: Better handling of songs whose title contain a + featured artist. Songs combinations are resolved now (all lyrics are + appended). Thanks to KraYmer and paulp. * Add support for `initial_key` as field in the library and tag for media files. When the user sets this field with ``beet modify initial_key=Am`` the media files will reflect this in their tags. The diff --git a/test/test_lyrics.py b/test/test_lyrics.py new file mode 100644 index 000000000..70bced2e1 --- /dev/null +++ b/test/test_lyrics.py @@ -0,0 +1,45 @@ +#!/usr/bin/python +# -*- coding: UTF-8 -*- + +"""Tests for the 'lyrics' plugin""" + +import _common +from _common import unittest +from beetsplug import lyrics +from beets import config +from beets.util import confit + + +class LyricsPluginTest(unittest.TestCase): + def setUp(self): + """Set up configuration""" + lyrics.LyricsPlugin() + + def test_split_multi_titles(self): + self.assertEqual(lyrics.split_multi_titles('song1 / song2 / song3'), + ['song1', 'song2', 'song3']) + self.assertEqual(lyrics.split_multi_titles('song1/song2 song3'), + ['song1', 'song2 song3']) + self.assertEqual(lyrics.split_multi_titles('song1 song2'), + None) + + def test_remove_ft_artist_suffix(self): + self.assertEqual(lyrics.remove_ft_artist_suffix('Bob featuring Marcia'), 'Bob') + self.assertEqual(lyrics.remove_ft_artist_suffix('Bob feat Marcia'), 'Bob') + self.assertEqual(lyrics.remove_ft_artist_suffix('Bob and Marcia'), 'Bob') + self.assertEqual(lyrics.remove_ft_artist_suffix('Bob feat. Marcia'), 'Bob') + self.assertEqual(lyrics.remove_ft_artist_suffix('Bob & Marcia'), 'Bob') + self.assertEqual(lyrics.remove_ft_artist_suffix('Bob feats Marcia'), 'Bob feats Marcia') + + def test_remove_parenthesized_suffix(self): + self.assertEqual(lyrics.remove_parenthesized_suffix('Song (live)'), 'Song') + self.assertEqual(lyrics.remove_parenthesized_suffix('Song (live) (new)'), 'Song') + self.assertEqual(lyrics.remove_parenthesized_suffix('Song (live (new))'), 'Song') + + +def suite(): + return unittest.TestLoader().loadTestsFromName(__name__) + +if __name__ == '__main__': + unittest.main(defaultTest='suite') +