Merge branch 'KraYmer-lyrics-enh-tests'

This commit is contained in:
Adrian Sampson 2014-04-12 12:58:20 -07:00
commit 2dafaa2f0c
3 changed files with 133 additions and 18 deletions

View file

@ -64,17 +64,20 @@ def fetch_url(url):
log.debug(u'failed to fetch: {0} ({1})'.format(url, unicode(exc)))
return None
def unescape(text):
"""Resolves &#xxx; HTML entities (and some others)."""
if isinstance(text, str):
text = text.decode('utf8', 'ignore')
out = text.replace(u' ', u' ')
def replchar(m):
num = m.group(1)
return unichr(int(num))
out = re.sub(u"&#(\d+);", replchar, out)
return out
def extract_text(html, starttag):
"""Extract the text from a <DIV> tag in the HTML starting with
``starttag``. Returns None if parsing fails.
@ -90,11 +93,11 @@ def extract_text(html, starttag):
parts = []
pos = 0
for match in DIV_RE.finditer(html):
if match.group(1): # Closing tag.
if match.group(1): # Closing tag.
level -= 1
if level == 0:
pos = match.end()
else: # Opening tag.
else: # Opening tag.
if level == 0:
parts.append(html[pos:match.start()])
@ -109,6 +112,7 @@ def extract_text(html, starttag):
lyrics = ''.join(parts)
return strip_cruft(lyrics)
def strip_cruft(lyrics, wscollapse=True):
"""Clean up HTML from an extracted lyrics string. For example, <BR>
tags are replaced with newlines.
@ -116,15 +120,46 @@ def strip_cruft(lyrics, wscollapse=True):
lyrics = COMMENT_RE.sub('', lyrics)
lyrics = unescape(lyrics)
if wscollapse:
lyrics = re.sub(r'\s+', ' ', lyrics) # Whitespace collapse.
lyrics = BREAK_RE.sub('\n', lyrics) # <BR> newlines.
lyrics = re.sub(r'\s+', ' ', lyrics) # Whitespace collapse.
lyrics = BREAK_RE.sub('\n', lyrics) # <BR> newlines.
lyrics = re.sub(r'\n +', '\n', lyrics)
lyrics = re.sub(r' +\n', '\n', lyrics)
lyrics = TAG_RE.sub('', lyrics) # Strip remaining HTML tags.
lyrics = lyrics.replace('\r','\n')
lyrics = TAG_RE.sub('', lyrics) # Strip remaining HTML tags.
lyrics = lyrics.replace('\r', '\n')
lyrics = lyrics.strip()
return lyrics
def split_multi_titles(s):
"""Check for a dual song (e.g. Pink Floyd - Speak to Me / Breathe)
and returns titles as a list or None if song is not dual."""
if '/' not in s:
return None
return [x.strip() for x in s.split('/')]
def remove_ft_artist_suffix(s):
"""Remove featuring artist from string"""
# Remove "featuring" suffixes
pattern = r"(.*?) (&|\b(and|feat(uring)?\b))"
match = re.search(pattern, s, re.IGNORECASE)
if match:
s = match.group(1)
return s
def remove_parenthesized_suffix(s):
"""Remove parenthesized suffix from string common examples are (live),
(remix), (acoustic)"""
pattern = r"(.+?)\s+[(].*[)]$"
match = re.search(pattern, s, re.IGNORECASE)
if match:
s = match.group(1)
return s
def _encode(s):
"""Encode the string for inclusion in a URL (common to both
LyricsWiki and Lyrics.com).
@ -139,6 +174,7 @@ def _encode(s):
# LyricsWiki.
LYRICSWIKI_URL_PATTERN = 'http://lyrics.wikia.com/%s:%s'
def _lw_encode(s):
s = re.sub(r'\s+', '_', s)
s = s.replace("<", "Less_Than")
@ -148,6 +184,7 @@ def _lw_encode(s):
s = re.sub(r'[\]\}]', ')', s)
return _encode(s)
def fetch_lyricswiki(artist, title):
"""Fetch lyrics from LyricsWiki."""
url = LYRICSWIKI_URL_PATTERN % (_lw_encode(artist), _lw_encode(title))
@ -167,11 +204,13 @@ LYRICSCOM_NOT_FOUND = (
'Sorry, we do not have the lyric',
'Submit Lyrics',
)
def _lc_encode(s):
s = re.sub(r'[^\w\s-]', '', s)
s = re.sub(r'\s+', '-', s)
return _encode(s).lower()
def fetch_lyricscom(artist, title):
"""Fetch lyrics from Lyrics.com."""
url = LYRICSCOM_URL_PATTERN % (_lc_encode(title), _lc_encode(artist))
@ -227,13 +266,14 @@ def is_page_candidate(urlLink, urlTitle, title, artist):
return True
# or try extracting song title from URL title and check if
# they are close enough
tokens = [by+'%20'+artist for by in BY_TRANS] + \
tokens = [by + '%20' + artist for by in BY_TRANS] + \
[artist, sitename, sitename.replace('www.','')] + LYRICS_TRANS
songTitle = re.sub(u'(%s)' % u'|'.join(tokens) ,u'', urlTitle).strip('%20')
songTitle = re.sub(u'(%s)' % u'|'.join(tokens) , u'', urlTitle).strip('%20')
typoRatio = .8
return difflib.SequenceMatcher(None, songTitle, title).ratio() >= typoRatio
def insert_line_feeds(text):
"""Insert newlines before upper-case characters.
"""
@ -243,6 +283,7 @@ def insert_line_feeds(text):
tokensStr[idx] = ltoken[0] + '\n' + ltoken[1]
return ''.join(tokensStr)
def sanitize_lyrics(text):
"""Clean text, returning raw lyrics as output or None if it happens
that input text is actually not lyrics content. Clean (x)html tags
@ -254,7 +295,7 @@ def sanitize_lyrics(text):
if '\n' not in text:
text = insert_line_feeds(text)
while text.count('\n\n') > text.count('\n')/4:
while text.count('\n\n') > text.count('\n') / 4:
# Remove first occurrence of \n for each sequence of \n
text = re.sub(r'\n(\n+)', '\g<1>', text)
@ -262,6 +303,7 @@ def sanitize_lyrics(text):
return text
def is_lyrics(text, artist):
"""Determine whether the text seems to be valid lyrics.
"""
@ -290,6 +332,7 @@ def is_lyrics(text, artist):
return len(badTriggers) < 2
def scrape_lyrics_from_url(url):
"""Scrape lyrics from a URL. If no lyrics can be found, return None
instead.
@ -306,7 +349,7 @@ def scrape_lyrics_from_url(url):
# Remove non relevant html parts
[s.extract() for s in soup(['head', 'script'])]
comments = soup.findAll(text=lambda text:isinstance(text, Comment))
comments = soup.findAll(text=lambda text: isinstance(text, Comment))
[s.extract() for s in comments]
try:
@ -315,7 +358,7 @@ def scrape_lyrics_from_url(url):
except Exception, e:
log.debug('Error %s when replacing containing marker by p marker' % e,
exc_info=True)
exc_info=True)
# Make better soup from current soup! The previous unclosed <p> sections
# are now closed. Use str() rather than prettify() as it's more
@ -343,6 +386,7 @@ def scrape_lyrics_from_url(url):
soup = BeautifulSoup(tagTokens[0])
return unescape(tagTokens[0].strip("\n\r: "))
def fetch_google(artist, title):
"""Fetch lyrics from Google search results.
"""
@ -378,6 +422,7 @@ def fetch_google(artist, title):
# Plugin logic.
class LyricsPlugin(BeetsPlugin):
def __init__(self):
super(LyricsPlugin, self).__init__()
@ -394,6 +439,7 @@ class LyricsPlugin(BeetsPlugin):
if self.config['google_API_key'].get():
self.backends.insert(0, fetch_google)
def commands(self):
cmd = ui.Subcommand('lyrics', help='fetch song lyrics')
cmd.parser.add_option('-p', '--print', dest='printlyr',
@ -414,11 +460,14 @@ class LyricsPlugin(BeetsPlugin):
cmd.func = func
return [cmd]
# Auto-fetch lyrics on import.
def imported(self, session, task):
"""Auto-fetch lyrics on import"""
if self.config['auto']:
for item in task.imported_items():
self.fetch_item_lyrics(session.lib, logging.DEBUG, item, False, False)
self.fetch_item_lyrics(session.lib, logging.DEBUG, item, \
False, False)
def fetch_item_lyrics(self, lib, loglevel, item, write, force):
"""Fetch and store lyrics for a single item. If ``write``, then the
@ -434,18 +483,35 @@ class LyricsPlugin(BeetsPlugin):
(item.artist, item.title))
return
artist = remove_ft_artist_suffix(item.artist)
title = remove_parenthesized_suffix(\
remove_ft_artist_suffix(item.title))
# Fetch lyrics.
lyrics = self.get_lyrics(item.artist, item.title)
lyrics = self.get_lyrics(artist, title)
if not lyrics:
# Check for a songs combinations
# (e.g. Pink Floyd - Speak to Me / Breathe)
titles = split_multi_titles(title)
for t in titles:
lyrics_title = self.get_lyrics(artist, t)
if lyrics_title:
if lyrics :
lyrics += u"\n\n---\n\n%s" % lyrics_title
else:
lyrics = lyrics_title
if not lyrics:
log.log(loglevel, u'lyrics not found: %s - %s' %
(item.artist, item.title))
(artist, title))
if fallback:
lyrics = fallback
else:
return
else:
log.log(loglevel, u'fetched lyrics: %s - %s' %
(item.artist, item.title))
log.log(loglevel, u'fetched lyrics : %s - %s' %
(artist, title))
item.lyrics = lyrics
@ -453,6 +519,7 @@ class LyricsPlugin(BeetsPlugin):
item.try_write()
item.store()
def get_lyrics(self, artist, title):
"""Fetch lyrics, trying each source in turn. Return a string or
None if no lyrics were found.
@ -471,4 +538,4 @@ class LyricsPlugin(BeetsPlugin):
log.debug(u'got lyrics from backend: {0}'.format(
backend.__name__
))
return lyrics
return lyrics.strip()

View file

@ -13,6 +13,9 @@ New stuff:
* :doc:`/plugins/replaygain`: Added support for calculating ReplayGain values
with GStreamer as well the mp3gain programs. This enables ReplayGain
calculation for any audio format.
* :doc:`/plugins/lyrics`: Better handling of songs whose title contain a
featured artist. Songs combinations are resolved now (all lyrics are
appended). Thanks to KraYmer and paulp.
* Add support for `initial_key` as field in the library and tag for
media files. When the user sets this field with ``beet modify
initial_key=Am`` the media files will reflect this in their tags. The

45
test/test_lyrics.py Normal file
View file

@ -0,0 +1,45 @@
#!/usr/bin/python
# -*- coding: UTF-8 -*-
"""Tests for the 'lyrics' plugin"""
import _common
from _common import unittest
from beetsplug import lyrics
from beets import config
from beets.util import confit
class LyricsPluginTest(unittest.TestCase):
def setUp(self):
"""Set up configuration"""
lyrics.LyricsPlugin()
def test_split_multi_titles(self):
self.assertEqual(lyrics.split_multi_titles('song1 / song2 / song3'),
['song1', 'song2', 'song3'])
self.assertEqual(lyrics.split_multi_titles('song1/song2 song3'),
['song1', 'song2 song3'])
self.assertEqual(lyrics.split_multi_titles('song1 song2'),
None)
def test_remove_ft_artist_suffix(self):
self.assertEqual(lyrics.remove_ft_artist_suffix('Bob featuring Marcia'), 'Bob')
self.assertEqual(lyrics.remove_ft_artist_suffix('Bob feat Marcia'), 'Bob')
self.assertEqual(lyrics.remove_ft_artist_suffix('Bob and Marcia'), 'Bob')
self.assertEqual(lyrics.remove_ft_artist_suffix('Bob feat. Marcia'), 'Bob')
self.assertEqual(lyrics.remove_ft_artist_suffix('Bob & Marcia'), 'Bob')
self.assertEqual(lyrics.remove_ft_artist_suffix('Bob feats Marcia'), 'Bob feats Marcia')
def test_remove_parenthesized_suffix(self):
self.assertEqual(lyrics.remove_parenthesized_suffix('Song (live)'), 'Song')
self.assertEqual(lyrics.remove_parenthesized_suffix('Song (live) (new)'), 'Song')
self.assertEqual(lyrics.remove_parenthesized_suffix('Song (live (new))'), 'Song')
def suite():
return unittest.TestLoader().loadTestsFromName(__name__)
if __name__ == '__main__':
unittest.main(defaultTest='suite')