mirror of
https://github.com/beetbox/beets.git
synced 2025-12-15 21:14:19 +01:00
Merge branch 'KraYmer-lyrics-enh-tests'
This commit is contained in:
commit
2dafaa2f0c
3 changed files with 133 additions and 18 deletions
|
|
@ -64,17 +64,20 @@ def fetch_url(url):
|
|||
log.debug(u'failed to fetch: {0} ({1})'.format(url, unicode(exc)))
|
||||
return None
|
||||
|
||||
|
||||
def unescape(text):
|
||||
"""Resolves &#xxx; HTML entities (and some others)."""
|
||||
if isinstance(text, str):
|
||||
text = text.decode('utf8', 'ignore')
|
||||
out = text.replace(u' ', u' ')
|
||||
|
||||
def replchar(m):
|
||||
num = m.group(1)
|
||||
return unichr(int(num))
|
||||
out = re.sub(u"&#(\d+);", replchar, out)
|
||||
return out
|
||||
|
||||
|
||||
def extract_text(html, starttag):
|
||||
"""Extract the text from a <DIV> tag in the HTML starting with
|
||||
``starttag``. Returns None if parsing fails.
|
||||
|
|
@ -90,11 +93,11 @@ def extract_text(html, starttag):
|
|||
parts = []
|
||||
pos = 0
|
||||
for match in DIV_RE.finditer(html):
|
||||
if match.group(1): # Closing tag.
|
||||
if match.group(1): # Closing tag.
|
||||
level -= 1
|
||||
if level == 0:
|
||||
pos = match.end()
|
||||
else: # Opening tag.
|
||||
else: # Opening tag.
|
||||
if level == 0:
|
||||
parts.append(html[pos:match.start()])
|
||||
|
||||
|
|
@ -109,6 +112,7 @@ def extract_text(html, starttag):
|
|||
lyrics = ''.join(parts)
|
||||
return strip_cruft(lyrics)
|
||||
|
||||
|
||||
def strip_cruft(lyrics, wscollapse=True):
|
||||
"""Clean up HTML from an extracted lyrics string. For example, <BR>
|
||||
tags are replaced with newlines.
|
||||
|
|
@ -116,15 +120,46 @@ def strip_cruft(lyrics, wscollapse=True):
|
|||
lyrics = COMMENT_RE.sub('', lyrics)
|
||||
lyrics = unescape(lyrics)
|
||||
if wscollapse:
|
||||
lyrics = re.sub(r'\s+', ' ', lyrics) # Whitespace collapse.
|
||||
lyrics = BREAK_RE.sub('\n', lyrics) # <BR> newlines.
|
||||
lyrics = re.sub(r'\s+', ' ', lyrics) # Whitespace collapse.
|
||||
lyrics = BREAK_RE.sub('\n', lyrics) # <BR> newlines.
|
||||
lyrics = re.sub(r'\n +', '\n', lyrics)
|
||||
lyrics = re.sub(r' +\n', '\n', lyrics)
|
||||
lyrics = TAG_RE.sub('', lyrics) # Strip remaining HTML tags.
|
||||
lyrics = lyrics.replace('\r','\n')
|
||||
lyrics = TAG_RE.sub('', lyrics) # Strip remaining HTML tags.
|
||||
lyrics = lyrics.replace('\r', '\n')
|
||||
lyrics = lyrics.strip()
|
||||
return lyrics
|
||||
|
||||
|
||||
def split_multi_titles(s):
|
||||
"""Check for a dual song (e.g. Pink Floyd - Speak to Me / Breathe)
|
||||
and returns titles as a list or None if song is not dual."""
|
||||
if '/' not in s:
|
||||
return None
|
||||
return [x.strip() for x in s.split('/')]
|
||||
|
||||
|
||||
def remove_ft_artist_suffix(s):
|
||||
"""Remove featuring artist from string"""
|
||||
|
||||
# Remove "featuring" suffixes
|
||||
pattern = r"(.*?) (&|\b(and|feat(uring)?\b))"
|
||||
match = re.search(pattern, s, re.IGNORECASE)
|
||||
if match:
|
||||
s = match.group(1)
|
||||
return s
|
||||
|
||||
|
||||
def remove_parenthesized_suffix(s):
|
||||
"""Remove parenthesized suffix from string common examples are (live),
|
||||
(remix), (acoustic)"""
|
||||
|
||||
pattern = r"(.+?)\s+[(].*[)]$"
|
||||
match = re.search(pattern, s, re.IGNORECASE)
|
||||
if match:
|
||||
s = match.group(1)
|
||||
return s
|
||||
|
||||
|
||||
def _encode(s):
|
||||
"""Encode the string for inclusion in a URL (common to both
|
||||
LyricsWiki and Lyrics.com).
|
||||
|
|
@ -139,6 +174,7 @@ def _encode(s):
|
|||
# LyricsWiki.
|
||||
|
||||
LYRICSWIKI_URL_PATTERN = 'http://lyrics.wikia.com/%s:%s'
|
||||
|
||||
def _lw_encode(s):
|
||||
s = re.sub(r'\s+', '_', s)
|
||||
s = s.replace("<", "Less_Than")
|
||||
|
|
@ -148,6 +184,7 @@ def _lw_encode(s):
|
|||
s = re.sub(r'[\]\}]', ')', s)
|
||||
return _encode(s)
|
||||
|
||||
|
||||
def fetch_lyricswiki(artist, title):
|
||||
"""Fetch lyrics from LyricsWiki."""
|
||||
url = LYRICSWIKI_URL_PATTERN % (_lw_encode(artist), _lw_encode(title))
|
||||
|
|
@ -167,11 +204,13 @@ LYRICSCOM_NOT_FOUND = (
|
|||
'Sorry, we do not have the lyric',
|
||||
'Submit Lyrics',
|
||||
)
|
||||
|
||||
def _lc_encode(s):
|
||||
s = re.sub(r'[^\w\s-]', '', s)
|
||||
s = re.sub(r'\s+', '-', s)
|
||||
return _encode(s).lower()
|
||||
|
||||
|
||||
def fetch_lyricscom(artist, title):
|
||||
"""Fetch lyrics from Lyrics.com."""
|
||||
url = LYRICSCOM_URL_PATTERN % (_lc_encode(title), _lc_encode(artist))
|
||||
|
|
@ -227,13 +266,14 @@ def is_page_candidate(urlLink, urlTitle, title, artist):
|
|||
return True
|
||||
# or try extracting song title from URL title and check if
|
||||
# they are close enough
|
||||
tokens = [by+'%20'+artist for by in BY_TRANS] + \
|
||||
tokens = [by + '%20' + artist for by in BY_TRANS] + \
|
||||
[artist, sitename, sitename.replace('www.','')] + LYRICS_TRANS
|
||||
songTitle = re.sub(u'(%s)' % u'|'.join(tokens) ,u'', urlTitle).strip('%20')
|
||||
songTitle = re.sub(u'(%s)' % u'|'.join(tokens) , u'', urlTitle).strip('%20')
|
||||
|
||||
typoRatio = .8
|
||||
return difflib.SequenceMatcher(None, songTitle, title).ratio() >= typoRatio
|
||||
|
||||
|
||||
def insert_line_feeds(text):
|
||||
"""Insert newlines before upper-case characters.
|
||||
"""
|
||||
|
|
@ -243,6 +283,7 @@ def insert_line_feeds(text):
|
|||
tokensStr[idx] = ltoken[0] + '\n' + ltoken[1]
|
||||
return ''.join(tokensStr)
|
||||
|
||||
|
||||
def sanitize_lyrics(text):
|
||||
"""Clean text, returning raw lyrics as output or None if it happens
|
||||
that input text is actually not lyrics content. Clean (x)html tags
|
||||
|
|
@ -254,7 +295,7 @@ def sanitize_lyrics(text):
|
|||
if '\n' not in text:
|
||||
text = insert_line_feeds(text)
|
||||
|
||||
while text.count('\n\n') > text.count('\n')/4:
|
||||
while text.count('\n\n') > text.count('\n') / 4:
|
||||
# Remove first occurrence of \n for each sequence of \n
|
||||
text = re.sub(r'\n(\n+)', '\g<1>', text)
|
||||
|
||||
|
|
@ -262,6 +303,7 @@ def sanitize_lyrics(text):
|
|||
|
||||
return text
|
||||
|
||||
|
||||
def is_lyrics(text, artist):
|
||||
"""Determine whether the text seems to be valid lyrics.
|
||||
"""
|
||||
|
|
@ -290,6 +332,7 @@ def is_lyrics(text, artist):
|
|||
|
||||
return len(badTriggers) < 2
|
||||
|
||||
|
||||
def scrape_lyrics_from_url(url):
|
||||
"""Scrape lyrics from a URL. If no lyrics can be found, return None
|
||||
instead.
|
||||
|
|
@ -306,7 +349,7 @@ def scrape_lyrics_from_url(url):
|
|||
|
||||
# Remove non relevant html parts
|
||||
[s.extract() for s in soup(['head', 'script'])]
|
||||
comments = soup.findAll(text=lambda text:isinstance(text, Comment))
|
||||
comments = soup.findAll(text=lambda text: isinstance(text, Comment))
|
||||
[s.extract() for s in comments]
|
||||
|
||||
try:
|
||||
|
|
@ -315,7 +358,7 @@ def scrape_lyrics_from_url(url):
|
|||
|
||||
except Exception, e:
|
||||
log.debug('Error %s when replacing containing marker by p marker' % e,
|
||||
exc_info=True)
|
||||
exc_info=True)
|
||||
|
||||
# Make better soup from current soup! The previous unclosed <p> sections
|
||||
# are now closed. Use str() rather than prettify() as it's more
|
||||
|
|
@ -343,6 +386,7 @@ def scrape_lyrics_from_url(url):
|
|||
soup = BeautifulSoup(tagTokens[0])
|
||||
return unescape(tagTokens[0].strip("\n\r: "))
|
||||
|
||||
|
||||
def fetch_google(artist, title):
|
||||
"""Fetch lyrics from Google search results.
|
||||
"""
|
||||
|
|
@ -378,6 +422,7 @@ def fetch_google(artist, title):
|
|||
|
||||
# Plugin logic.
|
||||
|
||||
|
||||
class LyricsPlugin(BeetsPlugin):
|
||||
def __init__(self):
|
||||
super(LyricsPlugin, self).__init__()
|
||||
|
|
@ -394,6 +439,7 @@ class LyricsPlugin(BeetsPlugin):
|
|||
if self.config['google_API_key'].get():
|
||||
self.backends.insert(0, fetch_google)
|
||||
|
||||
|
||||
def commands(self):
|
||||
cmd = ui.Subcommand('lyrics', help='fetch song lyrics')
|
||||
cmd.parser.add_option('-p', '--print', dest='printlyr',
|
||||
|
|
@ -414,11 +460,14 @@ class LyricsPlugin(BeetsPlugin):
|
|||
cmd.func = func
|
||||
return [cmd]
|
||||
|
||||
# Auto-fetch lyrics on import.
|
||||
|
||||
def imported(self, session, task):
|
||||
"""Auto-fetch lyrics on import"""
|
||||
if self.config['auto']:
|
||||
for item in task.imported_items():
|
||||
self.fetch_item_lyrics(session.lib, logging.DEBUG, item, False, False)
|
||||
self.fetch_item_lyrics(session.lib, logging.DEBUG, item, \
|
||||
False, False)
|
||||
|
||||
|
||||
def fetch_item_lyrics(self, lib, loglevel, item, write, force):
|
||||
"""Fetch and store lyrics for a single item. If ``write``, then the
|
||||
|
|
@ -434,18 +483,35 @@ class LyricsPlugin(BeetsPlugin):
|
|||
(item.artist, item.title))
|
||||
return
|
||||
|
||||
artist = remove_ft_artist_suffix(item.artist)
|
||||
title = remove_parenthesized_suffix(\
|
||||
remove_ft_artist_suffix(item.title))
|
||||
|
||||
# Fetch lyrics.
|
||||
lyrics = self.get_lyrics(item.artist, item.title)
|
||||
lyrics = self.get_lyrics(artist, title)
|
||||
|
||||
if not lyrics:
|
||||
# Check for a songs combinations
|
||||
# (e.g. Pink Floyd - Speak to Me / Breathe)
|
||||
titles = split_multi_titles(title)
|
||||
for t in titles:
|
||||
lyrics_title = self.get_lyrics(artist, t)
|
||||
if lyrics_title:
|
||||
if lyrics :
|
||||
lyrics += u"\n\n---\n\n%s" % lyrics_title
|
||||
else:
|
||||
lyrics = lyrics_title
|
||||
|
||||
if not lyrics:
|
||||
log.log(loglevel, u'lyrics not found: %s - %s' %
|
||||
(item.artist, item.title))
|
||||
(artist, title))
|
||||
if fallback:
|
||||
lyrics = fallback
|
||||
else:
|
||||
return
|
||||
else:
|
||||
log.log(loglevel, u'fetched lyrics: %s - %s' %
|
||||
(item.artist, item.title))
|
||||
log.log(loglevel, u'fetched lyrics : %s - %s' %
|
||||
(artist, title))
|
||||
|
||||
item.lyrics = lyrics
|
||||
|
||||
|
|
@ -453,6 +519,7 @@ class LyricsPlugin(BeetsPlugin):
|
|||
item.try_write()
|
||||
item.store()
|
||||
|
||||
|
||||
def get_lyrics(self, artist, title):
|
||||
"""Fetch lyrics, trying each source in turn. Return a string or
|
||||
None if no lyrics were found.
|
||||
|
|
@ -471,4 +538,4 @@ class LyricsPlugin(BeetsPlugin):
|
|||
log.debug(u'got lyrics from backend: {0}'.format(
|
||||
backend.__name__
|
||||
))
|
||||
return lyrics
|
||||
return lyrics.strip()
|
||||
|
|
|
|||
|
|
@ -13,6 +13,9 @@ New stuff:
|
|||
* :doc:`/plugins/replaygain`: Added support for calculating ReplayGain values
|
||||
with GStreamer as well the mp3gain programs. This enables ReplayGain
|
||||
calculation for any audio format.
|
||||
* :doc:`/plugins/lyrics`: Better handling of songs whose title contain a
|
||||
featured artist. Songs combinations are resolved now (all lyrics are
|
||||
appended). Thanks to KraYmer and paulp.
|
||||
* Add support for `initial_key` as field in the library and tag for
|
||||
media files. When the user sets this field with ``beet modify
|
||||
initial_key=Am`` the media files will reflect this in their tags. The
|
||||
|
|
|
|||
45
test/test_lyrics.py
Normal file
45
test/test_lyrics.py
Normal file
|
|
@ -0,0 +1,45 @@
|
|||
#!/usr/bin/python
|
||||
# -*- coding: UTF-8 -*-
|
||||
|
||||
"""Tests for the 'lyrics' plugin"""
|
||||
|
||||
import _common
|
||||
from _common import unittest
|
||||
from beetsplug import lyrics
|
||||
from beets import config
|
||||
from beets.util import confit
|
||||
|
||||
|
||||
class LyricsPluginTest(unittest.TestCase):
|
||||
def setUp(self):
|
||||
"""Set up configuration"""
|
||||
lyrics.LyricsPlugin()
|
||||
|
||||
def test_split_multi_titles(self):
|
||||
self.assertEqual(lyrics.split_multi_titles('song1 / song2 / song3'),
|
||||
['song1', 'song2', 'song3'])
|
||||
self.assertEqual(lyrics.split_multi_titles('song1/song2 song3'),
|
||||
['song1', 'song2 song3'])
|
||||
self.assertEqual(lyrics.split_multi_titles('song1 song2'),
|
||||
None)
|
||||
|
||||
def test_remove_ft_artist_suffix(self):
|
||||
self.assertEqual(lyrics.remove_ft_artist_suffix('Bob featuring Marcia'), 'Bob')
|
||||
self.assertEqual(lyrics.remove_ft_artist_suffix('Bob feat Marcia'), 'Bob')
|
||||
self.assertEqual(lyrics.remove_ft_artist_suffix('Bob and Marcia'), 'Bob')
|
||||
self.assertEqual(lyrics.remove_ft_artist_suffix('Bob feat. Marcia'), 'Bob')
|
||||
self.assertEqual(lyrics.remove_ft_artist_suffix('Bob & Marcia'), 'Bob')
|
||||
self.assertEqual(lyrics.remove_ft_artist_suffix('Bob feats Marcia'), 'Bob feats Marcia')
|
||||
|
||||
def test_remove_parenthesized_suffix(self):
|
||||
self.assertEqual(lyrics.remove_parenthesized_suffix('Song (live)'), 'Song')
|
||||
self.assertEqual(lyrics.remove_parenthesized_suffix('Song (live) (new)'), 'Song')
|
||||
self.assertEqual(lyrics.remove_parenthesized_suffix('Song (live (new))'), 'Song')
|
||||
|
||||
|
||||
def suite():
|
||||
return unittest.TestLoader().loadTestsFromName(__name__)
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main(defaultTest='suite')
|
||||
|
||||
Loading…
Reference in a new issue