Merge pull request #1148 from Kraymer/lyrics_musixmatch

lyrics: add 'musixmatch' source

Conflicts:
	beetsplug/lyrics.py
This commit is contained in:
Adrian Sampson 2014-12-18 10:36:16 +00:00
commit ac3f0824b0
11 changed files with 118 additions and 76 deletions

View file

@ -16,8 +16,10 @@
import logging
import traceback
from collections import defaultdict
import inspect
import re
from collections import defaultdict
import beets
from beets import mediafile
@ -402,3 +404,32 @@ def send(event, **arguments):
argspec = inspect.getargspec(handler).args
args = dict((k, v) for k, v in arguments.items() if k in argspec)
handler(**args)
def feat_tokens(for_artist=True):
"""Return a regular expression that matches phrases like "featuring"
that separate a main artist or a song title from secondary artists.
The `for_artist` option determines whether the regex should be
suitable for matching artist fields (the default) or title fields.
"""
feat_words = ['ft', 'featuring', 'feat', 'feat.', 'ft.']
if for_artist:
feat_words += ['with', 'vs', 'and', 'con', '&']
return '(?<=\s)(?:{0})(?=\s)'.format(
'|'.join(re.escape(x) for x in feat_words)
)
def sanitize_choices(choices, choices_all):
"""Clean up a stringlist configuration attribute: keep only choices
elements present in choices_all, remove duplicate elements, expand '*'
wildcard while keeping original stringlist order.
"""
seen = set()
others = [x for x in choices_all if x not in choices]
res = []
for s in choices:
if s in list(choices_all) + ['*']:
if not (s in seen or seen.add(s)):
res.extend(list(others) if s == '*' else [s])
return res

View file

@ -678,17 +678,3 @@ def max_filename_length(path, limit=MAX_FILENAME_LENGTH):
return min(res[9], limit)
else:
return limit
def feat_tokens(for_artist=True):
"""Return a regular expression that matches phrases like "featuring"
that separate a main artist or a song title from secondary artists.
The `for_artist` option determines whether the regex should be
suitable for matching artist fields (the default) or title fields.
"""
feat_words = ['ft', 'featuring', 'feat', 'feat.', 'ft.']
if for_artist:
feat_words += ['with', 'vs', 'and', 'con', '&']
return '(?<=\s)(?:{0})(?=\s)'.format(
'|'.join(re.escape(x) for x in feat_words)
)

View file

@ -22,12 +22,12 @@ from tempfile import NamedTemporaryFile
import requests
from beets.plugins import BeetsPlugin
from beets.util.artresizer import ArtResizer
from beets import plugins
from beets import importer
from beets import ui
from beets import util
from beets import config
from beets.util.artresizer import ArtResizer
try:
import itunes
@ -319,23 +319,7 @@ def batch_fetch_art(lib, albums, force, maxwidth=None):
message))
def sanitize_sources(sources):
"""Clean up the user's configured source list. Remove unknown or
duplicate sources while keeping original order.
"""
seen = set()
others = set(SOURCES_ALL) - set(sources)
res = []
for s in sources:
if s in SOURCES_ALL + ['*']:
if not (s in seen or seen.add(s)):
res.extend(list(others) if s == '*' else [s])
if not HAVE_ITUNES and 'itunes' in res:
res.remove('itunes')
return res
class FetchArtPlugin(BeetsPlugin):
class FetchArtPlugin(plugins.BeetsPlugin):
def __init__(self):
super(FetchArtPlugin, self).__init__()
@ -359,8 +343,10 @@ class FetchArtPlugin(BeetsPlugin):
self.import_stages = [self.fetch_art]
self.register_listener('import_task_files', self.assign_art)
self.config['sources'] = sanitize_sources(
self.config['sources'].as_str_seq())
if not HAVE_ITUNES and u'itunes' in SOURCES_ALL:
SOURCES_ALL.remove(u'itunes')
self.config['sources'] = plugins.sanitize_choices(
self.config['sources'].as_str_seq(), SOURCES_ALL)
# Asynchronous; after music is added to the library.
def fetch_art(self, session, task):

View file

@ -14,9 +14,9 @@
"""Moves "featured" artists to the title from the artist field.
"""
from beets.plugins import BeetsPlugin
from beets import plugins
from beets import ui
from beets.util import displayable_path, feat_tokens
from beets.util import displayable_path
from beets import config
import logging
import re
@ -31,7 +31,7 @@ def split_on_feat(artist):
may be a string or None if none is present.
"""
# split on the first "feat".
regex = re.compile(feat_tokens(), re.IGNORECASE)
regex = re.compile(plugins.feat_tokens(), re.IGNORECASE)
parts = [s.strip() for s in regex.split(artist, 1)]
if len(parts) == 1:
return parts[0], None
@ -42,7 +42,7 @@ def split_on_feat(artist):
def contains_feat(title):
"""Determine whether the title contains a "featured" marker.
"""
return bool(re.search(feat_tokens(), title, flags=re.IGNORECASE))
return bool(re.search(plugins.feat_tokens(), title, flags=re.IGNORECASE))
def update_metadata(item, feat_part, drop_feat):
@ -110,7 +110,7 @@ def ft_in_title(item, drop_feat):
ui.print_()
class FtInTitlePlugin(BeetsPlugin):
class FtInTitlePlugin(plugins.BeetsPlugin):
def __init__(self):
super(FtInTitlePlugin, self).__init__()

View file

@ -26,10 +26,8 @@ import difflib
import itertools
from HTMLParser import HTMLParseError
from beets.plugins import BeetsPlugin
from beets import ui
from beets import config
from beets.util import feat_tokens
from beets import plugins
from beets import config, ui
# Global logger.
@ -86,10 +84,17 @@ def unescape(text):
return out
def extract_text(html, starttag):
def extract_text_between(html, start_marker, end_marker):
_, html = html.split(start_marker, 1)
html, _ = html.split(end_marker, 1)
return _scrape_strip_cruft(html, True)
def extract_text_in(html, starttag):
"""Extract the text from a <DIV> tag in the HTML starting with
``starttag``. Returns None if parsing fails.
"""
# Strip off the leading text before opening tag.
try:
_, html = html.split(starttag, 1)
@ -138,7 +143,7 @@ def search_pairs(item):
artists = [artist]
# Remove any featuring artists from the artists name
pattern = r"(.*?) {0}".format(feat_tokens())
pattern = r"(.*?) {0}".format(plugins.feat_tokens())
match = re.search(pattern, artist, re.IGNORECASE)
if match:
artists.append(match.group(1))
@ -151,7 +156,7 @@ def search_pairs(item):
titles.append(match.group(1))
# Remove any featuring artists from the title
pattern = r"(.*?) {0}".format(feat_tokens(for_artist=False))
pattern = r"(.*?) {0}".format(plugins.feat_tokens(for_artist=False))
for title in titles[:]:
match = re.search(pattern, title, re.IGNORECASE)
if match:
@ -178,6 +183,19 @@ def _encode(s):
s = s.encode('utf8', 'ignore')
return urllib.quote(s)
# Musixmatch
MUSIXMATCH_URL_PATTERN = 'https://www.musixmatch.com/lyrics/%s/%s'
def fetch_musixmatch(artist, title):
url = MUSIXMATCH_URL_PATTERN % (_lw_encode(artist.title()),
_lw_encode(title.title()))
html = fetch_url(url)
if not html:
return
lyrics = extract_text_between(html, '"lyrics_body":', '"lyrics_language":')
return lyrics.strip(',"').replace('\\n', '\n')
# LyricsWiki.
@ -201,7 +219,7 @@ def fetch_lyricswiki(artist, title):
if not html:
return
lyrics = extract_text(html, "<div class='lyricbox'>")
lyrics = extract_text_in(html, "<div class='lyricbox'>")
if lyrics and 'Unfortunately, we are not licensed' not in lyrics:
return lyrics
@ -228,7 +246,7 @@ def fetch_lyricscom(artist, title):
if not html:
return
lyrics = extract_text(html, '<div id="lyric_space">')
lyrics = extract_text_in(html, '<div id="lyric_space">')
if not lyrics:
return
for not_found_str in LYRICSCOM_NOT_FOUND:
@ -411,8 +429,14 @@ def fetch_google(artist, title):
# Plugin logic.
SOURCES_KEYS = ['google', 'lyricwiki', 'lyrics.com', 'musixmatch']
SOURCES_ALL = {'google': fetch_google,
'lyricwiki': fetch_lyricswiki,
'lyrics.com': fetch_lyricscom,
'musixmatch': fetch_musixmatch}
class LyricsPlugin(BeetsPlugin):
class LyricsPlugin(plugins.BeetsPlugin):
def __init__(self):
super(LyricsPlugin, self).__init__()
self.import_stages = [self.imported]
@ -422,12 +446,17 @@ class LyricsPlugin(BeetsPlugin):
'google_engine_ID': u'009217259823014548361:lndtuqkycfu',
'fallback': None,
'force': False,
'sources': SOURCES_KEYS,
})
self.backends = [fetch_lyricswiki, fetch_lyricscom]
if self.config['google_API_key'].get():
self.backends.insert(0, fetch_google)
if not self.config['google_API_key'].get() and \
'google' in SOURCES_KEYS:
SOURCES_KEYS.remove('google')
self.config['sources'] = plugins.sanitize_choices(
self.config['sources'].as_str_seq(), SOURCES_KEYS)
self.backends = []
for key in self.config['sources'].as_str_seq():
self.backends.append(SOURCES_ALL[key])
def commands(self):
cmd = ui.Subcommand('lyrics', help='fetch song lyrics')

View file

@ -10,6 +10,9 @@ library by typing ``pip install requests`` or the equivalent for your OS.
New:
* :doc:`/plugins/lyrics`: Add `musixmatch`_ source and introduce a new ``sources``
config option that lets you choose exactly where to look for lyrics and in
which order.
* :doc:`/plugins/lyrics`: Add brazilian and hispanic sources to Google custom
search engine.
* A new :doc:`/plugins/permissions` makes it easy to fix permissions on music
@ -48,7 +51,7 @@ Fixed:
.. _API changes: http://developer.echonest.com/forums/thread/3650
.. _Plex: https://plex.tv/
.. _musixmatch: https://www.musixmatch.com/
1.3.9 (November 17, 2014)
-------------------------

View file

@ -46,8 +46,13 @@ configuration file. The available options are:
backend).
Default: None.
- **google_engine_ID**: The custom search engine to use.
Default: The beets custom search engine, which gathers a list of sources
Default: The `beets custom search engine`_, which gathers a list of sources
known to be scrapeable.
- **sources**: List of sources to search for lyrics. An asterisk `*` expands
to all available sources.
Default: ``google lyricwiki lyrics.com musixmatch``, i.e., all sources.
*google* source will be automatically deactivated if no `google_engine_ID` is
setup.
Here's an example of ``config.yaml``::
@ -56,6 +61,7 @@ Here's an example of ``config.yaml``::
google_API_key: AZERTYUIOPQSDFGHJKLMWXCVBN1234567890_ab
google_engine_ID: 009217259823014548361:lndtuqkycfu
.. _beets custom search engine: https://www.google.com:443/cse/publicurl?cx=009217259823014548361:lndtuqkycfu
Fetching Lyrics Manually
------------------------
@ -96,7 +102,7 @@ default, beets use a list of sources known to be scrapeable.
.. _define a custom search engine: http://www.google.com/cse/all
Note that the Google custom search API is limited to 100 queries per day.
After that, the lyrics plugin will fall back on its other data sources.
After that, the lyrics plugin will fall back on other declared data sources.
.. _pip: http://www.pip-installer.org/
.. _BeautifulSoup: http://www.crummy.com/software/BeautifulSoup/bs4/doc/

View file

@ -47,7 +47,7 @@ def main(argv=None):
url = s['url'] + s['path']
fn = test_lyrics.url_to_filename(url)
if not os.path.isfile(fn):
html = requests.get(url).text
html = requests.get(url, verify=False).text
with safe_open_w(fn) as f:
f.write(html.encode('utf8'))

View file

@ -12,8 +12,6 @@
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
from beetsplug import fetchart
import os.path
from _common import unittest
from helper import TestHelper
@ -43,17 +41,6 @@ class FetchartCliTest(unittest.TestCase, TestHelper):
with open(cover_path, 'r') as f:
self.assertEqual(f.read(), 'IMAGE')
def test_sanitize_sources(self):
self.assertEqual(fetchart.sanitize_sources(['google', 'unknown']),
['google'])
self.assertEqual(fetchart.sanitize_sources(['google', 'google']),
['google'])
res = fetchart.sanitize_sources(['google', '*', 'amazon'])
# don't check strict egality on lengths as itunes source may be removed
# by plugin
self.assertTrue(len(res) >= len(fetchart.SOURCES_ALL) - 1 and
res[0] == 'google' and res[-1] == 'amazon')
def suite():
return unittest.TestLoader().loadTestsFromName(__name__)

View file

@ -17,6 +17,7 @@
import os
import _common
import sys
import re
from _common import unittest
from beetsplug import lyrics
from beets.library import Item
@ -163,7 +164,7 @@ class LyricsPluginTest(unittest.TestCase):
def url_to_filename(url):
url = url.replace('http://', '').replace('www.', '')
url = re.sub(r'https?://|www.', '', url)
fn = "".join(x for x in url if (x.isalnum() or x == '/'))
fn = fn.split('/')
fn = os.path.join(LYRICS_ROOT_DIR, fn[0], fn[-1]) + '.txt'
@ -206,7 +207,9 @@ DEFAULT_SOURCES = [
dict(DEFAULT_SONG, url=u'http://lyrics.wikia.com/',
path=u'The_Beatles:Lady_Madonna'),
dict(DEFAULT_SONG, url='http://www.lyrics.com/',
path=u'lady-madonna-lyrics-the-beatles.html')
path=u'lady-madonna-lyrics-the-beatles.html'),
dict(DEFAULT_SONG, url='https://www.musixmatch.com/',
path=u'lyrics/The-Beatles/Lady-Madonna'),
]
# Every source entered in default beets google custom search engine
@ -307,8 +310,9 @@ class LyricsGooglePluginTest(unittest.TestCase):
"""Test default engines with the default query"""
if not check_lyrics_fetched():
self.skipTest("Run lyrics_download_samples.py script first.")
for (fun, s) in zip((lyrics.fetch_lyricswiki, lyrics.fetch_lyricscom),
DEFAULT_SOURCES):
for (fun, s) in zip([lyrics.fetch_lyricswiki,
lyrics.fetch_lyricscom,
lyrics.fetch_musixmatch], DEFAULT_SOURCES):
if os.path.isfile(url_to_filename(
s['url'] + s['path'])):
res = fun(s['artist'], s['title'])

View file

@ -151,6 +151,16 @@ class ItemTypeConflictTest(unittest.TestCase, TestHelper):
self.assertNotEqual(None, plugins.types(Item))
class HelpersTest(unittest.TestCase):
def test_sanitize_choices(self):
self.assertEqual(plugins.sanitize_choices(['A', 'Z'], ('A', 'B')),
['A'])
self.assertEqual(plugins.sanitize_choices(['A', 'A'], ('A')), ['A'])
self.assertEqual(plugins.sanitize_choices(['D', '*', 'A'],
('A', 'B', 'C', 'D')), ['D', 'B', 'C', 'A'])
def suite():
return unittest.TestLoader().loadTestsFromName(__name__)