diff --git a/beets/plugins.py b/beets/plugins.py index 2b68f2d02..8611b92a6 100755 --- a/beets/plugins.py +++ b/beets/plugins.py @@ -16,8 +16,10 @@ import logging import traceback -from collections import defaultdict import inspect +import re +from collections import defaultdict + import beets from beets import mediafile @@ -402,3 +404,32 @@ def send(event, **arguments): argspec = inspect.getargspec(handler).args args = dict((k, v) for k, v in arguments.items() if k in argspec) handler(**args) + + +def feat_tokens(for_artist=True): + """Return a regular expression that matches phrases like "featuring" + that separate a main artist or a song title from secondary artists. + The `for_artist` option determines whether the regex should be + suitable for matching artist fields (the default) or title fields. + """ + feat_words = ['ft', 'featuring', 'feat', 'feat.', 'ft.'] + if for_artist: + feat_words += ['with', 'vs', 'and', 'con', '&'] + return '(?<=\s)(?:{0})(?=\s)'.format( + '|'.join(re.escape(x) for x in feat_words) + ) + + +def sanitize_choices(choices, choices_all): + """Clean up a stringlist configuration attribute: keep only choices + elements present in choices_all, remove duplicate elements, expand '*' + wildcard while keeping original stringlist order. + """ + seen = set() + others = [x for x in choices_all if x not in choices] + res = [] + for s in choices: + if s in list(choices_all) + ['*']: + if not (s in seen or seen.add(s)): + res.extend(list(others) if s == '*' else [s]) + return res diff --git a/beets/util/__init__.py b/beets/util/__init__.py index 54cf423e1..529bbb2f3 100644 --- a/beets/util/__init__.py +++ b/beets/util/__init__.py @@ -678,17 +678,3 @@ def max_filename_length(path, limit=MAX_FILENAME_LENGTH): return min(res[9], limit) else: return limit - - -def feat_tokens(for_artist=True): - """Return a regular expression that matches phrases like "featuring" - that separate a main artist or a song title from secondary artists. - The `for_artist` option determines whether the regex should be - suitable for matching artist fields (the default) or title fields. - """ - feat_words = ['ft', 'featuring', 'feat', 'feat.', 'ft.'] - if for_artist: - feat_words += ['with', 'vs', 'and', 'con', '&'] - return '(?<=\s)(?:{0})(?=\s)'.format( - '|'.join(re.escape(x) for x in feat_words) - ) diff --git a/beetsplug/fetchart.py b/beetsplug/fetchart.py index 98e9fda3e..aa670c5ab 100644 --- a/beetsplug/fetchart.py +++ b/beetsplug/fetchart.py @@ -22,12 +22,12 @@ from tempfile import NamedTemporaryFile import requests -from beets.plugins import BeetsPlugin -from beets.util.artresizer import ArtResizer +from beets import plugins from beets import importer from beets import ui from beets import util from beets import config +from beets.util.artresizer import ArtResizer try: import itunes @@ -319,23 +319,7 @@ def batch_fetch_art(lib, albums, force, maxwidth=None): message)) -def sanitize_sources(sources): - """Clean up the user's configured source list. Remove unknown or - duplicate sources while keeping original order. - """ - seen = set() - others = set(SOURCES_ALL) - set(sources) - res = [] - for s in sources: - if s in SOURCES_ALL + ['*']: - if not (s in seen or seen.add(s)): - res.extend(list(others) if s == '*' else [s]) - if not HAVE_ITUNES and 'itunes' in res: - res.remove('itunes') - return res - - -class FetchArtPlugin(BeetsPlugin): +class FetchArtPlugin(plugins.BeetsPlugin): def __init__(self): super(FetchArtPlugin, self).__init__() @@ -359,8 +343,10 @@ class FetchArtPlugin(BeetsPlugin): self.import_stages = [self.fetch_art] self.register_listener('import_task_files', self.assign_art) - self.config['sources'] = sanitize_sources( - self.config['sources'].as_str_seq()) + if not HAVE_ITUNES and u'itunes' in SOURCES_ALL: + SOURCES_ALL.remove(u'itunes') + self.config['sources'] = plugins.sanitize_choices( + self.config['sources'].as_str_seq(), SOURCES_ALL) # Asynchronous; after music is added to the library. def fetch_art(self, session, task): diff --git a/beetsplug/ftintitle.py b/beetsplug/ftintitle.py index e83836e0e..75134ae9c 100644 --- a/beetsplug/ftintitle.py +++ b/beetsplug/ftintitle.py @@ -14,9 +14,9 @@ """Moves "featured" artists to the title from the artist field. """ -from beets.plugins import BeetsPlugin +from beets import plugins from beets import ui -from beets.util import displayable_path, feat_tokens +from beets.util import displayable_path from beets import config import logging import re @@ -31,7 +31,7 @@ def split_on_feat(artist): may be a string or None if none is present. """ # split on the first "feat". - regex = re.compile(feat_tokens(), re.IGNORECASE) + regex = re.compile(plugins.feat_tokens(), re.IGNORECASE) parts = [s.strip() for s in regex.split(artist, 1)] if len(parts) == 1: return parts[0], None @@ -42,7 +42,7 @@ def split_on_feat(artist): def contains_feat(title): """Determine whether the title contains a "featured" marker. """ - return bool(re.search(feat_tokens(), title, flags=re.IGNORECASE)) + return bool(re.search(plugins.feat_tokens(), title, flags=re.IGNORECASE)) def update_metadata(item, feat_part, drop_feat): @@ -110,7 +110,7 @@ def ft_in_title(item, drop_feat): ui.print_() -class FtInTitlePlugin(BeetsPlugin): +class FtInTitlePlugin(plugins.BeetsPlugin): def __init__(self): super(FtInTitlePlugin, self).__init__() diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py index 290142790..8cdee70e6 100644 --- a/beetsplug/lyrics.py +++ b/beetsplug/lyrics.py @@ -26,10 +26,8 @@ import difflib import itertools from HTMLParser import HTMLParseError -from beets.plugins import BeetsPlugin -from beets import ui -from beets import config -from beets.util import feat_tokens +from beets import plugins +from beets import config, ui # Global logger. @@ -86,10 +84,17 @@ def unescape(text): return out -def extract_text(html, starttag): +def extract_text_between(html, start_marker, end_marker): + _, html = html.split(start_marker, 1) + html, _ = html.split(end_marker, 1) + return _scrape_strip_cruft(html, True) + + +def extract_text_in(html, starttag): """Extract the text from a
tag in the HTML starting with ``starttag``. Returns None if parsing fails. """ + # Strip off the leading text before opening tag. try: _, html = html.split(starttag, 1) @@ -138,7 +143,7 @@ def search_pairs(item): artists = [artist] # Remove any featuring artists from the artists name - pattern = r"(.*?) {0}".format(feat_tokens()) + pattern = r"(.*?) {0}".format(plugins.feat_tokens()) match = re.search(pattern, artist, re.IGNORECASE) if match: artists.append(match.group(1)) @@ -151,7 +156,7 @@ def search_pairs(item): titles.append(match.group(1)) # Remove any featuring artists from the title - pattern = r"(.*?) {0}".format(feat_tokens(for_artist=False)) + pattern = r"(.*?) {0}".format(plugins.feat_tokens(for_artist=False)) for title in titles[:]: match = re.search(pattern, title, re.IGNORECASE) if match: @@ -178,6 +183,19 @@ def _encode(s): s = s.encode('utf8', 'ignore') return urllib.quote(s) +# Musixmatch + +MUSIXMATCH_URL_PATTERN = 'https://www.musixmatch.com/lyrics/%s/%s' + + +def fetch_musixmatch(artist, title): + url = MUSIXMATCH_URL_PATTERN % (_lw_encode(artist.title()), + _lw_encode(title.title())) + html = fetch_url(url) + if not html: + return + lyrics = extract_text_between(html, '"lyrics_body":', '"lyrics_language":') + return lyrics.strip(',"').replace('\\n', '\n') # LyricsWiki. @@ -201,7 +219,7 @@ def fetch_lyricswiki(artist, title): if not html: return - lyrics = extract_text(html, "
") + lyrics = extract_text_in(html, "
") if lyrics and 'Unfortunately, we are not licensed' not in lyrics: return lyrics @@ -228,7 +246,7 @@ def fetch_lyricscom(artist, title): if not html: return - lyrics = extract_text(html, '
') + lyrics = extract_text_in(html, '
') if not lyrics: return for not_found_str in LYRICSCOM_NOT_FOUND: @@ -411,8 +429,14 @@ def fetch_google(artist, title): # Plugin logic. +SOURCES_KEYS = ['google', 'lyricwiki', 'lyrics.com', 'musixmatch'] +SOURCES_ALL = {'google': fetch_google, + 'lyricwiki': fetch_lyricswiki, + 'lyrics.com': fetch_lyricscom, + 'musixmatch': fetch_musixmatch} -class LyricsPlugin(BeetsPlugin): + +class LyricsPlugin(plugins.BeetsPlugin): def __init__(self): super(LyricsPlugin, self).__init__() self.import_stages = [self.imported] @@ -422,12 +446,17 @@ class LyricsPlugin(BeetsPlugin): 'google_engine_ID': u'009217259823014548361:lndtuqkycfu', 'fallback': None, 'force': False, + 'sources': SOURCES_KEYS, }) - self.backends = [fetch_lyricswiki, fetch_lyricscom] - - if self.config['google_API_key'].get(): - self.backends.insert(0, fetch_google) + if not self.config['google_API_key'].get() and \ + 'google' in SOURCES_KEYS: + SOURCES_KEYS.remove('google') + self.config['sources'] = plugins.sanitize_choices( + self.config['sources'].as_str_seq(), SOURCES_KEYS) + self.backends = [] + for key in self.config['sources'].as_str_seq(): + self.backends.append(SOURCES_ALL[key]) def commands(self): cmd = ui.Subcommand('lyrics', help='fetch song lyrics') diff --git a/docs/changelog.rst b/docs/changelog.rst index c1557da92..5b6d640a9 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -10,6 +10,9 @@ library by typing ``pip install requests`` or the equivalent for your OS. New: +* :doc:`/plugins/lyrics`: Add `musixmatch`_ source and introduce a new ``sources`` + config option that lets you choose exactly where to look for lyrics and in + which order. * :doc:`/plugins/lyrics`: Add brazilian and hispanic sources to Google custom search engine. * A new :doc:`/plugins/permissions` makes it easy to fix permissions on music @@ -48,7 +51,7 @@ Fixed: .. _API changes: http://developer.echonest.com/forums/thread/3650 .. _Plex: https://plex.tv/ - +.. _musixmatch: https://www.musixmatch.com/ 1.3.9 (November 17, 2014) ------------------------- diff --git a/docs/plugins/lyrics.rst b/docs/plugins/lyrics.rst index ea04d9870..f6ec736ba 100644 --- a/docs/plugins/lyrics.rst +++ b/docs/plugins/lyrics.rst @@ -46,8 +46,13 @@ configuration file. The available options are: backend). Default: None. - **google_engine_ID**: The custom search engine to use. - Default: The beets custom search engine, which gathers a list of sources + Default: The `beets custom search engine`_, which gathers a list of sources known to be scrapeable. +- **sources**: List of sources to search for lyrics. An asterisk `*` expands + to all available sources. + Default: ``google lyricwiki lyrics.com musixmatch``, i.e., all sources. + *google* source will be automatically deactivated if no `google_engine_ID` is + setup. Here's an example of ``config.yaml``:: @@ -56,6 +61,7 @@ Here's an example of ``config.yaml``:: google_API_key: AZERTYUIOPQSDFGHJKLMWXCVBN1234567890_ab google_engine_ID: 009217259823014548361:lndtuqkycfu +.. _beets custom search engine: https://www.google.com:443/cse/publicurl?cx=009217259823014548361:lndtuqkycfu Fetching Lyrics Manually ------------------------ @@ -96,7 +102,7 @@ default, beets use a list of sources known to be scrapeable. .. _define a custom search engine: http://www.google.com/cse/all Note that the Google custom search API is limited to 100 queries per day. -After that, the lyrics plugin will fall back on its other data sources. +After that, the lyrics plugin will fall back on other declared data sources. .. _pip: http://www.pip-installer.org/ .. _BeautifulSoup: http://www.crummy.com/software/BeautifulSoup/bs4/doc/ diff --git a/test/lyrics_download_samples.py b/test/lyrics_download_samples.py index 754d2a2b5..c4aab5bd1 100644 --- a/test/lyrics_download_samples.py +++ b/test/lyrics_download_samples.py @@ -47,7 +47,7 @@ def main(argv=None): url = s['url'] + s['path'] fn = test_lyrics.url_to_filename(url) if not os.path.isfile(fn): - html = requests.get(url).text + html = requests.get(url, verify=False).text with safe_open_w(fn) as f: f.write(html.encode('utf8')) diff --git a/test/test_fetchart.py b/test/test_fetchart.py index 8b77914d6..5e36f9145 100644 --- a/test/test_fetchart.py +++ b/test/test_fetchart.py @@ -12,8 +12,6 @@ # The above copyright notice and this permission notice shall be # included in all copies or substantial portions of the Software. -from beetsplug import fetchart - import os.path from _common import unittest from helper import TestHelper @@ -43,17 +41,6 @@ class FetchartCliTest(unittest.TestCase, TestHelper): with open(cover_path, 'r') as f: self.assertEqual(f.read(), 'IMAGE') - def test_sanitize_sources(self): - self.assertEqual(fetchart.sanitize_sources(['google', 'unknown']), - ['google']) - self.assertEqual(fetchart.sanitize_sources(['google', 'google']), - ['google']) - res = fetchart.sanitize_sources(['google', '*', 'amazon']) - # don't check strict egality on lengths as itunes source may be removed - # by plugin - self.assertTrue(len(res) >= len(fetchart.SOURCES_ALL) - 1 and - res[0] == 'google' and res[-1] == 'amazon') - def suite(): return unittest.TestLoader().loadTestsFromName(__name__) diff --git a/test/test_lyrics.py b/test/test_lyrics.py index ea1d1943e..418b062ff 100644 --- a/test/test_lyrics.py +++ b/test/test_lyrics.py @@ -17,6 +17,7 @@ import os import _common import sys +import re from _common import unittest from beetsplug import lyrics from beets.library import Item @@ -163,7 +164,7 @@ class LyricsPluginTest(unittest.TestCase): def url_to_filename(url): - url = url.replace('http://', '').replace('www.', '') + url = re.sub(r'https?://|www.', '', url) fn = "".join(x for x in url if (x.isalnum() or x == '/')) fn = fn.split('/') fn = os.path.join(LYRICS_ROOT_DIR, fn[0], fn[-1]) + '.txt' @@ -206,7 +207,9 @@ DEFAULT_SOURCES = [ dict(DEFAULT_SONG, url=u'http://lyrics.wikia.com/', path=u'The_Beatles:Lady_Madonna'), dict(DEFAULT_SONG, url='http://www.lyrics.com/', - path=u'lady-madonna-lyrics-the-beatles.html') + path=u'lady-madonna-lyrics-the-beatles.html'), + dict(DEFAULT_SONG, url='https://www.musixmatch.com/', + path=u'lyrics/The-Beatles/Lady-Madonna'), ] # Every source entered in default beets google custom search engine @@ -307,8 +310,9 @@ class LyricsGooglePluginTest(unittest.TestCase): """Test default engines with the default query""" if not check_lyrics_fetched(): self.skipTest("Run lyrics_download_samples.py script first.") - for (fun, s) in zip((lyrics.fetch_lyricswiki, lyrics.fetch_lyricscom), - DEFAULT_SOURCES): + for (fun, s) in zip([lyrics.fetch_lyricswiki, + lyrics.fetch_lyricscom, + lyrics.fetch_musixmatch], DEFAULT_SOURCES): if os.path.isfile(url_to_filename( s['url'] + s['path'])): res = fun(s['artist'], s['title']) diff --git a/test/test_plugins.py b/test/test_plugins.py index 80285a688..eea162e90 100644 --- a/test/test_plugins.py +++ b/test/test_plugins.py @@ -151,6 +151,16 @@ class ItemTypeConflictTest(unittest.TestCase, TestHelper): self.assertNotEqual(None, plugins.types(Item)) +class HelpersTest(unittest.TestCase): + + def test_sanitize_choices(self): + self.assertEqual(plugins.sanitize_choices(['A', 'Z'], ('A', 'B')), + ['A']) + self.assertEqual(plugins.sanitize_choices(['A', 'A'], ('A')), ['A']) + self.assertEqual(plugins.sanitize_choices(['D', '*', 'A'], + ('A', 'B', 'C', 'D')), ['D', 'B', 'C', 'A']) + + def suite(): return unittest.TestLoader().loadTestsFromName(__name__)