Merge pull request #1148 from Kraymer/lyrics_musixmatch

lyrics: add 'musixmatch' source Conflicts: beetsplug/lyrics.py
2026-01-09 17:33:51 +01:00 · 2014-12-18 10:36:16 +00:00 · 2014-12-18 10:36:16 +00:00 · ac3f0824b0
commit ac3f0824b0
parent 20db9bb1a6 ea0f7ced5d
11 changed files with 118 additions and 76 deletions
--- a/beets/plugins.py
+++ b/beets/plugins.py
@ -16,8 +16,10 @@

 import logging
 import traceback
-from collections import defaultdict
 import inspect
+import re
+from collections import defaultdict
+

 import beets
 from beets import mediafile
@ -402,3 +404,32 @@ def send(event, **arguments):
        argspec = inspect.getargspec(handler).args
        args = dict((k, v) for k, v in arguments.items() if k in argspec)
        handler(**args)
+
+
+def feat_tokens(for_artist=True):
+    """Return a regular expression that matches phrases like "featuring"
+    that separate a main artist or a song title from secondary artists.
+    The `for_artist` option determines whether the regex should be
+    suitable for matching artist fields (the default) or title fields.
+    """
+    feat_words = ['ft', 'featuring', 'feat', 'feat.', 'ft.']
+    if for_artist:
+        feat_words += ['with', 'vs', 'and', 'con', '&']
+    return '(?<=\s)(?:{0})(?=\s)'.format(
+        '|'.join(re.escape(x) for x in feat_words)
+    )
+
+
+def sanitize_choices(choices, choices_all):
+    """Clean up a stringlist configuration attribute: keep only choices
+    elements present in choices_all, remove duplicate elements, expand '*'
+    wildcard while keeping original stringlist order.
+    """
+    seen = set()
+    others = [x for x in choices_all if x not in choices]
+    res = []
+    for s in choices:
+        if s in list(choices_all) + ['*']:
+            if not (s in seen or seen.add(s)):
+                res.extend(list(others) if s == '*' else [s])
+    return res
--- a/beets/util/init.py
+++ b/beets/util/init.py
@ -678,17 +678,3 @@ def max_filename_length(path, limit=MAX_FILENAME_LENGTH):
        return min(res[9], limit)
    else:
        return limit
-
-
-def feat_tokens(for_artist=True):
-    """Return a regular expression that matches phrases like "featuring"
-    that separate a main artist or a song title from secondary artists.
-    The `for_artist` option determines whether the regex should be
-    suitable for matching artist fields (the default) or title fields.
-    """
-    feat_words = ['ft', 'featuring', 'feat', 'feat.', 'ft.']
-    if for_artist:
-        feat_words += ['with', 'vs', 'and', 'con', '&']
-    return '(?<=\s)(?:{0})(?=\s)'.format(
-        '|'.join(re.escape(x) for x in feat_words)
-    )
--- a/beetsplug/fetchart.py
+++ b/beetsplug/fetchart.py
@ -22,12 +22,12 @@ from tempfile import NamedTemporaryFile

 import requests

-from beets.plugins import BeetsPlugin
-from beets.util.artresizer import ArtResizer
+from beets import plugins
 from beets import importer
 from beets import ui
 from beets import util
 from beets import config
+from beets.util.artresizer import ArtResizer

 try:
    import itunes
@ -319,23 +319,7 @@ def batch_fetch_art(lib, albums, force, maxwidth=None):
                                          message))


-def sanitize_sources(sources):
-    """Clean up the user's configured source list. Remove unknown or
-    duplicate sources while keeping original order.
-    """
-    seen = set()
-    others = set(SOURCES_ALL) - set(sources)
-    res = []
-    for s in sources:
-        if s in SOURCES_ALL + ['*']:
-            if not (s in seen or seen.add(s)):
-                res.extend(list(others) if s == '*' else [s])
-    if not HAVE_ITUNES and 'itunes' in res:
-        res.remove('itunes')
-    return res
-
-
-class FetchArtPlugin(BeetsPlugin):
+class FetchArtPlugin(plugins.BeetsPlugin):
    def __init__(self):
        super(FetchArtPlugin, self).__init__()

@ -359,8 +343,10 @@ class FetchArtPlugin(BeetsPlugin):
            self.import_stages = [self.fetch_art]
            self.register_listener('import_task_files', self.assign_art)

-        self.config['sources'] = sanitize_sources(
-            self.config['sources'].as_str_seq())
+        if not HAVE_ITUNES and u'itunes' in SOURCES_ALL:
+            SOURCES_ALL.remove(u'itunes')
+        self.config['sources'] = plugins.sanitize_choices(
+            self.config['sources'].as_str_seq(), SOURCES_ALL)

    # Asynchronous; after music is added to the library.
    def fetch_art(self, session, task):
--- a/beetsplug/ftintitle.py
+++ b/beetsplug/ftintitle.py
@ -14,9 +14,9 @@

 """Moves "featured" artists to the title from the artist field.
 """
-from beets.plugins import BeetsPlugin
+from beets import plugins
 from beets import ui
-from beets.util import displayable_path, feat_tokens
+from beets.util import displayable_path
 from beets import config
 import logging
 import re
@ -31,7 +31,7 @@ def split_on_feat(artist):
    may be a string or None if none is present.
    """
    # split on the first "feat".
-    regex = re.compile(feat_tokens(), re.IGNORECASE)
+    regex = re.compile(plugins.feat_tokens(), re.IGNORECASE)
    parts = [s.strip() for s in regex.split(artist, 1)]
    if len(parts) == 1:
        return parts[0], None
@ -42,7 +42,7 @@ def split_on_feat(artist):
 def contains_feat(title):
    """Determine whether the title contains a "featured" marker.
    """
-    return bool(re.search(feat_tokens(), title, flags=re.IGNORECASE))
+    return bool(re.search(plugins.feat_tokens(), title, flags=re.IGNORECASE))


 def update_metadata(item, feat_part, drop_feat):
@ -110,7 +110,7 @@ def ft_in_title(item, drop_feat):
        ui.print_()


-class FtInTitlePlugin(BeetsPlugin):
+class FtInTitlePlugin(plugins.BeetsPlugin):
    def __init__(self):
        super(FtInTitlePlugin, self).__init__()

--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@ -26,10 +26,8 @@ import difflib
 import itertools
 from HTMLParser import HTMLParseError

-from beets.plugins import BeetsPlugin
-from beets import ui
-from beets import config
-from beets.util import feat_tokens
+from beets import plugins
+from beets import config, ui


 # Global logger.
@ -86,10 +84,17 @@ def unescape(text):
    return out


-def extract_text(html, starttag):
+def extract_text_between(html, start_marker, end_marker):
+    _, html = html.split(start_marker, 1)
+    html, _ = html.split(end_marker, 1)
+    return _scrape_strip_cruft(html, True)
+
+
+def extract_text_in(html, starttag):
    """Extract the text from a <DIV> tag in the HTML starting with
    ``starttag``. Returns None if parsing fails.
    """
+
    # Strip off the leading text before opening tag.
    try:
        _, html = html.split(starttag, 1)
@ -138,7 +143,7 @@ def search_pairs(item):
    artists = [artist]

    # Remove any featuring artists from the artists name
-    pattern = r"(.*?) {0}".format(feat_tokens())
+    pattern = r"(.*?) {0}".format(plugins.feat_tokens())
    match = re.search(pattern, artist, re.IGNORECASE)
    if match:
        artists.append(match.group(1))
@ -151,7 +156,7 @@ def search_pairs(item):
        titles.append(match.group(1))

    # Remove any featuring artists from the title
-    pattern = r"(.*?) {0}".format(feat_tokens(for_artist=False))
+    pattern = r"(.*?) {0}".format(plugins.feat_tokens(for_artist=False))
    for title in titles[:]:
        match = re.search(pattern, title, re.IGNORECASE)
        if match:
@ -178,6 +183,19 @@ def _encode(s):
        s = s.encode('utf8', 'ignore')
    return urllib.quote(s)

+# Musixmatch
+
+MUSIXMATCH_URL_PATTERN = 'https://www.musixmatch.com/lyrics/%s/%s'
+
+
+def fetch_musixmatch(artist, title):
+    url = MUSIXMATCH_URL_PATTERN % (_lw_encode(artist.title()),
+                                    _lw_encode(title.title()))
+    html = fetch_url(url)
+    if not html:
+        return
+    lyrics = extract_text_between(html, '"lyrics_body":', '"lyrics_language":')
+    return lyrics.strip(',"').replace('\\n', '\n')

 # LyricsWiki.

@ -201,7 +219,7 @@ def fetch_lyricswiki(artist, title):
    if not html:
        return

-    lyrics = extract_text(html, "<div class='lyricbox'>")
+    lyrics = extract_text_in(html, "<div class='lyricbox'>")
    if lyrics and 'Unfortunately, we are not licensed' not in lyrics:
        return lyrics

@ -228,7 +246,7 @@ def fetch_lyricscom(artist, title):
    if not html:
        return

-    lyrics = extract_text(html, '<div id="lyric_space">')
+    lyrics = extract_text_in(html, '<div id="lyric_space">')
    if not lyrics:
        return
    for not_found_str in LYRICSCOM_NOT_FOUND:
@ -411,8 +429,14 @@ def fetch_google(artist, title):

 # Plugin logic.

+SOURCES_KEYS = ['google', 'lyricwiki', 'lyrics.com', 'musixmatch']
+SOURCES_ALL = {'google': fetch_google,
+               'lyricwiki': fetch_lyricswiki,
+               'lyrics.com': fetch_lyricscom,
+               'musixmatch': fetch_musixmatch}

-class LyricsPlugin(BeetsPlugin):
+
+class LyricsPlugin(plugins.BeetsPlugin):
    def __init__(self):
        super(LyricsPlugin, self).__init__()
        self.import_stages = [self.imported]
@ -422,12 +446,17 @@ class LyricsPlugin(BeetsPlugin):
            'google_engine_ID': u'009217259823014548361:lndtuqkycfu',
            'fallback': None,
            'force': False,
+            'sources': SOURCES_KEYS,
        })

-        self.backends = [fetch_lyricswiki, fetch_lyricscom]
-
-        if self.config['google_API_key'].get():
-            self.backends.insert(0, fetch_google)
+        if not self.config['google_API_key'].get() and \
+                'google' in SOURCES_KEYS:
+            SOURCES_KEYS.remove('google')
+        self.config['sources'] = plugins.sanitize_choices(
+            self.config['sources'].as_str_seq(), SOURCES_KEYS)
+        self.backends = []
+        for key in self.config['sources'].as_str_seq():
+            self.backends.append(SOURCES_ALL[key])

    def commands(self):
        cmd = ui.Subcommand('lyrics', help='fetch song lyrics')
--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@ -10,6 +10,9 @@ library by typing ``pip install requests`` or the equivalent for your OS.

 New:

+* :doc:`/plugins/lyrics`: Add `musixmatch`_ source and introduce a new ``sources``
+  config option that lets you choose exactly where to look for lyrics and in
+  which order.
 * :doc:`/plugins/lyrics`: Add brazilian and hispanic sources to Google custom
  search engine.
 * A new :doc:`/plugins/permissions` makes it easy to fix permissions on music
@ -48,7 +51,7 @@ Fixed:

 .. _API changes: http://developer.echonest.com/forums/thread/3650
 .. _Plex: https://plex.tv/
-
+.. _musixmatch: https://www.musixmatch.com/

 1.3.9 (November 17, 2014)
 -------------------------
--- a/docs/plugins/lyrics.rst
+++ b/docs/plugins/lyrics.rst
@ -46,8 +46,13 @@ configuration file. The available options are:
  backend).
  Default: None.
 - **google_engine_ID**: The custom search engine to use.
-  Default: The beets custom search engine, which gathers a list of sources
+  Default: The `beets custom search engine`_, which gathers a list of sources
  known to be scrapeable.
+- **sources**: List of sources to search for lyrics. An asterisk `*` expands
+  to all available sources.
+  Default: ``google lyricwiki lyrics.com musixmatch``, i.e., all sources.
+  *google* source will be automatically deactivated if no `google_engine_ID` is
+  setup.

 Here's an example of ``config.yaml``::

@ -56,6 +61,7 @@ Here's an example of ``config.yaml``::
      google_API_key: AZERTYUIOPQSDFGHJKLMWXCVBN1234567890_ab
      google_engine_ID: 009217259823014548361:lndtuqkycfu

+.. _beets custom search engine: https://www.google.com:443/cse/publicurl?cx=009217259823014548361:lndtuqkycfu

 Fetching Lyrics Manually
 ------------------------
@ -96,7 +102,7 @@ default, beets use a list of sources known to be scrapeable.
 .. _define a custom search engine: http://www.google.com/cse/all

 Note that the Google custom search API is limited to 100 queries per day.
-After that, the lyrics plugin will fall back on its other data sources.
+After that, the lyrics plugin will fall back on other declared data sources.

 .. _pip: http://www.pip-installer.org/
 .. _BeautifulSoup: http://www.crummy.com/software/BeautifulSoup/bs4/doc/
--- a/test/lyrics_download_samples.py
+++ b/test/lyrics_download_samples.py
@ -47,7 +47,7 @@ def main(argv=None):
        url = s['url'] + s['path']
        fn = test_lyrics.url_to_filename(url)
        if not os.path.isfile(fn):
-            html = requests.get(url).text
+            html = requests.get(url, verify=False).text
            with safe_open_w(fn) as f:
                f.write(html.encode('utf8'))

--- a/test/test_fetchart.py
+++ b/test/test_fetchart.py
@ -12,8 +12,6 @@
 # The above copyright notice and this permission notice shall be
 # included in all copies or substantial portions of the Software.

-from beetsplug import fetchart
-
 import os.path
 from _common import unittest
 from helper import TestHelper
@ -43,17 +41,6 @@ class FetchartCliTest(unittest.TestCase, TestHelper):
        with open(cover_path, 'r') as f:
            self.assertEqual(f.read(), 'IMAGE')

-    def test_sanitize_sources(self):
-        self.assertEqual(fetchart.sanitize_sources(['google', 'unknown']),
-                         ['google'])
-        self.assertEqual(fetchart.sanitize_sources(['google', 'google']),
-                         ['google'])
-        res = fetchart.sanitize_sources(['google', '*', 'amazon'])
-        # don't check strict egality on lengths as itunes source may be removed
-        # by plugin
-        self.assertTrue(len(res) >= len(fetchart.SOURCES_ALL) - 1 and
-                        res[0] == 'google' and res[-1] == 'amazon')
-

 def suite():
    return unittest.TestLoader().loadTestsFromName(__name__)
--- a/test/test_lyrics.py
+++ b/test/test_lyrics.py
@ -17,6 +17,7 @@
 import os
 import _common
 import sys
+import re
 from _common import unittest
 from beetsplug import lyrics
 from beets.library import Item
@ -163,7 +164,7 @@ class LyricsPluginTest(unittest.TestCase):


 def url_to_filename(url):
-    url = url.replace('http://', '').replace('www.', '')
+    url = re.sub(r'https?://|www.', '', url)
    fn = "".join(x for x in url if (x.isalnum() or x == '/'))
    fn = fn.split('/')
    fn = os.path.join(LYRICS_ROOT_DIR, fn[0], fn[-1]) + '.txt'
@ -206,7 +207,9 @@ DEFAULT_SOURCES = [
    dict(DEFAULT_SONG, url=u'http://lyrics.wikia.com/',
         path=u'The_Beatles:Lady_Madonna'),
    dict(DEFAULT_SONG, url='http://www.lyrics.com/',
-         path=u'lady-madonna-lyrics-the-beatles.html')
+         path=u'lady-madonna-lyrics-the-beatles.html'),
+    dict(DEFAULT_SONG, url='https://www.musixmatch.com/',
+         path=u'lyrics/The-Beatles/Lady-Madonna'),
 ]

 # Every source entered in default beets google custom search engine
@ -307,8 +310,9 @@ class LyricsGooglePluginTest(unittest.TestCase):
        """Test default engines with the default query"""
        if not check_lyrics_fetched():
            self.skipTest("Run lyrics_download_samples.py script first.")
-        for (fun, s) in zip((lyrics.fetch_lyricswiki, lyrics.fetch_lyricscom),
-                            DEFAULT_SOURCES):
+        for (fun, s) in zip([lyrics.fetch_lyricswiki,
+                             lyrics.fetch_lyricscom,
+                             lyrics.fetch_musixmatch], DEFAULT_SOURCES):
            if os.path.isfile(url_to_filename(
                              s['url'] + s['path'])):
                res = fun(s['artist'], s['title'])
--- a/test/test_plugins.py
+++ b/test/test_plugins.py
@ -151,6 +151,16 @@ class ItemTypeConflictTest(unittest.TestCase, TestHelper):
        self.assertNotEqual(None, plugins.types(Item))


+class HelpersTest(unittest.TestCase):
+
+    def test_sanitize_choices(self):
+        self.assertEqual(plugins.sanitize_choices(['A', 'Z'], ('A', 'B')),
+                         ['A'])
+        self.assertEqual(plugins.sanitize_choices(['A', 'A'], ('A')), ['A'])
+        self.assertEqual(plugins.sanitize_choices(['D', '*', 'A'],
+                         ('A', 'B', 'C', 'D')), ['D', 'B', 'C', 'A'])
+
+
 def suite():
    return unittest.TestLoader().loadTestsFromName(__name__)