diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index 710cdd2e4..70a1133c2 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -25,15 +25,10 @@ import difflib
import itertools
from HTMLParser import HTMLParseError
-from beets import logging
from beets import plugins
from beets import config, ui
-# Global logger.
-
-log = logging.getLogger(__name__)
-
DIV_RE = re.compile(r'<(/?)div>?', re.I)
COMMENT_RE = re.compile(r'', re.S)
TAG_RE = re.compile(r'<[^>]*>')
@@ -56,21 +51,6 @@ URL_CHARACTERS = {
# Utilities.
-def fetch_url(url):
- """Retrieve the content at a given URL, or return None if the source
- is unreachable.
- """
- try:
- r = requests.get(url, verify=False)
- except requests.RequestException as exc:
- log.debug(u'lyrics request failed: {0}', exc)
- return
- if r.status_code == requests.codes.ok:
- return r.text
- else:
- log.debug(u'failed to fetch: {0} ({1})', url, r.status_code)
-
-
def unescape(text):
"""Resolves xx; HTML entities (and some others)."""
if isinstance(text, str):
@@ -174,131 +154,110 @@ def search_pairs(item):
return itertools.product(artists, multi_titles)
-def _encode(s):
- """Encode the string for inclusion in a URL (common to both
- LyricsWiki and Lyrics.com).
- """
- if isinstance(s, unicode):
- for char, repl in URL_CHARACTERS.items():
- s = s.replace(char, repl)
- s = s.encode('utf8', 'ignore')
- return urllib.quote(s)
+class Backend(object):
+ def __init__(self, log):
+ self._log = log
-# Musixmatch
+ @staticmethod
+ def _encode(s):
+ """Encode the string for inclusion in a URL"""
+ if isinstance(s, unicode):
+ for char, repl in URL_CHARACTERS.items():
+ s = s.replace(char, repl)
+ s = s.encode('utf8', 'ignore')
+ return urllib.quote(s)
-MUSIXMATCH_URL_PATTERN = 'https://www.musixmatch.com/lyrics/%s/%s'
+ def build_url(self, artist, title):
+ return self.URL_PATTERN % (self._encode(artist.title()),
+ self._encode(title.title()))
-
-def fetch_musixmatch(artist, title):
- url = MUSIXMATCH_URL_PATTERN % (_lw_encode(artist.title()),
- _lw_encode(title.title()))
- html = fetch_url(url)
- if not html:
- return
- lyrics = extract_text_between(html, '"lyrics_body":', '"lyrics_language":')
- return lyrics.strip(',"').replace('\\n', '\n')
-
-# LyricsWiki.
-
-LYRICSWIKI_URL_PATTERN = 'http://lyrics.wikia.com/%s:%s'
-
-
-def _lw_encode(s):
- s = re.sub(r'\s+', '_', s)
- s = s.replace("<", "Less_Than")
- s = s.replace(">", "Greater_Than")
- s = s.replace("#", "Number_")
- s = re.sub(r'[\[\{]', '(', s)
- s = re.sub(r'[\]\}]', ')', s)
- return _encode(s)
-
-
-def fetch_lyricswiki(artist, title):
- """Fetch lyrics from LyricsWiki."""
- url = LYRICSWIKI_URL_PATTERN % (_lw_encode(artist), _lw_encode(title))
- html = fetch_url(url)
- if not html:
- return
-
- lyrics = extract_text_in(html, u"
")
- if lyrics and 'Unfortunately, we are not licensed' not in lyrics:
- return lyrics
-
-
-# Lyrics.com.
-
-LYRICSCOM_URL_PATTERN = 'http://www.lyrics.com/%s-lyrics-%s.html'
-LYRICSCOM_NOT_FOUND = (
- 'Sorry, we do not have the lyric',
- 'Submit Lyrics',
-)
-
-
-def _lc_encode(s):
- s = re.sub(r'[^\w\s-]', '', s)
- s = re.sub(r'\s+', '-', s)
- return _encode(s).lower()
-
-
-def fetch_lyricscom(artist, title):
- """Fetch lyrics from Lyrics.com."""
- url = LYRICSCOM_URL_PATTERN % (_lc_encode(title), _lc_encode(artist))
- html = fetch_url(url)
- if not html:
- return
- lyrics = extract_text_between(html, '
', '
')
- if not lyrics:
- return
- for not_found_str in LYRICSCOM_NOT_FOUND:
- if not_found_str in lyrics:
+ def fetch_url(self, url):
+ """Retrieve the content at a given URL, or return None if the source
+ is unreachable.
+ """
+ try:
+ r = requests.get(url, verify=False)
+ except requests.RequestException as exc:
+ self._log.debug(u'lyrics request failed: {0}', exc)
return
+ if r.status_code == requests.codes.ok:
+ return r.text
+ else:
+ self._log.debug(u'failed to fetch: {0} ({1})', url, r.status_code)
- parts = lyrics.split('\n---\nLyrics powered by', 1)
- if parts:
- return parts[0]
+ def fetch(self, artist, title):
+ raise NotImplementedError()
-# Optional Google custom search API backend.
-
-def slugify(text):
- """Normalize a string and remove non-alphanumeric characters.
- """
- text = re.sub(r"[-'_\s]", '_', text)
- text = re.sub(r"_+", '_', text).strip('_')
- pat = "([^,\(]*)\((.*?)\)" # Remove content within parentheses
- text = re.sub(pat, '\g<1>', text).strip()
- try:
- text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore')
- text = unicode(re.sub('[-\s]+', ' ', text))
- except UnicodeDecodeError:
- log.exception(u"Failing to normalize '{0}'", text)
- return text
+class SymbolsReplaced(Backend):
+ @classmethod
+ def _encode(cls, s):
+ s = re.sub(r'\s+', '_', s)
+ s = s.replace("<", "Less_Than")
+ s = s.replace(">", "Greater_Than")
+ s = s.replace("#", "Number_")
+ s = re.sub(r'[\[\{]', '(', s)
+ s = re.sub(r'[\]\}]', ')', s)
+ return super(SymbolsReplaced, cls)._encode(s)
-BY_TRANS = ['by', 'par', 'de', 'von']
-LYRICS_TRANS = ['lyrics', 'paroles', 'letras', 'liedtexte']
+class MusiXmatch(SymbolsReplaced):
+ URL_PATTERN = 'https://www.musixmatch.com/lyrics/%s/%s'
+
+ def fetch(self, artist, title):
+ url = self.build_url(artist, title)
+ html = self.fetch_url(url)
+ if not html:
+ return
+ lyrics = extract_text_between(html,
+ '"lyrics_body":', '"lyrics_language":')
+ return lyrics.strip(',"').replace('\\n', '\n')
-def is_page_candidate(urlLink, urlTitle, title, artist):
- """Return True if the URL title makes it a good candidate to be a
- page that contains lyrics of title by artist.
- """
- title = slugify(title.lower())
- artist = slugify(artist.lower())
- sitename = re.search(u"//([^/]+)/.*", slugify(urlLink.lower())).group(1)
- urlTitle = slugify(urlTitle.lower())
- # Check if URL title contains song title (exact match)
- if urlTitle.find(title) != -1:
- return True
- # or try extracting song title from URL title and check if
- # they are close enough
- tokens = [by + '_' + artist for by in BY_TRANS] + \
- [artist, sitename, sitename.replace('www.', '')] + LYRICS_TRANS
- songTitle = re.sub(u'(%s)' % u'|'.join(tokens), u'', urlTitle)
- songTitle = songTitle.strip('_|')
- typoRatio = .9
- return difflib.SequenceMatcher(None, songTitle, title).ratio() >= typoRatio
+class LyricsWiki(SymbolsReplaced):
+ """Fetch lyrics from LyricsWiki."""
+ URL_PATTERN = 'http://lyrics.wikia.com/%s:%s'
+
+ def fetch(self, artist, title):
+ url = self.build_url(artist, title)
+ html = self.fetch_url(url)
+ if not html:
+ return
+ lyrics = extract_text_in(html, u"
")
+ if lyrics and 'Unfortunately, we are not licensed' not in lyrics:
+ return lyrics
+
+
+class LyricsCom(Backend):
+ """Fetch lyrics from Lyrics.com."""
+ URL_PATTERN = 'http://www.lyrics.com/%s-lyrics-%s.html'
+ NOT_FOUND = (
+ 'Sorry, we do not have the lyric',
+ 'Submit Lyrics',
+ )
+
+ @classmethod
+ def _encode(cls, s):
+ s = re.sub(r'[^\w\s-]', '', s)
+ s = re.sub(r'\s+', '-', s)
+ return super(LyricsCom, cls)._encode(s).lower()
+
+ def fetch(self, artist, title):
+ url = self.build_url(artist, title)
+ html = self.fetch_url(url)
+ if not html:
+ return
+ lyrics = extract_text_between(html, '
', '
')
+ if not lyrics:
+ return
+ for not_found_str in self.NOT_FOUND:
+ if not_found_str in lyrics:
+ return
+
+ parts = lyrics.split('\n---\nLyrics powered by', 1)
+ if parts:
+ return parts[0]
def remove_credits(text):
@@ -315,36 +274,6 @@ def remove_credits(text):
return text
-def is_lyrics(text, artist=None):
- """Determine whether the text seems to be valid lyrics.
- """
- if not text:
- return False
- badTriggersOcc = []
- nbLines = text.count('\n')
- if nbLines <= 1:
- log.debug(u"Ignoring too short lyrics '{0}'", text)
- return False
- elif nbLines < 5:
- badTriggersOcc.append('too_short')
- else:
- # Lyrics look legit, remove credits to avoid being penalized further
- # down
- text = remove_credits(text)
-
- badTriggers = ['lyrics', 'copyright', 'property', 'links']
- if artist:
- badTriggersOcc += [artist]
-
- for item in badTriggers:
- badTriggersOcc += [item] * len(re.findall(r'\W%s\W' % item,
- text, re.I))
-
- if badTriggersOcc:
- log.debug(u'Bad triggers detected: {0}', badTriggersOcc)
- return len(badTriggersOcc) < 2
-
-
def _scrape_strip_cruft(html, plain_text_out=False):
"""Clean up HTML
"""
@@ -396,50 +325,119 @@ def scrape_lyrics_from_html(html):
return soup
-def fetch_google(artist, title):
- """Fetch lyrics from Google search results.
- """
- query = u"%s %s" % (artist, title)
- api_key = config['lyrics']['google_API_key'].get(unicode)
- engine_id = config['lyrics']['google_engine_ID'].get(unicode)
- url = u'https://www.googleapis.com/customsearch/v1?key=%s&cx=%s&q=%s' % \
- (api_key, engine_id, urllib.quote(query.encode('utf8')))
+class Google(Backend):
+ """Fetch lyrics from Google search results."""
+ def is_lyrics(self, text, artist=None):
+ """Determine whether the text seems to be valid lyrics.
+ """
+ if not text:
+ return False
+ badTriggersOcc = []
+ nbLines = text.count('\n')
+ if nbLines <= 1:
+ self._log.debug(u"Ignoring too short lyrics '{0}'", text)
+ return False
+ elif nbLines < 5:
+ badTriggersOcc.append('too_short')
+ else:
+ # Lyrics look legit, remove credits to avoid being penalized
+ # further down
+ text = remove_credits(text)
- data = urllib.urlopen(url)
- data = json.load(data)
- if 'error' in data:
- reason = data['error']['errors'][0]['reason']
- log.debug(u'google lyrics backend error: {0}', reason)
- return
+ badTriggers = ['lyrics', 'copyright', 'property', 'links']
+ if artist:
+ badTriggersOcc += [artist]
- if 'items' in data.keys():
- for item in data['items']:
- urlLink = item['link']
- urlTitle = item.get('title', u'')
- if not is_page_candidate(urlLink, urlTitle, title, artist):
- continue
- html = fetch_url(urlLink)
- lyrics = scrape_lyrics_from_html(html)
- if not lyrics:
- continue
+ for item in badTriggers:
+ badTriggersOcc += [item] * len(re.findall(r'\W%s\W' % item,
+ text, re.I))
- if is_lyrics(lyrics, artist):
- log.debug(u'got lyrics from {0}', item['displayLink'])
- return lyrics
+ if badTriggersOcc:
+ self._log.debug(u'Bad triggers detected: {0}', badTriggersOcc)
+ return len(badTriggersOcc) < 2
+ def slugify(self, text):
+ """Normalize a string and remove non-alphanumeric characters.
+ """
+ text = re.sub(r"[-'_\s]", '_', text)
+ text = re.sub(r"_+", '_', text).strip('_')
+ pat = "([^,\(]*)\((.*?)\)" # Remove content within parentheses
+ text = re.sub(pat, '\g<1>', text).strip()
+ try:
+ text = unicodedata.normalize('NFKD', text).encode('ascii',
+ 'ignore')
+ text = unicode(re.sub('[-\s]+', ' ', text))
+ except UnicodeDecodeError:
+ self._log.exception(u"Failing to normalize '{0}'", text)
+ return text
-# Plugin logic.
+ BY_TRANS = ['by', 'par', 'de', 'von']
+ LYRICS_TRANS = ['lyrics', 'paroles', 'letras', 'liedtexte']
-SOURCES = ['google', 'lyricwiki', 'lyrics.com', 'musixmatch']
-SOURCE_BACKENDS = {
- 'google': fetch_google,
- 'lyricwiki': fetch_lyricswiki,
- 'lyrics.com': fetch_lyricscom,
- 'musixmatch': fetch_musixmatch,
-}
+ def is_page_candidate(self, urlLink, urlTitle, title, artist):
+ """Return True if the URL title makes it a good candidate to be a
+ page that contains lyrics of title by artist.
+ """
+ title = self.slugify(title.lower())
+ artist = self.slugify(artist.lower())
+ sitename = re.search(u"//([^/]+)/.*",
+ self.slugify(urlLink.lower())).group(1)
+ urlTitle = self.slugify(urlTitle.lower())
+ # Check if URL title contains song title (exact match)
+ if urlTitle.find(title) != -1:
+ return True
+ # or try extracting song title from URL title and check if
+ # they are close enough
+ tokens = [by + '_' + artist for by in self.BY_TRANS] + \
+ [artist, sitename, sitename.replace('www.', '')] + \
+ self.LYRICS_TRANS
+ songTitle = re.sub(u'(%s)' % u'|'.join(tokens), u'', urlTitle)
+ songTitle = songTitle.strip('_|')
+ typoRatio = .9
+ ratio = difflib.SequenceMatcher(None, songTitle, title).ratio()
+ return ratio >= typoRatio
+
+ def fetch(self, artist, title):
+ query = u"%s %s" % (artist, title)
+ api_key = config['lyrics']['google_API_key'].get(unicode)
+ engine_id = config['lyrics']['google_engine_ID'].get(unicode)
+ url = u'https://www.googleapis.com/customsearch/v1?key=%s&cx=%s&q=%s' % \
+ (api_key, engine_id, urllib.quote(query.encode('utf8')))
+
+ data = urllib.urlopen(url)
+ data = json.load(data)
+ if 'error' in data:
+ reason = data['error']['errors'][0]['reason']
+ self._log.debug(u'google lyrics backend error: {0}', reason)
+ return
+
+ if 'items' in data.keys():
+ for item in data['items']:
+ urlLink = item['link']
+ urlTitle = item.get('title', u'')
+ if not self.is_page_candidate(urlLink, urlTitle,
+ title, artist):
+ continue
+ html = self.fetch_url(urlLink)
+ lyrics = scrape_lyrics_from_html(html)
+ if not lyrics:
+ continue
+
+ if self.is_lyrics(lyrics, artist):
+ self._log.debug(u'got lyrics from {0}',
+ item['displayLink'])
+ return lyrics
class LyricsPlugin(plugins.BeetsPlugin):
+ SOURCES = ['google', 'lyricwiki', 'lyrics.com', 'musixmatch']
+ SOURCE_BACKENDS = {
+ 'google': Google,
+ 'lyricwiki': LyricsWiki,
+ 'lyrics.com': LyricsCom,
+ 'musixmatch': MusiXmatch,
+ }
+
def __init__(self):
super(LyricsPlugin, self).__init__()
self._import_stages = [self.imported]
@@ -449,18 +447,18 @@ class LyricsPlugin(plugins.BeetsPlugin):
'google_engine_ID': u'009217259823014548361:lndtuqkycfu',
'fallback': None,
'force': False,
- 'sources': SOURCES,
+ 'sources': self.SOURCES,
})
- available_sources = list(SOURCES)
+ available_sources = list(self.SOURCES)
if not self.config['google_API_key'].get() and \
- 'google' in SOURCES:
+ 'google' in self.SOURCES:
available_sources.remove('google')
self.config['sources'] = plugins.sanitize_choices(
self.config['sources'].as_str_seq(), available_sources)
self.backends = []
for key in self.config['sources'].as_str_seq():
- self.backends.append(SOURCE_BACKENDS[key])
+ self.backends.append(self.SOURCE_BACKENDS[key](self._log))
def commands(self):
cmd = ui.Subcommand('lyrics', help='fetch song lyrics')
@@ -499,7 +497,8 @@ class LyricsPlugin(plugins.BeetsPlugin):
lyrics will also be written to the file itself."""
# Skip if the item already has lyrics.
if not force and item.lyrics:
- log.info(u'lyrics already present: {0.artist} - {0.title}', item)
+ self._log.info(u'lyrics already present: {0.artist} - {0.title}',
+ item)
return
lyrics = None
@@ -511,9 +510,9 @@ class LyricsPlugin(plugins.BeetsPlugin):
lyrics = u"\n\n---\n\n".join([l for l in lyrics if l])
if lyrics:
- log.info(u'fetched lyrics: {0} - {1}', item.artist, item.title)
+ self._log.info(u'fetched lyrics: {0.artist} - {0.title}', item)
else:
- log.info(u'lyrics not found: {0} - {1}', item.artist, item.title)
+ self._log.info(u'lyrics not found: {0.artist} - {0.title}', item)
fallback = self.config['fallback'].get()
if fallback:
lyrics = fallback
@@ -531,7 +530,8 @@ class LyricsPlugin(plugins.BeetsPlugin):
None if no lyrics were found.
"""
for backend in self.backends:
- lyrics = backend(artist, title)
+ lyrics = backend.fetch(artist, title)
if lyrics:
- log.debug(u'got lyrics from backend: {0}', backend.__name__)
+ self._log.debug(u'got lyrics from backend: {0}',
+ backend.__class__.__name__)
return _scrape_strip_cruft(lyrics, True)