# This file is part of beets.
# Copyright 2014, Adrian Sampson.
#
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and associated documentation files (the
# "Software"), to deal in the Software without restriction, including
# without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and to
# permit persons to whom the Software is furnished to do so, subject to
# the following conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
"""Fetches, embeds, and displays lyrics.
"""
from __future__ import print_function
import re
import logging
import requests
import json
import unicodedata
import urllib
import difflib
import itertools
from HTMLParser import HTMLParseError
from beets import plugins
from beets import config, ui
# Global logger.
log = logging.getLogger('beets')
DIV_RE = re.compile(r'<(/?)div>?', re.I)
COMMENT_RE = re.compile(r'', re.S)
TAG_RE = re.compile(r'<[^>]*>')
BREAK_RE = re.compile(r'\n?\s*
]*)*>\s*\n?', re.I)
URL_CHARACTERS = {
u'\u2018': u"'",
u'\u2019': u"'",
u'\u201c': u'"',
u'\u201d': u'"',
u'\u2010': u'-',
u'\u2011': u'-',
u'\u2012': u'-',
u'\u2013': u'-',
u'\u2014': u'-',
u'\u2015': u'-',
u'\u2016': u'-',
u'\u2026': u'...',
}
# Utilities.
def fetch_url(url):
"""Retrieve the content at a given URL, or return None if the source
is unreachable.
"""
try:
r = requests.get(url, verify=False)
except requests.RequestException as exc:
log.debug(u'lyrics request failed: {0}'.format(exc))
return
if r.status_code == requests.codes.ok:
return r.text
else:
log.debug(u'failed to fetch: {0} ({1})'.format(url, r.status_code))
def unescape(text):
"""Resolves xx; HTML entities (and some others)."""
if isinstance(text, str):
text = text.decode('utf8', 'ignore')
out = text.replace(u' ', u' ')
def replchar(m):
num = m.group(1)
return unichr(int(num))
out = re.sub(u"(\d+);", replchar, out)
return out
def extract_text_between(html, start_marker, end_marker):
try:
_, html = html.split(start_marker, 1)
html, _ = html.split(end_marker, 1)
except ValueError:
return u''
return _scrape_strip_cruft(html, True)
def extract_text_in(html, starttag):
"""Extract the text from a
]*)>', '\n', html) def scrape_lyrics_from_html(html): """Scrape lyrics from a URL. If no lyrics can be found, return None instead. """ from bs4 import SoupStrainer, BeautifulSoup if not html: return None def is_text_notcode(text): length = len(text) return (length > 20 and text.count(' ') > length / 25 and (text.find('{') == -1 or text.find(';') == -1)) html = _scrape_strip_cruft(html) html = _scrape_merge_paragraphs(html) # extract all long text blocks that are not code try: soup = BeautifulSoup(html, "html.parser", parse_only=SoupStrainer(text=is_text_notcode)) except HTMLParseError: return None soup = sorted(soup.stripped_strings, key=len)[-1] return soup def fetch_google(artist, title): """Fetch lyrics from Google search results. """ query = u"%s %s" % (artist, title) api_key = config['lyrics']['google_API_key'].get(unicode) engine_id = config['lyrics']['google_engine_ID'].get(unicode) url = u'https://www.googleapis.com/customsearch/v1?key=%s&cx=%s&q=%s' % \ (api_key, engine_id, urllib.quote(query.encode('utf8'))) data = urllib.urlopen(url) data = json.load(data) if 'error' in data: reason = data['error']['errors'][0]['reason'] log.debug(u'google lyrics backend error: {0}'.format(reason)) return if 'items' in data.keys(): for item in data['items']: urlLink = item['link'] urlTitle = item.get('title', u'') if not is_page_candidate(urlLink, urlTitle, title, artist): continue html = fetch_url(urlLink) lyrics = scrape_lyrics_from_html(html) if not lyrics: continue if is_lyrics(lyrics, artist): log.debug(u'got lyrics from {0}'.format(item['displayLink'])) return lyrics # Plugin logic. SOURCES = ['google', 'lyricwiki', 'lyrics.com', 'musixmatch'] SOURCE_BACKENDS = { 'google': fetch_google, 'lyricwiki': fetch_lyricswiki, 'lyrics.com': fetch_lyricscom, 'musixmatch': fetch_musixmatch, } class LyricsPlugin(plugins.BeetsPlugin): def __init__(self): super(LyricsPlugin, self).__init__() self.import_stages = [self.imported] self.config.add({ 'auto': True, 'google_API_key': None, 'google_engine_ID': u'009217259823014548361:lndtuqkycfu', 'fallback': None, 'force': False, 'sources': SOURCES, }) available_sources = list(SOURCES) if not self.config['google_API_key'].get() and \ 'google' in SOURCES: available_sources.remove('google') self.config['sources'] = plugins.sanitize_choices( self.config['sources'].as_str_seq(), available_sources) self.backends = [] for key in self.config['sources'].as_str_seq(): self.backends.append(SOURCE_BACKENDS[key]) def commands(self): cmd = ui.Subcommand('lyrics', help='fetch song lyrics') cmd.parser.add_option('-p', '--print', dest='printlyr', action='store_true', default=False, help='print lyrics to console') cmd.parser.add_option('-f', '--force', dest='force_refetch', action='store_true', default=False, help='always re-download lyrics') def func(lib, opts, args): # The "write to files" option corresponds to the # import_write config value. write = config['import']['write'].get(bool) for item in lib.items(ui.decargs(args)): self.fetch_item_lyrics( lib, logging.INFO, item, write, opts.force_refetch or self.config['force'], ) if opts.printlyr and item.lyrics: ui.print_(item.lyrics) cmd.func = func return [cmd] def imported(self, session, task): """Import hook for fetching lyrics automatically. """ if self.config['auto']: for item in task.imported_items(): self.fetch_item_lyrics(session.lib, logging.DEBUG, item, False, self.config['force']) def fetch_item_lyrics(self, lib, loglevel, item, write, force): """Fetch and store lyrics for a single item. If ``write``, then the lyrics will also be written to the file itself. The ``loglevel`` parameter controls the visibility of the function's status log messages. """ # Skip if the item already has lyrics. if not force and item.lyrics: log.log(loglevel, u'lyrics already present: {0} - {1}' .format(item.artist, item.title)) return lyrics = None for artist, titles in search_pairs(item): lyrics = [self.get_lyrics(artist, title) for title in titles] if any(lyrics): break lyrics = u"\n\n---\n\n".join([l for l in lyrics if l]) if lyrics: log.log(loglevel, u'fetched lyrics: {0} - {1}' .format(item.artist, item.title)) else: log.log(loglevel, u'lyrics not found: {0} - {1}' .format(item.artist, item.title)) fallback = self.config['fallback'].get() if fallback: lyrics = fallback else: return item.lyrics = lyrics if write: item.try_write() item.store() def get_lyrics(self, artist, title): """Fetch lyrics, trying each source in turn. Return a string or None if no lyrics were found. """ for backend in self.backends: lyrics = backend(artist, title) if lyrics: log.debug(u'got lyrics from backend: {0}' .format(backend.__name__)) return lyrics.strip()