# This file is part of beets.
# Copyright 2013, Adrian Sampson.
#
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and associated documentation files (the
# "Software"), to deal in the Software without restriction, including
# without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and to
# permit persons to whom the Software is furnished to do so, subject to
# the following conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
"""Fetches, embeds, and displays lyrics.
"""
from __future__ import print_function
import re
import logging
import urllib
import json
import unicodedata
import difflib
from beets.plugins import BeetsPlugin
from beets import ui
from beets import config
# Global logger.
log = logging.getLogger('beets')
DIV_RE = re.compile(r'<(/?)div>?')
COMMENT_RE = re.compile(r'', re.S)
TAG_RE = re.compile(r'<[^>]*>')
BREAK_RE = re.compile(r'
')
URL_CHARACTERS = {
u'\u2018': u"'",
u'\u2019': u"'",
u'\u201c': u'"',
u'\u201d': u'"',
u'\u2010': u'-',
u'\u2011': u'-',
u'\u2012': u'-',
u'\u2013': u'-',
u'\u2014': u'-',
u'\u2015': u'-',
u'\u2016': u'-',
u'\u2026': u'...',
}
# Utilities.
def fetch_url(url):
"""Retrieve the content at a given URL, or return None if the source
is unreachable.
"""
try:
return urllib.urlopen(url).read()
except IOError as exc:
log.debug(u'failed to fetch: {0} ({1})'.format(url, unicode(exc)))
return None
def unescape(text):
"""Resolves xx; HTML entities (and some others)."""
if isinstance(text, str):
text = text.decode('utf8', 'ignore')
out = text.replace(u' ', u' ')
def replchar(m):
num = m.group(1)
return unichr(int(num))
out = re.sub(u"(\d+);", replchar, out)
return out
def extract_text(html, starttag):
"""Extract the text from a
sections # are now closed. Use str() rather than prettify() as it's more # conservative concerning EOL soup = BeautifulSoup(str(soup)) # In case lyrics are nested in no markup but
# Insert the whole body in abodyTag = soup.find('body') if bodyTag: pTag = soup.new_tag("p") bodyTag.parent.insert(0, pTag) pTag.insert(0, bodyTag) tagTokens = [] for tag in soup.findAll('p'): soup2 = BeautifulSoup(str(tag)) # Extract all text of
section. tagTokens += soup2.findAll(text=True) if tagTokens: # Lyrics are expected to be the longest paragraph tagTokens = sorted(tagTokens, key=len, reverse=True) soup = BeautifulSoup(tagTokens[0]) return unescape(tagTokens[0].strip("\n\r: ")) def fetch_google(artist, title): """Fetch lyrics from Google search results. """ query = u"%s %s" % (artist, title) api_key = config['lyrics']['google_API_key'].get(unicode) engine_id = config['lyrics']['google_engine_ID'].get(unicode) url = u'https://www.googleapis.com/customsearch/v1?key=%s&cx=%s&q=%s' % \ (api_key, engine_id, urllib.quote(query.encode('utf8'))) data = urllib.urlopen(url) data = json.load(data) if 'error' in data: reason = data['error']['errors'][0]['reason'] log.debug(u'google lyrics backend error: %s' % reason) return None if 'items' in data.keys(): for item in data['items']: urlLink = item['link'] urlTitle = item['title'] if not is_page_candidate(urlLink, urlTitle, title, artist): continue lyrics = scrape_lyrics_from_url(urlLink) if not lyrics: continue lyrics = sanitize_lyrics(lyrics) if is_lyrics(lyrics, artist): log.debug(u'got lyrics from %s' % item['displayLink']) return lyrics # Plugin logic. class LyricsPlugin(BeetsPlugin): def __init__(self): super(LyricsPlugin, self).__init__() self.import_stages = [self.imported] self.config.add({ 'auto': True, 'google_API_key': None, 'google_engine_ID': u'009217259823014548361:lndtuqkycfu', 'fallback': None, }) self.backends = [fetch_lyricswiki, fetch_lyricscom] if self.config['google_API_key'].get(): self.backends.insert(0, fetch_google) def commands(self): cmd = ui.Subcommand('lyrics', help='fetch song lyrics') cmd.parser.add_option('-p', '--print', dest='printlyr', action='store_true', default=False, help='print lyrics to console') cmd.parser.add_option('-f', '--force', dest='force_refetch', action='store_true', default=False, help='always re-download lyrics') def func(lib, opts, args): # The "write to files" option corresponds to the # import_write config value. write = config['import']['write'].get(bool) for item in lib.items(ui.decargs(args)): self.fetch_item_lyrics(lib, logging.INFO, item, write, opts.force_refetch) if opts.printlyr and item.lyrics: ui.print_(item.lyrics) cmd.func = func return [cmd] # Auto-fetch lyrics on import. def imported(self, session, task): if self.config['auto']: for item in task.imported_items(): self.fetch_item_lyrics(session.lib, logging.DEBUG, item, False, False) def fetch_item_lyrics(self, lib, loglevel, item, write, force): """Fetch and store lyrics for a single item. If ``write``, then the lyrics will also be written to the file itself. The ``loglevel`` parameter controls the visibility of the function's status log messages. """ fallback = self.config['fallback'].get() # Skip if the item already has lyrics. if not force and item.lyrics: log.log(loglevel, u'lyrics already present: %s - %s' % (item.artist, item.title)) return # Fetch lyrics. lyrics = self.get_lyrics(item.artist, item.title) if not lyrics: log.log(loglevel, u'lyrics not found: %s - %s' % (item.artist, item.title)) if fallback: lyrics = fallback else: return else: log.log(loglevel, u'fetched lyrics: %s - %s' % (item.artist, item.title)) item.lyrics = lyrics if write: item.write() item.store() def get_lyrics(self, artist, title): """Fetch lyrics, trying each source in turn. Return a string or None if no lyrics were found. """ # Remove featuring artists from search. pattern = u"(.*) feat(uring|\.)?\s\S+" match = re.search(pattern, artist, re.IGNORECASE) if match: artist = match.group(0) for backend in self.backends: lyrics = backend(artist, title) if lyrics: if isinstance(lyrics, str): lyrics = lyrics.decode('utf8', 'ignore') log.debug(u'got lyrics from backend: {0}'.format( backend.__name__ )) return lyrics