# This file is part of beets.
# Copyright 2014, Adrian Sampson.
#
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and associated documentation files (the
# "Software"), to deal in the Software without restriction, including
# without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and to
# permit persons to whom the Software is furnished to do so, subject to
# the following conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
"""Fetches, embeds, and displays lyrics.
"""
from __future__ import print_function
import re
import logging
import urllib
import json
import unicodedata
import difflib
import itertools
from beets.plugins import BeetsPlugin
from beets import ui
from beets import config
# Global logger.
log = logging.getLogger('beets')
DIV_RE = re.compile(r'<(/?)div>?', re.I)
COMMENT_RE = re.compile(r'', re.S)
TAG_RE = re.compile(r'<[^>]*>')
BREAK_RE = re.compile(r'\n?\s*
]*)*>\s*\n?', re.I)
URL_CHARACTERS = {
u'\u2018': u"'",
u'\u2019': u"'",
u'\u201c': u'"',
u'\u201d': u'"',
u'\u2010': u'-',
u'\u2011': u'-',
u'\u2012': u'-',
u'\u2013': u'-',
u'\u2014': u'-',
u'\u2015': u'-',
u'\u2016': u'-',
u'\u2026': u'...',
}
# Utilities.
def fetch_url(url):
"""Retrieve the content at a given URL, or return None if the source
is unreachable.
"""
try:
return urllib.urlopen(url).read()
except IOError as exc:
log.debug(u'failed to fetch: {0} ({1})'.format(url, unicode(exc)))
return None
def unescape(text):
"""Resolves xx; HTML entities (and some others)."""
if isinstance(text, str):
text = text.decode('utf8', 'ignore')
out = text.replace(u' ', u' ')
def replchar(m):
num = m.group(1)
return unichr(int(num))
out = re.sub(u"(\d+);", replchar, out)
return out
def extract_text(html, starttag):
"""Extract the text from a
]*)>', '\n', html) def scrape_lyrics_from_html(html): """Scrape lyrics from a URL. If no lyrics can be found, return None instead. """ from bs4 import SoupStrainer, BeautifulSoup if not html: return None def is_text_notcode(string): length = len(string) return (length > 20 and string.count(' ') > length / 25 and (string.find('=') == -1 or string.find(';') == 1)) html = _scrape_strip_cruft(html) html = _scrape_merge_paragraphs(html) # extract all long text blocks that are not code soup = BeautifulSoup(html, "html.parser", parse_only=SoupStrainer(text=is_text_notcode)) soup = sorted(soup.stripped_strings, key=len)[-1] return soup def fetch_google(artist, title): """Fetch lyrics from Google search results. """ query = u"%s %s" % (artist, title) api_key = config['lyrics']['google_API_key'].get(unicode) engine_id = config['lyrics']['google_engine_ID'].get(unicode) url = u'https://www.googleapis.com/customsearch/v1?key=%s&cx=%s&q=%s' % \ (api_key, engine_id, urllib.quote(query.encode('utf8'))) data = urllib.urlopen(url) data = json.load(data) if 'error' in data: reason = data['error']['errors'][0]['reason'] log.debug(u'google lyrics backend error: {0}'.format(reason)) return if 'items' in data.keys(): for item in data['items']: urlLink = item['link'] urlTitle = item['title'] if not is_page_candidate(urlLink, urlTitle, title, artist): continue html = fetch_url(urlLink) lyrics = scrape_lyrics_from_html(html) if not lyrics: continue if is_lyrics(lyrics, artist): log.debug(u'got lyrics from {0}'.format(item['displayLink'])) return lyrics # Plugin logic. class LyricsPlugin(BeetsPlugin): def __init__(self): super(LyricsPlugin, self).__init__() self.import_stages = [self.imported] self.config.add({ 'auto': True, 'google_API_key': None, 'google_engine_ID': u'009217259823014548361:lndtuqkycfu', 'fallback': None, }) self.backends = [fetch_lyricswiki, fetch_lyricscom] if self.config['google_API_key'].get(): self.backends.insert(0, fetch_google) def commands(self): cmd = ui.Subcommand('lyrics', help='fetch song lyrics') cmd.parser.add_option('-p', '--print', dest='printlyr', action='store_true', default=False, help='print lyrics to console') cmd.parser.add_option('-f', '--force', dest='force_refetch', action='store_true', default=False, help='always re-download lyrics') def func(lib, opts, args): # The "write to files" option corresponds to the # import_write config value. write = config['import']['write'].get(bool) for item in lib.items(ui.decargs(args)): self.fetch_item_lyrics(lib, logging.INFO, item, write, opts.force_refetch) if opts.printlyr and item.lyrics: ui.print_(item.lyrics) cmd.func = func return [cmd] def imported(self, session, task): """Import hook for fetching lyrics automatically. """ if self.config['auto']: for item in task.imported_items(): self.fetch_item_lyrics(session.lib, logging.DEBUG, item, False, False) def fetch_item_lyrics(self, lib, loglevel, item, write, force): """Fetch and store lyrics for a single item. If ``write``, then the lyrics will also be written to the file itself. The ``loglevel`` parameter controls the visibility of the function's status log messages. """ # Skip if the item already has lyrics. if not force and item.lyrics: log.log(loglevel, u'lyrics already present: {0} - {1}' .format(item.artist, item.title)) return lyrics = None for artist, titles in search_pairs(item): lyrics = [self.get_lyrics(artist, title) for title in titles] if any(lyrics): break lyrics = u"\n\n---\n\n".join([l for l in lyrics if l]) if lyrics: log.log(loglevel, u'fetched lyrics: {0} - {1}' .format(item.artist, item.title)) else: log.log(loglevel, u'lyrics not found: {0} - {1}' .format(item.artist, item.title)) fallback = self.config['fallback'].get() if fallback: lyrics = fallback else: return item.lyrics = lyrics if write: item.try_write() item.store() def get_lyrics(self, artist, title): """Fetch lyrics, trying each source in turn. Return a string or None if no lyrics were found. """ for backend in self.backends: lyrics = backend(artist, title) if lyrics: if isinstance(lyrics, str): lyrics = lyrics.decode('utf8', 'ignore') log.debug(u'got lyrics from backend: {0}' .format(backend.__name__)) return lyrics.strip()