Add a lyrics backend that scraps results from google custom search api.

Add a 'fallback' option to facilitate working around the 100 queries/day google
limit by marking files as 'visited' so they are not considered for lyrics search
on the next beet run.
I've put my own google_engine_ID as default value in the code but could be
reconsidered, this engine contains databases known to be scrappable by the
plugin algorithm though.
This commit is contained in:
Fabrice Laporte 2013-04-06 14:58:50 +02:00
parent 70b528ed81
commit cfb6735e43
2 changed files with 342 additions and 52 deletions

View file

@ -1,5 +1,5 @@
# This file is part of beets.
# Copyright 2013, Adrian Sampson.
# Copyright 2012, Adrian Sampson.
#
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and associated documentation files (the
@ -16,24 +16,24 @@
"""
from __future__ import print_function
import urllib
import re
import logging
import urllib
import json
import unicodedata
import difflib
from beets.plugins import BeetsPlugin
from beets import ui
from beets import config
from beets.ui import commands
# Global logger.
log = logging.getLogger('beets')
# Lyrics scrapers.
COMMENT_RE = re.compile(r'<!--.*-->', re.S)
DIV_RE = re.compile(r'<(/?)div>?')
COMMENT_RE = re.compile(r'<!--.*-->', re.S)
TAG_RE = re.compile(r'<[^>]*>')
BREAK_RE = re.compile(r'<br\s*/?>')
@ -91,27 +91,26 @@ def extract_text(html, starttag):
return
lyrics = ''.join(parts)
lyrics = strip_cruft(lyrics)
def strip_cruft(lyrics, wscollapse=True):
"""Clean up lyrics"""
# Strip cruft.
lyrics = COMMENT_RE.sub('', lyrics)
lyrics = unescape(lyrics)
lyrics = re.sub(r'\s+', ' ', lyrics) # Whitespace collapse.
if wscollapse:
lyrics = re.sub(r'\s+', ' ', lyrics) # Whitespace collapse.
lyrics = BREAK_RE.sub('\n', lyrics) # <BR> newlines.
lyrics = re.sub(r'\n +', '\n', lyrics)
lyrics = re.sub(r' +\n', '\n', lyrics)
lyrics = TAG_RE.sub('', lyrics) # Strip remaining HTML tags.
lyrics = lyrics.replace('\r','\n')
lyrics = lyrics.strip()
return lyrics
def _encode(s):
"""Encode the string for inclusion in a URL (common to both
LyricsWiki and Lyrics.com).
"""
if isinstance(s, unicode):
# Replace "fancy" apostrophes with straight ones.
s = s.replace(u'\u2019', u"'")
s = s.encode('utf8', 'ignore')
return urllib.quote(s)
#
# Wikia db
#
LYRICSWIKI_URL_PATTERN = 'http://lyrics.wikia.com/%s:%s'
def _lw_encode(s):
@ -121,7 +120,10 @@ def _lw_encode(s):
s = s.replace("#", "Number_")
s = re.sub(r'[\[\{]', '(', s)
s = re.sub(r'[\]\}]', ')', s)
return _encode(s)
if isinstance(s, unicode):
s = s.encode('utf8', 'ignore')
return urllib.quote(s)
def fetch_lyricswiki(artist, title):
"""Fetch lyrics from LyricsWiki."""
url = LYRICSWIKI_URL_PATTERN % (_lw_encode(artist), _lw_encode(title))
@ -133,6 +135,10 @@ def fetch_lyricswiki(artist, title):
if lyrics and 'Unfortunately, we are not licensed' not in lyrics:
return lyrics
#
# Lyrics.com db
#
LYRICSCOM_URL_PATTERN = 'http://www.lyrics.com/%s-lyrics-%s.html'
LYRICSCOM_NOT_FOUND = (
'Sorry, we do not have the lyric',
@ -140,7 +146,9 @@ LYRICSCOM_NOT_FOUND = (
)
def _lc_encode(s):
s = re.sub(r'\s+', '-', s)
return _encode(s)
if isinstance(s, unicode):
s = s.encode('utf8', 'ignore')
return urllib.quote(s)
def fetch_lyricscom(artist, title):
"""Fetch lyrics from Lyrics.com."""
url = LYRICSCOM_URL_PATTERN % (_lc_encode(title), _lc_encode(artist))
@ -159,9 +167,236 @@ def fetch_lyricscom(artist, title):
if parts:
return parts[0]
BACKENDS = [fetch_lyricswiki, fetch_lyricscom]
#
# Google engine
#
def slugify(text, jokerChar=False, spaceChar=' '):
"""
Normalizes string, removes non-alpha characters
Found at
http://stackoverflow.com/questions/295135/turn-a-string-into-a-valid-filename\
-in-python
"""
try:
text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore')
text = unicode(re.sub('[-\s]+', spaceChar, text))
if jokerChar is not False:
text = unicode(re.sub('[^\w\s]', jokerChar, text))
except UnicodeDecodeError:
log.exception("Failing to normalize '%s'" % (text))
return urllib.quote(text)
def isPageCandidate(urlLink, urlTitle, title, artist):
'''Return True if the url title makes it a good candidate to be a
page that contains lyrics of title by artist '''
title = slugify(title.lower())
artist = slugify(artist.lower())
urlLink = slugify(urlLink.lower())
urlTitle = slugify(urlTitle.lower())
# Check if url title contains song title (exact match)
if urlTitle.find(title) != -1:
return True
# or try extracting song title from url title and check if
# they are close enough
songTitle = urlTitle.replace('lyrics', '')\
.replace(artist, '').strip('%20')
if len(songTitle):
log.debug("Match ratio of '%s' with title: %s" %
(songTitle, difflib.SequenceMatcher
(None, songTitle, title).ratio()))
typoRatio = .8
return difflib.SequenceMatcher(None, songTitle, title).ratio() > typoRatio
def insertLineFeeds(text):
"""Insert \n before upcased characters"""
tokensStr = re.split("([a-z][A-Z])", text)
for idx in range(1, len(tokensStr), 2):
ltoken = list(tokensStr[idx])
tokensStr[idx] = ltoken[0] + '\n' + ltoken[1]
return ''.join(tokensStr)
def decimateLineFeeds(text):
"""Decimate \n characters. By default une only one \n as eol marker. Keep
at most two \n in a row (eg. to separate verses)."""
# Remove first occurence of \n for each sequence of \n
text = re.sub(r'\n(\n+)', '\g<1>', text)
# Keep at most two \n in a row
text = re.sub(r'\n\n+', '\n\n', text)
return text.strip('\n')
def lyricsSanetizer(text):
"""Clean text, returning raw lyrics as output or None if it happens that
input text is actually not lyrics content. Clean (x)html tags in text,
correct layout and syntax ..."""
text = strip_cruft(text, False)
# Restore \n in input text
if text.find('\n') == -1:
text = insertLineFeeds(text)
# Supress advertisements regexps
textLines = text.splitlines(True)
# Match lines with an opening bracket but no ending one, ie lines that
# contained html link that has been wiped out when scraping.
reAdHtml = re.compile(r'(\(|\[).*[^\)\]]$')
# Match lines containing url between brackets
reAdTxt = re.compile(r'(\(|\[).*[http|www].*(\]|\))')
for line in textLines:
if (re.match(reAdHtml, line) != None) or \
(re.match(reAdTxt, line) != None):
textLines.remove(line)
# \n might have been duplicated during the scrapping.
# decimate \n while number of \n represent more than half the number of
# lines
while len([x for x in textLines if x=='\n']) >= (len(textLines)/2 - 1):
if len(textLines) <= 3:
break
text = ''.join(textLines)
text = decimateLineFeeds(text)
textLines = [line.strip(' ') for line in text.splitlines(True)]
return ''.join(textLines)
def isLyricsAccepted(text, artist):
"""Returns True if text is considered as valid lyrics"""
badTriggers = []
nbLines = text.count('\n')
if nbLines <= 1:
log.debug("Ignoring too short lyrics '%s'" % text)
return 0
elif nbLines < 5 :
badTriggers.append('too_short')
for item in [artist, 'lyrics', 'copyright', 'property']:
badTriggers += [item] * len(re.findall(r'\W%s\W' % item, text, re.I))
if len(badTriggers) :
log.debug('Bad triggers detected : %s' % badTriggers)
return len(badTriggers) < 2
def scrapLyricsFromUrl(url):
'''Scrap lyrics from url'''
from bs4 import BeautifulSoup, Tag
print (url)
html = fetch_url(url)
soup = BeautifulSoup(html)
# Simplify the code by replacing some markers by the <p> marker
try:
for tag in soup.findAll(['center','blockquote']):
pTag = Tag(soup, "p")
pTag.contents = tag.contents
tag.replaceWith(pTag)
for tag in soup.findAll(['script', 'a', 'font']):
tag.replaceWith('<p>')
except Exception, e:
log.debug('Error %s when replacing containing marker by p marker' % e, \
exc_info=True)
for tag in soup.findAll('br'):
tag.replaceWith('\n')
# Keep only tags that can possibly be parent tags and eol
for tag in soup.findAll(True):
containers = ['div', 'p', 'html', 'body', 'table', 'tr', 'td', 'br']
if tag.name not in containers:
tag.extract()
# Make better soup from current soup! The previous unclosed <p> sections are
# now closed. Use str() rather than prettify() as it's more conservative
# concerning EOL
soup = BeautifulSoup(str(soup))
# In case lyrics are nested in no markup but <body>
# Insert the whole body in a <p>
bodyTag = soup.find('body')
if bodyTag != None:
pTag = soup.new_tag("p")
bodyTag.parent.insert(0, pTag)
pTag.insert(0, bodyTag)
tagTokens = []
for tag in soup.findAll('p'):
soup2 = BeautifulSoup(str(tag))
tagTokens += soup2.findAll(text=True) # Extract all text of <p> section
if tagTokens != []:
# Lyrics are expected to be the longest paragraph
tagTokens = sorted(tagTokens, key=len, reverse=True)
soup = BeautifulSoup(tagTokens[0])
if soup.findAll(['div', 'a']) != []:
return None
return unescape(tagTokens[0].strip("\n\r: "))
return None
def fetch_google(artist, title):
"""Fetch lyrics from google results"""
QUERY = u"%s %s" % (artist, title)
url = u'https://www.googleapis.com/customsearch/v1?key=%s&cx=%s&q=%s' % \
(options['google_API_key'], options['google_engine_ID'], \
urllib.quote(QUERY.encode('utf8')))
data = urllib.urlopen(url)
data = json.load(data)
if 'error' in data:
reason = data['error']['errors'][0]['reason']
log.debug(u'google lyrics backend error: %s' % reason)
return None
if 'items' in data.keys():
for item in data['items']:
urlLink = item['link']
urlTitle = item['title']
if not isPageCandidate(urlLink, urlTitle, title, artist):
continue
lyrics = scrapLyricsFromUrl(urlLink)
if (lyrics == None or len(lyrics)== 0):
continue
lyrics = lyricsSanetizer(lyrics)
if isLyricsAccepted(lyrics, artist):
return lyrics
# Lyrics scrapers.
def get_lyrics(artist, title):
"""Fetch lyrics, trying each source in turn."""
# Remove featuring artists from search
pattern = u"(.*) feat(uring|\.)?\s\S+"
artist_nofeat = re.findall(re.compile(pattern,re.IGNORECASE), artist)
if artist_nofeat:
artist = artist_nofeat[0][0]
for backend in BACKENDS:
lyrics = backend(artist, title)
if lyrics:
@ -173,59 +408,83 @@ def get_lyrics(artist, title):
# Plugin logic.
def fetch_item_lyrics(lib, loglevel, item, write):
"""Fetch and store lyrics for a single item. If ``write``, then the
lyrics will also be written to the file itself. The ``loglevel``
parameter controls the visibility of the function's status log
messages.
"""
# Skip if the item already has lyrics.
if item.lyrics:
log.log(loglevel, u'lyrics already present: %s - %s' %
(item.artist, item.title))
return
BACKENDS = [fetch_lyricswiki, fetch_lyricscom]
# Fetch lyrics.
lyrics = get_lyrics(item.artist, item.title)
if not lyrics:
log.log(loglevel, u'lyrics not found: %s - %s' %
(item.artist, item.title))
return
options = {
'google_API_key': None,
'google_engine_ID': None,
}
def init_google_search(google_API_key, google_engine_ID):
options['google_API_key'] = google_API_key
options['google_engine_ID'] = google_engine_ID
log.log(loglevel, u'fetched lyrics: %s - %s' %
(item.artist, item.title))
item.lyrics = lyrics
if write:
item.write()
lib.store(item)
AUTOFETCH = True
class LyricsPlugin(BeetsPlugin):
def __init__(self):
super(LyricsPlugin, self).__init__()
self.import_stages = [self.imported]
self.config.add({
'auto': True,
'google_API_key': None,
'google_engine_ID': u'009217259823014548361:lndtuqkycfu',
'write': config['import']['write'].get(bool)
})
if self.config['google_API_key'].get():
init_google_search(self.config['google_API_key'].get(),
self.config['google_engine_ID'].get())
BACKENDS.insert(0, fetch_google)
def commands(self):
cmd = ui.Subcommand('lyrics', help='fetch song lyrics')
cmd.parser.add_option('-p', '--print', dest='printlyr',
action='store_true', default=False,
help='print lyrics to console')
def func(lib, opts, args):
def func(lib, config, opts, args):
# The "write to files" option corresponds to the
# import_write config value.
write = config['import']['write'].get(bool)
for item in lib.items(ui.decargs(args)):
fetch_item_lyrics(lib, logging.INFO, item, write)
self.fetch_item_lyrics(lib, logging.INFO, item,
self.config['write'].get())
if opts.printlyr and item.lyrics:
ui.print_(item.lyrics)
cmd.func = func
return [cmd]
# Auto-fetch lyrics on import.
def imported(self, session, task):
if self.config['auto']:
def imported(self, config, task):
if self.config['auto'].get():
for item in task.imported_items():
fetch_item_lyrics(session.lib, logging.DEBUG, item, False)
self.fetch_item_lyrics(config.lib, logging.DEBUG, item,
self.config['write'].get())
def fetch_item_lyrics(self, lib, loglevel, item, write):
"""Fetch and store lyrics for a single item. If ``write``, then the
lyrics will also be written to the file itself. The ``loglevel``
parameter controls the visibility of the function's status log
messages.
"""
# Skip if the item already has lyrics.
if item.lyrics:
log.log(loglevel, u'lyrics already present: %s - %s' %
(item.artist, item.title))
return
# Fetch lyrics.
lyrics = get_lyrics(item.artist, item.title)
if not lyrics:
log.log(loglevel, u'lyrics not found: %s - %s' %
(item.artist, item.title))
if self.config['fallback'].get():
lyrics = self.config['fallback'].get()
else:
return
else:
log.log(loglevel, u'fetched lyrics: %s - %s' %
(item.artist, item.title))
item.lyrics = lyrics
if write:
item.write()
lib.store(item)

View file

@ -7,6 +7,13 @@ Namely, the current version of the plugin uses `Lyric Wiki`_ and `Lyrics.com`_.
.. _Lyric Wiki: http://lyrics.wikia.com/
.. _Lyrics.com: http://www.lyrics.com/
:ref:`_activate-google-custom-search` to expand the plugin firepower, by using google search to harvest lyrics from your own websites list.
By default if no lyrics are found, the file will be left unchanged. To specify a placeholder for the lyrics tags when none are found, use the ``fallback`` configuration option.
lastgenre:
fallback: 'No lyrics found'
Fetch Lyrics During Import
--------------------------
@ -42,3 +49,27 @@ automatic lyrics fetching during import. To do so, add this to your
lyrics:
auto: no
.. _activate-google-custom-search:
Activate Google custom search
------------------------------
Using Google backend requires `beautifulsoup`_, which you can install using `pip`_ by typing::
pip install beautifulsoup4
To activate google search you must first register an API key on https://code.google.com/apis/console. Then click *API Access* and use that key for the `google_API_key` plugin option.
Optionally, you can define a custom search engine on http://www.google.com/cse/all. Click the *Search engine ID* button to display the token to copy into the `google_engine_ID` option.
By default, beets use a list of sources known to be scrapable.
Example of ``config.yaml``::
lyrics:
google_API_key: AZERTYUIOPQSDFGHJKLMWXCVBN1234567890_ab
google_engine_ID: 009217259823014548361:lndtuqkycfu
.. _pip: http://www.pip-installer.org/
.. _beautifulsoup: http://www.crummy.com/software/BeautifulSoup/bs4/doc/