mirror of
https://github.com/beetbox/beets.git
synced 2025-12-14 12:35:19 +01:00
Add a lyrics backend that scraps results from google custom search api.
Add a 'fallback' option to facilitate working around the 100 queries/day google limit by marking files as 'visited' so they are not considered for lyrics search on the next beet run. I've put my own google_engine_ID as default value in the code but could be reconsidered, this engine contains databases known to be scrappable by the plugin algorithm though.
This commit is contained in:
parent
70b528ed81
commit
cfb6735e43
2 changed files with 342 additions and 52 deletions
|
|
@ -1,5 +1,5 @@
|
|||
# This file is part of beets.
|
||||
# Copyright 2013, Adrian Sampson.
|
||||
# Copyright 2012, Adrian Sampson.
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining
|
||||
# a copy of this software and associated documentation files (the
|
||||
|
|
@ -16,24 +16,24 @@
|
|||
"""
|
||||
from __future__ import print_function
|
||||
|
||||
import urllib
|
||||
import re
|
||||
import logging
|
||||
import urllib
|
||||
import json
|
||||
import unicodedata
|
||||
import difflib
|
||||
|
||||
from beets.plugins import BeetsPlugin
|
||||
from beets import ui
|
||||
from beets import config
|
||||
|
||||
from beets.ui import commands
|
||||
|
||||
# Global logger.
|
||||
|
||||
log = logging.getLogger('beets')
|
||||
|
||||
|
||||
# Lyrics scrapers.
|
||||
|
||||
COMMENT_RE = re.compile(r'<!--.*-->', re.S)
|
||||
DIV_RE = re.compile(r'<(/?)div>?')
|
||||
COMMENT_RE = re.compile(r'<!--.*-->', re.S)
|
||||
TAG_RE = re.compile(r'<[^>]*>')
|
||||
BREAK_RE = re.compile(r'<br\s*/?>')
|
||||
|
||||
|
|
@ -91,27 +91,26 @@ def extract_text(html, starttag):
|
|||
return
|
||||
lyrics = ''.join(parts)
|
||||
|
||||
lyrics = strip_cruft(lyrics)
|
||||
|
||||
|
||||
def strip_cruft(lyrics, wscollapse=True):
|
||||
"""Clean up lyrics"""
|
||||
# Strip cruft.
|
||||
lyrics = COMMENT_RE.sub('', lyrics)
|
||||
lyrics = unescape(lyrics)
|
||||
lyrics = re.sub(r'\s+', ' ', lyrics) # Whitespace collapse.
|
||||
if wscollapse:
|
||||
lyrics = re.sub(r'\s+', ' ', lyrics) # Whitespace collapse.
|
||||
lyrics = BREAK_RE.sub('\n', lyrics) # <BR> newlines.
|
||||
lyrics = re.sub(r'\n +', '\n', lyrics)
|
||||
lyrics = re.sub(r' +\n', '\n', lyrics)
|
||||
lyrics = TAG_RE.sub('', lyrics) # Strip remaining HTML tags.
|
||||
lyrics = lyrics.replace('\r','\n')
|
||||
lyrics = lyrics.strip()
|
||||
return lyrics
|
||||
|
||||
def _encode(s):
|
||||
"""Encode the string for inclusion in a URL (common to both
|
||||
LyricsWiki and Lyrics.com).
|
||||
"""
|
||||
if isinstance(s, unicode):
|
||||
# Replace "fancy" apostrophes with straight ones.
|
||||
s = s.replace(u'\u2019', u"'")
|
||||
s = s.encode('utf8', 'ignore')
|
||||
return urllib.quote(s)
|
||||
#
|
||||
# Wikia db
|
||||
#
|
||||
|
||||
LYRICSWIKI_URL_PATTERN = 'http://lyrics.wikia.com/%s:%s'
|
||||
def _lw_encode(s):
|
||||
|
|
@ -121,7 +120,10 @@ def _lw_encode(s):
|
|||
s = s.replace("#", "Number_")
|
||||
s = re.sub(r'[\[\{]', '(', s)
|
||||
s = re.sub(r'[\]\}]', ')', s)
|
||||
return _encode(s)
|
||||
if isinstance(s, unicode):
|
||||
s = s.encode('utf8', 'ignore')
|
||||
return urllib.quote(s)
|
||||
|
||||
def fetch_lyricswiki(artist, title):
|
||||
"""Fetch lyrics from LyricsWiki."""
|
||||
url = LYRICSWIKI_URL_PATTERN % (_lw_encode(artist), _lw_encode(title))
|
||||
|
|
@ -133,6 +135,10 @@ def fetch_lyricswiki(artist, title):
|
|||
if lyrics and 'Unfortunately, we are not licensed' not in lyrics:
|
||||
return lyrics
|
||||
|
||||
#
|
||||
# Lyrics.com db
|
||||
#
|
||||
|
||||
LYRICSCOM_URL_PATTERN = 'http://www.lyrics.com/%s-lyrics-%s.html'
|
||||
LYRICSCOM_NOT_FOUND = (
|
||||
'Sorry, we do not have the lyric',
|
||||
|
|
@ -140,7 +146,9 @@ LYRICSCOM_NOT_FOUND = (
|
|||
)
|
||||
def _lc_encode(s):
|
||||
s = re.sub(r'\s+', '-', s)
|
||||
return _encode(s)
|
||||
if isinstance(s, unicode):
|
||||
s = s.encode('utf8', 'ignore')
|
||||
return urllib.quote(s)
|
||||
def fetch_lyricscom(artist, title):
|
||||
"""Fetch lyrics from Lyrics.com."""
|
||||
url = LYRICSCOM_URL_PATTERN % (_lc_encode(title), _lc_encode(artist))
|
||||
|
|
@ -159,9 +167,236 @@ def fetch_lyricscom(artist, title):
|
|||
if parts:
|
||||
return parts[0]
|
||||
|
||||
BACKENDS = [fetch_lyricswiki, fetch_lyricscom]
|
||||
#
|
||||
# Google engine
|
||||
#
|
||||
|
||||
def slugify(text, jokerChar=False, spaceChar=' '):
|
||||
"""
|
||||
Normalizes string, removes non-alpha characters
|
||||
Found at
|
||||
http://stackoverflow.com/questions/295135/turn-a-string-into-a-valid-filename\
|
||||
-in-python
|
||||
"""
|
||||
|
||||
try:
|
||||
text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore')
|
||||
text = unicode(re.sub('[-\s]+', spaceChar, text))
|
||||
|
||||
if jokerChar is not False:
|
||||
text = unicode(re.sub('[^\w\s]', jokerChar, text))
|
||||
|
||||
except UnicodeDecodeError:
|
||||
log.exception("Failing to normalize '%s'" % (text))
|
||||
|
||||
return urllib.quote(text)
|
||||
|
||||
|
||||
def isPageCandidate(urlLink, urlTitle, title, artist):
|
||||
'''Return True if the url title makes it a good candidate to be a
|
||||
page that contains lyrics of title by artist '''
|
||||
|
||||
title = slugify(title.lower())
|
||||
artist = slugify(artist.lower())
|
||||
urlLink = slugify(urlLink.lower())
|
||||
urlTitle = slugify(urlTitle.lower())
|
||||
|
||||
# Check if url title contains song title (exact match)
|
||||
if urlTitle.find(title) != -1:
|
||||
return True
|
||||
# or try extracting song title from url title and check if
|
||||
# they are close enough
|
||||
songTitle = urlTitle.replace('lyrics', '')\
|
||||
.replace(artist, '').strip('%20')
|
||||
if len(songTitle):
|
||||
log.debug("Match ratio of '%s' with title: %s" %
|
||||
(songTitle, difflib.SequenceMatcher
|
||||
(None, songTitle, title).ratio()))
|
||||
|
||||
typoRatio = .8
|
||||
return difflib.SequenceMatcher(None, songTitle, title).ratio() > typoRatio
|
||||
|
||||
|
||||
def insertLineFeeds(text):
|
||||
"""Insert \n before upcased characters"""
|
||||
|
||||
tokensStr = re.split("([a-z][A-Z])", text)
|
||||
for idx in range(1, len(tokensStr), 2):
|
||||
ltoken = list(tokensStr[idx])
|
||||
tokensStr[idx] = ltoken[0] + '\n' + ltoken[1]
|
||||
return ''.join(tokensStr)
|
||||
|
||||
|
||||
def decimateLineFeeds(text):
|
||||
"""Decimate \n characters. By default une only one \n as eol marker. Keep
|
||||
at most two \n in a row (eg. to separate verses)."""
|
||||
|
||||
# Remove first occurence of \n for each sequence of \n
|
||||
text = re.sub(r'\n(\n+)', '\g<1>', text)
|
||||
# Keep at most two \n in a row
|
||||
text = re.sub(r'\n\n+', '\n\n', text)
|
||||
return text.strip('\n')
|
||||
|
||||
|
||||
def lyricsSanetizer(text):
|
||||
"""Clean text, returning raw lyrics as output or None if it happens that
|
||||
input text is actually not lyrics content. Clean (x)html tags in text,
|
||||
correct layout and syntax ..."""
|
||||
|
||||
text = strip_cruft(text, False)
|
||||
|
||||
# Restore \n in input text
|
||||
if text.find('\n') == -1:
|
||||
text = insertLineFeeds(text)
|
||||
|
||||
# Supress advertisements regexps
|
||||
textLines = text.splitlines(True)
|
||||
# Match lines with an opening bracket but no ending one, ie lines that
|
||||
# contained html link that has been wiped out when scraping.
|
||||
reAdHtml = re.compile(r'(\(|\[).*[^\)\]]$')
|
||||
# Match lines containing url between brackets
|
||||
reAdTxt = re.compile(r'(\(|\[).*[http|www].*(\]|\))')
|
||||
for line in textLines:
|
||||
if (re.match(reAdHtml, line) != None) or \
|
||||
(re.match(reAdTxt, line) != None):
|
||||
textLines.remove(line)
|
||||
|
||||
# \n might have been duplicated during the scrapping.
|
||||
# decimate \n while number of \n represent more than half the number of
|
||||
# lines
|
||||
while len([x for x in textLines if x=='\n']) >= (len(textLines)/2 - 1):
|
||||
if len(textLines) <= 3:
|
||||
break
|
||||
text = ''.join(textLines)
|
||||
text = decimateLineFeeds(text)
|
||||
textLines = [line.strip(' ') for line in text.splitlines(True)]
|
||||
|
||||
return ''.join(textLines)
|
||||
|
||||
|
||||
def isLyricsAccepted(text, artist):
|
||||
"""Returns True if text is considered as valid lyrics"""
|
||||
|
||||
badTriggers = []
|
||||
nbLines = text.count('\n')
|
||||
if nbLines <= 1:
|
||||
log.debug("Ignoring too short lyrics '%s'" % text)
|
||||
return 0
|
||||
elif nbLines < 5 :
|
||||
badTriggers.append('too_short')
|
||||
|
||||
for item in [artist, 'lyrics', 'copyright', 'property']:
|
||||
badTriggers += [item] * len(re.findall(r'\W%s\W' % item, text, re.I))
|
||||
|
||||
if len(badTriggers) :
|
||||
log.debug('Bad triggers detected : %s' % badTriggers)
|
||||
|
||||
return len(badTriggers) < 2
|
||||
|
||||
|
||||
def scrapLyricsFromUrl(url):
|
||||
'''Scrap lyrics from url'''
|
||||
|
||||
from bs4 import BeautifulSoup, Tag
|
||||
print (url)
|
||||
html = fetch_url(url)
|
||||
soup = BeautifulSoup(html)
|
||||
|
||||
# Simplify the code by replacing some markers by the <p> marker
|
||||
try:
|
||||
for tag in soup.findAll(['center','blockquote']):
|
||||
pTag = Tag(soup, "p")
|
||||
pTag.contents = tag.contents
|
||||
tag.replaceWith(pTag)
|
||||
|
||||
for tag in soup.findAll(['script', 'a', 'font']):
|
||||
tag.replaceWith('<p>')
|
||||
|
||||
except Exception, e:
|
||||
log.debug('Error %s when replacing containing marker by p marker' % e, \
|
||||
exc_info=True)
|
||||
|
||||
for tag in soup.findAll('br'):
|
||||
tag.replaceWith('\n')
|
||||
|
||||
# Keep only tags that can possibly be parent tags and eol
|
||||
for tag in soup.findAll(True):
|
||||
containers = ['div', 'p', 'html', 'body', 'table', 'tr', 'td', 'br']
|
||||
if tag.name not in containers:
|
||||
tag.extract()
|
||||
|
||||
# Make better soup from current soup! The previous unclosed <p> sections are
|
||||
# now closed. Use str() rather than prettify() as it's more conservative
|
||||
# concerning EOL
|
||||
soup = BeautifulSoup(str(soup))
|
||||
|
||||
# In case lyrics are nested in no markup but <body>
|
||||
# Insert the whole body in a <p>
|
||||
bodyTag = soup.find('body')
|
||||
if bodyTag != None:
|
||||
pTag = soup.new_tag("p")
|
||||
bodyTag.parent.insert(0, pTag)
|
||||
pTag.insert(0, bodyTag)
|
||||
|
||||
|
||||
tagTokens = []
|
||||
for tag in soup.findAll('p'):
|
||||
soup2 = BeautifulSoup(str(tag))
|
||||
tagTokens += soup2.findAll(text=True) # Extract all text of <p> section
|
||||
|
||||
if tagTokens != []:
|
||||
# Lyrics are expected to be the longest paragraph
|
||||
tagTokens = sorted(tagTokens, key=len, reverse=True)
|
||||
soup = BeautifulSoup(tagTokens[0])
|
||||
if soup.findAll(['div', 'a']) != []:
|
||||
return None
|
||||
return unescape(tagTokens[0].strip("\n\r: "))
|
||||
|
||||
return None
|
||||
|
||||
|
||||
|
||||
def fetch_google(artist, title):
|
||||
"""Fetch lyrics from google results"""
|
||||
|
||||
QUERY = u"%s %s" % (artist, title)
|
||||
url = u'https://www.googleapis.com/customsearch/v1?key=%s&cx=%s&q=%s' % \
|
||||
(options['google_API_key'], options['google_engine_ID'], \
|
||||
urllib.quote(QUERY.encode('utf8')))
|
||||
|
||||
data = urllib.urlopen(url)
|
||||
data = json.load(data)
|
||||
if 'error' in data:
|
||||
reason = data['error']['errors'][0]['reason']
|
||||
log.debug(u'google lyrics backend error: %s' % reason)
|
||||
return None
|
||||
|
||||
if 'items' in data.keys():
|
||||
for item in data['items']:
|
||||
urlLink = item['link']
|
||||
urlTitle = item['title']
|
||||
if not isPageCandidate(urlLink, urlTitle, title, artist):
|
||||
continue
|
||||
lyrics = scrapLyricsFromUrl(urlLink)
|
||||
if (lyrics == None or len(lyrics)== 0):
|
||||
continue
|
||||
|
||||
lyrics = lyricsSanetizer(lyrics)
|
||||
|
||||
if isLyricsAccepted(lyrics, artist):
|
||||
return lyrics
|
||||
|
||||
# Lyrics scrapers.
|
||||
|
||||
def get_lyrics(artist, title):
|
||||
"""Fetch lyrics, trying each source in turn."""
|
||||
|
||||
# Remove featuring artists from search
|
||||
pattern = u"(.*) feat(uring|\.)?\s\S+"
|
||||
artist_nofeat = re.findall(re.compile(pattern,re.IGNORECASE), artist)
|
||||
if artist_nofeat:
|
||||
artist = artist_nofeat[0][0]
|
||||
|
||||
for backend in BACKENDS:
|
||||
lyrics = backend(artist, title)
|
||||
if lyrics:
|
||||
|
|
@ -173,59 +408,83 @@ def get_lyrics(artist, title):
|
|||
|
||||
# Plugin logic.
|
||||
|
||||
def fetch_item_lyrics(lib, loglevel, item, write):
|
||||
"""Fetch and store lyrics for a single item. If ``write``, then the
|
||||
lyrics will also be written to the file itself. The ``loglevel``
|
||||
parameter controls the visibility of the function's status log
|
||||
messages.
|
||||
"""
|
||||
# Skip if the item already has lyrics.
|
||||
if item.lyrics:
|
||||
log.log(loglevel, u'lyrics already present: %s - %s' %
|
||||
(item.artist, item.title))
|
||||
return
|
||||
BACKENDS = [fetch_lyricswiki, fetch_lyricscom]
|
||||
|
||||
# Fetch lyrics.
|
||||
lyrics = get_lyrics(item.artist, item.title)
|
||||
if not lyrics:
|
||||
log.log(loglevel, u'lyrics not found: %s - %s' %
|
||||
(item.artist, item.title))
|
||||
return
|
||||
options = {
|
||||
'google_API_key': None,
|
||||
'google_engine_ID': None,
|
||||
}
|
||||
def init_google_search(google_API_key, google_engine_ID):
|
||||
options['google_API_key'] = google_API_key
|
||||
options['google_engine_ID'] = google_engine_ID
|
||||
|
||||
log.log(loglevel, u'fetched lyrics: %s - %s' %
|
||||
(item.artist, item.title))
|
||||
item.lyrics = lyrics
|
||||
if write:
|
||||
item.write()
|
||||
lib.store(item)
|
||||
|
||||
AUTOFETCH = True
|
||||
class LyricsPlugin(BeetsPlugin):
|
||||
def __init__(self):
|
||||
super(LyricsPlugin, self).__init__()
|
||||
self.import_stages = [self.imported]
|
||||
self.config.add({
|
||||
'auto': True,
|
||||
'google_API_key': None,
|
||||
'google_engine_ID': u'009217259823014548361:lndtuqkycfu',
|
||||
'write': config['import']['write'].get(bool)
|
||||
})
|
||||
|
||||
if self.config['google_API_key'].get():
|
||||
init_google_search(self.config['google_API_key'].get(),
|
||||
self.config['google_engine_ID'].get())
|
||||
BACKENDS.insert(0, fetch_google)
|
||||
|
||||
def commands(self):
|
||||
cmd = ui.Subcommand('lyrics', help='fetch song lyrics')
|
||||
cmd.parser.add_option('-p', '--print', dest='printlyr',
|
||||
action='store_true', default=False,
|
||||
help='print lyrics to console')
|
||||
def func(lib, opts, args):
|
||||
def func(lib, config, opts, args):
|
||||
# The "write to files" option corresponds to the
|
||||
# import_write config value.
|
||||
write = config['import']['write'].get(bool)
|
||||
|
||||
for item in lib.items(ui.decargs(args)):
|
||||
fetch_item_lyrics(lib, logging.INFO, item, write)
|
||||
self.fetch_item_lyrics(lib, logging.INFO, item,
|
||||
self.config['write'].get())
|
||||
if opts.printlyr and item.lyrics:
|
||||
ui.print_(item.lyrics)
|
||||
cmd.func = func
|
||||
return [cmd]
|
||||
|
||||
|
||||
# Auto-fetch lyrics on import.
|
||||
def imported(self, session, task):
|
||||
if self.config['auto']:
|
||||
def imported(self, config, task):
|
||||
if self.config['auto'].get():
|
||||
for item in task.imported_items():
|
||||
fetch_item_lyrics(session.lib, logging.DEBUG, item, False)
|
||||
self.fetch_item_lyrics(config.lib, logging.DEBUG, item,
|
||||
self.config['write'].get())
|
||||
|
||||
|
||||
def fetch_item_lyrics(self, lib, loglevel, item, write):
|
||||
"""Fetch and store lyrics for a single item. If ``write``, then the
|
||||
lyrics will also be written to the file itself. The ``loglevel``
|
||||
parameter controls the visibility of the function's status log
|
||||
messages.
|
||||
"""
|
||||
# Skip if the item already has lyrics.
|
||||
if item.lyrics:
|
||||
log.log(loglevel, u'lyrics already present: %s - %s' %
|
||||
(item.artist, item.title))
|
||||
return
|
||||
|
||||
# Fetch lyrics.
|
||||
lyrics = get_lyrics(item.artist, item.title)
|
||||
if not lyrics:
|
||||
log.log(loglevel, u'lyrics not found: %s - %s' %
|
||||
(item.artist, item.title))
|
||||
if self.config['fallback'].get():
|
||||
lyrics = self.config['fallback'].get()
|
||||
else:
|
||||
return
|
||||
else:
|
||||
log.log(loglevel, u'fetched lyrics: %s - %s' %
|
||||
(item.artist, item.title))
|
||||
item.lyrics = lyrics
|
||||
if write:
|
||||
item.write()
|
||||
lib.store(item)
|
||||
|
|
@ -7,6 +7,13 @@ Namely, the current version of the plugin uses `Lyric Wiki`_ and `Lyrics.com`_.
|
|||
.. _Lyric Wiki: http://lyrics.wikia.com/
|
||||
.. _Lyrics.com: http://www.lyrics.com/
|
||||
|
||||
:ref:`_activate-google-custom-search` to expand the plugin firepower, by using google search to harvest lyrics from your own websites list.
|
||||
|
||||
By default if no lyrics are found, the file will be left unchanged. To specify a placeholder for the lyrics tags when none are found, use the ``fallback`` configuration option.
|
||||
|
||||
lastgenre:
|
||||
fallback: 'No lyrics found'
|
||||
|
||||
Fetch Lyrics During Import
|
||||
--------------------------
|
||||
|
||||
|
|
@ -42,3 +49,27 @@ automatic lyrics fetching during import. To do so, add this to your
|
|||
|
||||
lyrics:
|
||||
auto: no
|
||||
|
||||
.. _activate-google-custom-search:
|
||||
|
||||
Activate Google custom search
|
||||
------------------------------
|
||||
|
||||
Using Google backend requires `beautifulsoup`_, which you can install using `pip`_ by typing::
|
||||
|
||||
pip install beautifulsoup4
|
||||
|
||||
To activate google search you must first register an API key on https://code.google.com/apis/console. Then click *API Access* and use that key for the `google_API_key` plugin option.
|
||||
|
||||
Optionally, you can define a custom search engine on http://www.google.com/cse/all. Click the *Search engine ID* button to display the token to copy into the `google_engine_ID` option.
|
||||
By default, beets use a list of sources known to be scrapable.
|
||||
|
||||
|
||||
Example of ``config.yaml``::
|
||||
|
||||
lyrics:
|
||||
google_API_key: AZERTYUIOPQSDFGHJKLMWXCVBN1234567890_ab
|
||||
google_engine_ID: 009217259823014548361:lndtuqkycfu
|
||||
|
||||
.. _pip: http://www.pip-installer.org/
|
||||
.. _beautifulsoup: http://www.crummy.com/software/BeautifulSoup/bs4/doc/
|
||||
Loading…
Reference in a new issue