Merge pull request #243 from KraYmer/master

Add a lyrics backend that scrapes results from google custom search api
This commit is contained in:
Adrian Sampson 2013-04-15 10:19:31 -07:00
commit 1622dcefb7
2 changed files with 329 additions and 35 deletions

View file

@ -16,24 +16,24 @@
"""
from __future__ import print_function
import urllib
import re
import logging
import urllib
import json
import unicodedata
import difflib
from beets.plugins import BeetsPlugin
from beets import ui
from beets import config
from beets.ui import commands
# Global logger.
log = logging.getLogger('beets')
# Lyrics scrapers.
COMMENT_RE = re.compile(r'<!--.*-->', re.S)
DIV_RE = re.compile(r'<(/?)div>?')
COMMENT_RE = re.compile(r'<!--.*-->', re.S)
TAG_RE = re.compile(r'<[^>]*>')
BREAK_RE = re.compile(r'<br\s*/?>')
@ -90,16 +90,20 @@ def extract_text(html, starttag):
print('no closing tag found!')
return
lyrics = ''.join(parts)
return strip_cruft(lyrics)
def strip_cruft(lyrics, wscollapse=True):
"""Clean up lyrics"""
# Strip cruft.
lyrics = COMMENT_RE.sub('', lyrics)
lyrics = unescape(lyrics)
lyrics = re.sub(r'\s+', ' ', lyrics) # Whitespace collapse.
if wscollapse:
lyrics = re.sub(r'\s+', ' ', lyrics) # Whitespace collapse.
lyrics = BREAK_RE.sub('\n', lyrics) # <BR> newlines.
lyrics = re.sub(r'\n +', '\n', lyrics)
lyrics = re.sub(r' +\n', '\n', lyrics)
lyrics = TAG_RE.sub('', lyrics) # Strip remaining HTML tags.
lyrics = lyrics.replace('\r','\n')
lyrics = lyrics.strip()
return lyrics
@ -113,6 +117,10 @@ def _encode(s):
s = s.encode('utf8', 'ignore')
return urllib.quote(s)
#
# Wikia db
#
LYRICSWIKI_URL_PATTERN = 'http://lyrics.wikia.com/%s:%s'
def _lw_encode(s):
s = re.sub(r'\s+', '_', s)
@ -122,6 +130,7 @@ def _lw_encode(s):
s = re.sub(r'[\[\{]', '(', s)
s = re.sub(r'[\]\}]', ')', s)
return _encode(s)
def fetch_lyricswiki(artist, title):
"""Fetch lyrics from LyricsWiki."""
url = LYRICSWIKI_URL_PATTERN % (_lw_encode(artist), _lw_encode(title))
@ -133,6 +142,10 @@ def fetch_lyricswiki(artist, title):
if lyrics and 'Unfortunately, we are not licensed' not in lyrics:
return lyrics
#
# Lyrics.com db
#
LYRICSCOM_URL_PATTERN = 'http://www.lyrics.com/%s-lyrics-%s.html'
LYRICSCOM_NOT_FOUND = (
'Sorry, we do not have the lyric',
@ -141,6 +154,7 @@ LYRICSCOM_NOT_FOUND = (
def _lc_encode(s):
s = re.sub(r'\s+', '-', s)
return _encode(s)
def fetch_lyricscom(artist, title):
"""Fetch lyrics from Lyrics.com."""
url = LYRICSCOM_URL_PATTERN % (_lc_encode(title), _lc_encode(artist))
@ -159,9 +173,236 @@ def fetch_lyricscom(artist, title):
if parts:
return parts[0]
BACKENDS = [fetch_lyricswiki, fetch_lyricscom]
#
# Google engine
#
def slugify(text, jokerChar=False, spaceChar=' '):
"""
Normalizes string, removes non-alpha characters
Found at
http://stackoverflow.com/questions/295135/turn-a-string-into-a-valid-filename\
-in-python
"""
try:
text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore')
text = unicode(re.sub('[-\s]+', spaceChar, text))
if jokerChar is not False:
text = unicode(re.sub('[^\w\s]', jokerChar, text))
except UnicodeDecodeError:
log.exception("Failing to normalize '%s'" % (text))
return urllib.quote(text)
def is_page_candidate(urlLink, urlTitle, title, artist):
"""Return True if the url title makes it a good candidate to be a
page that contains lyrics of title by artist """
title = slugify(title.lower())
artist = slugify(artist.lower())
urlLink = slugify(urlLink.lower())
urlTitle = slugify(urlTitle.lower())
# Check if url title contains song title (exact match)
if urlTitle.find(title) != -1:
return True
# or try extracting song title from url title and check if
# they are close enough
songTitle = urlTitle.replace('lyrics', '')\
.replace(artist, '').strip('%20')
if len(songTitle):
log.debug("Match ratio of '%s' with title: %s" %
(songTitle, difflib.SequenceMatcher
(None, songTitle, title).ratio()))
typoRatio = .8
return difflib.SequenceMatcher(None, songTitle, title).ratio() > typoRatio
def insert_line_feeds(text):
"""Insert \n before upcased characters"""
tokensStr = re.split("([a-z][A-Z])", text)
for idx in range(1, len(tokensStr), 2):
ltoken = list(tokensStr[idx])
tokensStr[idx] = ltoken[0] + '\n' + ltoken[1]
return ''.join(tokensStr)
def decimate_line_feeds(text):
"""Decimate \n characters. By default une only one \n as eol marker. Keep
at most two \n in a row (eg. to separate verses)."""
# Remove first occurence of \n for each sequence of \n
text = re.sub(r'\n(\n+)', '\g<1>', text)
# Keep at most two \n in a row
text = re.sub(r'\n\n+', '\n\n', text)
return text.strip('\n')
def sanetize_lyrics(text):
"""Clean text, returning raw lyrics as output or None if it happens that
input text is actually not lyrics content. Clean (x)html tags in text,
correct layout and syntax ..."""
text = strip_cruft(text, False)
# Restore \n in input text
if text.find('\n') == -1:
text = insert_line_feeds(text)
# Supress advertisements regexps
textLines = text.splitlines(True)
# Match lines with an opening bracket but no ending one, ie lines that
# contained html link that has been wiped out when scraping.
reAdHtml = re.compile(r'(\(|\[).*[^\)\]]$')
# Match lines containing url between brackets
reAdTxt = re.compile(r'(\(|\[).*[http|www].*(\]|\))')
for line in textLines:
if (re.match(reAdHtml, line) != None) or \
(re.match(reAdTxt, line) != None):
textLines.remove(line)
# \n might have been duplicated during the scraping.
# decimate \n while number of \n represent more than half the number of
# lines
while len([x for x in textLines if x=='\n']) >= (len(textLines)/2 - 1):
if len(textLines) <= 3:
break
text = ''.join(textLines)
text = decimate_line_feeds(text)
textLines = [line.strip(' ') for line in text.splitlines(True)]
return ''.join(textLines)
def is_lyrics_accepted(text, artist):
"""Returns True if text is considered as valid lyrics"""
badTriggers = []
nbLines = text.count('\n')
if nbLines <= 1:
log.debug("Ignoring too short lyrics '%s'" % text)
return 0
elif nbLines < 5 :
badTriggers.append('too_short')
for item in [artist, 'lyrics', 'copyright', 'property']:
badTriggers += [item] * len(re.findall(r'\W%s\W' % item, text, re.I))
if len(badTriggers) :
log.debug('Bad triggers detected : %s' % badTriggers)
return len(badTriggers) < 2
def scrape_lyrics_from_url(url):
"""Scrape lyrics from url"""
from bs4 import BeautifulSoup, Tag
print (url)
html = fetch_url(url)
soup = BeautifulSoup(html)
# Simplify the code by replacing some markers by the <p> marker
try:
for tag in soup.findAll(['center','blockquote']):
pTag = Tag(soup, "p")
pTag.contents = tag.contents
tag.replaceWith(pTag)
for tag in soup.findAll(['script', 'a', 'font']):
tag.replaceWith('<p>')
except Exception, e:
log.debug('Error %s when replacing containing marker by p marker' % e, \
exc_info=True)
for tag in soup.findAll('br'):
tag.replaceWith('\n')
# Keep only tags that can possibly be parent tags and eol
for tag in soup.findAll(True):
containers = ['div', 'p', 'html', 'body', 'table', 'tr', 'td', 'br']
if tag.name not in containers:
tag.extract()
# Make better soup from current soup! The previous unclosed <p> sections are
# now closed. Use str() rather than prettify() as it's more conservative
# concerning EOL
soup = BeautifulSoup(str(soup))
# In case lyrics are nested in no markup but <body>
# Insert the whole body in a <p>
bodyTag = soup.find('body')
if bodyTag != None:
pTag = soup.new_tag("p")
bodyTag.parent.insert(0, pTag)
pTag.insert(0, bodyTag)
tagTokens = []
for tag in soup.findAll('p'):
soup2 = BeautifulSoup(str(tag))
tagTokens += soup2.findAll(text=True) # Extract all text of <p> section
if tagTokens != []:
# Lyrics are expected to be the longest paragraph
tagTokens = sorted(tagTokens, key=len, reverse=True)
soup = BeautifulSoup(tagTokens[0])
if soup.findAll(['div', 'a']) != []:
return None
return unescape(tagTokens[0].strip("\n\r: "))
return None
def fetch_google(artist, title):
"""Fetch lyrics from google results"""
QUERY = u"%s %s" % (artist, title)
url = u'https://www.googleapis.com/customsearch/v1?key=%s&cx=%s&q=%s' % \
(options['google_API_key'], options['google_engine_ID'], \
urllib.quote(QUERY.encode('utf8')))
data = urllib.urlopen(url)
data = json.load(data)
if 'error' in data:
reason = data['error']['errors'][0]['reason']
log.debug(u'google lyrics backend error: %s' % reason)
return None
if 'items' in data.keys():
for item in data['items']:
urlLink = item['link']
urlTitle = item['title']
if not is_page_candidate(urlLink, urlTitle, title, artist):
continue
lyrics = scrape_lyrics_from_url(urlLink)
if (lyrics == None or len(lyrics)== 0):
continue
lyrics = sanetize_lyrics(lyrics)
if is_lyrics_accepted(lyrics, artist):
return lyrics
# Lyrics scrapers.
def get_lyrics(artist, title):
"""Fetch lyrics, trying each source in turn."""
# Remove featuring artists from search
pattern = u"(.*) feat(uring|\.)?\s\S+"
artist_nofeat = re.findall(re.compile(pattern,re.IGNORECASE), artist)
if artist_nofeat:
artist = artist_nofeat[0][0]
for backend in BACKENDS:
lyrics = backend(artist, title)
if lyrics:
@ -173,40 +414,31 @@ def get_lyrics(artist, title):
# Plugin logic.
def fetch_item_lyrics(lib, loglevel, item, write):
"""Fetch and store lyrics for a single item. If ``write``, then the
lyrics will also be written to the file itself. The ``loglevel``
parameter controls the visibility of the function's status log
messages.
"""
# Skip if the item already has lyrics.
if item.lyrics:
log.log(loglevel, u'lyrics already present: %s - %s' %
(item.artist, item.title))
return
BACKENDS = [fetch_lyricswiki, fetch_lyricscom]
# Fetch lyrics.
lyrics = get_lyrics(item.artist, item.title)
if not lyrics:
log.log(loglevel, u'lyrics not found: %s - %s' %
(item.artist, item.title))
return
options = {
'google_API_key': None,
'google_engine_ID': None,
}
def init_google_search(google_API_key, google_engine_ID):
options['google_API_key'] = google_API_key
options['google_engine_ID'] = google_engine_ID
log.log(loglevel, u'fetched lyrics: %s - %s' %
(item.artist, item.title))
item.lyrics = lyrics
if write:
item.write()
lib.store(item)
AUTOFETCH = True
class LyricsPlugin(BeetsPlugin):
def __init__(self):
super(LyricsPlugin, self).__init__()
self.import_stages = [self.imported]
self.config.add({
'auto': True,
'google_API_key': None,
'google_engine_ID': u'009217259823014548361:lndtuqkycfu',
'fallback':False
})
if self.config['google_API_key'].get():
init_google_search(self.config['google_API_key'].get(),
self.config['google_engine_ID'].get())
BACKENDS.insert(0, fetch_google)
def commands(self):
cmd = ui.Subcommand('lyrics', help='fetch song lyrics')
@ -224,8 +456,39 @@ class LyricsPlugin(BeetsPlugin):
cmd.func = func
return [cmd]
# Auto-fetch lyrics on import.
def imported(self, session, task):
if self.config['auto']:
for item in task.imported_items():
fetch_item_lyrics(session.lib, logging.DEBUG, item, False)
self.fetch_item_lyrics(session.lib, logging.DEBUG, item, False)
def fetch_item_lyrics(self, lib, loglevel, item, write):
"""Fetch and store lyrics for a single item. If ``write``, then the
lyrics will also be written to the file itself. The ``loglevel``
parameter controls the visibility of the function's status log
messages.
"""
# Skip if the item already has lyrics.
if item.lyrics:
log.log(loglevel, u'lyrics already present: %s - %s' %
(item.artist, item.title))
return
# Fetch lyrics.
lyrics = get_lyrics(item.artist, item.title)
if not lyrics:
log.log(loglevel, u'lyrics not found: %s - %s' %
(item.artist, item.title))
if self.config['fallback'].get():
lyrics = self.config['fallback'].get()
else:
return
else:
log.log(loglevel, u'fetched lyrics: %s - %s' %
(item.artist, item.title))
item.lyrics = lyrics
if write:
item.write()
lib.store(item)

View file

@ -7,6 +7,13 @@ Namely, the current version of the plugin uses `Lyric Wiki`_ and `Lyrics.com`_.
.. _Lyric Wiki: http://lyrics.wikia.com/
.. _Lyrics.com: http://www.lyrics.com/
:ref:`_activate-google-custom-search` to expand the plugin firepower, by using google search to harvest lyrics from your own websites list.
By default if no lyrics are found, the file will be left unchanged. To specify a placeholder for the lyrics tags when none are found, use the ``fallback`` configuration option.
lyrics:
fallback: 'No lyrics found'
Fetch Lyrics During Import
--------------------------
@ -42,3 +49,27 @@ automatic lyrics fetching during import. To do so, add this to your
lyrics:
auto: no
.. _activate-google-custom-search:
Activate Google custom search
------------------------------
Using Google backend requires `beautifulsoup`_, which you can install using `pip`_ by typing::
pip install beautifulsoup4
To activate google search you must first register an API key on https://code.google.com/apis/console. Then click *API Access* and use that key for the `google_API_key` plugin option.
Optionally, you can define a custom search engine on http://www.google.com/cse/all. Click the *Search engine ID* button to display the token to copy into the `google_engine_ID` option.
By default, beets use a list of sources known to be scrapable.
Example of ``config.yaml``::
lyrics:
google_API_key: AZERTYUIOPQSDFGHJKLMWXCVBN1234567890_ab
google_engine_ID: 009217259823014548361:lndtuqkycfu
.. _pip: http://www.pip-installer.org/
.. _beautifulsoup: http://www.crummy.com/software/BeautifulSoup/bs4/doc/