diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index d2c9eb0e5..8d9f4b8ae 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -1,5 +1,5 @@
# This file is part of beets.
-# Copyright 2013, Adrian Sampson.
+# Copyright 2012, Adrian Sampson.
#
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and associated documentation files (the
@@ -16,24 +16,24 @@
"""
from __future__ import print_function
-import urllib
import re
import logging
+import urllib
+import json
+import unicodedata
+import difflib
from beets.plugins import BeetsPlugin
from beets import ui
from beets import config
-
+from beets.ui import commands
# Global logger.
log = logging.getLogger('beets')
-
-# Lyrics scrapers.
-
-COMMENT_RE = re.compile(r'', re.S)
DIV_RE = re.compile(r'<(/?)div>?')
+COMMENT_RE = re.compile(r'', re.S)
TAG_RE = re.compile(r'<[^>]*>')
BREAK_RE = re.compile(r'
')
@@ -91,27 +91,26 @@ def extract_text(html, starttag):
return
lyrics = ''.join(parts)
+ lyrics = strip_cruft(lyrics)
+
+
+def strip_cruft(lyrics, wscollapse=True):
+ """Clean up lyrics"""
# Strip cruft.
lyrics = COMMENT_RE.sub('', lyrics)
lyrics = unescape(lyrics)
- lyrics = re.sub(r'\s+', ' ', lyrics) # Whitespace collapse.
+ if wscollapse:
+ lyrics = re.sub(r'\s+', ' ', lyrics) # Whitespace collapse.
lyrics = BREAK_RE.sub('\n', lyrics) #
newlines.
lyrics = re.sub(r'\n +', '\n', lyrics)
lyrics = re.sub(r' +\n', '\n', lyrics)
lyrics = TAG_RE.sub('', lyrics) # Strip remaining HTML tags.
- lyrics = lyrics.replace('\r','\n')
lyrics = lyrics.strip()
return lyrics
-def _encode(s):
- """Encode the string for inclusion in a URL (common to both
- LyricsWiki and Lyrics.com).
- """
- if isinstance(s, unicode):
- # Replace "fancy" apostrophes with straight ones.
- s = s.replace(u'\u2019', u"'")
- s = s.encode('utf8', 'ignore')
- return urllib.quote(s)
+#
+# Wikia db
+#
LYRICSWIKI_URL_PATTERN = 'http://lyrics.wikia.com/%s:%s'
def _lw_encode(s):
@@ -121,7 +120,10 @@ def _lw_encode(s):
s = s.replace("#", "Number_")
s = re.sub(r'[\[\{]', '(', s)
s = re.sub(r'[\]\}]', ')', s)
- return _encode(s)
+ if isinstance(s, unicode):
+ s = s.encode('utf8', 'ignore')
+ return urllib.quote(s)
+
def fetch_lyricswiki(artist, title):
"""Fetch lyrics from LyricsWiki."""
url = LYRICSWIKI_URL_PATTERN % (_lw_encode(artist), _lw_encode(title))
@@ -133,6 +135,10 @@ def fetch_lyricswiki(artist, title):
if lyrics and 'Unfortunately, we are not licensed' not in lyrics:
return lyrics
+#
+# Lyrics.com db
+#
+
LYRICSCOM_URL_PATTERN = 'http://www.lyrics.com/%s-lyrics-%s.html'
LYRICSCOM_NOT_FOUND = (
'Sorry, we do not have the lyric',
@@ -140,7 +146,9 @@ LYRICSCOM_NOT_FOUND = (
)
def _lc_encode(s):
s = re.sub(r'\s+', '-', s)
- return _encode(s)
+ if isinstance(s, unicode):
+ s = s.encode('utf8', 'ignore')
+ return urllib.quote(s)
def fetch_lyricscom(artist, title):
"""Fetch lyrics from Lyrics.com."""
url = LYRICSCOM_URL_PATTERN % (_lc_encode(title), _lc_encode(artist))
@@ -159,9 +167,236 @@ def fetch_lyricscom(artist, title):
if parts:
return parts[0]
-BACKENDS = [fetch_lyricswiki, fetch_lyricscom]
+#
+# Google engine
+#
+
+def slugify(text, jokerChar=False, spaceChar=' '):
+ """
+ Normalizes string, removes non-alpha characters
+ Found at
+ http://stackoverflow.com/questions/295135/turn-a-string-into-a-valid-filename\
+ -in-python
+ """
+
+ try:
+ text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore')
+ text = unicode(re.sub('[-\s]+', spaceChar, text))
+
+ if jokerChar is not False:
+ text = unicode(re.sub('[^\w\s]', jokerChar, text))
+
+ except UnicodeDecodeError:
+ log.exception("Failing to normalize '%s'" % (text))
+
+ return urllib.quote(text)
+
+
+def isPageCandidate(urlLink, urlTitle, title, artist):
+ '''Return True if the url title makes it a good candidate to be a
+ page that contains lyrics of title by artist '''
+
+ title = slugify(title.lower())
+ artist = slugify(artist.lower())
+ urlLink = slugify(urlLink.lower())
+ urlTitle = slugify(urlTitle.lower())
+
+ # Check if url title contains song title (exact match)
+ if urlTitle.find(title) != -1:
+ return True
+ # or try extracting song title from url title and check if
+ # they are close enough
+ songTitle = urlTitle.replace('lyrics', '')\
+ .replace(artist, '').strip('%20')
+ if len(songTitle):
+ log.debug("Match ratio of '%s' with title: %s" %
+ (songTitle, difflib.SequenceMatcher
+ (None, songTitle, title).ratio()))
+
+ typoRatio = .8
+ return difflib.SequenceMatcher(None, songTitle, title).ratio() > typoRatio
+
+
+def insertLineFeeds(text):
+ """Insert \n before upcased characters"""
+
+ tokensStr = re.split("([a-z][A-Z])", text)
+ for idx in range(1, len(tokensStr), 2):
+ ltoken = list(tokensStr[idx])
+ tokensStr[idx] = ltoken[0] + '\n' + ltoken[1]
+ return ''.join(tokensStr)
+
+
+def decimateLineFeeds(text):
+ """Decimate \n characters. By default une only one \n as eol marker. Keep
+ at most two \n in a row (eg. to separate verses)."""
+
+ # Remove first occurence of \n for each sequence of \n
+ text = re.sub(r'\n(\n+)', '\g<1>', text)
+ # Keep at most two \n in a row
+ text = re.sub(r'\n\n+', '\n\n', text)
+ return text.strip('\n')
+
+
+def lyricsSanetizer(text):
+ """Clean text, returning raw lyrics as output or None if it happens that
+ input text is actually not lyrics content. Clean (x)html tags in text,
+ correct layout and syntax ..."""
+
+ text = strip_cruft(text, False)
+
+ # Restore \n in input text
+ if text.find('\n') == -1:
+ text = insertLineFeeds(text)
+
+ # Supress advertisements regexps
+ textLines = text.splitlines(True)
+ # Match lines with an opening bracket but no ending one, ie lines that
+ # contained html link that has been wiped out when scraping.
+ reAdHtml = re.compile(r'(\(|\[).*[^\)\]]$')
+ # Match lines containing url between brackets
+ reAdTxt = re.compile(r'(\(|\[).*[http|www].*(\]|\))')
+ for line in textLines:
+ if (re.match(reAdHtml, line) != None) or \
+ (re.match(reAdTxt, line) != None):
+ textLines.remove(line)
+
+ # \n might have been duplicated during the scrapping.
+ # decimate \n while number of \n represent more than half the number of
+ # lines
+ while len([x for x in textLines if x=='\n']) >= (len(textLines)/2 - 1):
+ if len(textLines) <= 3:
+ break
+ text = ''.join(textLines)
+ text = decimateLineFeeds(text)
+ textLines = [line.strip(' ') for line in text.splitlines(True)]
+
+ return ''.join(textLines)
+
+
+def isLyricsAccepted(text, artist):
+ """Returns True if text is considered as valid lyrics"""
+
+ badTriggers = []
+ nbLines = text.count('\n')
+ if nbLines <= 1:
+ log.debug("Ignoring too short lyrics '%s'" % text)
+ return 0
+ elif nbLines < 5 :
+ badTriggers.append('too_short')
+
+ for item in [artist, 'lyrics', 'copyright', 'property']:
+ badTriggers += [item] * len(re.findall(r'\W%s\W' % item, text, re.I))
+
+ if len(badTriggers) :
+ log.debug('Bad triggers detected : %s' % badTriggers)
+
+ return len(badTriggers) < 2
+
+
+def scrapLyricsFromUrl(url):
+ '''Scrap lyrics from url'''
+
+ from bs4 import BeautifulSoup, Tag
+ print (url)
+ html = fetch_url(url)
+ soup = BeautifulSoup(html)
+
+ # Simplify the code by replacing some markers by the
marker + try: + for tag in soup.findAll(['center','blockquote']): + pTag = Tag(soup, "p") + pTag.contents = tag.contents + tag.replaceWith(pTag) + + for tag in soup.findAll(['script', 'a', 'font']): + tag.replaceWith('
') + + except Exception, e: + log.debug('Error %s when replacing containing marker by p marker' % e, \ + exc_info=True) + + for tag in soup.findAll('br'): + tag.replaceWith('\n') + + # Keep only tags that can possibly be parent tags and eol + for tag in soup.findAll(True): + containers = ['div', 'p', 'html', 'body', 'table', 'tr', 'td', 'br'] + if tag.name not in containers: + tag.extract() + + # Make better soup from current soup! The previous unclosed
sections are + # now closed. Use str() rather than prettify() as it's more conservative + # concerning EOL + soup = BeautifulSoup(str(soup)) + + # In case lyrics are nested in no markup but
+ # Insert the whole body in a+ bodyTag = soup.find('body') + if bodyTag != None: + pTag = soup.new_tag("p") + bodyTag.parent.insert(0, pTag) + pTag.insert(0, bodyTag) + + + tagTokens = [] + for tag in soup.findAll('p'): + soup2 = BeautifulSoup(str(tag)) + tagTokens += soup2.findAll(text=True) # Extract all text of
section + + if tagTokens != []: + # Lyrics are expected to be the longest paragraph + tagTokens = sorted(tagTokens, key=len, reverse=True) + soup = BeautifulSoup(tagTokens[0]) + if soup.findAll(['div', 'a']) != []: + return None + return unescape(tagTokens[0].strip("\n\r: ")) + + return None + + + +def fetch_google(artist, title): + """Fetch lyrics from google results""" + + QUERY = u"%s %s" % (artist, title) + url = u'https://www.googleapis.com/customsearch/v1?key=%s&cx=%s&q=%s' % \ + (options['google_API_key'], options['google_engine_ID'], \ + urllib.quote(QUERY.encode('utf8'))) + + data = urllib.urlopen(url) + data = json.load(data) + if 'error' in data: + reason = data['error']['errors'][0]['reason'] + log.debug(u'google lyrics backend error: %s' % reason) + return None + + if 'items' in data.keys(): + for item in data['items']: + urlLink = item['link'] + urlTitle = item['title'] + if not isPageCandidate(urlLink, urlTitle, title, artist): + continue + lyrics = scrapLyricsFromUrl(urlLink) + if (lyrics == None or len(lyrics)== 0): + continue + + lyrics = lyricsSanetizer(lyrics) + + if isLyricsAccepted(lyrics, artist): + return lyrics + +# Lyrics scrapers. + def get_lyrics(artist, title): """Fetch lyrics, trying each source in turn.""" + + # Remove featuring artists from search + pattern = u"(.*) feat(uring|\.)?\s\S+" + artist_nofeat = re.findall(re.compile(pattern,re.IGNORECASE), artist) + if artist_nofeat: + artist = artist_nofeat[0][0] + for backend in BACKENDS: lyrics = backend(artist, title) if lyrics: @@ -173,59 +408,83 @@ def get_lyrics(artist, title): # Plugin logic. -def fetch_item_lyrics(lib, loglevel, item, write): - """Fetch and store lyrics for a single item. If ``write``, then the - lyrics will also be written to the file itself. The ``loglevel`` - parameter controls the visibility of the function's status log - messages. - """ - # Skip if the item already has lyrics. - if item.lyrics: - log.log(loglevel, u'lyrics already present: %s - %s' % - (item.artist, item.title)) - return +BACKENDS = [fetch_lyricswiki, fetch_lyricscom] - # Fetch lyrics. - lyrics = get_lyrics(item.artist, item.title) - if not lyrics: - log.log(loglevel, u'lyrics not found: %s - %s' % - (item.artist, item.title)) - return +options = { + 'google_API_key': None, + 'google_engine_ID': None, +} +def init_google_search(google_API_key, google_engine_ID): + options['google_API_key'] = google_API_key + options['google_engine_ID'] = google_engine_ID - log.log(loglevel, u'fetched lyrics: %s - %s' % - (item.artist, item.title)) - item.lyrics = lyrics - if write: - item.write() - lib.store(item) - -AUTOFETCH = True class LyricsPlugin(BeetsPlugin): def __init__(self): super(LyricsPlugin, self).__init__() self.import_stages = [self.imported] self.config.add({ 'auto': True, + 'google_API_key': None, + 'google_engine_ID': u'009217259823014548361:lndtuqkycfu', + 'write': config['import']['write'].get(bool) }) + + if self.config['google_API_key'].get(): + init_google_search(self.config['google_API_key'].get(), + self.config['google_engine_ID'].get()) + BACKENDS.insert(0, fetch_google) def commands(self): cmd = ui.Subcommand('lyrics', help='fetch song lyrics') cmd.parser.add_option('-p', '--print', dest='printlyr', action='store_true', default=False, help='print lyrics to console') - def func(lib, opts, args): + def func(lib, config, opts, args): # The "write to files" option corresponds to the # import_write config value. - write = config['import']['write'].get(bool) + for item in lib.items(ui.decargs(args)): - fetch_item_lyrics(lib, logging.INFO, item, write) + self.fetch_item_lyrics(lib, logging.INFO, item, + self.config['write'].get()) if opts.printlyr and item.lyrics: ui.print_(item.lyrics) cmd.func = func return [cmd] + # Auto-fetch lyrics on import. - def imported(self, session, task): - if self.config['auto']: + def imported(self, config, task): + if self.config['auto'].get(): for item in task.imported_items(): - fetch_item_lyrics(session.lib, logging.DEBUG, item, False) + self.fetch_item_lyrics(config.lib, logging.DEBUG, item, + self.config['write'].get()) + + + def fetch_item_lyrics(self, lib, loglevel, item, write): + """Fetch and store lyrics for a single item. If ``write``, then the + lyrics will also be written to the file itself. The ``loglevel`` + parameter controls the visibility of the function's status log + messages. + """ + # Skip if the item already has lyrics. + if item.lyrics: + log.log(loglevel, u'lyrics already present: %s - %s' % + (item.artist, item.title)) + return + + # Fetch lyrics. + lyrics = get_lyrics(item.artist, item.title) + if not lyrics: + log.log(loglevel, u'lyrics not found: %s - %s' % + (item.artist, item.title)) + if self.config['fallback'].get(): + lyrics = self.config['fallback'].get() + else: + return + else: + log.log(loglevel, u'fetched lyrics: %s - %s' % + (item.artist, item.title)) + item.lyrics = lyrics + if write: + item.write() + lib.store(item) \ No newline at end of file diff --git a/docs/plugins/lyrics.rst b/docs/plugins/lyrics.rst index e6a348133..673bf2c9e 100644 --- a/docs/plugins/lyrics.rst +++ b/docs/plugins/lyrics.rst @@ -7,6 +7,13 @@ Namely, the current version of the plugin uses `Lyric Wiki`_ and `Lyrics.com`_. .. _Lyric Wiki: http://lyrics.wikia.com/ .. _Lyrics.com: http://www.lyrics.com/ +:ref:`_activate-google-custom-search` to expand the plugin firepower, by using google search to harvest lyrics from your own websites list. + +By default if no lyrics are found, the file will be left unchanged. To specify a placeholder for the lyrics tags when none are found, use the ``fallback`` configuration option. + + lastgenre: + fallback: 'No lyrics found' + Fetch Lyrics During Import -------------------------- @@ -42,3 +49,27 @@ automatic lyrics fetching during import. To do so, add this to your lyrics: auto: no + +.. _activate-google-custom-search: + +Activate Google custom search +------------------------------ + +Using Google backend requires `beautifulsoup`_, which you can install using `pip`_ by typing:: + + pip install beautifulsoup4 + +To activate google search you must first register an API key on https://code.google.com/apis/console. Then click *API Access* and use that key for the `google_API_key` plugin option. + +Optionally, you can define a custom search engine on http://www.google.com/cse/all. Click the *Search engine ID* button to display the token to copy into the `google_engine_ID` option. +By default, beets use a list of sources known to be scrapable. + + +Example of ``config.yaml``:: + + lyrics: + google_API_key: AZERTYUIOPQSDFGHJKLMWXCVBN1234567890_ab + google_engine_ID: 009217259823014548361:lndtuqkycfu + +.. _pip: http://www.pip-installer.org/ +.. _beautifulsoup: http://www.crummy.com/software/BeautifulSoup/bs4/doc/ \ No newline at end of file