diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index d2b8bb84a..03ac7d26a 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -18,7 +18,7 @@ from __future__ import print_function
import re
import logging
-import urllib
+import urllib
import json
import unicodedata
import difflib
@@ -26,7 +26,7 @@ import difflib
from beets.plugins import BeetsPlugin
from beets import ui
from beets import config
-from beets.ui import commands
+
# Global logger.
@@ -37,6 +37,9 @@ COMMENT_RE = re.compile(r'', re.S)
TAG_RE = re.compile(r'<[^>]*>')
BREAK_RE = re.compile(r'
')
+
+# Utilities.
+
def fetch_url(url):
"""Retrieve the content at a given URL, or return None if the source
is unreachable.
@@ -92,10 +95,10 @@ def extract_text(html, starttag):
lyrics = ''.join(parts)
return strip_cruft(lyrics)
-
def strip_cruft(lyrics, wscollapse=True):
- """Clean up lyrics"""
- # Strip cruft.
+ """Clean up HTML from an extracted lyrics string. For example,
+ tags are replaced with newlines.
+ """
lyrics = COMMENT_RE.sub('', lyrics)
lyrics = unescape(lyrics)
if wscollapse:
@@ -117,9 +120,8 @@ def _encode(s):
s = s.encode('utf8', 'ignore')
return urllib.quote(s)
-#
-# Wikia db
-#
+
+# LyricsWiki.
LYRICSWIKI_URL_PATTERN = 'http://lyrics.wikia.com/%s:%s'
def _lw_encode(s):
@@ -142,9 +144,8 @@ def fetch_lyricswiki(artist, title):
if lyrics and 'Unfortunately, we are not licensed' not in lyrics:
return lyrics
-#
-# Lyrics.com db
-#
+
+# Lyrics.com.
LYRICSCOM_URL_PATTERN = 'http://www.lyrics.com/%s-lyrics-%s.html'
LYRICSCOM_NOT_FOUND = (
@@ -173,89 +174,77 @@ def fetch_lyricscom(artist, title):
if parts:
return parts[0]
-#
-# Google engine
-#
-def slugify(text, jokerChar=False, spaceChar=' '):
- """
- Normalizes string, removes non-alpha characters
- Found at
- http://stackoverflow.com/questions/295135/turn-a-string-into-a-valid-filename\
- -in-python
- """
+# Optional Google custom search API backend.
+def slugify(text):
+ """Normalize a string and remove non-alphanumeric characters.
+ """
+ # http://stackoverflow.com/questions/295135/turn-a-string-into-a-valid-
+ # filename-in-python
try:
text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore')
- text = unicode(re.sub('[-\s]+', spaceChar, text))
-
- if jokerChar is not False:
- text = unicode(re.sub('[^\w\s]', jokerChar, text))
-
+ text = unicode(re.sub('[-\s]+', ' ', text))
except UnicodeDecodeError:
log.exception("Failing to normalize '%s'" % (text))
-
return urllib.quote(text)
-
def is_page_candidate(urlLink, urlTitle, title, artist):
- """Return True if the url title makes it a good candidate to be a
- page that contains lyrics of title by artist """
-
+ """Return True if the URL title makes it a good candidate to be a
+ page that contains lyrics of title by artist.
+ """
title = slugify(title.lower())
artist = slugify(artist.lower())
urlLink = slugify(urlLink.lower())
urlTitle = slugify(urlTitle.lower())
- # Check if url title contains song title (exact match)
+ # Check if URL title contains song title (exact match)
if urlTitle.find(title) != -1:
return True
- # or try extracting song title from url title and check if
+ # or try extracting song title from URL title and check if
# they are close enough
- songTitle = urlTitle.replace('lyrics', '')\
- .replace(artist, '').strip('%20')
- if len(songTitle):
+ songTitle = urlTitle.replace('lyrics', '') \
+ .replace(artist, '').strip('%20')
+ if songTitle:
log.debug("Match ratio of '%s' with title: %s" %
- (songTitle, difflib.SequenceMatcher
- (None, songTitle, title).ratio()))
+ (songTitle,
+ difflib.SequenceMatcher(None, songTitle, title).ratio()))
typoRatio = .8
return difflib.SequenceMatcher(None, songTitle, title).ratio() > typoRatio
-
def insert_line_feeds(text):
- """Insert \n before upcased characters"""
-
+ """Insert newlines before upper-case characters.
+ """
tokensStr = re.split("([a-z][A-Z])", text)
for idx in range(1, len(tokensStr), 2):
ltoken = list(tokensStr[idx])
tokensStr[idx] = ltoken[0] + '\n' + ltoken[1]
return ''.join(tokensStr)
-
def decimate_line_feeds(text):
- """Decimate \n characters. By default une only one \n as eol marker. Keep
- at most two \n in a row (eg. to separate verses)."""
-
- # Remove first occurence of \n for each sequence of \n
+ """Decimate newline characters. By default use only one newline as
+ an end-of-line marker. Keep at most two newlines in a row (e.g., to
+ separate verses).
+ """
+ # Remove first occurrence of \n for each sequence of \n
text = re.sub(r'\n(\n+)', '\g<1>', text)
# Keep at most two \n in a row
text = re.sub(r'\n\n+', '\n\n', text)
return text.strip('\n')
-
-def sanetize_lyrics(text):
- """Clean text, returning raw lyrics as output or None if it happens that
- input text is actually not lyrics content. Clean (x)html tags in text,
- correct layout and syntax ..."""
-
+def sanitize_lyrics(text):
+ """Clean text, returning raw lyrics as output or None if it happens
+ that input text is actually not lyrics content. Clean (x)html tags
+ in text, correct layout and syntax...
+ """
text = strip_cruft(text, False)
# Restore \n in input text
- if text.find('\n') == -1:
+ if '\n' not in text:
text = insert_line_feeds(text)
- # Supress advertisements regexps
+ # Suppress advertisements.
textLines = text.splitlines(True)
# Match lines with an opening bracket but no ending one, ie lines that
# contained html link that has been wiped out when scraping.
@@ -263,14 +252,13 @@ def sanetize_lyrics(text):
# Match lines containing url between brackets
reAdTxt = re.compile(r'(\(|\[).*[http|www].*(\]|\))')
for line in textLines:
- if (re.match(reAdHtml, line) != None) or \
- (re.match(reAdTxt, line) != None):
+ if re.match(reAdHtml, line) or re.match(reAdTxt, line):
textLines.remove(line)
# \n might have been duplicated during the scraping.
- # decimate \n while number of \n represent more than half the number of
+ # decimate \n while number of \n represent more than half the number of
# lines
- while len([x for x in textLines if x=='\n']) >= (len(textLines)/2 - 1):
+ while len([x for x in textLines if x == '\n']) >= len(textLines) / 2 - 1:
if len(textLines) <= 3:
break
text = ''.join(textLines)
@@ -279,96 +267,91 @@ def sanetize_lyrics(text):
return ''.join(textLines)
-
-def is_lyrics_accepted(text, artist):
- """Returns True if text is considered as valid lyrics"""
-
+def is_lyrics(text, artist):
+ """Determine whether the text seems to be valid lyrics.
+ """
badTriggers = []
nbLines = text.count('\n')
if nbLines <= 1:
log.debug("Ignoring too short lyrics '%s'" % text)
return 0
- elif nbLines < 5 :
+ elif nbLines < 5:
badTriggers.append('too_short')
- for item in [artist, 'lyrics', 'copyright', 'property']:
+ for item in artist, 'lyrics', 'copyright', 'property':
badTriggers += [item] * len(re.findall(r'\W%s\W' % item, text, re.I))
-
- if len(badTriggers) :
- log.debug('Bad triggers detected : %s' % badTriggers)
+
+ if badTriggers:
+ log.debug('Bad triggers detected: %s' % badTriggers)
return len(badTriggers) < 2
-
def scrape_lyrics_from_url(url):
- """Scrape lyrics from url"""
-
+ """Scrape lyrics from a URL. If no lyrics can be found, return None
+ instead.
+ """
from bs4 import BeautifulSoup, Tag
- print (url)
- html = fetch_url(url)
+ html = fetch_url(url)
soup = BeautifulSoup(html)
- # Simplify the code by replacing some markers by the
marker + # Simplify the code by replacing some markers by the
marker try: - for tag in soup.findAll(['center','blockquote']): + for tag in soup.findAll(['center', 'blockquote']): pTag = Tag(soup, "p") pTag.contents = tag.contents tag.replaceWith(pTag) for tag in soup.findAll(['script', 'a', 'font']): - tag.replaceWith('
') + tag.replaceWith('
') except Exception, e: - log.debug('Error %s when replacing containing marker by p marker' % e, \ + log.debug('Error %s when replacing containing marker by p marker' % e, exc_info=True) - + for tag in soup.findAll('br'): tag.replaceWith('\n') - + # Keep only tags that can possibly be parent tags and eol for tag in soup.findAll(True): containers = ['div', 'p', 'html', 'body', 'table', 'tr', 'td', 'br'] if tag.name not in containers: tag.extract() - # Make better soup from current soup! The previous unclosed
sections are - # now closed. Use str() rather than prettify() as it's more conservative - # concerning EOL + # Make better soup from current soup! The previous unclosed
sections + # are now closed. Use str() rather than prettify() as it's more + # conservative concerning EOL soup = BeautifulSoup(str(soup)) # In case lyrics are nested in no markup but
# Insert the whole body in abodyTag = soup.find('body') - if bodyTag != None: + if bodyTag: pTag = soup.new_tag("p") bodyTag.parent.insert(0, pTag) pTag.insert(0, bodyTag) - tagTokens = [] for tag in soup.findAll('p'): soup2 = BeautifulSoup(str(tag)) - tagTokens += soup2.findAll(text=True) # Extract all text of
section + # Extract all text of
section. + tagTokens += soup2.findAll(text=True) - if tagTokens != []: + if tagTokens: # Lyrics are expected to be the longest paragraph tagTokens = sorted(tagTokens, key=len, reverse=True) soup = BeautifulSoup(tagTokens[0]) - if soup.findAll(['div', 'a']) != []: + if soup.findAll(['div', 'a']): return None return unescape(tagTokens[0].strip("\n\r: ")) - return None - - - def fetch_google(artist, title): - """Fetch lyrics from google results""" - - QUERY = u"%s %s" % (artist, title) + """Fetch lyrics from Google search results. + """ + query = u"%s %s" % (artist, title) + api_key = config['lyrics']['google_API_key'].get(unicode) + engine_id = config['lyrics']['google_engine_ID'].get(unicode) url = u'https://www.googleapis.com/customsearch/v1?key=%s&cx=%s&q=%s' % \ - (options['google_API_key'], options['google_engine_ID'], \ - urllib.quote(QUERY.encode('utf8'))) + (api_key, engine_id, urllib.quote(query.encode('utf8'))) data = urllib.urlopen(url) data = json.load(data) @@ -376,54 +359,25 @@ def fetch_google(artist, title): reason = data['error']['errors'][0]['reason'] log.debug(u'google lyrics backend error: %s' % reason) return None - + if 'items' in data.keys(): for item in data['items']: - urlLink = item['link'] + urlLink = item['link'] urlTitle = item['title'] if not is_page_candidate(urlLink, urlTitle, title, artist): continue lyrics = scrape_lyrics_from_url(urlLink) - if (lyrics == None or len(lyrics)== 0): + if not lyrics: continue - lyrics = sanetize_lyrics(lyrics) + lyrics = sanitize_lyrics(lyrics) - if is_lyrics_accepted(lyrics, artist): + if is_lyrics(lyrics, artist): return lyrics -# Lyrics scrapers. - -def get_lyrics(artist, title): - """Fetch lyrics, trying each source in turn.""" - - # Remove featuring artists from search - pattern = u"(.*) feat(uring|\.)?\s\S+" - artist_nofeat = re.findall(re.compile(pattern,re.IGNORECASE), artist) - if artist_nofeat: - artist = artist_nofeat[0][0] - - for backend in BACKENDS: - lyrics = backend(artist, title) - if lyrics: - if isinstance(lyrics, str): - lyrics = lyrics.decode('utf8', 'ignore') - log.debug(u'got lyrics from backend: {0}'.format(backend.__name__)) - return lyrics - # Plugin logic. -BACKENDS = [fetch_lyricswiki, fetch_lyricscom] - -options = { - 'google_API_key': None, - 'google_engine_ID': None, -} -def init_google_search(google_API_key, google_engine_ID): - options['google_API_key'] = google_API_key - options['google_engine_ID'] = google_engine_ID - class LyricsPlugin(BeetsPlugin): def __init__(self): super(LyricsPlugin, self).__init__() @@ -432,13 +386,13 @@ class LyricsPlugin(BeetsPlugin): 'auto': True, 'google_API_key': None, 'google_engine_ID': u'009217259823014548361:lndtuqkycfu', - 'fallback':False + 'fallback': None, }) - + + self.backends = [fetch_lyricswiki, fetch_lyricscom] + if self.config['google_API_key'].get(): - init_google_search(self.config['google_API_key'].get(), - self.config['google_engine_ID'].get()) - BACKENDS.insert(0, fetch_google) + self.backends.insert(0, fetch_google) def commands(self): cmd = ui.Subcommand('lyrics', help='fetch song lyrics') @@ -450,26 +404,26 @@ class LyricsPlugin(BeetsPlugin): # import_write config value. write = config['import']['write'].get(bool) for item in lib.items(ui.decargs(args)): - fetch_item_lyrics(lib, logging.INFO, item, write) + self.fetch_item_lyrics(lib, logging.INFO, item, write) if opts.printlyr and item.lyrics: ui.print_(item.lyrics) cmd.func = func return [cmd] - # Auto-fetch lyrics on import. def imported(self, session, task): if self.config['auto']: for item in task.imported_items(): self.fetch_item_lyrics(session.lib, logging.DEBUG, item, False) - def fetch_item_lyrics(self, lib, loglevel, item, write): """Fetch and store lyrics for a single item. If ``write``, then the lyrics will also be written to the file itself. The ``loglevel`` parameter controls the visibility of the function's status log messages. """ + fallback = self.config['fallback'].get() + # Skip if the item already has lyrics. if item.lyrics: log.log(loglevel, u'lyrics already present: %s - %s' % @@ -477,18 +431,39 @@ class LyricsPlugin(BeetsPlugin): return # Fetch lyrics. - lyrics = get_lyrics(item.artist, item.title) + lyrics = self.get_lyrics(item.artist, item.title) if not lyrics: log.log(loglevel, u'lyrics not found: %s - %s' % (item.artist, item.title)) - if self.config['fallback'].get(): - lyrics = self.config['fallback'].get() + if fallback: + lyrics = fallback else: return else: log.log(loglevel, u'fetched lyrics: %s - %s' % (item.artist, item.title)) + item.lyrics = lyrics if write: item.write() - lib.store(item) \ No newline at end of file + lib.store(item) + + def get_lyrics(self, artist, title): + """Fetch lyrics, trying each source in turn. Return a string or + None if no lyrics were found. + """ + # Remove featuring artists from search. + pattern = u"(.*) feat(uring|\.)?\s\S+" + match = re.search(pattern, artist, re.IGNORECASE) + if match: + artist = match.group(0) + + for backend in self.backends: + lyrics = backend(artist, title) + if lyrics: + if isinstance(lyrics, str): + lyrics = lyrics.decode('utf8', 'ignore') + log.debug(u'got lyrics from backend: {0}'.format( + backend.__name__ + )) + return lyrics