diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py index d2b8bb84a..03ac7d26a 100644 --- a/beetsplug/lyrics.py +++ b/beetsplug/lyrics.py @@ -18,7 +18,7 @@ from __future__ import print_function import re import logging -import urllib +import urllib import json import unicodedata import difflib @@ -26,7 +26,7 @@ import difflib from beets.plugins import BeetsPlugin from beets import ui from beets import config -from beets.ui import commands + # Global logger. @@ -37,6 +37,9 @@ COMMENT_RE = re.compile(r'', re.S) TAG_RE = re.compile(r'<[^>]*>') BREAK_RE = re.compile(r'') + +# Utilities. + def fetch_url(url): """Retrieve the content at a given URL, or return None if the source is unreachable. @@ -92,10 +95,10 @@ def extract_text(html, starttag): lyrics = ''.join(parts) return strip_cruft(lyrics) - def strip_cruft(lyrics, wscollapse=True): - """Clean up lyrics""" - # Strip cruft. + """Clean up HTML from an extracted lyrics string. For example,
+ tags are replaced with newlines. + """ lyrics = COMMENT_RE.sub('', lyrics) lyrics = unescape(lyrics) if wscollapse: @@ -117,9 +120,8 @@ def _encode(s): s = s.encode('utf8', 'ignore') return urllib.quote(s) -# -# Wikia db -# + +# LyricsWiki. LYRICSWIKI_URL_PATTERN = 'http://lyrics.wikia.com/%s:%s' def _lw_encode(s): @@ -142,9 +144,8 @@ def fetch_lyricswiki(artist, title): if lyrics and 'Unfortunately, we are not licensed' not in lyrics: return lyrics -# -# Lyrics.com db -# + +# Lyrics.com. LYRICSCOM_URL_PATTERN = 'http://www.lyrics.com/%s-lyrics-%s.html' LYRICSCOM_NOT_FOUND = ( @@ -173,89 +174,77 @@ def fetch_lyricscom(artist, title): if parts: return parts[0] -# -# Google engine -# -def slugify(text, jokerChar=False, spaceChar=' '): - """ - Normalizes string, removes non-alpha characters - Found at - http://stackoverflow.com/questions/295135/turn-a-string-into-a-valid-filename\ - -in-python - """ +# Optional Google custom search API backend. +def slugify(text): + """Normalize a string and remove non-alphanumeric characters. + """ + # http://stackoverflow.com/questions/295135/turn-a-string-into-a-valid- + # filename-in-python try: text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore') - text = unicode(re.sub('[-\s]+', spaceChar, text)) - - if jokerChar is not False: - text = unicode(re.sub('[^\w\s]', jokerChar, text)) - + text = unicode(re.sub('[-\s]+', ' ', text)) except UnicodeDecodeError: log.exception("Failing to normalize '%s'" % (text)) - return urllib.quote(text) - def is_page_candidate(urlLink, urlTitle, title, artist): - """Return True if the url title makes it a good candidate to be a - page that contains lyrics of title by artist """ - + """Return True if the URL title makes it a good candidate to be a + page that contains lyrics of title by artist. + """ title = slugify(title.lower()) artist = slugify(artist.lower()) urlLink = slugify(urlLink.lower()) urlTitle = slugify(urlTitle.lower()) - # Check if url title contains song title (exact match) + # Check if URL title contains song title (exact match) if urlTitle.find(title) != -1: return True - # or try extracting song title from url title and check if + # or try extracting song title from URL title and check if # they are close enough - songTitle = urlTitle.replace('lyrics', '')\ - .replace(artist, '').strip('%20') - if len(songTitle): + songTitle = urlTitle.replace('lyrics', '') \ + .replace(artist, '').strip('%20') + if songTitle: log.debug("Match ratio of '%s' with title: %s" % - (songTitle, difflib.SequenceMatcher - (None, songTitle, title).ratio())) + (songTitle, + difflib.SequenceMatcher(None, songTitle, title).ratio())) typoRatio = .8 return difflib.SequenceMatcher(None, songTitle, title).ratio() > typoRatio - def insert_line_feeds(text): - """Insert \n before upcased characters""" - + """Insert newlines before upper-case characters. + """ tokensStr = re.split("([a-z][A-Z])", text) for idx in range(1, len(tokensStr), 2): ltoken = list(tokensStr[idx]) tokensStr[idx] = ltoken[0] + '\n' + ltoken[1] return ''.join(tokensStr) - def decimate_line_feeds(text): - """Decimate \n characters. By default une only one \n as eol marker. Keep - at most two \n in a row (eg. to separate verses).""" - - # Remove first occurence of \n for each sequence of \n + """Decimate newline characters. By default use only one newline as + an end-of-line marker. Keep at most two newlines in a row (e.g., to + separate verses). + """ + # Remove first occurrence of \n for each sequence of \n text = re.sub(r'\n(\n+)', '\g<1>', text) # Keep at most two \n in a row text = re.sub(r'\n\n+', '\n\n', text) return text.strip('\n') - -def sanetize_lyrics(text): - """Clean text, returning raw lyrics as output or None if it happens that - input text is actually not lyrics content. Clean (x)html tags in text, - correct layout and syntax ...""" - +def sanitize_lyrics(text): + """Clean text, returning raw lyrics as output or None if it happens + that input text is actually not lyrics content. Clean (x)html tags + in text, correct layout and syntax... + """ text = strip_cruft(text, False) # Restore \n in input text - if text.find('\n') == -1: + if '\n' not in text: text = insert_line_feeds(text) - # Supress advertisements regexps + # Suppress advertisements. textLines = text.splitlines(True) # Match lines with an opening bracket but no ending one, ie lines that # contained html link that has been wiped out when scraping. @@ -263,14 +252,13 @@ def sanetize_lyrics(text): # Match lines containing url between brackets reAdTxt = re.compile(r'(\(|\[).*[http|www].*(\]|\))') for line in textLines: - if (re.match(reAdHtml, line) != None) or \ - (re.match(reAdTxt, line) != None): + if re.match(reAdHtml, line) or re.match(reAdTxt, line): textLines.remove(line) # \n might have been duplicated during the scraping. - # decimate \n while number of \n represent more than half the number of + # decimate \n while number of \n represent more than half the number of # lines - while len([x for x in textLines if x=='\n']) >= (len(textLines)/2 - 1): + while len([x for x in textLines if x == '\n']) >= len(textLines) / 2 - 1: if len(textLines) <= 3: break text = ''.join(textLines) @@ -279,96 +267,91 @@ def sanetize_lyrics(text): return ''.join(textLines) - -def is_lyrics_accepted(text, artist): - """Returns True if text is considered as valid lyrics""" - +def is_lyrics(text, artist): + """Determine whether the text seems to be valid lyrics. + """ badTriggers = [] nbLines = text.count('\n') if nbLines <= 1: log.debug("Ignoring too short lyrics '%s'" % text) return 0 - elif nbLines < 5 : + elif nbLines < 5: badTriggers.append('too_short') - for item in [artist, 'lyrics', 'copyright', 'property']: + for item in artist, 'lyrics', 'copyright', 'property': badTriggers += [item] * len(re.findall(r'\W%s\W' % item, text, re.I)) - - if len(badTriggers) : - log.debug('Bad triggers detected : %s' % badTriggers) + + if badTriggers: + log.debug('Bad triggers detected: %s' % badTriggers) return len(badTriggers) < 2 - def scrape_lyrics_from_url(url): - """Scrape lyrics from url""" - + """Scrape lyrics from a URL. If no lyrics can be found, return None + instead. + """ from bs4 import BeautifulSoup, Tag - print (url) - html = fetch_url(url) + html = fetch_url(url) soup = BeautifulSoup(html) - # Simplify the code by replacing some markers by the

marker + # Simplify the code by replacing some markers by the

marker try: - for tag in soup.findAll(['center','blockquote']): + for tag in soup.findAll(['center', 'blockquote']): pTag = Tag(soup, "p") pTag.contents = tag.contents tag.replaceWith(pTag) for tag in soup.findAll(['script', 'a', 'font']): - tag.replaceWith('

') + tag.replaceWith('

') except Exception, e: - log.debug('Error %s when replacing containing marker by p marker' % e, \ + log.debug('Error %s when replacing containing marker by p marker' % e, exc_info=True) - + for tag in soup.findAll('br'): tag.replaceWith('\n') - + # Keep only tags that can possibly be parent tags and eol for tag in soup.findAll(True): containers = ['div', 'p', 'html', 'body', 'table', 'tr', 'td', 'br'] if tag.name not in containers: tag.extract() - # Make better soup from current soup! The previous unclosed

sections are - # now closed. Use str() rather than prettify() as it's more conservative - # concerning EOL + # Make better soup from current soup! The previous unclosed

sections + # are now closed. Use str() rather than prettify() as it's more + # conservative concerning EOL soup = BeautifulSoup(str(soup)) # In case lyrics are nested in no markup but # Insert the whole body in a

bodyTag = soup.find('body') - if bodyTag != None: + if bodyTag: pTag = soup.new_tag("p") bodyTag.parent.insert(0, pTag) pTag.insert(0, bodyTag) - tagTokens = [] for tag in soup.findAll('p'): soup2 = BeautifulSoup(str(tag)) - tagTokens += soup2.findAll(text=True) # Extract all text of

section + # Extract all text of

section. + tagTokens += soup2.findAll(text=True) - if tagTokens != []: + if tagTokens: # Lyrics are expected to be the longest paragraph tagTokens = sorted(tagTokens, key=len, reverse=True) soup = BeautifulSoup(tagTokens[0]) - if soup.findAll(['div', 'a']) != []: + if soup.findAll(['div', 'a']): return None return unescape(tagTokens[0].strip("\n\r: ")) - return None - - - def fetch_google(artist, title): - """Fetch lyrics from google results""" - - QUERY = u"%s %s" % (artist, title) + """Fetch lyrics from Google search results. + """ + query = u"%s %s" % (artist, title) + api_key = config['lyrics']['google_API_key'].get(unicode) + engine_id = config['lyrics']['google_engine_ID'].get(unicode) url = u'https://www.googleapis.com/customsearch/v1?key=%s&cx=%s&q=%s' % \ - (options['google_API_key'], options['google_engine_ID'], \ - urllib.quote(QUERY.encode('utf8'))) + (api_key, engine_id, urllib.quote(query.encode('utf8'))) data = urllib.urlopen(url) data = json.load(data) @@ -376,54 +359,25 @@ def fetch_google(artist, title): reason = data['error']['errors'][0]['reason'] log.debug(u'google lyrics backend error: %s' % reason) return None - + if 'items' in data.keys(): for item in data['items']: - urlLink = item['link'] + urlLink = item['link'] urlTitle = item['title'] if not is_page_candidate(urlLink, urlTitle, title, artist): continue lyrics = scrape_lyrics_from_url(urlLink) - if (lyrics == None or len(lyrics)== 0): + if not lyrics: continue - lyrics = sanetize_lyrics(lyrics) + lyrics = sanitize_lyrics(lyrics) - if is_lyrics_accepted(lyrics, artist): + if is_lyrics(lyrics, artist): return lyrics -# Lyrics scrapers. - -def get_lyrics(artist, title): - """Fetch lyrics, trying each source in turn.""" - - # Remove featuring artists from search - pattern = u"(.*) feat(uring|\.)?\s\S+" - artist_nofeat = re.findall(re.compile(pattern,re.IGNORECASE), artist) - if artist_nofeat: - artist = artist_nofeat[0][0] - - for backend in BACKENDS: - lyrics = backend(artist, title) - if lyrics: - if isinstance(lyrics, str): - lyrics = lyrics.decode('utf8', 'ignore') - log.debug(u'got lyrics from backend: {0}'.format(backend.__name__)) - return lyrics - # Plugin logic. -BACKENDS = [fetch_lyricswiki, fetch_lyricscom] - -options = { - 'google_API_key': None, - 'google_engine_ID': None, -} -def init_google_search(google_API_key, google_engine_ID): - options['google_API_key'] = google_API_key - options['google_engine_ID'] = google_engine_ID - class LyricsPlugin(BeetsPlugin): def __init__(self): super(LyricsPlugin, self).__init__() @@ -432,13 +386,13 @@ class LyricsPlugin(BeetsPlugin): 'auto': True, 'google_API_key': None, 'google_engine_ID': u'009217259823014548361:lndtuqkycfu', - 'fallback':False + 'fallback': None, }) - + + self.backends = [fetch_lyricswiki, fetch_lyricscom] + if self.config['google_API_key'].get(): - init_google_search(self.config['google_API_key'].get(), - self.config['google_engine_ID'].get()) - BACKENDS.insert(0, fetch_google) + self.backends.insert(0, fetch_google) def commands(self): cmd = ui.Subcommand('lyrics', help='fetch song lyrics') @@ -450,26 +404,26 @@ class LyricsPlugin(BeetsPlugin): # import_write config value. write = config['import']['write'].get(bool) for item in lib.items(ui.decargs(args)): - fetch_item_lyrics(lib, logging.INFO, item, write) + self.fetch_item_lyrics(lib, logging.INFO, item, write) if opts.printlyr and item.lyrics: ui.print_(item.lyrics) cmd.func = func return [cmd] - # Auto-fetch lyrics on import. def imported(self, session, task): if self.config['auto']: for item in task.imported_items(): self.fetch_item_lyrics(session.lib, logging.DEBUG, item, False) - def fetch_item_lyrics(self, lib, loglevel, item, write): """Fetch and store lyrics for a single item. If ``write``, then the lyrics will also be written to the file itself. The ``loglevel`` parameter controls the visibility of the function's status log messages. """ + fallback = self.config['fallback'].get() + # Skip if the item already has lyrics. if item.lyrics: log.log(loglevel, u'lyrics already present: %s - %s' % @@ -477,18 +431,39 @@ class LyricsPlugin(BeetsPlugin): return # Fetch lyrics. - lyrics = get_lyrics(item.artist, item.title) + lyrics = self.get_lyrics(item.artist, item.title) if not lyrics: log.log(loglevel, u'lyrics not found: %s - %s' % (item.artist, item.title)) - if self.config['fallback'].get(): - lyrics = self.config['fallback'].get() + if fallback: + lyrics = fallback else: return else: log.log(loglevel, u'fetched lyrics: %s - %s' % (item.artist, item.title)) + item.lyrics = lyrics if write: item.write() - lib.store(item) \ No newline at end of file + lib.store(item) + + def get_lyrics(self, artist, title): + """Fetch lyrics, trying each source in turn. Return a string or + None if no lyrics were found. + """ + # Remove featuring artists from search. + pattern = u"(.*) feat(uring|\.)?\s\S+" + match = re.search(pattern, artist, re.IGNORECASE) + if match: + artist = match.group(0) + + for backend in self.backends: + lyrics = backend(artist, title) + if lyrics: + if isinstance(lyrics, str): + lyrics = lyrics.decode('utf8', 'ignore') + log.debug(u'got lyrics from backend: {0}'.format( + backend.__name__ + )) + return lyrics