Merge pull request #243 from KraYmer/master

Add a lyrics backend that scrapes results from google custom search api
2026-01-30 03:54:21 +01:00 · 2013-04-15 10:19:31 -07:00 · 2013-04-15 10:19:31 -07:00 · 1622dcefb7
commit 1622dcefb7
parent 16e0648c81 479b25bac3
2 changed files with 329 additions and 35 deletions
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@ -16,24 +16,24 @@
 """
 from __future__ import print_function

-import urllib
 import re
 import logging
+import urllib 
+import json
+import unicodedata
+import difflib

 from beets.plugins import BeetsPlugin
 from beets import ui
 from beets import config
-
+from beets.ui import commands

 # Global logger.

 log = logging.getLogger('beets')

-
-# Lyrics scrapers.
-
-COMMENT_RE = re.compile(r'<!--.*-->', re.S)
 DIV_RE = re.compile(r'<(/?)div>?')
+COMMENT_RE = re.compile(r'<!--.*-->', re.S)
 TAG_RE = re.compile(r'<[^>]*>')
 BREAK_RE = re.compile(r'<br\s*/?>')

@ -90,16 +90,20 @@ def extract_text(html, starttag):
        print('no closing tag found!')
        return
    lyrics = ''.join(parts)
+    return strip_cruft(lyrics)

+
+def strip_cruft(lyrics, wscollapse=True):
+    """Clean up lyrics"""    
    # Strip cruft.
    lyrics = COMMENT_RE.sub('', lyrics)
    lyrics = unescape(lyrics)
-    lyrics = re.sub(r'\s+', ' ', lyrics) # Whitespace collapse.
+    if wscollapse:
+        lyrics = re.sub(r'\s+', ' ', lyrics) # Whitespace collapse.
    lyrics = BREAK_RE.sub('\n', lyrics) # <BR> newlines.
    lyrics = re.sub(r'\n +', '\n', lyrics)
    lyrics = re.sub(r' +\n', '\n', lyrics)
    lyrics = TAG_RE.sub('', lyrics) # Strip remaining HTML tags.
-    lyrics = lyrics.replace('\r','\n')
    lyrics = lyrics.strip()
    return lyrics

@ -113,6 +117,10 @@ def _encode(s):
        s = s.encode('utf8', 'ignore')
    return urllib.quote(s)

+#
+# Wikia db
+#
+
 LYRICSWIKI_URL_PATTERN = 'http://lyrics.wikia.com/%s:%s'
 def _lw_encode(s):
    s = re.sub(r'\s+', '_', s)
@ -122,6 +130,7 @@ def _lw_encode(s):
    s = re.sub(r'[\[\{]', '(', s)
    s = re.sub(r'[\]\}]', ')', s)
    return _encode(s)
+
 def fetch_lyricswiki(artist, title):
    """Fetch lyrics from LyricsWiki."""
    url = LYRICSWIKI_URL_PATTERN % (_lw_encode(artist), _lw_encode(title))
@ -133,6 +142,10 @@ def fetch_lyricswiki(artist, title):
    if lyrics and 'Unfortunately, we are not licensed' not in lyrics:
        return lyrics

+#
+# Lyrics.com db
+#
+
 LYRICSCOM_URL_PATTERN = 'http://www.lyrics.com/%s-lyrics-%s.html'
 LYRICSCOM_NOT_FOUND = (
    'Sorry, we do not have the lyric',
@ -141,6 +154,7 @@ LYRICSCOM_NOT_FOUND = (
 def _lc_encode(s):
    s = re.sub(r'\s+', '-', s)
    return _encode(s)
+
 def fetch_lyricscom(artist, title):
    """Fetch lyrics from Lyrics.com."""
    url = LYRICSCOM_URL_PATTERN % (_lc_encode(title), _lc_encode(artist))
@ -159,9 +173,236 @@ def fetch_lyricscom(artist, title):
    if parts:
        return parts[0]

-BACKENDS = [fetch_lyricswiki, fetch_lyricscom]
+#
+# Google engine
+#
+
+def slugify(text, jokerChar=False, spaceChar=' '):
+    """
+    Normalizes string, removes non-alpha characters
+    Found at
+    http://stackoverflow.com/questions/295135/turn-a-string-into-a-valid-filename\
+    -in-python
+    """
+
+    try:
+        text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore')
+        text = unicode(re.sub('[-\s]+', spaceChar, text))
+
+        if jokerChar is not False:
+            text = unicode(re.sub('[^\w\s]', jokerChar, text))
+
+    except UnicodeDecodeError:
+        log.exception("Failing to normalize '%s'" % (text))
+        
+    return urllib.quote(text)
+
+
+def is_page_candidate(urlLink, urlTitle, title, artist):
+    """Return True if the url title makes it a good candidate to be a 
+    page that contains lyrics of title by artist """
+
+    title = slugify(title.lower())
+    artist = slugify(artist.lower())
+    urlLink = slugify(urlLink.lower())
+    urlTitle = slugify(urlTitle.lower())
+
+    # Check if url title contains song title (exact match) 
+    if urlTitle.find(title) != -1:
+        return True
+    # or try extracting song title from url title and check if 
+    # they are close enough
+    songTitle = urlTitle.replace('lyrics', '')\
+                            .replace(artist, '').strip('%20')
+    if len(songTitle):
+        log.debug("Match ratio of '%s' with title: %s" %
+                     (songTitle, difflib.SequenceMatcher
+                      (None, songTitle, title).ratio()))
+
+    typoRatio = .8
+    return difflib.SequenceMatcher(None, songTitle, title).ratio() > typoRatio
+
+
+def insert_line_feeds(text):
+    """Insert \n before upcased characters"""
+    
+    tokensStr = re.split("([a-z][A-Z])", text)
+    for idx in range(1, len(tokensStr), 2):
+        ltoken = list(tokensStr[idx])
+        tokensStr[idx] = ltoken[0] + '\n' + ltoken[1]
+    return ''.join(tokensStr)
+
+
+def decimate_line_feeds(text):
+    """Decimate \n characters.  By default une only one \n as eol marker. Keep
+    at most two \n in a row (eg. to separate verses)."""
+    
+    # Remove first occurence of \n for each sequence of \n 
+    text = re.sub(r'\n(\n+)', '\g<1>', text)
+    # Keep at most two \n in a row
+    text = re.sub(r'\n\n+', '\n\n', text)
+    return text.strip('\n')
+
+
+def sanetize_lyrics(text):
+    """Clean text, returning raw lyrics as output or None if it happens that
+    input text is actually not lyrics content.  Clean (x)html tags in text,
+    correct layout and syntax ..."""
+
+    text = strip_cruft(text, False)
+
+    # Restore \n in input text
+    if text.find('\n') == -1:
+        text = insert_line_feeds(text)
+
+    # Supress advertisements regexps 
+    textLines = text.splitlines(True)
+    # Match lines with an opening bracket but no ending one, ie lines that
+    # contained html link that has been wiped out when scraping.
+    reAdHtml = re.compile(r'(\(|\[).*[^\)\]]$')
+    # Match lines containing url between brackets
+    reAdTxt  = re.compile(r'(\(|\[).*[http|www].*(\]|\))')
+    for line in textLines:
+        if (re.match(reAdHtml, line) != None) or \
+           (re.match(reAdTxt, line) != None):
+            textLines.remove(line)
+
+    # \n might have been duplicated during the scraping.
+    # decimate \n while number of \n represent more than half the number of 
+    # lines
+    while len([x for x in textLines if x=='\n']) >= (len(textLines)/2 - 1):
+        if len(textLines) <= 3:
+            break
+        text = ''.join(textLines)
+        text = decimate_line_feeds(text)
+        textLines = [line.strip(' ') for line in text.splitlines(True)]
+
+    return ''.join(textLines)
+
+
+def is_lyrics_accepted(text, artist):
+    """Returns True if text is considered as valid lyrics"""
+
+    badTriggers = []
+    nbLines = text.count('\n')
+    if nbLines <= 1:
+        log.debug("Ignoring too short lyrics '%s'" % text)
+        return 0
+    elif nbLines < 5 :
+        badTriggers.append('too_short')
+
+    for item in [artist, 'lyrics', 'copyright', 'property']:
+        badTriggers += [item] * len(re.findall(r'\W%s\W' % item, text, re.I))
+        
+    if len(badTriggers) :
+        log.debug('Bad triggers detected : %s' % badTriggers)
+
+    return len(badTriggers) < 2
+
+
+def scrape_lyrics_from_url(url):
+    """Scrape lyrics from url"""
+    
+    from bs4 import BeautifulSoup, Tag
+    print (url)
+    html = fetch_url(url)      
+    soup = BeautifulSoup(html)
+
+    # Simplify the code by replacing some markers by the <p> marker    
+    try:
+        for tag in soup.findAll(['center','blockquote']):        
+            pTag = Tag(soup, "p")
+            pTag.contents = tag.contents
+            tag.replaceWith(pTag)
+
+        for tag in soup.findAll(['script', 'a', 'font']):
+            tag.replaceWith('<p>')  
+
+    except Exception, e:
+        log.debug('Error %s when replacing containing marker by p marker' % e, \
+            exc_info=True)
+         
+    for tag in soup.findAll('br'):
+        tag.replaceWith('\n')
+        
+    # Keep only tags that can possibly be parent tags and eol
+    for tag in soup.findAll(True):
+        containers = ['div', 'p', 'html', 'body', 'table', 'tr', 'td', 'br']
+        if tag.name not in containers:
+            tag.extract()
+
+    # Make better soup from current soup! The previous unclosed <p> sections are
+    # now closed.  Use str() rather than prettify() as it's more conservative
+    # concerning EOL
+    soup = BeautifulSoup(str(soup))
+
+    # In case lyrics are nested in no markup but <body>
+    # Insert the whole body in a <p>
+    bodyTag = soup.find('body')
+    if bodyTag != None:
+        pTag = soup.new_tag("p")
+        bodyTag.parent.insert(0, pTag)
+        pTag.insert(0, bodyTag)
+
+
+    tagTokens = []
+    for tag in soup.findAll('p'):
+        soup2 = BeautifulSoup(str(tag))
+        tagTokens += soup2.findAll(text=True)  # Extract all text of <p> section
+
+    if tagTokens != []:
+        # Lyrics are expected to be the longest paragraph
+        tagTokens = sorted(tagTokens, key=len, reverse=True)
+        soup = BeautifulSoup(tagTokens[0])
+        if soup.findAll(['div', 'a']) != []:
+            return None
+        return unescape(tagTokens[0].strip("\n\r: "))
+
+    return None
+
+
+
+def fetch_google(artist, title):
+    """Fetch lyrics from google results"""
+
+    QUERY = u"%s %s" % (artist, title)
+    url = u'https://www.googleapis.com/customsearch/v1?key=%s&cx=%s&q=%s' % \
+          (options['google_API_key'], options['google_engine_ID'], \
+          urllib.quote(QUERY.encode('utf8')))
+
+    data = urllib.urlopen(url)
+    data = json.load(data)
+    if 'error' in data:
+        reason = data['error']['errors'][0]['reason']
+        log.debug(u'google lyrics backend error: %s' % reason)
+        return None
+    
+    if 'items' in data.keys():
+        for item in data['items']:
+            urlLink = item['link'] 
+            urlTitle = item['title']
+            if not is_page_candidate(urlLink, urlTitle, title, artist):
+                continue
+            lyrics = scrape_lyrics_from_url(urlLink)
+            if (lyrics == None or len(lyrics)== 0):
+                continue
+
+            lyrics = sanetize_lyrics(lyrics)
+
+            if is_lyrics_accepted(lyrics, artist):
+                return lyrics
+
+# Lyrics scrapers.
+
 def get_lyrics(artist, title):
    """Fetch lyrics, trying each source in turn."""
+
+    # Remove featuring artists from search
+    pattern = u"(.*) feat(uring|\.)?\s\S+"
+    artist_nofeat = re.findall(re.compile(pattern,re.IGNORECASE), artist)
+    if artist_nofeat:
+        artist = artist_nofeat[0][0]
+
    for backend in BACKENDS:
        lyrics = backend(artist, title)
        if lyrics:
@ -173,40 +414,31 @@ def get_lyrics(artist, title):

 # Plugin logic.

-def fetch_item_lyrics(lib, loglevel, item, write):
-    """Fetch and store lyrics for a single item. If ``write``, then the
-    lyrics will also be written to the file itself. The ``loglevel``
-    parameter controls the visibility of the function's status log
-    messages.
-    """
-    # Skip if the item already has lyrics.
-    if item.lyrics:
-        log.log(loglevel, u'lyrics already present: %s - %s' %
-                          (item.artist, item.title))
-        return
+BACKENDS = [fetch_lyricswiki, fetch_lyricscom]

-    # Fetch lyrics.
-    lyrics = get_lyrics(item.artist, item.title)
-    if not lyrics:
-        log.log(loglevel, u'lyrics not found: %s - %s' %
-                          (item.artist, item.title))
-        return
+options = {
+    'google_API_key': None,
+    'google_engine_ID': None,
+}
+def init_google_search(google_API_key, google_engine_ID):
+    options['google_API_key'] = google_API_key 
+    options['google_engine_ID'] = google_engine_ID 

-    log.log(loglevel, u'fetched lyrics: %s - %s' %
-                      (item.artist, item.title))
-    item.lyrics = lyrics
-    if write:
-        item.write()
-    lib.store(item)
-
-AUTOFETCH = True
 class LyricsPlugin(BeetsPlugin):
    def __init__(self):
        super(LyricsPlugin, self).__init__()
        self.import_stages = [self.imported]
        self.config.add({
            'auto': True,
+            'google_API_key': None,
+            'google_engine_ID': u'009217259823014548361:lndtuqkycfu',
+            'fallback':False
        })
+                 
+        if self.config['google_API_key'].get():
+            init_google_search(self.config['google_API_key'].get(), 
+                                        self.config['google_engine_ID'].get())
+            BACKENDS.insert(0, fetch_google)

    def commands(self):
        cmd = ui.Subcommand('lyrics', help='fetch song lyrics')
@ -224,8 +456,39 @@ class LyricsPlugin(BeetsPlugin):
        cmd.func = func
        return [cmd]

+
    # Auto-fetch lyrics on import.
    def imported(self, session, task):
        if self.config['auto']:
            for item in task.imported_items():
-                fetch_item_lyrics(session.lib, logging.DEBUG, item, False)
+                self.fetch_item_lyrics(session.lib, logging.DEBUG, item, False)
+
+
+    def fetch_item_lyrics(self, lib, loglevel, item, write):
+        """Fetch and store lyrics for a single item. If ``write``, then the
+        lyrics will also be written to the file itself. The ``loglevel``
+        parameter controls the visibility of the function's status log
+        messages.
+        """
+        # Skip if the item already has lyrics.
+        if item.lyrics:
+            log.log(loglevel, u'lyrics already present: %s - %s' %
+                              (item.artist, item.title))
+            return
+
+        # Fetch lyrics.
+        lyrics = get_lyrics(item.artist, item.title)
+        if not lyrics:
+            log.log(loglevel, u'lyrics not found: %s - %s' %
+                              (item.artist, item.title))
+            if self.config['fallback'].get():
+                lyrics = self.config['fallback'].get()
+            else:
+                return
+        else:
+            log.log(loglevel, u'fetched lyrics: %s - %s' %
+                              (item.artist, item.title))
+        item.lyrics = lyrics
+        if write:
+            item.write()
+        lib.store(item)
--- a/docs/plugins/lyrics.rst
+++ b/docs/plugins/lyrics.rst
@ -7,6 +7,13 @@ Namely, the current version of the plugin uses `Lyric Wiki`_ and `Lyrics.com`_.
 .. _Lyric Wiki: http://lyrics.wikia.com/
 .. _Lyrics.com: http://www.lyrics.com/

+:ref:`_activate-google-custom-search` to expand the plugin firepower, by using google search to harvest lyrics from your own websites list.
+
+By default if no lyrics are found, the file will be left unchanged. To specify a placeholder for the lyrics tags when none are found, use the ``fallback`` configuration option.
+
+    lyrics:
+        fallback: 'No lyrics found'
+
 Fetch Lyrics During Import
 --------------------------

@ -42,3 +49,27 @@ automatic lyrics fetching during import. To do so, add this to your

    lyrics:
        auto: no
+
+.. _activate-google-custom-search:
+
+Activate Google custom search
+------------------------------
+
+Using Google backend requires `beautifulsoup`_, which you can install using `pip`_ by typing::
+
+    pip install beautifulsoup4
+
+To activate google search you must first register an API key on https://code.google.com/apis/console. Then click *API Access* and use that key for the `google_API_key` plugin option.
+
+Optionally, you can define a custom search engine on http://www.google.com/cse/all. Click the *Search engine ID* button to display the token to copy into the `google_engine_ID` option.
+By default, beets use a list of sources known to be scrapable.
+ 
+
+Example of ``config.yaml``::
+
+    lyrics:
+      google_API_key: AZERTYUIOPQSDFGHJKLMWXCVBN1234567890_ab
+      google_engine_ID: 009217259823014548361:lndtuqkycfu
+
+.. _pip: http://www.pip-installer.org/
+.. _beautifulsoup: http://www.crummy.com/software/BeautifulSoup/bs4/doc/