Convert lyrics plugin, with OO rewrite of backends

This commit is contained in:
Bruno Cauet 2015-01-06 18:09:18 +01:00
parent 860e7e1483
commit 63041736e3

View file

@ -25,15 +25,10 @@ import difflib
import itertools
from HTMLParser import HTMLParseError
from beets import logging
from beets import plugins
from beets import config, ui
# Global logger.
log = logging.getLogger(__name__)
DIV_RE = re.compile(r'<(/?)div>?', re.I)
COMMENT_RE = re.compile(r'<!--.*-->', re.S)
TAG_RE = re.compile(r'<[^>]*>')
@ -56,21 +51,6 @@ URL_CHARACTERS = {
# Utilities.
def fetch_url(url):
"""Retrieve the content at a given URL, or return None if the source
is unreachable.
"""
try:
r = requests.get(url, verify=False)
except requests.RequestException as exc:
log.debug(u'lyrics request failed: {0}', exc)
return
if r.status_code == requests.codes.ok:
return r.text
else:
log.debug(u'failed to fetch: {0} ({1})', url, r.status_code)
def unescape(text):
"""Resolves &#xxx; HTML entities (and some others)."""
if isinstance(text, str):
@ -174,131 +154,110 @@ def search_pairs(item):
return itertools.product(artists, multi_titles)
def _encode(s):
"""Encode the string for inclusion in a URL (common to both
LyricsWiki and Lyrics.com).
"""
if isinstance(s, unicode):
for char, repl in URL_CHARACTERS.items():
s = s.replace(char, repl)
s = s.encode('utf8', 'ignore')
return urllib.quote(s)
class Backend(object):
def __init__(self, log):
self._log = log
# Musixmatch
@staticmethod
def _encode(s):
"""Encode the string for inclusion in a URL"""
if isinstance(s, unicode):
for char, repl in URL_CHARACTERS.items():
s = s.replace(char, repl)
s = s.encode('utf8', 'ignore')
return urllib.quote(s)
MUSIXMATCH_URL_PATTERN = 'https://www.musixmatch.com/lyrics/%s/%s'
def build_url(self, artist, title):
return self.URL_PATTERN % (self._encode(artist.title()),
self._encode(title.title()))
def fetch_musixmatch(artist, title):
url = MUSIXMATCH_URL_PATTERN % (_lw_encode(artist.title()),
_lw_encode(title.title()))
html = fetch_url(url)
if not html:
return
lyrics = extract_text_between(html, '"lyrics_body":', '"lyrics_language":')
return lyrics.strip(',"').replace('\\n', '\n')
# LyricsWiki.
LYRICSWIKI_URL_PATTERN = 'http://lyrics.wikia.com/%s:%s'
def _lw_encode(s):
s = re.sub(r'\s+', '_', s)
s = s.replace("<", "Less_Than")
s = s.replace(">", "Greater_Than")
s = s.replace("#", "Number_")
s = re.sub(r'[\[\{]', '(', s)
s = re.sub(r'[\]\}]', ')', s)
return _encode(s)
def fetch_lyricswiki(artist, title):
"""Fetch lyrics from LyricsWiki."""
url = LYRICSWIKI_URL_PATTERN % (_lw_encode(artist), _lw_encode(title))
html = fetch_url(url)
if not html:
return
lyrics = extract_text_in(html, u"<div class='lyricbox'>")
if lyrics and 'Unfortunately, we are not licensed' not in lyrics:
return lyrics
# Lyrics.com.
LYRICSCOM_URL_PATTERN = 'http://www.lyrics.com/%s-lyrics-%s.html'
LYRICSCOM_NOT_FOUND = (
'Sorry, we do not have the lyric',
'Submit Lyrics',
)
def _lc_encode(s):
s = re.sub(r'[^\w\s-]', '', s)
s = re.sub(r'\s+', '-', s)
return _encode(s).lower()
def fetch_lyricscom(artist, title):
"""Fetch lyrics from Lyrics.com."""
url = LYRICSCOM_URL_PATTERN % (_lc_encode(title), _lc_encode(artist))
html = fetch_url(url)
if not html:
return
lyrics = extract_text_between(html, '<div id="lyrics" class="SCREENONLY" '
'itemprop="description">', '</div>')
if not lyrics:
return
for not_found_str in LYRICSCOM_NOT_FOUND:
if not_found_str in lyrics:
def fetch_url(self, url):
"""Retrieve the content at a given URL, or return None if the source
is unreachable.
"""
try:
r = requests.get(url, verify=False)
except requests.RequestException as exc:
self._log.debug(u'lyrics request failed: {0}', exc)
return
if r.status_code == requests.codes.ok:
return r.text
else:
self._log.debug(u'failed to fetch: {0} ({1})', url, r.status_code)
parts = lyrics.split('\n---\nLyrics powered by', 1)
if parts:
return parts[0]
def fetch(self, artist, title):
raise NotImplementedError()
# Optional Google custom search API backend.
def slugify(text):
"""Normalize a string and remove non-alphanumeric characters.
"""
text = re.sub(r"[-'_\s]", '_', text)
text = re.sub(r"_+", '_', text).strip('_')
pat = "([^,\(]*)\((.*?)\)" # Remove content within parentheses
text = re.sub(pat, '\g<1>', text).strip()
try:
text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore')
text = unicode(re.sub('[-\s]+', ' ', text))
except UnicodeDecodeError:
log.exception(u"Failing to normalize '{0}'", text)
return text
class SymbolsReplaced(Backend):
@classmethod
def _encode(cls, s):
s = re.sub(r'\s+', '_', s)
s = s.replace("<", "Less_Than")
s = s.replace(">", "Greater_Than")
s = s.replace("#", "Number_")
s = re.sub(r'[\[\{]', '(', s)
s = re.sub(r'[\]\}]', ')', s)
return super(SymbolsReplaced, cls)._encode(s)
BY_TRANS = ['by', 'par', 'de', 'von']
LYRICS_TRANS = ['lyrics', 'paroles', 'letras', 'liedtexte']
class MusiXmatch(SymbolsReplaced):
URL_PATTERN = 'https://www.musixmatch.com/lyrics/%s/%s'
def fetch(self, artist, title):
url = self.build_url(artist, title)
html = self.fetch_url(url)
if not html:
return
lyrics = extract_text_between(html,
'"lyrics_body":', '"lyrics_language":')
return lyrics.strip(',"').replace('\\n', '\n')
def is_page_candidate(urlLink, urlTitle, title, artist):
"""Return True if the URL title makes it a good candidate to be a
page that contains lyrics of title by artist.
"""
title = slugify(title.lower())
artist = slugify(artist.lower())
sitename = re.search(u"//([^/]+)/.*", slugify(urlLink.lower())).group(1)
urlTitle = slugify(urlTitle.lower())
# Check if URL title contains song title (exact match)
if urlTitle.find(title) != -1:
return True
# or try extracting song title from URL title and check if
# they are close enough
tokens = [by + '_' + artist for by in BY_TRANS] + \
[artist, sitename, sitename.replace('www.', '')] + LYRICS_TRANS
songTitle = re.sub(u'(%s)' % u'|'.join(tokens), u'', urlTitle)
songTitle = songTitle.strip('_|')
typoRatio = .9
return difflib.SequenceMatcher(None, songTitle, title).ratio() >= typoRatio
class LyricsWiki(SymbolsReplaced):
"""Fetch lyrics from LyricsWiki."""
URL_PATTERN = 'http://lyrics.wikia.com/%s:%s'
def fetch(self, artist, title):
url = self.build_url(artist, title)
html = self.fetch_url(url)
if not html:
return
lyrics = extract_text_in(html, u"<div class='lyricbox'>")
if lyrics and 'Unfortunately, we are not licensed' not in lyrics:
return lyrics
class LyricsCom(Backend):
"""Fetch lyrics from Lyrics.com."""
URL_PATTERN = 'http://www.lyrics.com/%s-lyrics-%s.html'
NOT_FOUND = (
'Sorry, we do not have the lyric',
'Submit Lyrics',
)
@classmethod
def _encode(cls, s):
s = re.sub(r'[^\w\s-]', '', s)
s = re.sub(r'\s+', '-', s)
return super(LyricsCom, cls)._encode(s).lower()
def fetch(self, artist, title):
url = self.build_url(artist, title)
html = self.fetch_url(url)
if not html:
return
lyrics = extract_text_between(html, '<div id="lyrics" class="SCREENO'
'NLY" itemprop="description">', '</div>')
if not lyrics:
return
for not_found_str in self.NOT_FOUND:
if not_found_str in lyrics:
return
parts = lyrics.split('\n---\nLyrics powered by', 1)
if parts:
return parts[0]
def remove_credits(text):
@ -315,36 +274,6 @@ def remove_credits(text):
return text
def is_lyrics(text, artist=None):
"""Determine whether the text seems to be valid lyrics.
"""
if not text:
return False
badTriggersOcc = []
nbLines = text.count('\n')
if nbLines <= 1:
log.debug(u"Ignoring too short lyrics '{0}'", text)
return False
elif nbLines < 5:
badTriggersOcc.append('too_short')
else:
# Lyrics look legit, remove credits to avoid being penalized further
# down
text = remove_credits(text)
badTriggers = ['lyrics', 'copyright', 'property', 'links']
if artist:
badTriggersOcc += [artist]
for item in badTriggers:
badTriggersOcc += [item] * len(re.findall(r'\W%s\W' % item,
text, re.I))
if badTriggersOcc:
log.debug(u'Bad triggers detected: {0}', badTriggersOcc)
return len(badTriggersOcc) < 2
def _scrape_strip_cruft(html, plain_text_out=False):
"""Clean up HTML
"""
@ -396,50 +325,119 @@ def scrape_lyrics_from_html(html):
return soup
def fetch_google(artist, title):
"""Fetch lyrics from Google search results.
"""
query = u"%s %s" % (artist, title)
api_key = config['lyrics']['google_API_key'].get(unicode)
engine_id = config['lyrics']['google_engine_ID'].get(unicode)
url = u'https://www.googleapis.com/customsearch/v1?key=%s&cx=%s&q=%s' % \
(api_key, engine_id, urllib.quote(query.encode('utf8')))
class Google(Backend):
"""Fetch lyrics from Google search results."""
def is_lyrics(self, text, artist=None):
"""Determine whether the text seems to be valid lyrics.
"""
if not text:
return False
badTriggersOcc = []
nbLines = text.count('\n')
if nbLines <= 1:
self._log.debug(u"Ignoring too short lyrics '{0}'", text)
return False
elif nbLines < 5:
badTriggersOcc.append('too_short')
else:
# Lyrics look legit, remove credits to avoid being penalized
# further down
text = remove_credits(text)
data = urllib.urlopen(url)
data = json.load(data)
if 'error' in data:
reason = data['error']['errors'][0]['reason']
log.debug(u'google lyrics backend error: {0}', reason)
return
badTriggers = ['lyrics', 'copyright', 'property', 'links']
if artist:
badTriggersOcc += [artist]
if 'items' in data.keys():
for item in data['items']:
urlLink = item['link']
urlTitle = item.get('title', u'')
if not is_page_candidate(urlLink, urlTitle, title, artist):
continue
html = fetch_url(urlLink)
lyrics = scrape_lyrics_from_html(html)
if not lyrics:
continue
for item in badTriggers:
badTriggersOcc += [item] * len(re.findall(r'\W%s\W' % item,
text, re.I))
if is_lyrics(lyrics, artist):
log.debug(u'got lyrics from {0}', item['displayLink'])
return lyrics
if badTriggersOcc:
self._log.debug(u'Bad triggers detected: {0}', badTriggersOcc)
return len(badTriggersOcc) < 2
def slugify(self, text):
"""Normalize a string and remove non-alphanumeric characters.
"""
text = re.sub(r"[-'_\s]", '_', text)
text = re.sub(r"_+", '_', text).strip('_')
pat = "([^,\(]*)\((.*?)\)" # Remove content within parentheses
text = re.sub(pat, '\g<1>', text).strip()
try:
text = unicodedata.normalize('NFKD', text).encode('ascii',
'ignore')
text = unicode(re.sub('[-\s]+', ' ', text))
except UnicodeDecodeError:
self._log.exception(u"Failing to normalize '{0}'", text)
return text
# Plugin logic.
BY_TRANS = ['by', 'par', 'de', 'von']
LYRICS_TRANS = ['lyrics', 'paroles', 'letras', 'liedtexte']
SOURCES = ['google', 'lyricwiki', 'lyrics.com', 'musixmatch']
SOURCE_BACKENDS = {
'google': fetch_google,
'lyricwiki': fetch_lyricswiki,
'lyrics.com': fetch_lyricscom,
'musixmatch': fetch_musixmatch,
}
def is_page_candidate(self, urlLink, urlTitle, title, artist):
"""Return True if the URL title makes it a good candidate to be a
page that contains lyrics of title by artist.
"""
title = self.slugify(title.lower())
artist = self.slugify(artist.lower())
sitename = re.search(u"//([^/]+)/.*",
self.slugify(urlLink.lower())).group(1)
urlTitle = self.slugify(urlTitle.lower())
# Check if URL title contains song title (exact match)
if urlTitle.find(title) != -1:
return True
# or try extracting song title from URL title and check if
# they are close enough
tokens = [by + '_' + artist for by in self.BY_TRANS] + \
[artist, sitename, sitename.replace('www.', '')] + \
self.LYRICS_TRANS
songTitle = re.sub(u'(%s)' % u'|'.join(tokens), u'', urlTitle)
songTitle = songTitle.strip('_|')
typoRatio = .9
ratio = difflib.SequenceMatcher(None, songTitle, title).ratio()
return ratio >= typoRatio
def fetch(self, artist, title):
query = u"%s %s" % (artist, title)
api_key = config['lyrics']['google_API_key'].get(unicode)
engine_id = config['lyrics']['google_engine_ID'].get(unicode)
url = u'https://www.googleapis.com/customsearch/v1?key=%s&cx=%s&q=%s' % \
(api_key, engine_id, urllib.quote(query.encode('utf8')))
data = urllib.urlopen(url)
data = json.load(data)
if 'error' in data:
reason = data['error']['errors'][0]['reason']
self._log.debug(u'google lyrics backend error: {0}', reason)
return
if 'items' in data.keys():
for item in data['items']:
urlLink = item['link']
urlTitle = item.get('title', u'')
if not self.is_page_candidate(urlLink, urlTitle,
title, artist):
continue
html = self.fetch_url(urlLink)
lyrics = scrape_lyrics_from_html(html)
if not lyrics:
continue
if self.is_lyrics(lyrics, artist):
self._log.debug(u'got lyrics from {0}',
item['displayLink'])
return lyrics
class LyricsPlugin(plugins.BeetsPlugin):
SOURCES = ['google', 'lyricwiki', 'lyrics.com', 'musixmatch']
SOURCE_BACKENDS = {
'google': Google,
'lyricwiki': LyricsWiki,
'lyrics.com': LyricsCom,
'musixmatch': MusiXmatch,
}
def __init__(self):
super(LyricsPlugin, self).__init__()
self._import_stages = [self.imported]
@ -449,18 +447,18 @@ class LyricsPlugin(plugins.BeetsPlugin):
'google_engine_ID': u'009217259823014548361:lndtuqkycfu',
'fallback': None,
'force': False,
'sources': SOURCES,
'sources': self.SOURCES,
})
available_sources = list(SOURCES)
available_sources = list(self.SOURCES)
if not self.config['google_API_key'].get() and \
'google' in SOURCES:
'google' in self.SOURCES:
available_sources.remove('google')
self.config['sources'] = plugins.sanitize_choices(
self.config['sources'].as_str_seq(), available_sources)
self.backends = []
for key in self.config['sources'].as_str_seq():
self.backends.append(SOURCE_BACKENDS[key])
self.backends.append(self.SOURCE_BACKENDS[key](self._log))
def commands(self):
cmd = ui.Subcommand('lyrics', help='fetch song lyrics')
@ -499,7 +497,8 @@ class LyricsPlugin(plugins.BeetsPlugin):
lyrics will also be written to the file itself."""
# Skip if the item already has lyrics.
if not force and item.lyrics:
log.info(u'lyrics already present: {0.artist} - {0.title}', item)
self._log.info(u'lyrics already present: {0.artist} - {0.title}',
item)
return
lyrics = None
@ -511,9 +510,9 @@ class LyricsPlugin(plugins.BeetsPlugin):
lyrics = u"\n\n---\n\n".join([l for l in lyrics if l])
if lyrics:
log.info(u'fetched lyrics: {0} - {1}', item.artist, item.title)
self._log.info(u'fetched lyrics: {0.artist} - {0.title}', item)
else:
log.info(u'lyrics not found: {0} - {1}', item.artist, item.title)
self._log.info(u'lyrics not found: {0.artist} - {0.title}', item)
fallback = self.config['fallback'].get()
if fallback:
lyrics = fallback
@ -531,7 +530,8 @@ class LyricsPlugin(plugins.BeetsPlugin):
None if no lyrics were found.
"""
for backend in self.backends:
lyrics = backend(artist, title)
lyrics = backend.fetch(artist, title)
if lyrics:
log.debug(u'got lyrics from backend: {0}', backend.__name__)
self._log.debug(u'got lyrics from backend: {0}',
backend.__class__.__name__)
return _scrape_strip_cruft(lyrics, True)