mirror of
https://github.com/beetbox/beets.git
synced 2026-02-26 17:21:24 +01:00
Convert lyrics plugin, with OO rewrite of backends
This commit is contained in:
parent
860e7e1483
commit
63041736e3
1 changed files with 208 additions and 208 deletions
|
|
@ -25,15 +25,10 @@ import difflib
|
|||
import itertools
|
||||
from HTMLParser import HTMLParseError
|
||||
|
||||
from beets import logging
|
||||
from beets import plugins
|
||||
from beets import config, ui
|
||||
|
||||
|
||||
# Global logger.
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
DIV_RE = re.compile(r'<(/?)div>?', re.I)
|
||||
COMMENT_RE = re.compile(r'<!--.*-->', re.S)
|
||||
TAG_RE = re.compile(r'<[^>]*>')
|
||||
|
|
@ -56,21 +51,6 @@ URL_CHARACTERS = {
|
|||
|
||||
# Utilities.
|
||||
|
||||
def fetch_url(url):
|
||||
"""Retrieve the content at a given URL, or return None if the source
|
||||
is unreachable.
|
||||
"""
|
||||
try:
|
||||
r = requests.get(url, verify=False)
|
||||
except requests.RequestException as exc:
|
||||
log.debug(u'lyrics request failed: {0}', exc)
|
||||
return
|
||||
if r.status_code == requests.codes.ok:
|
||||
return r.text
|
||||
else:
|
||||
log.debug(u'failed to fetch: {0} ({1})', url, r.status_code)
|
||||
|
||||
|
||||
def unescape(text):
|
||||
"""Resolves &#xxx; HTML entities (and some others)."""
|
||||
if isinstance(text, str):
|
||||
|
|
@ -174,131 +154,110 @@ def search_pairs(item):
|
|||
return itertools.product(artists, multi_titles)
|
||||
|
||||
|
||||
def _encode(s):
|
||||
"""Encode the string for inclusion in a URL (common to both
|
||||
LyricsWiki and Lyrics.com).
|
||||
"""
|
||||
if isinstance(s, unicode):
|
||||
for char, repl in URL_CHARACTERS.items():
|
||||
s = s.replace(char, repl)
|
||||
s = s.encode('utf8', 'ignore')
|
||||
return urllib.quote(s)
|
||||
class Backend(object):
|
||||
def __init__(self, log):
|
||||
self._log = log
|
||||
|
||||
# Musixmatch
|
||||
@staticmethod
|
||||
def _encode(s):
|
||||
"""Encode the string for inclusion in a URL"""
|
||||
if isinstance(s, unicode):
|
||||
for char, repl in URL_CHARACTERS.items():
|
||||
s = s.replace(char, repl)
|
||||
s = s.encode('utf8', 'ignore')
|
||||
return urllib.quote(s)
|
||||
|
||||
MUSIXMATCH_URL_PATTERN = 'https://www.musixmatch.com/lyrics/%s/%s'
|
||||
def build_url(self, artist, title):
|
||||
return self.URL_PATTERN % (self._encode(artist.title()),
|
||||
self._encode(title.title()))
|
||||
|
||||
|
||||
def fetch_musixmatch(artist, title):
|
||||
url = MUSIXMATCH_URL_PATTERN % (_lw_encode(artist.title()),
|
||||
_lw_encode(title.title()))
|
||||
html = fetch_url(url)
|
||||
if not html:
|
||||
return
|
||||
lyrics = extract_text_between(html, '"lyrics_body":', '"lyrics_language":')
|
||||
return lyrics.strip(',"').replace('\\n', '\n')
|
||||
|
||||
# LyricsWiki.
|
||||
|
||||
LYRICSWIKI_URL_PATTERN = 'http://lyrics.wikia.com/%s:%s'
|
||||
|
||||
|
||||
def _lw_encode(s):
|
||||
s = re.sub(r'\s+', '_', s)
|
||||
s = s.replace("<", "Less_Than")
|
||||
s = s.replace(">", "Greater_Than")
|
||||
s = s.replace("#", "Number_")
|
||||
s = re.sub(r'[\[\{]', '(', s)
|
||||
s = re.sub(r'[\]\}]', ')', s)
|
||||
return _encode(s)
|
||||
|
||||
|
||||
def fetch_lyricswiki(artist, title):
|
||||
"""Fetch lyrics from LyricsWiki."""
|
||||
url = LYRICSWIKI_URL_PATTERN % (_lw_encode(artist), _lw_encode(title))
|
||||
html = fetch_url(url)
|
||||
if not html:
|
||||
return
|
||||
|
||||
lyrics = extract_text_in(html, u"<div class='lyricbox'>")
|
||||
if lyrics and 'Unfortunately, we are not licensed' not in lyrics:
|
||||
return lyrics
|
||||
|
||||
|
||||
# Lyrics.com.
|
||||
|
||||
LYRICSCOM_URL_PATTERN = 'http://www.lyrics.com/%s-lyrics-%s.html'
|
||||
LYRICSCOM_NOT_FOUND = (
|
||||
'Sorry, we do not have the lyric',
|
||||
'Submit Lyrics',
|
||||
)
|
||||
|
||||
|
||||
def _lc_encode(s):
|
||||
s = re.sub(r'[^\w\s-]', '', s)
|
||||
s = re.sub(r'\s+', '-', s)
|
||||
return _encode(s).lower()
|
||||
|
||||
|
||||
def fetch_lyricscom(artist, title):
|
||||
"""Fetch lyrics from Lyrics.com."""
|
||||
url = LYRICSCOM_URL_PATTERN % (_lc_encode(title), _lc_encode(artist))
|
||||
html = fetch_url(url)
|
||||
if not html:
|
||||
return
|
||||
lyrics = extract_text_between(html, '<div id="lyrics" class="SCREENONLY" '
|
||||
'itemprop="description">', '</div>')
|
||||
if not lyrics:
|
||||
return
|
||||
for not_found_str in LYRICSCOM_NOT_FOUND:
|
||||
if not_found_str in lyrics:
|
||||
def fetch_url(self, url):
|
||||
"""Retrieve the content at a given URL, or return None if the source
|
||||
is unreachable.
|
||||
"""
|
||||
try:
|
||||
r = requests.get(url, verify=False)
|
||||
except requests.RequestException as exc:
|
||||
self._log.debug(u'lyrics request failed: {0}', exc)
|
||||
return
|
||||
if r.status_code == requests.codes.ok:
|
||||
return r.text
|
||||
else:
|
||||
self._log.debug(u'failed to fetch: {0} ({1})', url, r.status_code)
|
||||
|
||||
parts = lyrics.split('\n---\nLyrics powered by', 1)
|
||||
if parts:
|
||||
return parts[0]
|
||||
def fetch(self, artist, title):
|
||||
raise NotImplementedError()
|
||||
|
||||
|
||||
# Optional Google custom search API backend.
|
||||
|
||||
def slugify(text):
|
||||
"""Normalize a string and remove non-alphanumeric characters.
|
||||
"""
|
||||
text = re.sub(r"[-'_\s]", '_', text)
|
||||
text = re.sub(r"_+", '_', text).strip('_')
|
||||
pat = "([^,\(]*)\((.*?)\)" # Remove content within parentheses
|
||||
text = re.sub(pat, '\g<1>', text).strip()
|
||||
try:
|
||||
text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore')
|
||||
text = unicode(re.sub('[-\s]+', ' ', text))
|
||||
except UnicodeDecodeError:
|
||||
log.exception(u"Failing to normalize '{0}'", text)
|
||||
return text
|
||||
class SymbolsReplaced(Backend):
|
||||
@classmethod
|
||||
def _encode(cls, s):
|
||||
s = re.sub(r'\s+', '_', s)
|
||||
s = s.replace("<", "Less_Than")
|
||||
s = s.replace(">", "Greater_Than")
|
||||
s = s.replace("#", "Number_")
|
||||
s = re.sub(r'[\[\{]', '(', s)
|
||||
s = re.sub(r'[\]\}]', ')', s)
|
||||
return super(SymbolsReplaced, cls)._encode(s)
|
||||
|
||||
|
||||
BY_TRANS = ['by', 'par', 'de', 'von']
|
||||
LYRICS_TRANS = ['lyrics', 'paroles', 'letras', 'liedtexte']
|
||||
class MusiXmatch(SymbolsReplaced):
|
||||
URL_PATTERN = 'https://www.musixmatch.com/lyrics/%s/%s'
|
||||
|
||||
def fetch(self, artist, title):
|
||||
url = self.build_url(artist, title)
|
||||
html = self.fetch_url(url)
|
||||
if not html:
|
||||
return
|
||||
lyrics = extract_text_between(html,
|
||||
'"lyrics_body":', '"lyrics_language":')
|
||||
return lyrics.strip(',"').replace('\\n', '\n')
|
||||
|
||||
|
||||
def is_page_candidate(urlLink, urlTitle, title, artist):
|
||||
"""Return True if the URL title makes it a good candidate to be a
|
||||
page that contains lyrics of title by artist.
|
||||
"""
|
||||
title = slugify(title.lower())
|
||||
artist = slugify(artist.lower())
|
||||
sitename = re.search(u"//([^/]+)/.*", slugify(urlLink.lower())).group(1)
|
||||
urlTitle = slugify(urlTitle.lower())
|
||||
# Check if URL title contains song title (exact match)
|
||||
if urlTitle.find(title) != -1:
|
||||
return True
|
||||
# or try extracting song title from URL title and check if
|
||||
# they are close enough
|
||||
tokens = [by + '_' + artist for by in BY_TRANS] + \
|
||||
[artist, sitename, sitename.replace('www.', '')] + LYRICS_TRANS
|
||||
songTitle = re.sub(u'(%s)' % u'|'.join(tokens), u'', urlTitle)
|
||||
songTitle = songTitle.strip('_|')
|
||||
typoRatio = .9
|
||||
return difflib.SequenceMatcher(None, songTitle, title).ratio() >= typoRatio
|
||||
class LyricsWiki(SymbolsReplaced):
|
||||
"""Fetch lyrics from LyricsWiki."""
|
||||
URL_PATTERN = 'http://lyrics.wikia.com/%s:%s'
|
||||
|
||||
def fetch(self, artist, title):
|
||||
url = self.build_url(artist, title)
|
||||
html = self.fetch_url(url)
|
||||
if not html:
|
||||
return
|
||||
lyrics = extract_text_in(html, u"<div class='lyricbox'>")
|
||||
if lyrics and 'Unfortunately, we are not licensed' not in lyrics:
|
||||
return lyrics
|
||||
|
||||
|
||||
class LyricsCom(Backend):
|
||||
"""Fetch lyrics from Lyrics.com."""
|
||||
URL_PATTERN = 'http://www.lyrics.com/%s-lyrics-%s.html'
|
||||
NOT_FOUND = (
|
||||
'Sorry, we do not have the lyric',
|
||||
'Submit Lyrics',
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def _encode(cls, s):
|
||||
s = re.sub(r'[^\w\s-]', '', s)
|
||||
s = re.sub(r'\s+', '-', s)
|
||||
return super(LyricsCom, cls)._encode(s).lower()
|
||||
|
||||
def fetch(self, artist, title):
|
||||
url = self.build_url(artist, title)
|
||||
html = self.fetch_url(url)
|
||||
if not html:
|
||||
return
|
||||
lyrics = extract_text_between(html, '<div id="lyrics" class="SCREENO'
|
||||
'NLY" itemprop="description">', '</div>')
|
||||
if not lyrics:
|
||||
return
|
||||
for not_found_str in self.NOT_FOUND:
|
||||
if not_found_str in lyrics:
|
||||
return
|
||||
|
||||
parts = lyrics.split('\n---\nLyrics powered by', 1)
|
||||
if parts:
|
||||
return parts[0]
|
||||
|
||||
|
||||
def remove_credits(text):
|
||||
|
|
@ -315,36 +274,6 @@ def remove_credits(text):
|
|||
return text
|
||||
|
||||
|
||||
def is_lyrics(text, artist=None):
|
||||
"""Determine whether the text seems to be valid lyrics.
|
||||
"""
|
||||
if not text:
|
||||
return False
|
||||
badTriggersOcc = []
|
||||
nbLines = text.count('\n')
|
||||
if nbLines <= 1:
|
||||
log.debug(u"Ignoring too short lyrics '{0}'", text)
|
||||
return False
|
||||
elif nbLines < 5:
|
||||
badTriggersOcc.append('too_short')
|
||||
else:
|
||||
# Lyrics look legit, remove credits to avoid being penalized further
|
||||
# down
|
||||
text = remove_credits(text)
|
||||
|
||||
badTriggers = ['lyrics', 'copyright', 'property', 'links']
|
||||
if artist:
|
||||
badTriggersOcc += [artist]
|
||||
|
||||
for item in badTriggers:
|
||||
badTriggersOcc += [item] * len(re.findall(r'\W%s\W' % item,
|
||||
text, re.I))
|
||||
|
||||
if badTriggersOcc:
|
||||
log.debug(u'Bad triggers detected: {0}', badTriggersOcc)
|
||||
return len(badTriggersOcc) < 2
|
||||
|
||||
|
||||
def _scrape_strip_cruft(html, plain_text_out=False):
|
||||
"""Clean up HTML
|
||||
"""
|
||||
|
|
@ -396,50 +325,119 @@ def scrape_lyrics_from_html(html):
|
|||
return soup
|
||||
|
||||
|
||||
def fetch_google(artist, title):
|
||||
"""Fetch lyrics from Google search results.
|
||||
"""
|
||||
query = u"%s %s" % (artist, title)
|
||||
api_key = config['lyrics']['google_API_key'].get(unicode)
|
||||
engine_id = config['lyrics']['google_engine_ID'].get(unicode)
|
||||
url = u'https://www.googleapis.com/customsearch/v1?key=%s&cx=%s&q=%s' % \
|
||||
(api_key, engine_id, urllib.quote(query.encode('utf8')))
|
||||
class Google(Backend):
|
||||
"""Fetch lyrics from Google search results."""
|
||||
def is_lyrics(self, text, artist=None):
|
||||
"""Determine whether the text seems to be valid lyrics.
|
||||
"""
|
||||
if not text:
|
||||
return False
|
||||
badTriggersOcc = []
|
||||
nbLines = text.count('\n')
|
||||
if nbLines <= 1:
|
||||
self._log.debug(u"Ignoring too short lyrics '{0}'", text)
|
||||
return False
|
||||
elif nbLines < 5:
|
||||
badTriggersOcc.append('too_short')
|
||||
else:
|
||||
# Lyrics look legit, remove credits to avoid being penalized
|
||||
# further down
|
||||
text = remove_credits(text)
|
||||
|
||||
data = urllib.urlopen(url)
|
||||
data = json.load(data)
|
||||
if 'error' in data:
|
||||
reason = data['error']['errors'][0]['reason']
|
||||
log.debug(u'google lyrics backend error: {0}', reason)
|
||||
return
|
||||
badTriggers = ['lyrics', 'copyright', 'property', 'links']
|
||||
if artist:
|
||||
badTriggersOcc += [artist]
|
||||
|
||||
if 'items' in data.keys():
|
||||
for item in data['items']:
|
||||
urlLink = item['link']
|
||||
urlTitle = item.get('title', u'')
|
||||
if not is_page_candidate(urlLink, urlTitle, title, artist):
|
||||
continue
|
||||
html = fetch_url(urlLink)
|
||||
lyrics = scrape_lyrics_from_html(html)
|
||||
if not lyrics:
|
||||
continue
|
||||
for item in badTriggers:
|
||||
badTriggersOcc += [item] * len(re.findall(r'\W%s\W' % item,
|
||||
text, re.I))
|
||||
|
||||
if is_lyrics(lyrics, artist):
|
||||
log.debug(u'got lyrics from {0}', item['displayLink'])
|
||||
return lyrics
|
||||
if badTriggersOcc:
|
||||
self._log.debug(u'Bad triggers detected: {0}', badTriggersOcc)
|
||||
return len(badTriggersOcc) < 2
|
||||
|
||||
def slugify(self, text):
|
||||
"""Normalize a string and remove non-alphanumeric characters.
|
||||
"""
|
||||
text = re.sub(r"[-'_\s]", '_', text)
|
||||
text = re.sub(r"_+", '_', text).strip('_')
|
||||
pat = "([^,\(]*)\((.*?)\)" # Remove content within parentheses
|
||||
text = re.sub(pat, '\g<1>', text).strip()
|
||||
try:
|
||||
text = unicodedata.normalize('NFKD', text).encode('ascii',
|
||||
'ignore')
|
||||
text = unicode(re.sub('[-\s]+', ' ', text))
|
||||
except UnicodeDecodeError:
|
||||
self._log.exception(u"Failing to normalize '{0}'", text)
|
||||
return text
|
||||
|
||||
# Plugin logic.
|
||||
BY_TRANS = ['by', 'par', 'de', 'von']
|
||||
LYRICS_TRANS = ['lyrics', 'paroles', 'letras', 'liedtexte']
|
||||
|
||||
SOURCES = ['google', 'lyricwiki', 'lyrics.com', 'musixmatch']
|
||||
SOURCE_BACKENDS = {
|
||||
'google': fetch_google,
|
||||
'lyricwiki': fetch_lyricswiki,
|
||||
'lyrics.com': fetch_lyricscom,
|
||||
'musixmatch': fetch_musixmatch,
|
||||
}
|
||||
def is_page_candidate(self, urlLink, urlTitle, title, artist):
|
||||
"""Return True if the URL title makes it a good candidate to be a
|
||||
page that contains lyrics of title by artist.
|
||||
"""
|
||||
title = self.slugify(title.lower())
|
||||
artist = self.slugify(artist.lower())
|
||||
sitename = re.search(u"//([^/]+)/.*",
|
||||
self.slugify(urlLink.lower())).group(1)
|
||||
urlTitle = self.slugify(urlTitle.lower())
|
||||
# Check if URL title contains song title (exact match)
|
||||
if urlTitle.find(title) != -1:
|
||||
return True
|
||||
# or try extracting song title from URL title and check if
|
||||
# they are close enough
|
||||
tokens = [by + '_' + artist for by in self.BY_TRANS] + \
|
||||
[artist, sitename, sitename.replace('www.', '')] + \
|
||||
self.LYRICS_TRANS
|
||||
songTitle = re.sub(u'(%s)' % u'|'.join(tokens), u'', urlTitle)
|
||||
songTitle = songTitle.strip('_|')
|
||||
typoRatio = .9
|
||||
ratio = difflib.SequenceMatcher(None, songTitle, title).ratio()
|
||||
return ratio >= typoRatio
|
||||
|
||||
def fetch(self, artist, title):
|
||||
query = u"%s %s" % (artist, title)
|
||||
api_key = config['lyrics']['google_API_key'].get(unicode)
|
||||
engine_id = config['lyrics']['google_engine_ID'].get(unicode)
|
||||
url = u'https://www.googleapis.com/customsearch/v1?key=%s&cx=%s&q=%s' % \
|
||||
(api_key, engine_id, urllib.quote(query.encode('utf8')))
|
||||
|
||||
data = urllib.urlopen(url)
|
||||
data = json.load(data)
|
||||
if 'error' in data:
|
||||
reason = data['error']['errors'][0]['reason']
|
||||
self._log.debug(u'google lyrics backend error: {0}', reason)
|
||||
return
|
||||
|
||||
if 'items' in data.keys():
|
||||
for item in data['items']:
|
||||
urlLink = item['link']
|
||||
urlTitle = item.get('title', u'')
|
||||
if not self.is_page_candidate(urlLink, urlTitle,
|
||||
title, artist):
|
||||
continue
|
||||
html = self.fetch_url(urlLink)
|
||||
lyrics = scrape_lyrics_from_html(html)
|
||||
if not lyrics:
|
||||
continue
|
||||
|
||||
if self.is_lyrics(lyrics, artist):
|
||||
self._log.debug(u'got lyrics from {0}',
|
||||
item['displayLink'])
|
||||
return lyrics
|
||||
|
||||
|
||||
class LyricsPlugin(plugins.BeetsPlugin):
|
||||
SOURCES = ['google', 'lyricwiki', 'lyrics.com', 'musixmatch']
|
||||
SOURCE_BACKENDS = {
|
||||
'google': Google,
|
||||
'lyricwiki': LyricsWiki,
|
||||
'lyrics.com': LyricsCom,
|
||||
'musixmatch': MusiXmatch,
|
||||
}
|
||||
|
||||
def __init__(self):
|
||||
super(LyricsPlugin, self).__init__()
|
||||
self._import_stages = [self.imported]
|
||||
|
|
@ -449,18 +447,18 @@ class LyricsPlugin(plugins.BeetsPlugin):
|
|||
'google_engine_ID': u'009217259823014548361:lndtuqkycfu',
|
||||
'fallback': None,
|
||||
'force': False,
|
||||
'sources': SOURCES,
|
||||
'sources': self.SOURCES,
|
||||
})
|
||||
|
||||
available_sources = list(SOURCES)
|
||||
available_sources = list(self.SOURCES)
|
||||
if not self.config['google_API_key'].get() and \
|
||||
'google' in SOURCES:
|
||||
'google' in self.SOURCES:
|
||||
available_sources.remove('google')
|
||||
self.config['sources'] = plugins.sanitize_choices(
|
||||
self.config['sources'].as_str_seq(), available_sources)
|
||||
self.backends = []
|
||||
for key in self.config['sources'].as_str_seq():
|
||||
self.backends.append(SOURCE_BACKENDS[key])
|
||||
self.backends.append(self.SOURCE_BACKENDS[key](self._log))
|
||||
|
||||
def commands(self):
|
||||
cmd = ui.Subcommand('lyrics', help='fetch song lyrics')
|
||||
|
|
@ -499,7 +497,8 @@ class LyricsPlugin(plugins.BeetsPlugin):
|
|||
lyrics will also be written to the file itself."""
|
||||
# Skip if the item already has lyrics.
|
||||
if not force and item.lyrics:
|
||||
log.info(u'lyrics already present: {0.artist} - {0.title}', item)
|
||||
self._log.info(u'lyrics already present: {0.artist} - {0.title}',
|
||||
item)
|
||||
return
|
||||
|
||||
lyrics = None
|
||||
|
|
@ -511,9 +510,9 @@ class LyricsPlugin(plugins.BeetsPlugin):
|
|||
lyrics = u"\n\n---\n\n".join([l for l in lyrics if l])
|
||||
|
||||
if lyrics:
|
||||
log.info(u'fetched lyrics: {0} - {1}', item.artist, item.title)
|
||||
self._log.info(u'fetched lyrics: {0.artist} - {0.title}', item)
|
||||
else:
|
||||
log.info(u'lyrics not found: {0} - {1}', item.artist, item.title)
|
||||
self._log.info(u'lyrics not found: {0.artist} - {0.title}', item)
|
||||
fallback = self.config['fallback'].get()
|
||||
if fallback:
|
||||
lyrics = fallback
|
||||
|
|
@ -531,7 +530,8 @@ class LyricsPlugin(plugins.BeetsPlugin):
|
|||
None if no lyrics were found.
|
||||
"""
|
||||
for backend in self.backends:
|
||||
lyrics = backend(artist, title)
|
||||
lyrics = backend.fetch(artist, title)
|
||||
if lyrics:
|
||||
log.debug(u'got lyrics from backend: {0}', backend.__name__)
|
||||
self._log.debug(u'got lyrics from backend: {0}',
|
||||
backend.__class__.__name__)
|
||||
return _scrape_strip_cruft(lyrics, True)
|
||||
|
|
|
|||
Loading…
Reference in a new issue