From 63041736e3a18a0ca7fe4efd1278796434f24aa4 Mon Sep 17 00:00:00 2001 From: Bruno Cauet Date: Tue, 6 Jan 2015 18:09:18 +0100 Subject: [PATCH] Convert lyrics plugin, with OO rewrite of backends --- beetsplug/lyrics.py | 416 ++++++++++++++++++++++---------------------- 1 file changed, 208 insertions(+), 208 deletions(-) diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py index 710cdd2e4..70a1133c2 100644 --- a/beetsplug/lyrics.py +++ b/beetsplug/lyrics.py @@ -25,15 +25,10 @@ import difflib import itertools from HTMLParser import HTMLParseError -from beets import logging from beets import plugins from beets import config, ui -# Global logger. - -log = logging.getLogger(__name__) - DIV_RE = re.compile(r'<(/?)div>?', re.I) COMMENT_RE = re.compile(r'', re.S) TAG_RE = re.compile(r'<[^>]*>') @@ -56,21 +51,6 @@ URL_CHARACTERS = { # Utilities. -def fetch_url(url): - """Retrieve the content at a given URL, or return None if the source - is unreachable. - """ - try: - r = requests.get(url, verify=False) - except requests.RequestException as exc: - log.debug(u'lyrics request failed: {0}', exc) - return - if r.status_code == requests.codes.ok: - return r.text - else: - log.debug(u'failed to fetch: {0} ({1})', url, r.status_code) - - def unescape(text): """Resolves &#xxx; HTML entities (and some others).""" if isinstance(text, str): @@ -174,131 +154,110 @@ def search_pairs(item): return itertools.product(artists, multi_titles) -def _encode(s): - """Encode the string for inclusion in a URL (common to both - LyricsWiki and Lyrics.com). - """ - if isinstance(s, unicode): - for char, repl in URL_CHARACTERS.items(): - s = s.replace(char, repl) - s = s.encode('utf8', 'ignore') - return urllib.quote(s) +class Backend(object): + def __init__(self, log): + self._log = log -# Musixmatch + @staticmethod + def _encode(s): + """Encode the string for inclusion in a URL""" + if isinstance(s, unicode): + for char, repl in URL_CHARACTERS.items(): + s = s.replace(char, repl) + s = s.encode('utf8', 'ignore') + return urllib.quote(s) -MUSIXMATCH_URL_PATTERN = 'https://www.musixmatch.com/lyrics/%s/%s' + def build_url(self, artist, title): + return self.URL_PATTERN % (self._encode(artist.title()), + self._encode(title.title())) - -def fetch_musixmatch(artist, title): - url = MUSIXMATCH_URL_PATTERN % (_lw_encode(artist.title()), - _lw_encode(title.title())) - html = fetch_url(url) - if not html: - return - lyrics = extract_text_between(html, '"lyrics_body":', '"lyrics_language":') - return lyrics.strip(',"').replace('\\n', '\n') - -# LyricsWiki. - -LYRICSWIKI_URL_PATTERN = 'http://lyrics.wikia.com/%s:%s' - - -def _lw_encode(s): - s = re.sub(r'\s+', '_', s) - s = s.replace("<", "Less_Than") - s = s.replace(">", "Greater_Than") - s = s.replace("#", "Number_") - s = re.sub(r'[\[\{]', '(', s) - s = re.sub(r'[\]\}]', ')', s) - return _encode(s) - - -def fetch_lyricswiki(artist, title): - """Fetch lyrics from LyricsWiki.""" - url = LYRICSWIKI_URL_PATTERN % (_lw_encode(artist), _lw_encode(title)) - html = fetch_url(url) - if not html: - return - - lyrics = extract_text_in(html, u"
") - if lyrics and 'Unfortunately, we are not licensed' not in lyrics: - return lyrics - - -# Lyrics.com. - -LYRICSCOM_URL_PATTERN = 'http://www.lyrics.com/%s-lyrics-%s.html' -LYRICSCOM_NOT_FOUND = ( - 'Sorry, we do not have the lyric', - 'Submit Lyrics', -) - - -def _lc_encode(s): - s = re.sub(r'[^\w\s-]', '', s) - s = re.sub(r'\s+', '-', s) - return _encode(s).lower() - - -def fetch_lyricscom(artist, title): - """Fetch lyrics from Lyrics.com.""" - url = LYRICSCOM_URL_PATTERN % (_lc_encode(title), _lc_encode(artist)) - html = fetch_url(url) - if not html: - return - lyrics = extract_text_between(html, '
', '
') - if not lyrics: - return - for not_found_str in LYRICSCOM_NOT_FOUND: - if not_found_str in lyrics: + def fetch_url(self, url): + """Retrieve the content at a given URL, or return None if the source + is unreachable. + """ + try: + r = requests.get(url, verify=False) + except requests.RequestException as exc: + self._log.debug(u'lyrics request failed: {0}', exc) return + if r.status_code == requests.codes.ok: + return r.text + else: + self._log.debug(u'failed to fetch: {0} ({1})', url, r.status_code) - parts = lyrics.split('\n---\nLyrics powered by', 1) - if parts: - return parts[0] + def fetch(self, artist, title): + raise NotImplementedError() -# Optional Google custom search API backend. - -def slugify(text): - """Normalize a string and remove non-alphanumeric characters. - """ - text = re.sub(r"[-'_\s]", '_', text) - text = re.sub(r"_+", '_', text).strip('_') - pat = "([^,\(]*)\((.*?)\)" # Remove content within parentheses - text = re.sub(pat, '\g<1>', text).strip() - try: - text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore') - text = unicode(re.sub('[-\s]+', ' ', text)) - except UnicodeDecodeError: - log.exception(u"Failing to normalize '{0}'", text) - return text +class SymbolsReplaced(Backend): + @classmethod + def _encode(cls, s): + s = re.sub(r'\s+', '_', s) + s = s.replace("<", "Less_Than") + s = s.replace(">", "Greater_Than") + s = s.replace("#", "Number_") + s = re.sub(r'[\[\{]', '(', s) + s = re.sub(r'[\]\}]', ')', s) + return super(SymbolsReplaced, cls)._encode(s) -BY_TRANS = ['by', 'par', 'de', 'von'] -LYRICS_TRANS = ['lyrics', 'paroles', 'letras', 'liedtexte'] +class MusiXmatch(SymbolsReplaced): + URL_PATTERN = 'https://www.musixmatch.com/lyrics/%s/%s' + + def fetch(self, artist, title): + url = self.build_url(artist, title) + html = self.fetch_url(url) + if not html: + return + lyrics = extract_text_between(html, + '"lyrics_body":', '"lyrics_language":') + return lyrics.strip(',"').replace('\\n', '\n') -def is_page_candidate(urlLink, urlTitle, title, artist): - """Return True if the URL title makes it a good candidate to be a - page that contains lyrics of title by artist. - """ - title = slugify(title.lower()) - artist = slugify(artist.lower()) - sitename = re.search(u"//([^/]+)/.*", slugify(urlLink.lower())).group(1) - urlTitle = slugify(urlTitle.lower()) - # Check if URL title contains song title (exact match) - if urlTitle.find(title) != -1: - return True - # or try extracting song title from URL title and check if - # they are close enough - tokens = [by + '_' + artist for by in BY_TRANS] + \ - [artist, sitename, sitename.replace('www.', '')] + LYRICS_TRANS - songTitle = re.sub(u'(%s)' % u'|'.join(tokens), u'', urlTitle) - songTitle = songTitle.strip('_|') - typoRatio = .9 - return difflib.SequenceMatcher(None, songTitle, title).ratio() >= typoRatio +class LyricsWiki(SymbolsReplaced): + """Fetch lyrics from LyricsWiki.""" + URL_PATTERN = 'http://lyrics.wikia.com/%s:%s' + + def fetch(self, artist, title): + url = self.build_url(artist, title) + html = self.fetch_url(url) + if not html: + return + lyrics = extract_text_in(html, u"
") + if lyrics and 'Unfortunately, we are not licensed' not in lyrics: + return lyrics + + +class LyricsCom(Backend): + """Fetch lyrics from Lyrics.com.""" + URL_PATTERN = 'http://www.lyrics.com/%s-lyrics-%s.html' + NOT_FOUND = ( + 'Sorry, we do not have the lyric', + 'Submit Lyrics', + ) + + @classmethod + def _encode(cls, s): + s = re.sub(r'[^\w\s-]', '', s) + s = re.sub(r'\s+', '-', s) + return super(LyricsCom, cls)._encode(s).lower() + + def fetch(self, artist, title): + url = self.build_url(artist, title) + html = self.fetch_url(url) + if not html: + return + lyrics = extract_text_between(html, '
', '
') + if not lyrics: + return + for not_found_str in self.NOT_FOUND: + if not_found_str in lyrics: + return + + parts = lyrics.split('\n---\nLyrics powered by', 1) + if parts: + return parts[0] def remove_credits(text): @@ -315,36 +274,6 @@ def remove_credits(text): return text -def is_lyrics(text, artist=None): - """Determine whether the text seems to be valid lyrics. - """ - if not text: - return False - badTriggersOcc = [] - nbLines = text.count('\n') - if nbLines <= 1: - log.debug(u"Ignoring too short lyrics '{0}'", text) - return False - elif nbLines < 5: - badTriggersOcc.append('too_short') - else: - # Lyrics look legit, remove credits to avoid being penalized further - # down - text = remove_credits(text) - - badTriggers = ['lyrics', 'copyright', 'property', 'links'] - if artist: - badTriggersOcc += [artist] - - for item in badTriggers: - badTriggersOcc += [item] * len(re.findall(r'\W%s\W' % item, - text, re.I)) - - if badTriggersOcc: - log.debug(u'Bad triggers detected: {0}', badTriggersOcc) - return len(badTriggersOcc) < 2 - - def _scrape_strip_cruft(html, plain_text_out=False): """Clean up HTML """ @@ -396,50 +325,119 @@ def scrape_lyrics_from_html(html): return soup -def fetch_google(artist, title): - """Fetch lyrics from Google search results. - """ - query = u"%s %s" % (artist, title) - api_key = config['lyrics']['google_API_key'].get(unicode) - engine_id = config['lyrics']['google_engine_ID'].get(unicode) - url = u'https://www.googleapis.com/customsearch/v1?key=%s&cx=%s&q=%s' % \ - (api_key, engine_id, urllib.quote(query.encode('utf8'))) +class Google(Backend): + """Fetch lyrics from Google search results.""" + def is_lyrics(self, text, artist=None): + """Determine whether the text seems to be valid lyrics. + """ + if not text: + return False + badTriggersOcc = [] + nbLines = text.count('\n') + if nbLines <= 1: + self._log.debug(u"Ignoring too short lyrics '{0}'", text) + return False + elif nbLines < 5: + badTriggersOcc.append('too_short') + else: + # Lyrics look legit, remove credits to avoid being penalized + # further down + text = remove_credits(text) - data = urllib.urlopen(url) - data = json.load(data) - if 'error' in data: - reason = data['error']['errors'][0]['reason'] - log.debug(u'google lyrics backend error: {0}', reason) - return + badTriggers = ['lyrics', 'copyright', 'property', 'links'] + if artist: + badTriggersOcc += [artist] - if 'items' in data.keys(): - for item in data['items']: - urlLink = item['link'] - urlTitle = item.get('title', u'') - if not is_page_candidate(urlLink, urlTitle, title, artist): - continue - html = fetch_url(urlLink) - lyrics = scrape_lyrics_from_html(html) - if not lyrics: - continue + for item in badTriggers: + badTriggersOcc += [item] * len(re.findall(r'\W%s\W' % item, + text, re.I)) - if is_lyrics(lyrics, artist): - log.debug(u'got lyrics from {0}', item['displayLink']) - return lyrics + if badTriggersOcc: + self._log.debug(u'Bad triggers detected: {0}', badTriggersOcc) + return len(badTriggersOcc) < 2 + def slugify(self, text): + """Normalize a string and remove non-alphanumeric characters. + """ + text = re.sub(r"[-'_\s]", '_', text) + text = re.sub(r"_+", '_', text).strip('_') + pat = "([^,\(]*)\((.*?)\)" # Remove content within parentheses + text = re.sub(pat, '\g<1>', text).strip() + try: + text = unicodedata.normalize('NFKD', text).encode('ascii', + 'ignore') + text = unicode(re.sub('[-\s]+', ' ', text)) + except UnicodeDecodeError: + self._log.exception(u"Failing to normalize '{0}'", text) + return text -# Plugin logic. + BY_TRANS = ['by', 'par', 'de', 'von'] + LYRICS_TRANS = ['lyrics', 'paroles', 'letras', 'liedtexte'] -SOURCES = ['google', 'lyricwiki', 'lyrics.com', 'musixmatch'] -SOURCE_BACKENDS = { - 'google': fetch_google, - 'lyricwiki': fetch_lyricswiki, - 'lyrics.com': fetch_lyricscom, - 'musixmatch': fetch_musixmatch, -} + def is_page_candidate(self, urlLink, urlTitle, title, artist): + """Return True if the URL title makes it a good candidate to be a + page that contains lyrics of title by artist. + """ + title = self.slugify(title.lower()) + artist = self.slugify(artist.lower()) + sitename = re.search(u"//([^/]+)/.*", + self.slugify(urlLink.lower())).group(1) + urlTitle = self.slugify(urlTitle.lower()) + # Check if URL title contains song title (exact match) + if urlTitle.find(title) != -1: + return True + # or try extracting song title from URL title and check if + # they are close enough + tokens = [by + '_' + artist for by in self.BY_TRANS] + \ + [artist, sitename, sitename.replace('www.', '')] + \ + self.LYRICS_TRANS + songTitle = re.sub(u'(%s)' % u'|'.join(tokens), u'', urlTitle) + songTitle = songTitle.strip('_|') + typoRatio = .9 + ratio = difflib.SequenceMatcher(None, songTitle, title).ratio() + return ratio >= typoRatio + + def fetch(self, artist, title): + query = u"%s %s" % (artist, title) + api_key = config['lyrics']['google_API_key'].get(unicode) + engine_id = config['lyrics']['google_engine_ID'].get(unicode) + url = u'https://www.googleapis.com/customsearch/v1?key=%s&cx=%s&q=%s' % \ + (api_key, engine_id, urllib.quote(query.encode('utf8'))) + + data = urllib.urlopen(url) + data = json.load(data) + if 'error' in data: + reason = data['error']['errors'][0]['reason'] + self._log.debug(u'google lyrics backend error: {0}', reason) + return + + if 'items' in data.keys(): + for item in data['items']: + urlLink = item['link'] + urlTitle = item.get('title', u'') + if not self.is_page_candidate(urlLink, urlTitle, + title, artist): + continue + html = self.fetch_url(urlLink) + lyrics = scrape_lyrics_from_html(html) + if not lyrics: + continue + + if self.is_lyrics(lyrics, artist): + self._log.debug(u'got lyrics from {0}', + item['displayLink']) + return lyrics class LyricsPlugin(plugins.BeetsPlugin): + SOURCES = ['google', 'lyricwiki', 'lyrics.com', 'musixmatch'] + SOURCE_BACKENDS = { + 'google': Google, + 'lyricwiki': LyricsWiki, + 'lyrics.com': LyricsCom, + 'musixmatch': MusiXmatch, + } + def __init__(self): super(LyricsPlugin, self).__init__() self._import_stages = [self.imported] @@ -449,18 +447,18 @@ class LyricsPlugin(plugins.BeetsPlugin): 'google_engine_ID': u'009217259823014548361:lndtuqkycfu', 'fallback': None, 'force': False, - 'sources': SOURCES, + 'sources': self.SOURCES, }) - available_sources = list(SOURCES) + available_sources = list(self.SOURCES) if not self.config['google_API_key'].get() and \ - 'google' in SOURCES: + 'google' in self.SOURCES: available_sources.remove('google') self.config['sources'] = plugins.sanitize_choices( self.config['sources'].as_str_seq(), available_sources) self.backends = [] for key in self.config['sources'].as_str_seq(): - self.backends.append(SOURCE_BACKENDS[key]) + self.backends.append(self.SOURCE_BACKENDS[key](self._log)) def commands(self): cmd = ui.Subcommand('lyrics', help='fetch song lyrics') @@ -499,7 +497,8 @@ class LyricsPlugin(plugins.BeetsPlugin): lyrics will also be written to the file itself.""" # Skip if the item already has lyrics. if not force and item.lyrics: - log.info(u'lyrics already present: {0.artist} - {0.title}', item) + self._log.info(u'lyrics already present: {0.artist} - {0.title}', + item) return lyrics = None @@ -511,9 +510,9 @@ class LyricsPlugin(plugins.BeetsPlugin): lyrics = u"\n\n---\n\n".join([l for l in lyrics if l]) if lyrics: - log.info(u'fetched lyrics: {0} - {1}', item.artist, item.title) + self._log.info(u'fetched lyrics: {0.artist} - {0.title}', item) else: - log.info(u'lyrics not found: {0} - {1}', item.artist, item.title) + self._log.info(u'lyrics not found: {0.artist} - {0.title}', item) fallback = self.config['fallback'].get() if fallback: lyrics = fallback @@ -531,7 +530,8 @@ class LyricsPlugin(plugins.BeetsPlugin): None if no lyrics were found. """ for backend in self.backends: - lyrics = backend(artist, title) + lyrics = backend.fetch(artist, title) if lyrics: - log.debug(u'got lyrics from backend: {0}', backend.__name__) + self._log.debug(u'got lyrics from backend: {0}', + backend.__class__.__name__) return _scrape_strip_cruft(lyrics, True)