Merge pull request #1939 from Kraymer/lyrics_translate

Add lyrics translation using Microsoft Bing API
This commit is contained in:
Fabrice Laporte 2016-04-14 23:51:47 +02:00
commit a282245011
3 changed files with 114 additions and 22 deletions

View file

@ -16,15 +16,15 @@
"""Fetches, embeds, and displays lyrics.
"""
from __future__ import division, absolute_import, print_function
from __future__ import absolute_import, division, print_function
import re
import requests
import json
import unicodedata
import urllib
import difflib
import itertools
import json
import re
import requests
import unicodedata
import urllib
import warnings
from HTMLParser import HTMLParseError
@ -56,7 +56,7 @@ URL_CHARACTERS = {
def unescape(text):
"""Resolves &#xxx; HTML entities (and some others)."""
"""Resolve &#xxx; HTML entities (and some others)."""
if isinstance(text, bytes):
text = text.decode('utf8', 'ignore')
out = text.replace(u' ', u' ')
@ -455,29 +455,29 @@ class Google(Backend):
"""
if not text:
return False
badTriggersOcc = []
nbLines = text.count('\n')
if nbLines <= 1:
bad_triggers_occ = []
nb_lines = text.count('\n')
if nb_lines <= 1:
self._log.debug(u"Ignoring too short lyrics '{0}'", text)
return False
elif nbLines < 5:
badTriggersOcc.append('too_short')
elif nb_lines < 5:
bad_triggers_occ.append('too_short')
else:
# Lyrics look legit, remove credits to avoid being penalized
# further down
text = remove_credits(text)
badTriggers = ['lyrics', 'copyright', 'property', 'links']
bad_triggers = ['lyrics', 'copyright', 'property', 'links']
if artist:
badTriggersOcc += [artist]
bad_triggers_occ += [artist]
for item in badTriggers:
badTriggersOcc += [item] * len(re.findall(r'\W%s\W' % item,
text, re.I))
for item in bad_triggers:
bad_triggers_occ += [item] * len(re.findall(r'\W%s\W' % item,
text, re.I))
if badTriggersOcc:
self._log.debug(u'Bad triggers detected: {0}', badTriggersOcc)
return len(badTriggersOcc) < 2
if bad_triggers_occ:
self._log.debug(u'Bad triggers detected: {0}', bad_triggers_occ)
return len(bad_triggers_occ) < 2
def slugify(self, text):
"""Normalize a string and remove non-alphanumeric characters.
@ -570,6 +570,9 @@ class LyricsPlugin(plugins.BeetsPlugin):
self.import_stages = [self.imported]
self.config.add({
'auto': True,
'bing_client_secret': None,
'bing_lang_from': [],
'bing_lang_to': None,
'google_API_key': None,
'google_engine_ID': u'009217259823014548361:lndtuqkycfu',
'genius_api_key':
@ -579,6 +582,7 @@ class LyricsPlugin(plugins.BeetsPlugin):
'force': False,
'sources': self.SOURCES,
})
self.config['bing_client_secret'].redact = True
self.config['google_API_key'].redact = True
self.config['google_engine_ID'].redact = True
self.config['genius_api_key'].redact = True
@ -592,6 +596,27 @@ class LyricsPlugin(plugins.BeetsPlugin):
self.backends = [self.SOURCE_BACKENDS[key](self.config, self._log)
for key in self.config['sources'].as_str_seq()]
self.config['bing_lang_from'] = [
x.lower() for x in self.config['bing_lang_from'].as_str_seq()]
self.bing_auth_token = None
def get_bing_access_token(self):
params = {
'client_id': 'beets',
'client_secret': self.config['bing_client_secret'],
'scope': 'http://api.microsofttranslator.com',
'grant_type': 'client_credentials',
}
oauth_url = 'https://datamarket.accesscontrol.windows.net/v2/OAuth2-13'
oauth_token = json.loads(requests.post(
oauth_url,
data=urllib.urlencode(params)).content)
if 'access_token' in oauth_token:
return "Bearer " + oauth_token['access_token']
else:
self._log.warning(u'Could not get Bing Translate API access token.'
u' Check your "bing_client_secret" password')
def commands(self):
cmd = ui.Subcommand('lyrics', help='fetch song lyrics')
@ -647,6 +672,16 @@ class LyricsPlugin(plugins.BeetsPlugin):
if lyrics:
self._log.info(u'fetched lyrics: {0}', item)
if self.config['bing_client_secret'].get():
from langdetect import detect
lang_from = detect(lyrics)
if self.config['bing_lang_to'].get() != lang_from and (
not self.config['bing_lang_from'] or (
lang_from in self.config[
'bing_lang_from'].as_str_seq())):
lyrics = self.append_translation(
lyrics, self.config['bing_lang_to'])
else:
self._log.info(u'lyrics not found: {0}', item)
fallback = self.config['fallback'].get()
@ -654,11 +689,10 @@ class LyricsPlugin(plugins.BeetsPlugin):
lyrics = fallback
else:
return
item.lyrics = lyrics
if write:
item.try_write()
print(lyrics)
item.store()
def get_lyrics(self, artist, title):
@ -671,3 +705,30 @@ class LyricsPlugin(plugins.BeetsPlugin):
self._log.debug(u'got lyrics from backend: {0}',
backend.__class__.__name__)
return _scrape_strip_cruft(lyrics, True)
def append_translation(self, text, to_lang):
import xml.etree.ElementTree as ET
if not self.bing_auth_token:
self.bing_auth_token = self.get_bing_access_token()
if self.bing_auth_token:
# Extract unique lines to limit API request size per song
text_lines = set(text.split('\n'))
url = ('http://api.microsofttranslator.com/v2/Http.svc/'
'Translate?text=%s&to=%s' % ('|'.join(text_lines), to_lang))
r = requests.get(url,
headers={"Authorization ": self.bing_auth_token})
if r.status_code != 200:
self._log.debug('translation API error {}: {}', r.status_code,
r.text)
if 'token has expired' in r.text:
self.bing_auth_token = None
return self.append_translation(text, to_lang)
return text
lines_translated = ET.fromstring(r.text.encode('utf8')).text
# Use a translation mapping dict to build resulting lyrics
translations = dict(zip(text_lines, lines_translated.split('|')))
result = ''
for line in text.split('\n'):
result += '%s / %s\n' % (line, translations[line])
return result

View file

@ -10,6 +10,9 @@ New features:
art while copying it.
* :doc:`/plugins/importadded`: A new `preserve_write_mtimes` option
lets you preserve mtime of files after each write.
* :doc:`/plugins/lyrics`: The plugin can now translate the fetched lyrics to a
configured `bing_lang_to` langage. Enabling translation require to register
for a Microsoft Azure Marketplace free account. Thanks to :user:`Kraymer`.
Fixes:

View file

@ -38,6 +38,14 @@ configuration file. The available options are:
- **auto**: Fetch lyrics automatically during import.
Default: ``yes``.
- **bing_client_secret**: Your Bing Translation application password
(to :ref:`lyrics-translation`)
- **bing_lang_from**: By default all lyrics with a language other than
``bing_lang_to`` are translated. Use a list of lang codes to restrict the set
of source languages to translate.
Default: ``[]``
- **bing_lang_to**: Language to translate lyrics into.
Default: None.
- **fallback**: By default, the file will be left unchanged when no lyrics are
found. Use the empty string ``''`` to reset the lyrics in such a case.
Default: None.
@ -113,3 +121,23 @@ After that, the lyrics plugin will fall back on other declared data sources.
.. _pip: http://www.pip-installer.org/
.. _BeautifulSoup: http://www.crummy.com/software/BeautifulSoup/bs4/doc/
.. _lyrics-translation:
Activate On-the-Fly Translation
-------------------------------
Using the Bing Translation API requires `langdetect`_, which you can install
using `pip`_ by typing::
pip install langdetect
You also need to register for a Microsoft Azure Marketplace free account and
to the `Microsoft Translator API`_. Follow the four steps process, specifically
at step 3 enter `beets`` as *Client ID* and copy/paste the generated
*Client secret*. into your ``bing_client_secret`` configuration, alongside
``bing_lang_to`` target `language code`_.
.. _langdetect: https://pypi.python.org/pypi/langdetect
.. _Microsoft Translator API: https://www.microsoft.com/en-us/translator/getstarted.aspx
.. _language code: https://msdn.microsoft.com/en-us/library/hh456380.aspx