mirror of
https://github.com/beetbox/beets.git
synced 2025-12-06 08:39:17 +01:00
Resurrect translation functionality
This commit is contained in:
parent
c315487bd2
commit
d7201062a8
5 changed files with 268 additions and 73 deletions
|
|
@ -113,3 +113,23 @@ class GoogleCustomSearchAPI:
|
|||
"""Pagemap data with a single meta tags dict in a list."""
|
||||
|
||||
metatags: list[JSONDict]
|
||||
|
||||
|
||||
class TranslatorAPI:
|
||||
class Language(TypedDict):
|
||||
"""Language data returned by the translator API."""
|
||||
|
||||
language: str
|
||||
score: float
|
||||
|
||||
class Translation(TypedDict):
|
||||
"""Translation data returned by the translator API."""
|
||||
|
||||
text: str
|
||||
to: str
|
||||
|
||||
class Response(TypedDict):
|
||||
"""Response from the translator API."""
|
||||
|
||||
detectedLanguage: TranslatorAPI.Language
|
||||
translations: list[TranslatorAPI.Translation]
|
||||
|
|
|
|||
|
|
@ -40,10 +40,18 @@ from beets import plugins, ui
|
|||
from beets.autotag.hooks import string_dist
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from logging import Logger
|
||||
|
||||
from beets.importer import ImportTask
|
||||
from beets.library import Item
|
||||
|
||||
from ._typing import GeniusAPI, GoogleCustomSearchAPI, JSONDict, LRCLibAPI
|
||||
from ._typing import (
|
||||
GeniusAPI,
|
||||
GoogleCustomSearchAPI,
|
||||
JSONDict,
|
||||
LRCLibAPI,
|
||||
TranslatorAPI,
|
||||
)
|
||||
|
||||
USER_AGENT = f"beets/{beets.__version__}"
|
||||
INSTRUMENTAL_LYRICS = "[Instrumental]"
|
||||
|
|
@ -252,6 +260,12 @@ class RequestHandler:
|
|||
self.debug("Fetching JSON from {}", url)
|
||||
return r_session.get(url, **kwargs).json()
|
||||
|
||||
def post_json(self, url: str, params: JSONDict | None = None, **kwargs):
|
||||
"""Send POST request and return JSON response."""
|
||||
url = self.format_url(url, params)
|
||||
self.debug("Posting JSON to {}", url)
|
||||
return r_session.post(url, **kwargs).json()
|
||||
|
||||
@contextmanager
|
||||
def handle_request(self) -> Iterator[None]:
|
||||
try:
|
||||
|
|
@ -760,6 +774,97 @@ class Google(SearchBackend):
|
|||
return None
|
||||
|
||||
|
||||
@dataclass
|
||||
class Translator(RequestHandler):
|
||||
TRANSLATE_URL = "https://api.cognitive.microsofttranslator.com/translate"
|
||||
LINE_PARTS_RE = re.compile(r"^(\[\d\d:\d\d.\d\d\]|) *(.*)$")
|
||||
|
||||
_log: Logger
|
||||
api_key: str
|
||||
to_language: str
|
||||
from_languages: list[str]
|
||||
|
||||
@classmethod
|
||||
def from_config(
|
||||
cls,
|
||||
log: Logger,
|
||||
api_key: str,
|
||||
to_language: str,
|
||||
from_languages: list[str] | None = None,
|
||||
) -> Translator:
|
||||
return cls(
|
||||
log,
|
||||
api_key,
|
||||
to_language.upper(),
|
||||
[x.upper() for x in from_languages or []],
|
||||
)
|
||||
|
||||
def get_translations(self, texts: Iterable[str]) -> list[tuple[str, str]]:
|
||||
"""Return translations for the given texts.
|
||||
|
||||
To reduce the translation 'cost', we translate unique texts, and then
|
||||
map the translations back to the original texts.
|
||||
"""
|
||||
unique_texts = list(dict.fromkeys(texts))
|
||||
data: list[TranslatorAPI.Response] = self.post_json(
|
||||
self.TRANSLATE_URL,
|
||||
headers={"Ocp-Apim-Subscription-Key": self.api_key},
|
||||
json=[{"text": "|".join(unique_texts)}],
|
||||
params={"api-version": "3.0", "to": self.to_language},
|
||||
)
|
||||
|
||||
translations = data[0]["translations"][0]["text"].split("|")
|
||||
trans_by_text = dict(zip(unique_texts, translations))
|
||||
return list(zip(texts, (trans_by_text.get(t, "") for t in texts)))
|
||||
|
||||
@classmethod
|
||||
def split_line(cls, line: str) -> tuple[str, str]:
|
||||
"""Split line to (timestamp, text)."""
|
||||
if m := cls.LINE_PARTS_RE.match(line):
|
||||
return m[1], m[2]
|
||||
|
||||
return "", ""
|
||||
|
||||
def append_translations(self, lines: Iterable[str]) -> list[str]:
|
||||
"""Append translations to the given lyrics texts.
|
||||
|
||||
Lines may contain timestamps from LRCLib which need to be temporarily
|
||||
removed for the translation. They can take any of these forms:
|
||||
- empty
|
||||
Text - text only
|
||||
[00:00:00] - timestamp only
|
||||
[00:00:00] Text - timestamp with text
|
||||
"""
|
||||
# split into [(timestamp, text), ...]]
|
||||
ts_and_text = list(map(self.split_line, lines))
|
||||
timestamps = [ts for ts, _ in ts_and_text]
|
||||
text_pairs = self.get_translations([ln for _, ln in ts_and_text])
|
||||
|
||||
# only add the separator for non-empty translations
|
||||
texts = [" / ".join(filter(None, p)) for p in text_pairs]
|
||||
# only add the space between non-empty timestamps and texts
|
||||
return [" ".join(filter(None, p)) for p in zip(timestamps, texts)]
|
||||
|
||||
def translate(self, lyrics: str) -> str:
|
||||
"""Translate the given lyrics to the target language.
|
||||
|
||||
If the lyrics are already in the target language or not in any of
|
||||
of the source languages (if configured), they are returned as is.
|
||||
|
||||
The footer with the source URL is preserved, if present.
|
||||
"""
|
||||
lyrics_language = langdetect.detect(lyrics).upper()
|
||||
if lyrics_language == self.to_language or (
|
||||
self.from_languages and lyrics_language not in self.from_languages
|
||||
):
|
||||
return lyrics
|
||||
|
||||
lyrics, *url = lyrics.split("\n\nSource: ")
|
||||
with self.handle_request():
|
||||
translated_lines = self.append_translations(lyrics.splitlines())
|
||||
return "\n\nSource: ".join(["\n".join(translated_lines), *url])
|
||||
|
||||
|
||||
class LyricsPlugin(RequestHandler, plugins.BeetsPlugin):
|
||||
BACKEND_BY_NAME = {
|
||||
b.name: b for b in [LRCLib, Google, Genius, Tekstowo, MusiXmatch]
|
||||
|
|
@ -776,15 +881,24 @@ class LyricsPlugin(RequestHandler, plugins.BeetsPlugin):
|
|||
|
||||
return [self.BACKEND_BY_NAME[c](self.config, self._log) for c in chosen]
|
||||
|
||||
@cached_property
|
||||
def translator(self) -> Translator | None:
|
||||
config = self.config["translate"]
|
||||
if config["api_key"].get() and config["to_language"].get():
|
||||
return Translator.from_config(self._log, **config.flatten())
|
||||
return None
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.import_stages = [self.imported]
|
||||
self.config.add(
|
||||
{
|
||||
"auto": True,
|
||||
"bing_client_secret": None,
|
||||
"bing_lang_from": [],
|
||||
"bing_lang_to": None,
|
||||
"translate": {
|
||||
"api_key": None,
|
||||
"from_languages": [],
|
||||
"to_language": None,
|
||||
},
|
||||
"dist_thresh": 0.11,
|
||||
"google_API_key": None,
|
||||
"google_engine_ID": "009217259823014548361:lndtuqkycfu",
|
||||
|
|
@ -803,7 +917,7 @@ class LyricsPlugin(RequestHandler, plugins.BeetsPlugin):
|
|||
],
|
||||
}
|
||||
)
|
||||
self.config["bing_client_secret"].redact = True
|
||||
self.config["translate"]["api_key"].redact = True
|
||||
self.config["google_API_key"].redact = True
|
||||
self.config["google_engine_ID"].redact = True
|
||||
self.config["genius_api_key"].redact = True
|
||||
|
|
@ -817,24 +931,6 @@ class LyricsPlugin(RequestHandler, plugins.BeetsPlugin):
|
|||
# open yet.
|
||||
self.rest = None
|
||||
|
||||
self.config["bing_lang_from"] = [
|
||||
x.lower() for x in self.config["bing_lang_from"].as_str_seq()
|
||||
]
|
||||
|
||||
@cached_property
|
||||
def bing_access_token(self) -> str | None:
|
||||
params = {
|
||||
"client_id": "beets",
|
||||
"client_secret": self.config["bing_client_secret"],
|
||||
"scope": "https://api.microsofttranslator.com",
|
||||
"grant_type": "client_credentials",
|
||||
}
|
||||
|
||||
oauth_url = "https://datamarket.accesscontrol.windows.net/v2/OAuth2-13"
|
||||
with self.handle_request():
|
||||
r = r_session.post(oauth_url, params=params)
|
||||
return r.json()["access_token"]
|
||||
|
||||
def commands(self):
|
||||
cmd = ui.Subcommand("lyrics", help="fetch song lyrics")
|
||||
cmd.parser.add_option(
|
||||
|
|
@ -996,14 +1092,12 @@ class LyricsPlugin(RequestHandler, plugins.BeetsPlugin):
|
|||
|
||||
if lyrics:
|
||||
self.info("🟢 Found lyrics: {0}", item)
|
||||
if self.config["bing_client_secret"].get():
|
||||
lang_from = langdetect.detect(lyrics)
|
||||
if self.config["bing_lang_to"].get() != lang_from and (
|
||||
not self.config["bing_lang_from"]
|
||||
or (lang_from in self.config["bing_lang_from"].as_str_seq())
|
||||
):
|
||||
lyrics = self.append_translation(
|
||||
lyrics, self.config["bing_lang_to"]
|
||||
if translator := self.translator:
|
||||
initial_lyrics = lyrics
|
||||
if (lyrics := translator.translate(lyrics)) != initial_lyrics:
|
||||
self.info(
|
||||
"🟢 Added translation to {}",
|
||||
self.config["translate_to"].get().upper(),
|
||||
)
|
||||
else:
|
||||
self.info("🔴 Lyrics not found: {}", item)
|
||||
|
|
@ -1027,30 +1121,3 @@ class LyricsPlugin(RequestHandler, plugins.BeetsPlugin):
|
|||
return f"{lyrics}\n\nSource: {url}"
|
||||
|
||||
return None
|
||||
|
||||
def append_translation(self, text, to_lang):
|
||||
from xml.etree import ElementTree
|
||||
|
||||
if not (token := self.bing_access_token):
|
||||
self.warn(
|
||||
"Could not get Bing Translate API access token. "
|
||||
"Check your 'bing_client_secret' password."
|
||||
)
|
||||
return text
|
||||
|
||||
# Extract unique lines to limit API request size per song
|
||||
lines = text.split("\n")
|
||||
unique_lines = set(lines)
|
||||
url = "https://api.microsofttranslator.com/v2/Http.svc/Translate"
|
||||
with self.handle_request():
|
||||
text = self.fetch_text(
|
||||
url,
|
||||
headers={"Authorization": f"Bearer {token}"},
|
||||
params={"text": "|".join(unique_lines), "to": to_lang},
|
||||
)
|
||||
if translated := ElementTree.fromstring(text.encode("utf-8")).text:
|
||||
# Use a translation mapping dict to build resulting lyrics
|
||||
translations = dict(zip(unique_lines, translated.split("|")))
|
||||
return "".join(f"{ln} / {translations[ln]}\n" for ln in lines)
|
||||
|
||||
return text
|
||||
|
|
|
|||
|
|
@ -19,6 +19,8 @@ New features:
|
|||
control the maximum allowed distance between the lyrics search result and the
|
||||
tagged item's artist and title. This is useful for preventing false positives
|
||||
when fetching lyrics.
|
||||
* :doc:`plugins/lyrics`: Rewrite lyrics translation functionality to use Azure
|
||||
AI Translator API and add relevant instructions to the documentation.
|
||||
|
||||
Bug fixes:
|
||||
|
||||
|
|
|
|||
|
|
@ -38,9 +38,10 @@ Default configuration:
|
|||
|
||||
lyrics:
|
||||
auto: yes
|
||||
bing_client_secret: null
|
||||
bing_lang_from: []
|
||||
bing_lang_to: null
|
||||
translate:
|
||||
api_key:
|
||||
from_languages: []
|
||||
to_language:
|
||||
dist_thresh: 0.11
|
||||
fallback: null
|
||||
force: no
|
||||
|
|
@ -52,12 +53,14 @@ Default configuration:
|
|||
The available options are:
|
||||
|
||||
- **auto**: Fetch lyrics automatically during import.
|
||||
- **bing_client_secret**: Your Bing Translation application password
|
||||
(see :ref:`lyrics-translation`)
|
||||
- **bing_lang_from**: By default all lyrics with a language other than
|
||||
``bing_lang_to`` are translated. Use a list of lang codes to restrict the set
|
||||
of source languages to translate.
|
||||
- **bing_lang_to**: Language to translate lyrics into.
|
||||
- **translate**:
|
||||
|
||||
- **api_key**: Api key to access your Azure Translator resource. (see
|
||||
:ref:`lyrics-translation`)
|
||||
- **from_languages**: By default all lyrics with a language other than
|
||||
``translate_to`` are translated. Use a list of language codes to restrict
|
||||
them.
|
||||
- **to_language**: Language code to translate lyrics to.
|
||||
- **dist_thresh**: The maximum distance between the artist and title
|
||||
combination of the music file and lyrics candidate to consider them a match.
|
||||
Lower values will make the plugin more strict, higher values will make it
|
||||
|
|
@ -165,10 +168,28 @@ After that, the lyrics plugin will fall back on other declared data sources.
|
|||
Activate On-the-Fly Translation
|
||||
-------------------------------
|
||||
|
||||
You need to register for a Microsoft Azure Marketplace free account and
|
||||
to the `Microsoft Translator API`_. Follow the four steps process, specifically
|
||||
at step 3 enter ``beets`` as *Client ID* and copy/paste the generated
|
||||
*Client secret* into your ``bing_client_secret`` configuration, alongside
|
||||
``bing_lang_to`` target ``language code``.
|
||||
We use Azure to optionally translate your lyrics. To set up the integration,
|
||||
follow these steps:
|
||||
|
||||
.. _Microsoft Translator API: https://docs.microsoft.com/en-us/azure/cognitive-services/translator/translator-how-to-signup
|
||||
1. `Create a Translator resource`_ on Azure.
|
||||
2. `Obtain its API key`_.
|
||||
3. Add the API key to your configuration as ``translate.api_key``.
|
||||
4. Configure your target language using the ``translate.to_language`` option.
|
||||
|
||||
|
||||
For example, with the following configuration
|
||||
|
||||
.. code-block:: yaml
|
||||
|
||||
lyrics:
|
||||
translate:
|
||||
api_key: YOUR_TRANSLATOR_API_KEY
|
||||
to_language: de
|
||||
|
||||
You should expect lyrics like this::
|
||||
|
||||
Original verse / Ursprünglicher Vers
|
||||
Some other verse / Ein anderer Vers
|
||||
|
||||
.. _create a Translator resource: https://learn.microsoft.com/en-us/azure/ai-services/translator/create-translator-resource
|
||||
.. _obtain its API key: https://learn.microsoft.com/en-us/python/api/overview/azure/ai-translation-text-readme?view=azure-python&preserve-view=true#get-an-api-key
|
||||
|
|
|
|||
|
|
@ -17,6 +17,7 @@
|
|||
import importlib.util
|
||||
import os
|
||||
import re
|
||||
import textwrap
|
||||
from functools import partial
|
||||
from http import HTTPStatus
|
||||
|
||||
|
|
@ -509,3 +510,87 @@ class TestLRCLibLyrics(LyricsBackendTest):
|
|||
lyrics, _ = fetch_lyrics()
|
||||
|
||||
assert lyrics == expected_lyrics
|
||||
|
||||
|
||||
class TestTranslation:
|
||||
@pytest.fixture(autouse=True)
|
||||
def _patch_bing(self, requests_mock):
|
||||
def callback(request, _):
|
||||
if b"Refrain" in request.body:
|
||||
translations = (
|
||||
""
|
||||
"|[Refrain : Doja Cat]"
|
||||
"|Difficile pour moi de te laisser partir (Te laisser partir, te laisser partir)" # noqa: E501
|
||||
"|Mon corps ne me laissait pas le cacher (Cachez-le)"
|
||||
"|Quoi qu’il arrive, je ne plierais pas (Ne plierait pas, ne plierais pas)" # noqa: E501
|
||||
"|Chevauchant à travers le tonnerre, la foudre"
|
||||
)
|
||||
elif b"00:00.00" in request.body:
|
||||
translations = (
|
||||
""
|
||||
"|[00:00.00] Quelques paroles synchronisées"
|
||||
"|[00:01.00] Quelques paroles plus synchronisées"
|
||||
)
|
||||
else:
|
||||
translations = (
|
||||
""
|
||||
"|Quelques paroles synchronisées"
|
||||
"|Quelques paroles plus synchronisées"
|
||||
)
|
||||
|
||||
return [
|
||||
{
|
||||
"detectedLanguage": {"language": "en", "score": 1.0},
|
||||
"translations": [{"text": translations, "to": "fr"}],
|
||||
}
|
||||
]
|
||||
|
||||
requests_mock.post(lyrics.Translator.TRANSLATE_URL, json=callback)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"initial_lyrics, expected",
|
||||
[
|
||||
pytest.param(
|
||||
"""
|
||||
[Refrain: Doja Cat]
|
||||
Hard for me to let you go (Let you go, let you go)
|
||||
My body wouldn't let me hide it (Hide it)
|
||||
No matter what, I wouldn't fold (Wouldn't fold, wouldn't fold)
|
||||
Ridin' through the thunder, lightnin'""",
|
||||
"""
|
||||
[Refrain: Doja Cat] / [Refrain : Doja Cat]
|
||||
Hard for me to let you go (Let you go, let you go) / Difficile pour moi de te laisser partir (Te laisser partir, te laisser partir)
|
||||
My body wouldn't let me hide it (Hide it) / Mon corps ne me laissait pas le cacher (Cachez-le)
|
||||
No matter what, I wouldn't fold (Wouldn't fold, wouldn't fold) / Quoi qu’il arrive, je ne plierais pas (Ne plierait pas, ne plierais pas)
|
||||
Ridin' through the thunder, lightnin' / Chevauchant à travers le tonnerre, la foudre""", # noqa: E501
|
||||
id="plain",
|
||||
),
|
||||
pytest.param(
|
||||
"""
|
||||
[00:00.00] Some synced lyrics
|
||||
[00:00:50]
|
||||
[00:01.00] Some more synced lyrics
|
||||
|
||||
Source: https://lrclib.net/api/123""",
|
||||
"""
|
||||
[00:00.00] Some synced lyrics / Quelques paroles synchronisées
|
||||
[00:00:50]
|
||||
[00:01.00] Some more synced lyrics / Quelques paroles plus synchronisées
|
||||
|
||||
Source: https://lrclib.net/api/123""", # noqa: E501
|
||||
id="synced",
|
||||
),
|
||||
pytest.param(
|
||||
"Quelques paroles",
|
||||
"Quelques paroles",
|
||||
id="already in the target language",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_translate(self, initial_lyrics, expected):
|
||||
plugin = lyrics.LyricsPlugin()
|
||||
bing = lyrics.Translator(plugin._log, "123", "FR", ["EN"])
|
||||
|
||||
assert bing.translate(
|
||||
textwrap.dedent(initial_lyrics)
|
||||
) == textwrap.dedent(expected)
|
||||
|
|
|
|||
Loading…
Reference in a new issue