mirror of
https://github.com/beetbox/beets.git
synced 2025-12-06 16:42:42 +01:00
Unite Genius, Tekstowo and Google backends under the same interface
This commit is contained in:
parent
745c5eb9f0
commit
12c5eaae5e
2 changed files with 101 additions and 103 deletions
|
|
@ -27,7 +27,7 @@ from dataclasses import dataclass
|
||||||
from functools import cached_property, partial, total_ordering
|
from functools import cached_property, partial, total_ordering
|
||||||
from html import unescape
|
from html import unescape
|
||||||
from http import HTTPStatus
|
from http import HTTPStatus
|
||||||
from typing import TYPE_CHECKING, Any, ClassVar, Iterable, Iterator
|
from typing import TYPE_CHECKING, ClassVar, Iterable, Iterator, NamedTuple
|
||||||
from urllib.parse import quote, urlencode, urlparse
|
from urllib.parse import quote, urlencode, urlparse
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
|
|
@ -41,7 +41,7 @@ if TYPE_CHECKING:
|
||||||
from beets.importer import ImportTask
|
from beets.importer import ImportTask
|
||||||
from beets.library import Item
|
from beets.library import Item
|
||||||
|
|
||||||
from ._typing import GeniusAPI, LRCLibAPI
|
from ._typing import GeniusAPI, JSONDict, LRCLibAPI
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
@ -57,8 +57,6 @@ try:
|
||||||
except ImportError:
|
except ImportError:
|
||||||
HAS_LANGDETECT = False
|
HAS_LANGDETECT = False
|
||||||
|
|
||||||
JSONDict = dict[str, Any]
|
|
||||||
|
|
||||||
BREAK_RE = re.compile(r"\n?\s*<br([\s|/][^>]*)*>\s*\n?", re.I)
|
BREAK_RE = re.compile(r"\n?\s*<br([\s|/][^>]*)*>\s*\n?", re.I)
|
||||||
USER_AGENT = f"beets/{beets.__version__}"
|
USER_AGENT = f"beets/{beets.__version__}"
|
||||||
INSTRUMENTAL_LYRICS = "[Instrumental]"
|
INSTRUMENTAL_LYRICS = "[Instrumental]"
|
||||||
|
|
@ -442,6 +440,12 @@ class MusiXmatch(DirectBackend):
|
||||||
return lyrics
|
return lyrics
|
||||||
|
|
||||||
|
|
||||||
|
class SearchResult(NamedTuple):
|
||||||
|
artist: str
|
||||||
|
title: str
|
||||||
|
url: str
|
||||||
|
|
||||||
|
|
||||||
class SearchBackend(Backend):
|
class SearchBackend(Backend):
|
||||||
REQUIRES_BS = True
|
REQUIRES_BS = True
|
||||||
|
|
||||||
|
|
@ -450,12 +454,12 @@ class SearchBackend(Backend):
|
||||||
return self.config["dist_thresh"].get(float)
|
return self.config["dist_thresh"].get(float)
|
||||||
|
|
||||||
def check_match(
|
def check_match(
|
||||||
self, target_artist: str, target_title: str, artist: str, title: str
|
self, target_artist: str, target_title: str, result: SearchResult
|
||||||
) -> bool:
|
) -> bool:
|
||||||
"""Check if the given artist and title are 'good enough' match."""
|
"""Check if the given search result is a 'good enough' match."""
|
||||||
max_dist = max(
|
max_dist = max(
|
||||||
string_dist(target_artist, artist),
|
string_dist(target_artist, result.artist),
|
||||||
string_dist(target_title, title),
|
string_dist(target_title, result.title),
|
||||||
)
|
)
|
||||||
|
|
||||||
if (max_dist := round(max_dist, 2)) <= self.dist_thresh:
|
if (max_dist := round(max_dist, 2)) <= self.dist_thresh:
|
||||||
|
|
@ -466,8 +470,8 @@ class SearchBackend(Backend):
|
||||||
# This may show a matching candidate with some noise in the name
|
# This may show a matching candidate with some noise in the name
|
||||||
self.debug(
|
self.debug(
|
||||||
"({}, {}) does not match ({}, {}) but dist was close: {:.2f}",
|
"({}, {}) does not match ({}, {}) but dist was close: {:.2f}",
|
||||||
artist,
|
result.artist,
|
||||||
title,
|
result.title,
|
||||||
target_artist,
|
target_artist,
|
||||||
target_title,
|
target_title,
|
||||||
max_dist,
|
max_dist,
|
||||||
|
|
@ -475,61 +479,59 @@ class SearchBackend(Backend):
|
||||||
|
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
def search(self, artist: str, title: str) -> Iterable[SearchResult]:
|
||||||
|
"""Search for the given query and yield search results."""
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def get_results(self, artist: str, title: str) -> Iterable[SearchResult]:
|
||||||
|
check_match = partial(self.check_match, artist, title)
|
||||||
|
for candidate in self.search(artist, title):
|
||||||
|
if check_match(candidate):
|
||||||
|
yield candidate
|
||||||
|
|
||||||
|
def fetch(self, artist: str, title: str, *_) -> str | None:
|
||||||
|
"""Fetch lyrics for the given artist and title."""
|
||||||
|
for result in self.get_results(artist, title):
|
||||||
|
if lyrics := self.scrape(self.fetch_text(result.url)):
|
||||||
|
return lyrics
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def scrape(cls, html: str) -> str | None:
|
||||||
|
"""Scrape the lyrics from the given HTML."""
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
class Genius(SearchBackend):
|
class Genius(SearchBackend):
|
||||||
"""Fetch lyrics from Genius via genius-api.
|
"""Fetch lyrics from Genius via genius-api.
|
||||||
|
|
||||||
Simply adapted from
|
Because genius doesn't allow accessing lyrics via the api, we first query
|
||||||
bigishdata.com/2016/09/27/getting-song-lyrics-from-geniuss-api-scraping/
|
the api for a url matching our artist & title, then scrape the HTML text
|
||||||
|
for the JSON data containing the lyrics.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
SEARCH_URL = "https://api.genius.com/search"
|
||||||
LYRICS_IN_JSON_RE = re.compile(r'(?<=.\\"html\\":\\").*?(?=(?<!\\)\\")')
|
LYRICS_IN_JSON_RE = re.compile(r'(?<=.\\"html\\":\\").*?(?=(?<!\\)\\")')
|
||||||
remove_backslash = partial(re.compile(r"\\(?=[^\\])").sub, "")
|
remove_backslash = partial(re.compile(r"\\(?=[^\\])").sub, "")
|
||||||
|
|
||||||
base_url = "https://api.genius.com"
|
|
||||||
search_url = f"{base_url}/search"
|
|
||||||
|
|
||||||
@cached_property
|
@cached_property
|
||||||
def headers(self) -> dict[str, str]:
|
def headers(self) -> dict[str, str]:
|
||||||
return {"Authorization": f'Bearer {self.config["genius_api_key"]}'}
|
return {"Authorization": f'Bearer {self.config["genius_api_key"]}'}
|
||||||
|
|
||||||
def fetch(self, artist: str, title: str, *_) -> str | None:
|
def search(self, artist: str, title: str) -> Iterable[SearchResult]:
|
||||||
"""Fetch lyrics from genius.com
|
search_data: GeniusAPI.Search = self.fetch_json(
|
||||||
|
self.SEARCH_URL,
|
||||||
Because genius doesn't allow accessing lyrics via the api,
|
params={"q": f"{artist} {title}"},
|
||||||
we first query the api for a url matching our artist & title,
|
|
||||||
then attempt to scrape that url for the lyrics.
|
|
||||||
"""
|
|
||||||
|
|
||||||
data = self.fetch_json(
|
|
||||||
self.search_url,
|
|
||||||
params={"q": f"{artist} {title}".lower()},
|
|
||||||
headers=self.headers,
|
headers=self.headers,
|
||||||
)
|
)
|
||||||
if (url := self.find_lyrics_url(data, artist, title)) and (
|
for r in (hit["result"] for hit in search_data["response"]["hits"]):
|
||||||
lyrics := self.scrape_lyrics(self.fetch_text(url))
|
yield SearchResult(r["artist_names"], r["title"], r["url"])
|
||||||
):
|
|
||||||
return collapse_newlines(lyrics)
|
|
||||||
|
|
||||||
return None
|
@classmethod
|
||||||
|
def scrape(cls, html: str) -> str | None:
|
||||||
def find_lyrics_url(
|
if m := cls.LYRICS_IN_JSON_RE.search(html):
|
||||||
self, data: GeniusAPI.Search, artist: str, title: str
|
html_text = cls.remove_backslash(m[0]).replace(r"\n", "\n")
|
||||||
) -> str | None:
|
|
||||||
"""Find URL to the lyrics of the given artist and title.
|
|
||||||
|
|
||||||
https://docs.genius.com/#search-h2.
|
|
||||||
"""
|
|
||||||
check = partial(self.check_match, artist, title)
|
|
||||||
for result in (hit["result"] for hit in data["response"]["hits"]):
|
|
||||||
if check(result["artist_names"], result["title"]):
|
|
||||||
return result["url"]
|
|
||||||
|
|
||||||
return None
|
|
||||||
|
|
||||||
def scrape_lyrics(self, html: str) -> str | None:
|
|
||||||
if m := self.LYRICS_IN_JSON_RE.search(html):
|
|
||||||
html_text = self.remove_backslash(m[0]).replace(r"\n", "\n")
|
|
||||||
return get_soup(html_text).get_text().strip()
|
return get_soup(html_text).get_text().strip()
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
@ -551,13 +553,12 @@ class Tekstowo(DirectBackend):
|
||||||
# We are expecting to receive a 404 since we are guessing the URL.
|
# We are expecting to receive a 404 since we are guessing the URL.
|
||||||
# Thus suppress the error so that it does not end up in the logs.
|
# Thus suppress the error so that it does not end up in the logs.
|
||||||
with suppress(NotFoundError):
|
with suppress(NotFoundError):
|
||||||
return self.scrape_lyrics(
|
return self.scrape(self.fetch_text(self.build_url(artist, title)))
|
||||||
self.fetch_text(self.build_url(artist, title))
|
|
||||||
)
|
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def scrape_lyrics(self, html: str) -> str | None:
|
@classmethod
|
||||||
|
def scrape(cls, html: str) -> str | None:
|
||||||
soup = get_soup(html)
|
soup = get_soup(html)
|
||||||
|
|
||||||
if lyrics_div := soup.select_one("div.song-text > div.inner-text"):
|
if lyrics_div := soup.select_one("div.song-text > div.inner-text"):
|
||||||
|
|
@ -616,16 +617,6 @@ class Google(SearchBackend):
|
||||||
|
|
||||||
SEARCH_URL = "https://www.googleapis.com/customsearch/v1"
|
SEARCH_URL = "https://www.googleapis.com/customsearch/v1"
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def scrape_lyrics(html: str) -> str | None:
|
|
||||||
soup = get_soup(html)
|
|
||||||
|
|
||||||
# Get the longest text element (if any).
|
|
||||||
strings = sorted(soup.stripped_strings, key=len, reverse=True)
|
|
||||||
if strings:
|
|
||||||
return strings[0]
|
|
||||||
return None
|
|
||||||
|
|
||||||
def is_lyrics(self, text, artist=None):
|
def is_lyrics(self, text, artist=None):
|
||||||
"""Determine whether the text seems to be valid lyrics."""
|
"""Determine whether the text seems to be valid lyrics."""
|
||||||
if not text:
|
if not text:
|
||||||
|
|
@ -658,17 +649,11 @@ class Google(SearchBackend):
|
||||||
BY_TRANS = ["by", "par", "de", "von"]
|
BY_TRANS = ["by", "par", "de", "von"]
|
||||||
LYRICS_TRANS = ["lyrics", "paroles", "letras", "liedtexte"]
|
LYRICS_TRANS = ["lyrics", "paroles", "letras", "liedtexte"]
|
||||||
|
|
||||||
def is_page_candidate(
|
def make_search_result(
|
||||||
self, artist: str, title: str, url_link: str, url_title: str
|
self, artist: str, url_link: str, url_title: str
|
||||||
) -> bool:
|
) -> SearchResult:
|
||||||
"""Return True if the URL title makes it a good candidate to be a
|
"""Parse artist and title from the URL title and return a search result."""
|
||||||
page that contains lyrics of title by artist.
|
|
||||||
"""
|
|
||||||
title_slug = slug(title)
|
|
||||||
url_title_slug = slug(url_title)
|
url_title_slug = slug(url_title)
|
||||||
if title_slug in url_title_slug:
|
|
||||||
return True
|
|
||||||
|
|
||||||
artist = slug(artist)
|
artist = slug(artist)
|
||||||
sitename = urlparse(url_link).netloc
|
sitename = urlparse(url_link).netloc
|
||||||
|
|
||||||
|
|
@ -683,33 +668,45 @@ class Google(SearchBackend):
|
||||||
"(%s)" % "|".join(tokens), "", url_title_slug
|
"(%s)" % "|".join(tokens), "", url_title_slug
|
||||||
).strip("-")
|
).strip("-")
|
||||||
|
|
||||||
return self.check_match(artist, title_slug, artist, song_title)
|
return SearchResult(artist, song_title, url_link)
|
||||||
|
|
||||||
def fetch(self, artist: str, title: str, *_) -> str | None:
|
def search(self, artist: str, title: str) -> Iterable[SearchResult]:
|
||||||
params = {
|
params = {
|
||||||
"key": self.config["google_API_key"].as_str(),
|
"key": self.config["google_API_key"].as_str(),
|
||||||
"cx": self.config["google_engine_ID"].as_str(),
|
"cx": self.config["google_engine_ID"].as_str(),
|
||||||
"q": f"{artist} {title}",
|
"q": f"{artist} {title}",
|
||||||
}
|
}
|
||||||
|
|
||||||
check_candidate = partial(self.is_page_candidate, artist, title)
|
data = self.fetch_json(self.SEARCH_URL, params=params)
|
||||||
for item in self.fetch_json(self.SEARCH_URL, params=params).get(
|
for item in data.get("items", []):
|
||||||
"items", []
|
yield self.make_search_result(artist, item["link"], item["title"])
|
||||||
):
|
|
||||||
url_link = item["link"]
|
def get_results(self, artist: str, title: str) -> Iterable[SearchResult]:
|
||||||
if not check_candidate(url_link, item.get("title", "")):
|
return super().get_results(artist, slug(title))
|
||||||
continue
|
|
||||||
|
def fetch(self, artist: str, title: str, *_) -> str | None:
|
||||||
|
for result in self.get_results(artist, title):
|
||||||
with self.handle_request():
|
with self.handle_request():
|
||||||
lyrics = self.scrape_lyrics(self.fetch_text(url_link))
|
lyrics = self.scrape(self.fetch_text(result.url))
|
||||||
if not lyrics:
|
if not lyrics:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if self.is_lyrics(lyrics, artist):
|
if self.is_lyrics(lyrics, artist):
|
||||||
self.debug("Got lyrics from {}", item["displayLink"])
|
self.debug(
|
||||||
|
"Got lyrics from {}", urlparse(result.url).netloc
|
||||||
|
)
|
||||||
return lyrics
|
return lyrics
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def scrape(cls, html: str) -> str | None:
|
||||||
|
# Get the longest text element (if any).
|
||||||
|
if strings := sorted(get_soup(html).stripped_strings, key=len):
|
||||||
|
return strings[-1]
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
class LyricsPlugin(RequestHandler, plugins.BeetsPlugin):
|
class LyricsPlugin(RequestHandler, plugins.BeetsPlugin):
|
||||||
SOURCES = ["lrclib", "google", "musixmatch", "genius", "tekstowo"]
|
SOURCES = ["lrclib", "google", "musixmatch", "genius", "tekstowo"]
|
||||||
|
|
|
||||||
|
|
@ -191,9 +191,9 @@ class TestSearchBackend:
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_check_match(self, backend, target_artist, artist, should_match):
|
def test_check_match(self, backend, target_artist, artist, should_match):
|
||||||
assert (
|
result = lyrics.SearchResult(artist, "", "")
|
||||||
backend.check_match(target_artist, "", artist, "") == should_match
|
|
||||||
)
|
assert backend.check_match(target_artist, "", result) == should_match
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="module")
|
@pytest.fixture(scope="module")
|
||||||
|
|
@ -327,31 +327,32 @@ class TestGoogleLyrics(LyricsBackendTest):
|
||||||
|
|
||||||
def test_mocked_source_ok(self, backend, lyrics_html):
|
def test_mocked_source_ok(self, backend, lyrics_html):
|
||||||
"""Test that lyrics of the mocked page are correctly scraped"""
|
"""Test that lyrics of the mocked page are correctly scraped"""
|
||||||
result = backend.scrape_lyrics(lyrics_html).lower()
|
result = backend.scrape(lyrics_html).lower()
|
||||||
|
|
||||||
assert result
|
assert result
|
||||||
assert backend.is_lyrics(result)
|
assert backend.is_lyrics(result)
|
||||||
assert PHRASE_BY_TITLE[self.TITLE] in result
|
assert PHRASE_BY_TITLE[self.TITLE] in result
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"url_title, artist, should_be_candidate",
|
"url_title, artist, expected_title",
|
||||||
[
|
[
|
||||||
("John Doe - beets song Lyrics", "John Doe", True),
|
("John Doe - beets song Lyrics", "John Doe", "beets-song"),
|
||||||
("example.com | Beats song by John doe", "John Doe", True),
|
("example.com | Beats song by John doe", "John Doe", "beats-song"),
|
||||||
("example.com | seets bong lyrics by John doe", "John Doe", False),
|
(
|
||||||
("foo", "Sun O)))", False),
|
"example.com | seets bong lyrics by John doe",
|
||||||
|
"John Doe",
|
||||||
|
"seets-bong",
|
||||||
|
),
|
||||||
|
("foo", "Sun O)))", "foo"),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_is_page_candidate(
|
def test_make_search_result(
|
||||||
self, backend, lyrics_html, url_title, artist, should_be_candidate
|
self, backend, url_title, artist, expected_title
|
||||||
):
|
):
|
||||||
result = backend.is_page_candidate(
|
result = backend.make_search_result(
|
||||||
artist,
|
artist, "https://example.com", url_title
|
||||||
self.TITLE,
|
|
||||||
"http://www.example.com/lyrics/beetssong",
|
|
||||||
url_title,
|
|
||||||
)
|
)
|
||||||
assert bool(result) == should_be_candidate
|
assert result.title == expected_title
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"lyrics",
|
"lyrics",
|
||||||
|
|
@ -385,7 +386,7 @@ class TestGeniusLyrics(LyricsBackendTest):
|
||||||
],
|
],
|
||||||
) # fmt: skip
|
) # fmt: skip
|
||||||
def test_scrape(self, backend, lyrics_html, expected_line_count):
|
def test_scrape(self, backend, lyrics_html, expected_line_count):
|
||||||
result = backend.scrape_lyrics(lyrics_html) or ""
|
result = backend.scrape(lyrics_html) or ""
|
||||||
|
|
||||||
assert len(result.splitlines()) == expected_line_count
|
assert len(result.splitlines()) == expected_line_count
|
||||||
|
|
||||||
|
|
@ -406,7 +407,7 @@ class TestTekstowoLyrics(LyricsBackendTest):
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_scrape(self, backend, lyrics_html, expecting_lyrics):
|
def test_scrape(self, backend, lyrics_html, expecting_lyrics):
|
||||||
assert bool(backend.scrape_lyrics(lyrics_html)) == expecting_lyrics
|
assert bool(backend.scrape(lyrics_html)) == expecting_lyrics
|
||||||
|
|
||||||
|
|
||||||
LYRICS_DURATION = 950
|
LYRICS_DURATION = 950
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue