Unite Genius, Tekstowo and Google backends under the same interface

This commit is contained in:
Šarūnas Nejus 2024-10-13 17:04:58 +01:00
parent 745c5eb9f0
commit 12c5eaae5e
No known key found for this signature in database
GPG key ID: DD28F6704DBE3435
2 changed files with 101 additions and 103 deletions

View file

@ -27,7 +27,7 @@ from dataclasses import dataclass
from functools import cached_property, partial, total_ordering from functools import cached_property, partial, total_ordering
from html import unescape from html import unescape
from http import HTTPStatus from http import HTTPStatus
from typing import TYPE_CHECKING, Any, ClassVar, Iterable, Iterator from typing import TYPE_CHECKING, ClassVar, Iterable, Iterator, NamedTuple
from urllib.parse import quote, urlencode, urlparse from urllib.parse import quote, urlencode, urlparse
import requests import requests
@ -41,7 +41,7 @@ if TYPE_CHECKING:
from beets.importer import ImportTask from beets.importer import ImportTask
from beets.library import Item from beets.library import Item
from ._typing import GeniusAPI, LRCLibAPI from ._typing import GeniusAPI, JSONDict, LRCLibAPI
try: try:
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
@ -57,8 +57,6 @@ try:
except ImportError: except ImportError:
HAS_LANGDETECT = False HAS_LANGDETECT = False
JSONDict = dict[str, Any]
BREAK_RE = re.compile(r"\n?\s*<br([\s|/][^>]*)*>\s*\n?", re.I) BREAK_RE = re.compile(r"\n?\s*<br([\s|/][^>]*)*>\s*\n?", re.I)
USER_AGENT = f"beets/{beets.__version__}" USER_AGENT = f"beets/{beets.__version__}"
INSTRUMENTAL_LYRICS = "[Instrumental]" INSTRUMENTAL_LYRICS = "[Instrumental]"
@ -442,6 +440,12 @@ class MusiXmatch(DirectBackend):
return lyrics return lyrics
class SearchResult(NamedTuple):
artist: str
title: str
url: str
class SearchBackend(Backend): class SearchBackend(Backend):
REQUIRES_BS = True REQUIRES_BS = True
@ -450,12 +454,12 @@ class SearchBackend(Backend):
return self.config["dist_thresh"].get(float) return self.config["dist_thresh"].get(float)
def check_match( def check_match(
self, target_artist: str, target_title: str, artist: str, title: str self, target_artist: str, target_title: str, result: SearchResult
) -> bool: ) -> bool:
"""Check if the given artist and title are 'good enough' match.""" """Check if the given search result is a 'good enough' match."""
max_dist = max( max_dist = max(
string_dist(target_artist, artist), string_dist(target_artist, result.artist),
string_dist(target_title, title), string_dist(target_title, result.title),
) )
if (max_dist := round(max_dist, 2)) <= self.dist_thresh: if (max_dist := round(max_dist, 2)) <= self.dist_thresh:
@ -466,8 +470,8 @@ class SearchBackend(Backend):
# This may show a matching candidate with some noise in the name # This may show a matching candidate with some noise in the name
self.debug( self.debug(
"({}, {}) does not match ({}, {}) but dist was close: {:.2f}", "({}, {}) does not match ({}, {}) but dist was close: {:.2f}",
artist, result.artist,
title, result.title,
target_artist, target_artist,
target_title, target_title,
max_dist, max_dist,
@ -475,61 +479,59 @@ class SearchBackend(Backend):
return False return False
def search(self, artist: str, title: str) -> Iterable[SearchResult]:
"""Search for the given query and yield search results."""
raise NotImplementedError
def get_results(self, artist: str, title: str) -> Iterable[SearchResult]:
check_match = partial(self.check_match, artist, title)
for candidate in self.search(artist, title):
if check_match(candidate):
yield candidate
def fetch(self, artist: str, title: str, *_) -> str | None:
"""Fetch lyrics for the given artist and title."""
for result in self.get_results(artist, title):
if lyrics := self.scrape(self.fetch_text(result.url)):
return lyrics
return None
@classmethod
def scrape(cls, html: str) -> str | None:
"""Scrape the lyrics from the given HTML."""
raise NotImplementedError
class Genius(SearchBackend): class Genius(SearchBackend):
"""Fetch lyrics from Genius via genius-api. """Fetch lyrics from Genius via genius-api.
Simply adapted from Because genius doesn't allow accessing lyrics via the api, we first query
bigishdata.com/2016/09/27/getting-song-lyrics-from-geniuss-api-scraping/ the api for a url matching our artist & title, then scrape the HTML text
for the JSON data containing the lyrics.
""" """
SEARCH_URL = "https://api.genius.com/search"
LYRICS_IN_JSON_RE = re.compile(r'(?<=.\\"html\\":\\").*?(?=(?<!\\)\\")') LYRICS_IN_JSON_RE = re.compile(r'(?<=.\\"html\\":\\").*?(?=(?<!\\)\\")')
remove_backslash = partial(re.compile(r"\\(?=[^\\])").sub, "") remove_backslash = partial(re.compile(r"\\(?=[^\\])").sub, "")
base_url = "https://api.genius.com"
search_url = f"{base_url}/search"
@cached_property @cached_property
def headers(self) -> dict[str, str]: def headers(self) -> dict[str, str]:
return {"Authorization": f'Bearer {self.config["genius_api_key"]}'} return {"Authorization": f'Bearer {self.config["genius_api_key"]}'}
def fetch(self, artist: str, title: str, *_) -> str | None: def search(self, artist: str, title: str) -> Iterable[SearchResult]:
"""Fetch lyrics from genius.com search_data: GeniusAPI.Search = self.fetch_json(
self.SEARCH_URL,
Because genius doesn't allow accessing lyrics via the api, params={"q": f"{artist} {title}"},
we first query the api for a url matching our artist & title,
then attempt to scrape that url for the lyrics.
"""
data = self.fetch_json(
self.search_url,
params={"q": f"{artist} {title}".lower()},
headers=self.headers, headers=self.headers,
) )
if (url := self.find_lyrics_url(data, artist, title)) and ( for r in (hit["result"] for hit in search_data["response"]["hits"]):
lyrics := self.scrape_lyrics(self.fetch_text(url)) yield SearchResult(r["artist_names"], r["title"], r["url"])
):
return collapse_newlines(lyrics)
return None @classmethod
def scrape(cls, html: str) -> str | None:
def find_lyrics_url( if m := cls.LYRICS_IN_JSON_RE.search(html):
self, data: GeniusAPI.Search, artist: str, title: str html_text = cls.remove_backslash(m[0]).replace(r"\n", "\n")
) -> str | None:
"""Find URL to the lyrics of the given artist and title.
https://docs.genius.com/#search-h2.
"""
check = partial(self.check_match, artist, title)
for result in (hit["result"] for hit in data["response"]["hits"]):
if check(result["artist_names"], result["title"]):
return result["url"]
return None
def scrape_lyrics(self, html: str) -> str | None:
if m := self.LYRICS_IN_JSON_RE.search(html):
html_text = self.remove_backslash(m[0]).replace(r"\n", "\n")
return get_soup(html_text).get_text().strip() return get_soup(html_text).get_text().strip()
return None return None
@ -551,13 +553,12 @@ class Tekstowo(DirectBackend):
# We are expecting to receive a 404 since we are guessing the URL. # We are expecting to receive a 404 since we are guessing the URL.
# Thus suppress the error so that it does not end up in the logs. # Thus suppress the error so that it does not end up in the logs.
with suppress(NotFoundError): with suppress(NotFoundError):
return self.scrape_lyrics( return self.scrape(self.fetch_text(self.build_url(artist, title)))
self.fetch_text(self.build_url(artist, title))
)
return None return None
def scrape_lyrics(self, html: str) -> str | None: @classmethod
def scrape(cls, html: str) -> str | None:
soup = get_soup(html) soup = get_soup(html)
if lyrics_div := soup.select_one("div.song-text > div.inner-text"): if lyrics_div := soup.select_one("div.song-text > div.inner-text"):
@ -616,16 +617,6 @@ class Google(SearchBackend):
SEARCH_URL = "https://www.googleapis.com/customsearch/v1" SEARCH_URL = "https://www.googleapis.com/customsearch/v1"
@staticmethod
def scrape_lyrics(html: str) -> str | None:
soup = get_soup(html)
# Get the longest text element (if any).
strings = sorted(soup.stripped_strings, key=len, reverse=True)
if strings:
return strings[0]
return None
def is_lyrics(self, text, artist=None): def is_lyrics(self, text, artist=None):
"""Determine whether the text seems to be valid lyrics.""" """Determine whether the text seems to be valid lyrics."""
if not text: if not text:
@ -658,17 +649,11 @@ class Google(SearchBackend):
BY_TRANS = ["by", "par", "de", "von"] BY_TRANS = ["by", "par", "de", "von"]
LYRICS_TRANS = ["lyrics", "paroles", "letras", "liedtexte"] LYRICS_TRANS = ["lyrics", "paroles", "letras", "liedtexte"]
def is_page_candidate( def make_search_result(
self, artist: str, title: str, url_link: str, url_title: str self, artist: str, url_link: str, url_title: str
) -> bool: ) -> SearchResult:
"""Return True if the URL title makes it a good candidate to be a """Parse artist and title from the URL title and return a search result."""
page that contains lyrics of title by artist.
"""
title_slug = slug(title)
url_title_slug = slug(url_title) url_title_slug = slug(url_title)
if title_slug in url_title_slug:
return True
artist = slug(artist) artist = slug(artist)
sitename = urlparse(url_link).netloc sitename = urlparse(url_link).netloc
@ -683,33 +668,45 @@ class Google(SearchBackend):
"(%s)" % "|".join(tokens), "", url_title_slug "(%s)" % "|".join(tokens), "", url_title_slug
).strip("-") ).strip("-")
return self.check_match(artist, title_slug, artist, song_title) return SearchResult(artist, song_title, url_link)
def fetch(self, artist: str, title: str, *_) -> str | None: def search(self, artist: str, title: str) -> Iterable[SearchResult]:
params = { params = {
"key": self.config["google_API_key"].as_str(), "key": self.config["google_API_key"].as_str(),
"cx": self.config["google_engine_ID"].as_str(), "cx": self.config["google_engine_ID"].as_str(),
"q": f"{artist} {title}", "q": f"{artist} {title}",
} }
check_candidate = partial(self.is_page_candidate, artist, title) data = self.fetch_json(self.SEARCH_URL, params=params)
for item in self.fetch_json(self.SEARCH_URL, params=params).get( for item in data.get("items", []):
"items", [] yield self.make_search_result(artist, item["link"], item["title"])
):
url_link = item["link"] def get_results(self, artist: str, title: str) -> Iterable[SearchResult]:
if not check_candidate(url_link, item.get("title", "")): return super().get_results(artist, slug(title))
continue
def fetch(self, artist: str, title: str, *_) -> str | None:
for result in self.get_results(artist, title):
with self.handle_request(): with self.handle_request():
lyrics = self.scrape_lyrics(self.fetch_text(url_link)) lyrics = self.scrape(self.fetch_text(result.url))
if not lyrics: if not lyrics:
continue continue
if self.is_lyrics(lyrics, artist): if self.is_lyrics(lyrics, artist):
self.debug("Got lyrics from {}", item["displayLink"]) self.debug(
"Got lyrics from {}", urlparse(result.url).netloc
)
return lyrics return lyrics
return None return None
@classmethod
def scrape(cls, html: str) -> str | None:
# Get the longest text element (if any).
if strings := sorted(get_soup(html).stripped_strings, key=len):
return strings[-1]
return None
class LyricsPlugin(RequestHandler, plugins.BeetsPlugin): class LyricsPlugin(RequestHandler, plugins.BeetsPlugin):
SOURCES = ["lrclib", "google", "musixmatch", "genius", "tekstowo"] SOURCES = ["lrclib", "google", "musixmatch", "genius", "tekstowo"]

View file

@ -191,9 +191,9 @@ class TestSearchBackend:
], ],
) )
def test_check_match(self, backend, target_artist, artist, should_match): def test_check_match(self, backend, target_artist, artist, should_match):
assert ( result = lyrics.SearchResult(artist, "", "")
backend.check_match(target_artist, "", artist, "") == should_match
) assert backend.check_match(target_artist, "", result) == should_match
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
@ -327,31 +327,32 @@ class TestGoogleLyrics(LyricsBackendTest):
def test_mocked_source_ok(self, backend, lyrics_html): def test_mocked_source_ok(self, backend, lyrics_html):
"""Test that lyrics of the mocked page are correctly scraped""" """Test that lyrics of the mocked page are correctly scraped"""
result = backend.scrape_lyrics(lyrics_html).lower() result = backend.scrape(lyrics_html).lower()
assert result assert result
assert backend.is_lyrics(result) assert backend.is_lyrics(result)
assert PHRASE_BY_TITLE[self.TITLE] in result assert PHRASE_BY_TITLE[self.TITLE] in result
@pytest.mark.parametrize( @pytest.mark.parametrize(
"url_title, artist, should_be_candidate", "url_title, artist, expected_title",
[ [
("John Doe - beets song Lyrics", "John Doe", True), ("John Doe - beets song Lyrics", "John Doe", "beets-song"),
("example.com | Beats song by John doe", "John Doe", True), ("example.com | Beats song by John doe", "John Doe", "beats-song"),
("example.com | seets bong lyrics by John doe", "John Doe", False), (
("foo", "Sun O)))", False), "example.com | seets bong lyrics by John doe",
"John Doe",
"seets-bong",
),
("foo", "Sun O)))", "foo"),
], ],
) )
def test_is_page_candidate( def test_make_search_result(
self, backend, lyrics_html, url_title, artist, should_be_candidate self, backend, url_title, artist, expected_title
): ):
result = backend.is_page_candidate( result = backend.make_search_result(
artist, artist, "https://example.com", url_title
self.TITLE,
"http://www.example.com/lyrics/beetssong",
url_title,
) )
assert bool(result) == should_be_candidate assert result.title == expected_title
@pytest.mark.parametrize( @pytest.mark.parametrize(
"lyrics", "lyrics",
@ -385,7 +386,7 @@ class TestGeniusLyrics(LyricsBackendTest):
], ],
) # fmt: skip ) # fmt: skip
def test_scrape(self, backend, lyrics_html, expected_line_count): def test_scrape(self, backend, lyrics_html, expected_line_count):
result = backend.scrape_lyrics(lyrics_html) or "" result = backend.scrape(lyrics_html) or ""
assert len(result.splitlines()) == expected_line_count assert len(result.splitlines()) == expected_line_count
@ -406,7 +407,7 @@ class TestTekstowoLyrics(LyricsBackendTest):
], ],
) )
def test_scrape(self, backend, lyrics_html, expecting_lyrics): def test_scrape(self, backend, lyrics_html, expecting_lyrics):
assert bool(backend.scrape_lyrics(lyrics_html)) == expecting_lyrics assert bool(backend.scrape(lyrics_html)) == expecting_lyrics
LYRICS_DURATION = 950 LYRICS_DURATION = 950