From 12c5eaae5e144dcd88e32a0af23701aa346e3d00 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?= Date: Sun, 13 Oct 2024 17:04:58 +0100 Subject: [PATCH] Unite Genius, Tekstowo and Google backends under the same interface --- beetsplug/lyrics.py | 165 ++++++++++++++++++------------------ test/plugins/test_lyrics.py | 39 ++++----- 2 files changed, 101 insertions(+), 103 deletions(-) diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py index 12b2d1f37..9424a14b6 100644 --- a/beetsplug/lyrics.py +++ b/beetsplug/lyrics.py @@ -27,7 +27,7 @@ from dataclasses import dataclass from functools import cached_property, partial, total_ordering from html import unescape from http import HTTPStatus -from typing import TYPE_CHECKING, Any, ClassVar, Iterable, Iterator +from typing import TYPE_CHECKING, ClassVar, Iterable, Iterator, NamedTuple from urllib.parse import quote, urlencode, urlparse import requests @@ -41,7 +41,7 @@ if TYPE_CHECKING: from beets.importer import ImportTask from beets.library import Item - from ._typing import GeniusAPI, LRCLibAPI + from ._typing import GeniusAPI, JSONDict, LRCLibAPI try: from bs4 import BeautifulSoup @@ -57,8 +57,6 @@ try: except ImportError: HAS_LANGDETECT = False -JSONDict = dict[str, Any] - BREAK_RE = re.compile(r"\n?\s*]*)*>\s*\n?", re.I) USER_AGENT = f"beets/{beets.__version__}" INSTRUMENTAL_LYRICS = "[Instrumental]" @@ -442,6 +440,12 @@ class MusiXmatch(DirectBackend): return lyrics +class SearchResult(NamedTuple): + artist: str + title: str + url: str + + class SearchBackend(Backend): REQUIRES_BS = True @@ -450,12 +454,12 @@ class SearchBackend(Backend): return self.config["dist_thresh"].get(float) def check_match( - self, target_artist: str, target_title: str, artist: str, title: str + self, target_artist: str, target_title: str, result: SearchResult ) -> bool: - """Check if the given artist and title are 'good enough' match.""" + """Check if the given search result is a 'good enough' match.""" max_dist = max( - string_dist(target_artist, artist), - string_dist(target_title, title), + string_dist(target_artist, result.artist), + string_dist(target_title, result.title), ) if (max_dist := round(max_dist, 2)) <= self.dist_thresh: @@ -466,8 +470,8 @@ class SearchBackend(Backend): # This may show a matching candidate with some noise in the name self.debug( "({}, {}) does not match ({}, {}) but dist was close: {:.2f}", - artist, - title, + result.artist, + result.title, target_artist, target_title, max_dist, @@ -475,61 +479,59 @@ class SearchBackend(Backend): return False + def search(self, artist: str, title: str) -> Iterable[SearchResult]: + """Search for the given query and yield search results.""" + raise NotImplementedError + + def get_results(self, artist: str, title: str) -> Iterable[SearchResult]: + check_match = partial(self.check_match, artist, title) + for candidate in self.search(artist, title): + if check_match(candidate): + yield candidate + + def fetch(self, artist: str, title: str, *_) -> str | None: + """Fetch lyrics for the given artist and title.""" + for result in self.get_results(artist, title): + if lyrics := self.scrape(self.fetch_text(result.url)): + return lyrics + + return None + + @classmethod + def scrape(cls, html: str) -> str | None: + """Scrape the lyrics from the given HTML.""" + raise NotImplementedError + class Genius(SearchBackend): """Fetch lyrics from Genius via genius-api. - Simply adapted from - bigishdata.com/2016/09/27/getting-song-lyrics-from-geniuss-api-scraping/ + Because genius doesn't allow accessing lyrics via the api, we first query + the api for a url matching our artist & title, then scrape the HTML text + for the JSON data containing the lyrics. """ + SEARCH_URL = "https://api.genius.com/search" LYRICS_IN_JSON_RE = re.compile(r'(?<=.\\"html\\":\\").*?(?=(? dict[str, str]: return {"Authorization": f'Bearer {self.config["genius_api_key"]}'} - def fetch(self, artist: str, title: str, *_) -> str | None: - """Fetch lyrics from genius.com - - Because genius doesn't allow accessing lyrics via the api, - we first query the api for a url matching our artist & title, - then attempt to scrape that url for the lyrics. - """ - - data = self.fetch_json( - self.search_url, - params={"q": f"{artist} {title}".lower()}, + def search(self, artist: str, title: str) -> Iterable[SearchResult]: + search_data: GeniusAPI.Search = self.fetch_json( + self.SEARCH_URL, + params={"q": f"{artist} {title}"}, headers=self.headers, ) - if (url := self.find_lyrics_url(data, artist, title)) and ( - lyrics := self.scrape_lyrics(self.fetch_text(url)) - ): - return collapse_newlines(lyrics) + for r in (hit["result"] for hit in search_data["response"]["hits"]): + yield SearchResult(r["artist_names"], r["title"], r["url"]) - return None - - def find_lyrics_url( - self, data: GeniusAPI.Search, artist: str, title: str - ) -> str | None: - """Find URL to the lyrics of the given artist and title. - - https://docs.genius.com/#search-h2. - """ - check = partial(self.check_match, artist, title) - for result in (hit["result"] for hit in data["response"]["hits"]): - if check(result["artist_names"], result["title"]): - return result["url"] - - return None - - def scrape_lyrics(self, html: str) -> str | None: - if m := self.LYRICS_IN_JSON_RE.search(html): - html_text = self.remove_backslash(m[0]).replace(r"\n", "\n") + @classmethod + def scrape(cls, html: str) -> str | None: + if m := cls.LYRICS_IN_JSON_RE.search(html): + html_text = cls.remove_backslash(m[0]).replace(r"\n", "\n") return get_soup(html_text).get_text().strip() return None @@ -551,13 +553,12 @@ class Tekstowo(DirectBackend): # We are expecting to receive a 404 since we are guessing the URL. # Thus suppress the error so that it does not end up in the logs. with suppress(NotFoundError): - return self.scrape_lyrics( - self.fetch_text(self.build_url(artist, title)) - ) + return self.scrape(self.fetch_text(self.build_url(artist, title))) return None - def scrape_lyrics(self, html: str) -> str | None: + @classmethod + def scrape(cls, html: str) -> str | None: soup = get_soup(html) if lyrics_div := soup.select_one("div.song-text > div.inner-text"): @@ -616,16 +617,6 @@ class Google(SearchBackend): SEARCH_URL = "https://www.googleapis.com/customsearch/v1" - @staticmethod - def scrape_lyrics(html: str) -> str | None: - soup = get_soup(html) - - # Get the longest text element (if any). - strings = sorted(soup.stripped_strings, key=len, reverse=True) - if strings: - return strings[0] - return None - def is_lyrics(self, text, artist=None): """Determine whether the text seems to be valid lyrics.""" if not text: @@ -658,17 +649,11 @@ class Google(SearchBackend): BY_TRANS = ["by", "par", "de", "von"] LYRICS_TRANS = ["lyrics", "paroles", "letras", "liedtexte"] - def is_page_candidate( - self, artist: str, title: str, url_link: str, url_title: str - ) -> bool: - """Return True if the URL title makes it a good candidate to be a - page that contains lyrics of title by artist. - """ - title_slug = slug(title) + def make_search_result( + self, artist: str, url_link: str, url_title: str + ) -> SearchResult: + """Parse artist and title from the URL title and return a search result.""" url_title_slug = slug(url_title) - if title_slug in url_title_slug: - return True - artist = slug(artist) sitename = urlparse(url_link).netloc @@ -683,33 +668,45 @@ class Google(SearchBackend): "(%s)" % "|".join(tokens), "", url_title_slug ).strip("-") - return self.check_match(artist, title_slug, artist, song_title) + return SearchResult(artist, song_title, url_link) - def fetch(self, artist: str, title: str, *_) -> str | None: + def search(self, artist: str, title: str) -> Iterable[SearchResult]: params = { "key": self.config["google_API_key"].as_str(), "cx": self.config["google_engine_ID"].as_str(), "q": f"{artist} {title}", } - check_candidate = partial(self.is_page_candidate, artist, title) - for item in self.fetch_json(self.SEARCH_URL, params=params).get( - "items", [] - ): - url_link = item["link"] - if not check_candidate(url_link, item.get("title", "")): - continue + data = self.fetch_json(self.SEARCH_URL, params=params) + for item in data.get("items", []): + yield self.make_search_result(artist, item["link"], item["title"]) + + def get_results(self, artist: str, title: str) -> Iterable[SearchResult]: + return super().get_results(artist, slug(title)) + + def fetch(self, artist: str, title: str, *_) -> str | None: + for result in self.get_results(artist, title): with self.handle_request(): - lyrics = self.scrape_lyrics(self.fetch_text(url_link)) + lyrics = self.scrape(self.fetch_text(result.url)) if not lyrics: continue if self.is_lyrics(lyrics, artist): - self.debug("Got lyrics from {}", item["displayLink"]) + self.debug( + "Got lyrics from {}", urlparse(result.url).netloc + ) return lyrics return None + @classmethod + def scrape(cls, html: str) -> str | None: + # Get the longest text element (if any). + if strings := sorted(get_soup(html).stripped_strings, key=len): + return strings[-1] + + return None + class LyricsPlugin(RequestHandler, plugins.BeetsPlugin): SOURCES = ["lrclib", "google", "musixmatch", "genius", "tekstowo"] diff --git a/test/plugins/test_lyrics.py b/test/plugins/test_lyrics.py index 091776108..8afd80585 100644 --- a/test/plugins/test_lyrics.py +++ b/test/plugins/test_lyrics.py @@ -191,9 +191,9 @@ class TestSearchBackend: ], ) def test_check_match(self, backend, target_artist, artist, should_match): - assert ( - backend.check_match(target_artist, "", artist, "") == should_match - ) + result = lyrics.SearchResult(artist, "", "") + + assert backend.check_match(target_artist, "", result) == should_match @pytest.fixture(scope="module") @@ -327,31 +327,32 @@ class TestGoogleLyrics(LyricsBackendTest): def test_mocked_source_ok(self, backend, lyrics_html): """Test that lyrics of the mocked page are correctly scraped""" - result = backend.scrape_lyrics(lyrics_html).lower() + result = backend.scrape(lyrics_html).lower() assert result assert backend.is_lyrics(result) assert PHRASE_BY_TITLE[self.TITLE] in result @pytest.mark.parametrize( - "url_title, artist, should_be_candidate", + "url_title, artist, expected_title", [ - ("John Doe - beets song Lyrics", "John Doe", True), - ("example.com | Beats song by John doe", "John Doe", True), - ("example.com | seets bong lyrics by John doe", "John Doe", False), - ("foo", "Sun O)))", False), + ("John Doe - beets song Lyrics", "John Doe", "beets-song"), + ("example.com | Beats song by John doe", "John Doe", "beats-song"), + ( + "example.com | seets bong lyrics by John doe", + "John Doe", + "seets-bong", + ), + ("foo", "Sun O)))", "foo"), ], ) - def test_is_page_candidate( - self, backend, lyrics_html, url_title, artist, should_be_candidate + def test_make_search_result( + self, backend, url_title, artist, expected_title ): - result = backend.is_page_candidate( - artist, - self.TITLE, - "http://www.example.com/lyrics/beetssong", - url_title, + result = backend.make_search_result( + artist, "https://example.com", url_title ) - assert bool(result) == should_be_candidate + assert result.title == expected_title @pytest.mark.parametrize( "lyrics", @@ -385,7 +386,7 @@ class TestGeniusLyrics(LyricsBackendTest): ], ) # fmt: skip def test_scrape(self, backend, lyrics_html, expected_line_count): - result = backend.scrape_lyrics(lyrics_html) or "" + result = backend.scrape(lyrics_html) or "" assert len(result.splitlines()) == expected_line_count @@ -406,7 +407,7 @@ class TestTekstowoLyrics(LyricsBackendTest): ], ) def test_scrape(self, backend, lyrics_html, expecting_lyrics): - assert bool(backend.scrape_lyrics(lyrics_html)) == expecting_lyrics + assert bool(backend.scrape(lyrics_html)) == expecting_lyrics LYRICS_DURATION = 950