From 12c5eaae5e144dcd88e32a0af23701aa346e3d00 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?= <snejus@protonmail.com>
Date: Sun, 13 Oct 2024 17:04:58 +0100
Subject: [PATCH] Unite Genius, Tekstowo and Google backends under the same
 interface

---
 beetsplug/lyrics.py         | 165 ++++++++++++++++++------------------
 test/plugins/test_lyrics.py |  39 ++++-----
 2 files changed, 101 insertions(+), 103 deletions(-)

diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index 12b2d1f37..9424a14b6 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -27,7 +27,7 @@ from dataclasses import dataclass
 from functools import cached_property, partial, total_ordering
 from html import unescape
 from http import HTTPStatus
-from typing import TYPE_CHECKING, Any, ClassVar, Iterable, Iterator
+from typing import TYPE_CHECKING, ClassVar, Iterable, Iterator, NamedTuple
 from urllib.parse import quote, urlencode, urlparse
 
 import requests
@@ -41,7 +41,7 @@ if TYPE_CHECKING:
     from beets.importer import ImportTask
     from beets.library import Item
 
-    from ._typing import GeniusAPI, LRCLibAPI
+    from ._typing import GeniusAPI, JSONDict, LRCLibAPI
 
 try:
     from bs4 import BeautifulSoup
@@ -57,8 +57,6 @@ try:
 except ImportError:
     HAS_LANGDETECT = False
 
-JSONDict = dict[str, Any]
-
 BREAK_RE = re.compile(r"\n?\s*<br([\s|/][^>]*)*>\s*\n?", re.I)
 USER_AGENT = f"beets/{beets.__version__}"
 INSTRUMENTAL_LYRICS = "[Instrumental]"
@@ -442,6 +440,12 @@ class MusiXmatch(DirectBackend):
         return lyrics
 
 
+class SearchResult(NamedTuple):
+    artist: str
+    title: str
+    url: str
+
+
 class SearchBackend(Backend):
     REQUIRES_BS = True
 
@@ -450,12 +454,12 @@ class SearchBackend(Backend):
         return self.config["dist_thresh"].get(float)
 
     def check_match(
-        self, target_artist: str, target_title: str, artist: str, title: str
+        self, target_artist: str, target_title: str, result: SearchResult
     ) -> bool:
-        """Check if the given artist and title are 'good enough' match."""
+        """Check if the given search result is a 'good enough' match."""
         max_dist = max(
-            string_dist(target_artist, artist),
-            string_dist(target_title, title),
+            string_dist(target_artist, result.artist),
+            string_dist(target_title, result.title),
         )
 
         if (max_dist := round(max_dist, 2)) <= self.dist_thresh:
@@ -466,8 +470,8 @@ class SearchBackend(Backend):
             # This may show a matching candidate with some noise in the name
             self.debug(
                 "({}, {}) does not match ({}, {}) but dist was close: {:.2f}",
-                artist,
-                title,
+                result.artist,
+                result.title,
                 target_artist,
                 target_title,
                 max_dist,
@@ -475,61 +479,59 @@ class SearchBackend(Backend):
 
         return False
 
+    def search(self, artist: str, title: str) -> Iterable[SearchResult]:
+        """Search for the given query and yield search results."""
+        raise NotImplementedError
+
+    def get_results(self, artist: str, title: str) -> Iterable[SearchResult]:
+        check_match = partial(self.check_match, artist, title)
+        for candidate in self.search(artist, title):
+            if check_match(candidate):
+                yield candidate
+
+    def fetch(self, artist: str, title: str, *_) -> str | None:
+        """Fetch lyrics for the given artist and title."""
+        for result in self.get_results(artist, title):
+            if lyrics := self.scrape(self.fetch_text(result.url)):
+                return lyrics
+
+        return None
+
+    @classmethod
+    def scrape(cls, html: str) -> str | None:
+        """Scrape the lyrics from the given HTML."""
+        raise NotImplementedError
+
 
 class Genius(SearchBackend):
     """Fetch lyrics from Genius via genius-api.
 
-    Simply adapted from
-    bigishdata.com/2016/09/27/getting-song-lyrics-from-geniuss-api-scraping/
+    Because genius doesn't allow accessing lyrics via the api, we first query
+    the api for a url matching our artist & title, then scrape the HTML text
+    for the JSON data containing the lyrics.
     """
 
+    SEARCH_URL = "https://api.genius.com/search"
     LYRICS_IN_JSON_RE = re.compile(r'(?<=.\\"html\\":\\").*?(?=(?<!\\)\\")')
     remove_backslash = partial(re.compile(r"\\(?=[^\\])").sub, "")
 
-    base_url = "https://api.genius.com"
-    search_url = f"{base_url}/search"
-
     @cached_property
     def headers(self) -> dict[str, str]:
         return {"Authorization": f'Bearer {self.config["genius_api_key"]}'}
 
-    def fetch(self, artist: str, title: str, *_) -> str | None:
-        """Fetch lyrics from genius.com
-
-        Because genius doesn't allow accessing lyrics via the api,
-        we first query the api for a url matching our artist & title,
-        then attempt to scrape that url for the lyrics.
-        """
-
-        data = self.fetch_json(
-            self.search_url,
-            params={"q": f"{artist} {title}".lower()},
+    def search(self, artist: str, title: str) -> Iterable[SearchResult]:
+        search_data: GeniusAPI.Search = self.fetch_json(
+            self.SEARCH_URL,
+            params={"q": f"{artist} {title}"},
             headers=self.headers,
         )
-        if (url := self.find_lyrics_url(data, artist, title)) and (
-            lyrics := self.scrape_lyrics(self.fetch_text(url))
-        ):
-            return collapse_newlines(lyrics)
+        for r in (hit["result"] for hit in search_data["response"]["hits"]):
+            yield SearchResult(r["artist_names"], r["title"], r["url"])
 
-        return None
-
-    def find_lyrics_url(
-        self, data: GeniusAPI.Search, artist: str, title: str
-    ) -> str | None:
-        """Find URL to the lyrics of the given artist and title.
-
-        https://docs.genius.com/#search-h2.
-        """
-        check = partial(self.check_match, artist, title)
-        for result in (hit["result"] for hit in data["response"]["hits"]):
-            if check(result["artist_names"], result["title"]):
-                return result["url"]
-
-        return None
-
-    def scrape_lyrics(self, html: str) -> str | None:
-        if m := self.LYRICS_IN_JSON_RE.search(html):
-            html_text = self.remove_backslash(m[0]).replace(r"\n", "\n")
+    @classmethod
+    def scrape(cls, html: str) -> str | None:
+        if m := cls.LYRICS_IN_JSON_RE.search(html):
+            html_text = cls.remove_backslash(m[0]).replace(r"\n", "\n")
             return get_soup(html_text).get_text().strip()
 
         return None
@@ -551,13 +553,12 @@ class Tekstowo(DirectBackend):
         # We are expecting to receive a 404 since we are guessing the URL.
         # Thus suppress the error so that it does not end up in the logs.
         with suppress(NotFoundError):
-            return self.scrape_lyrics(
-                self.fetch_text(self.build_url(artist, title))
-            )
+            return self.scrape(self.fetch_text(self.build_url(artist, title)))
 
         return None
 
-    def scrape_lyrics(self, html: str) -> str | None:
+    @classmethod
+    def scrape(cls, html: str) -> str | None:
         soup = get_soup(html)
 
         if lyrics_div := soup.select_one("div.song-text > div.inner-text"):
@@ -616,16 +617,6 @@ class Google(SearchBackend):
 
     SEARCH_URL = "https://www.googleapis.com/customsearch/v1"
 
-    @staticmethod
-    def scrape_lyrics(html: str) -> str | None:
-        soup = get_soup(html)
-
-        # Get the longest text element (if any).
-        strings = sorted(soup.stripped_strings, key=len, reverse=True)
-        if strings:
-            return strings[0]
-        return None
-
     def is_lyrics(self, text, artist=None):
         """Determine whether the text seems to be valid lyrics."""
         if not text:
@@ -658,17 +649,11 @@ class Google(SearchBackend):
     BY_TRANS = ["by", "par", "de", "von"]
     LYRICS_TRANS = ["lyrics", "paroles", "letras", "liedtexte"]
 
-    def is_page_candidate(
-        self, artist: str, title: str, url_link: str, url_title: str
-    ) -> bool:
-        """Return True if the URL title makes it a good candidate to be a
-        page that contains lyrics of title by artist.
-        """
-        title_slug = slug(title)
+    def make_search_result(
+        self, artist: str, url_link: str, url_title: str
+    ) -> SearchResult:
+        """Parse artist and title from the URL title and return a search result."""
         url_title_slug = slug(url_title)
-        if title_slug in url_title_slug:
-            return True
-
         artist = slug(artist)
         sitename = urlparse(url_link).netloc
 
@@ -683,33 +668,45 @@ class Google(SearchBackend):
             "(%s)" % "|".join(tokens), "", url_title_slug
         ).strip("-")
 
-        return self.check_match(artist, title_slug, artist, song_title)
+        return SearchResult(artist, song_title, url_link)
 
-    def fetch(self, artist: str, title: str, *_) -> str | None:
+    def search(self, artist: str, title: str) -> Iterable[SearchResult]:
         params = {
             "key": self.config["google_API_key"].as_str(),
             "cx": self.config["google_engine_ID"].as_str(),
             "q": f"{artist} {title}",
         }
 
-        check_candidate = partial(self.is_page_candidate, artist, title)
-        for item in self.fetch_json(self.SEARCH_URL, params=params).get(
-            "items", []
-        ):
-            url_link = item["link"]
-            if not check_candidate(url_link, item.get("title", "")):
-                continue
+        data = self.fetch_json(self.SEARCH_URL, params=params)
+        for item in data.get("items", []):
+            yield self.make_search_result(artist, item["link"], item["title"])
+
+    def get_results(self, artist: str, title: str) -> Iterable[SearchResult]:
+        return super().get_results(artist, slug(title))
+
+    def fetch(self, artist: str, title: str, *_) -> str | None:
+        for result in self.get_results(artist, title):
             with self.handle_request():
-                lyrics = self.scrape_lyrics(self.fetch_text(url_link))
+                lyrics = self.scrape(self.fetch_text(result.url))
                 if not lyrics:
                     continue
 
                 if self.is_lyrics(lyrics, artist):
-                    self.debug("Got lyrics from {}", item["displayLink"])
+                    self.debug(
+                        "Got lyrics from {}", urlparse(result.url).netloc
+                    )
                     return lyrics
 
         return None
 
+    @classmethod
+    def scrape(cls, html: str) -> str | None:
+        # Get the longest text element (if any).
+        if strings := sorted(get_soup(html).stripped_strings, key=len):
+            return strings[-1]
+
+        return None
+
 
 class LyricsPlugin(RequestHandler, plugins.BeetsPlugin):
     SOURCES = ["lrclib", "google", "musixmatch", "genius", "tekstowo"]
diff --git a/test/plugins/test_lyrics.py b/test/plugins/test_lyrics.py
index 091776108..8afd80585 100644
--- a/test/plugins/test_lyrics.py
+++ b/test/plugins/test_lyrics.py
@@ -191,9 +191,9 @@ class TestSearchBackend:
         ],
     )
     def test_check_match(self, backend, target_artist, artist, should_match):
-        assert (
-            backend.check_match(target_artist, "", artist, "") == should_match
-        )
+        result = lyrics.SearchResult(artist, "", "")
+
+        assert backend.check_match(target_artist, "", result) == should_match
 
 
 @pytest.fixture(scope="module")
@@ -327,31 +327,32 @@ class TestGoogleLyrics(LyricsBackendTest):
 
     def test_mocked_source_ok(self, backend, lyrics_html):
         """Test that lyrics of the mocked page are correctly scraped"""
-        result = backend.scrape_lyrics(lyrics_html).lower()
+        result = backend.scrape(lyrics_html).lower()
 
         assert result
         assert backend.is_lyrics(result)
         assert PHRASE_BY_TITLE[self.TITLE] in result
 
     @pytest.mark.parametrize(
-        "url_title, artist, should_be_candidate",
+        "url_title, artist, expected_title",
         [
-            ("John Doe - beets song Lyrics", "John Doe", True),
-            ("example.com | Beats song by John doe", "John Doe", True),
-            ("example.com | seets bong lyrics by John doe", "John Doe", False),
-            ("foo", "Sun O)))", False),
+            ("John Doe - beets song Lyrics", "John Doe", "beets-song"),
+            ("example.com | Beats song by John doe", "John Doe", "beats-song"),
+            (
+                "example.com | seets bong lyrics by John doe",
+                "John Doe",
+                "seets-bong",
+            ),
+            ("foo", "Sun O)))", "foo"),
         ],
     )
-    def test_is_page_candidate(
-        self, backend, lyrics_html, url_title, artist, should_be_candidate
+    def test_make_search_result(
+        self, backend, url_title, artist, expected_title
     ):
-        result = backend.is_page_candidate(
-            artist,
-            self.TITLE,
-            "http://www.example.com/lyrics/beetssong",
-            url_title,
+        result = backend.make_search_result(
+            artist, "https://example.com", url_title
         )
-        assert bool(result) == should_be_candidate
+        assert result.title == expected_title
 
     @pytest.mark.parametrize(
         "lyrics",
@@ -385,7 +386,7 @@ class TestGeniusLyrics(LyricsBackendTest):
         ],
     )  # fmt: skip
     def test_scrape(self, backend, lyrics_html, expected_line_count):
-        result = backend.scrape_lyrics(lyrics_html) or ""
+        result = backend.scrape(lyrics_html) or ""
 
         assert len(result.splitlines()) == expected_line_count
 
@@ -406,7 +407,7 @@ class TestTekstowoLyrics(LyricsBackendTest):
         ],
     )
     def test_scrape(self, backend, lyrics_html, expecting_lyrics):
-        assert bool(backend.scrape_lyrics(lyrics_html)) == expecting_lyrics
+        assert bool(backend.scrape(lyrics_html)) == expecting_lyrics
 
 
 LYRICS_DURATION = 950