Genius: refactor and simplify

2026-02-16 12:24:53 +01:00 · 2024-10-09 12:12:09 +01:00 · 2024-10-09 12:12:09 +01:00 · 745c5eb9f0
commit 745c5eb9f0
parent 54fc67b30a
4 changed files with 126 additions and 92 deletions
--- a/beetsplug/_typing.py
+++ b/beetsplug/_typing.py
@ -0,0 +1,86 @@
+from __future__ import annotations
+
+from typing import Any
+
+from typing_extensions import TypedDict
+
+JSONDict = dict[str, Any]
+
+
+class LRCLibAPI:
+    class Item(TypedDict):
+        """Lyrics data item returned by the LRCLib API."""
+
+        id: int
+        name: str
+        trackName: str
+        artistName: str
+        albumName: str
+        duration: float | None
+        instrumental: bool
+        plainLyrics: str
+        syncedLyrics: str | None
+
+
+class GeniusAPI:
+    """Genius API data types.
+
+    This documents *only* the fields that are used in the plugin.
+    :attr:`SearchResult` is an exception, since I thought some of the other
+    fields might be useful in the future.
+    """
+
+    class DateComponents(TypedDict):
+        year: int
+        month: int
+        day: int
+
+    class Artist(TypedDict):
+        api_path: str
+        header_image_url: str
+        id: int
+        image_url: str
+        is_meme_verified: bool
+        is_verified: bool
+        name: str
+        url: str
+
+    class Stats(TypedDict):
+        unreviewed_annotations: int
+        hot: bool
+
+    class SearchResult(TypedDict):
+        annotation_count: int
+        api_path: str
+        artist_names: str
+        full_title: str
+        header_image_thumbnail_url: str
+        header_image_url: str
+        id: int
+        lyrics_owner_id: int
+        lyrics_state: str
+        path: str
+        primary_artist_names: str
+        pyongs_count: int | None
+        relationships_index_url: str
+        release_date_components: GeniusAPI.DateComponents
+        release_date_for_display: str
+        release_date_with_abbreviated_month_for_display: str
+        song_art_image_thumbnail_url: str
+        song_art_image_url: str
+        stats: GeniusAPI.Stats
+        title: str
+        title_with_featured: str
+        url: str
+        featured_artists: list[GeniusAPI.Artist]
+        primary_artist: GeniusAPI.Artist
+        primary_artists: list[GeniusAPI.Artist]
+
+    class SearchHit(TypedDict):
+        result: GeniusAPI.SearchResult
+
+    class SearchResponse(TypedDict):
+        hits: list[GeniusAPI.SearchHit]
+
+    class Search(TypedDict):
+        response: GeniusAPI.SearchResponse
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@ -31,7 +31,6 @@ from typing import TYPE_CHECKING, Any, ClassVar, Iterable, Iterator
 from urllib.parse import quote, urlencode, urlparse

 import requests
-from typing_extensions import TypedDict
 from unidecode import unidecode

 import beets
@ -42,6 +41,8 @@ if TYPE_CHECKING:
    from beets.importer import ImportTask
    from beets.library import Item

+    from ._typing import GeniusAPI, LRCLibAPI
+
 try:
    from bs4 import BeautifulSoup

@ -266,20 +267,6 @@ class Backend(RequestHandler):
        raise NotImplementedError


-class LRCLibItem(TypedDict):
-    """Lyrics data item returned by the LRCLib API."""
-
-    id: int
-    name: str
-    trackName: str
-    artistName: str
-    albumName: str
-    duration: float | None
-    instrumental: bool
-    plainLyrics: str
-    syncedLyrics: str | None
-
-
@dataclass
@total_ordering
 class LRCLyrics:
@ -297,7 +284,9 @@ class LRCLyrics:
        return self.dist < other.dist

    @classmethod
-    def make(cls, candidate: LRCLibItem, target_duration: float) -> LRCLyrics:
+    def make(
+        cls, candidate: LRCLibAPI.Item, target_duration: float
+    ) -> LRCLyrics:
        return cls(
            target_duration,
            candidate["duration"] or 0.0,
@ -354,7 +343,7 @@ class LRCLib(Backend):

    def fetch_candidates(
        self, artist: str, title: str, album: str, length: int
-    ) -> Iterator[list[LRCLibItem]]:
+    ) -> Iterator[list[LRCLibAPI.Item]]:
        """Yield lyrics candidates for the given song data.

        I found that the ``/get`` endpoint sometimes returns inaccurate or
@ -494,13 +483,15 @@ class Genius(SearchBackend):
    bigishdata.com/2016/09/27/getting-song-lyrics-from-geniuss-api-scraping/
    """

+    LYRICS_IN_JSON_RE = re.compile(r'(?<=.\\"html\\":\\").*?(?=(?<!\\)\\")')
+    remove_backslash = partial(re.compile(r"\\(?=[^\\])").sub, "")
+
    base_url = "https://api.genius.com"
    search_url = f"{base_url}/search"

-    def __init__(self, config, log):
-        super().__init__(config, log)
-        self.api_key = config["genius_api_key"].as_str()
-        self.headers = {"Authorization": f"Bearer {self.api_key}"}
+    @cached_property
+    def headers(self) -> dict[str, str]:
+        return {"Authorization": f'Bearer {self.config["genius_api_key"]}'}

    def fetch(self, artist: str, title: str, *_) -> str | None:
        """Fetch lyrics from genius.com
@ -509,85 +500,39 @@ class Genius(SearchBackend):
        we first query the api for a url matching our artist & title,
        then attempt to scrape that url for the lyrics.
        """
-        json = self._search(artist, title)

-        check = partial(self.check_match, artist, title)
-        for hit in json["response"]["hits"]:
-            result = hit["result"]
-            url = hit["result"]["url"]
-            if check(result["primary_artist"]["name"], result["title"]) and (
-                lyrics := self.scrape_lyrics(self.fetch_text(url))
-            ):
-                return collapse_newlines(lyrics)
+        data = self.fetch_json(
+            self.search_url,
+            params={"q": f"{artist} {title}".lower()},
+            headers=self.headers,
+        )
+        if (url := self.find_lyrics_url(data, artist, title)) and (
+            lyrics := self.scrape_lyrics(self.fetch_text(url))
+        ):
+            return collapse_newlines(lyrics)

        return None

-    def _search(self, artist, title):
-        """Searches the genius api for a given artist and title
+    def find_lyrics_url(
+        self, data: GeniusAPI.Search, artist: str, title: str
+    ) -> str | None:
+        """Find URL to the lyrics of the given artist and title.

-        https://docs.genius.com/#search-h2
-
-        :returns: json response
+        https://docs.genius.com/#search-h2.
        """
-        return self.fetch_json(
-            self.search_url,
-            params={"q": f"{title} {artist.lower()}"},
-            headers=self.headers,
-        )
+        check = partial(self.check_match, artist, title)
+        for result in (hit["result"] for hit in data["response"]["hits"]):
+            if check(result["artist_names"], result["title"]):
+                return result["url"]
+
+        return None

    def scrape_lyrics(self, html: str) -> str | None:
-        """Scrape lyrics from a given genius.com html"""
-        soup = get_soup(html)
+        if m := self.LYRICS_IN_JSON_RE.search(html):
+            html_text = self.remove_backslash(m[0]).replace(r"\n", "\n")
+            return get_soup(html_text).get_text().strip()

-        # Most of the time, the page contains a div with class="lyrics" where
-        # all of the lyrics can be found already correctly formatted
-        # Sometimes, though, it packages the lyrics into separate divs, most
-        # likely for easier ad placement
-
-        lyrics_divs = soup.find_all("div", {"data-lyrics-container": True})
-        if not lyrics_divs:
-            self.debug("Received unusual song page html")
-            return self._try_extracting_lyrics_from_non_data_lyrics_container(
-                soup
-            )
-        lyrics = ""
-        for lyrics_div in lyrics_divs:
-            lyrics += lyrics_div.get_text() + "\n\n"
-        while lyrics[-1] == "\n":
-            lyrics = lyrics[:-1]
-        return lyrics
-
-    def _try_extracting_lyrics_from_non_data_lyrics_container(self, soup):
-        """Extract lyrics from a div without attribute data-lyrics-container
-        This is the second most common layout on genius.com
-        """
-        verse_div = soup.find("div", class_=re.compile("Lyrics__Container"))
-        if not verse_div:
-            if soup.find(
-                "div",
-                class_=re.compile("LyricsPlaceholder__Message"),
-                string="This song is an instrumental",
-            ):
-                self.debug("Detected instrumental")
-                return INSTRUMENTAL_LYRICS
-            else:
-                self.debug("Couldn't scrape page using known layouts")
-                return None
-
-        lyrics_div = verse_div.parent
-
-        ads = lyrics_div.find_all(
-            "div", class_=re.compile("InreadAd__Container")
-        )
-        for ad in ads:
-            ad.replace_with("\n")
-
-        footers = lyrics_div.find_all(
-            "div", class_=re.compile("Lyrics__Footer")
-        )
-        for footer in footers:
-            footer.replace_with("")
-        return lyrics_div.get_text()
+        return None


 class Tekstowo(DirectBackend):
--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@ -67,6 +67,9 @@ Bug fixes:
 * :doc:`plugins/lyrics`: Fix the issue with ``genius`` backend not being able
  to match lyrics when there is a slight variation in the artist name.
  :bug:`4791`
+* :doc:`plugins/lyrics`: Fix plugin crash when ``genius`` backend returns empty
+  lyrics.
+  :bug:`5583`

 For packagers:

--- a/test/plugins/test_lyrics.py
+++ b/test/plugins/test_lyrics.py
@ -379,7 +379,7 @@ class TestGeniusLyrics(LyricsBackendTest):
    @pytest.mark.parametrize(
        "file_name, expected_line_count",
        [
-            ("geniuscom/2pacalleyezonmelyrics", 134),
+            ("geniuscom/2pacalleyezonmelyrics", 131),
            ("geniuscom/Ttngchinchillalyrics", 29),
            ("geniuscom/sample", 0),  # see https://github.com/beetbox/beets/issues/3535
        ],