From 745c5eb9f058603c0817003d57adc4c80dc7aecd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?= Date: Wed, 9 Oct 2024 12:12:09 +0100 Subject: [PATCH] Genius: refactor and simplify --- beetsplug/_typing.py | 86 ++++++++++++++++++++++++ beetsplug/lyrics.py | 127 ++++++++++-------------------------- docs/changelog.rst | 3 + test/plugins/test_lyrics.py | 2 +- 4 files changed, 126 insertions(+), 92 deletions(-) create mode 100644 beetsplug/_typing.py diff --git a/beetsplug/_typing.py b/beetsplug/_typing.py new file mode 100644 index 000000000..93f4a2a58 --- /dev/null +++ b/beetsplug/_typing.py @@ -0,0 +1,86 @@ +from __future__ import annotations + +from typing import Any + +from typing_extensions import TypedDict + +JSONDict = dict[str, Any] + + +class LRCLibAPI: + class Item(TypedDict): + """Lyrics data item returned by the LRCLib API.""" + + id: int + name: str + trackName: str + artistName: str + albumName: str + duration: float | None + instrumental: bool + plainLyrics: str + syncedLyrics: str | None + + +class GeniusAPI: + """Genius API data types. + + This documents *only* the fields that are used in the plugin. + :attr:`SearchResult` is an exception, since I thought some of the other + fields might be useful in the future. + """ + + class DateComponents(TypedDict): + year: int + month: int + day: int + + class Artist(TypedDict): + api_path: str + header_image_url: str + id: int + image_url: str + is_meme_verified: bool + is_verified: bool + name: str + url: str + + class Stats(TypedDict): + unreviewed_annotations: int + hot: bool + + class SearchResult(TypedDict): + annotation_count: int + api_path: str + artist_names: str + full_title: str + header_image_thumbnail_url: str + header_image_url: str + id: int + lyrics_owner_id: int + lyrics_state: str + path: str + primary_artist_names: str + pyongs_count: int | None + relationships_index_url: str + release_date_components: GeniusAPI.DateComponents + release_date_for_display: str + release_date_with_abbreviated_month_for_display: str + song_art_image_thumbnail_url: str + song_art_image_url: str + stats: GeniusAPI.Stats + title: str + title_with_featured: str + url: str + featured_artists: list[GeniusAPI.Artist] + primary_artist: GeniusAPI.Artist + primary_artists: list[GeniusAPI.Artist] + + class SearchHit(TypedDict): + result: GeniusAPI.SearchResult + + class SearchResponse(TypedDict): + hits: list[GeniusAPI.SearchHit] + + class Search(TypedDict): + response: GeniusAPI.SearchResponse diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py index 7adb1bc7e..12b2d1f37 100644 --- a/beetsplug/lyrics.py +++ b/beetsplug/lyrics.py @@ -31,7 +31,6 @@ from typing import TYPE_CHECKING, Any, ClassVar, Iterable, Iterator from urllib.parse import quote, urlencode, urlparse import requests -from typing_extensions import TypedDict from unidecode import unidecode import beets @@ -42,6 +41,8 @@ if TYPE_CHECKING: from beets.importer import ImportTask from beets.library import Item + from ._typing import GeniusAPI, LRCLibAPI + try: from bs4 import BeautifulSoup @@ -266,20 +267,6 @@ class Backend(RequestHandler): raise NotImplementedError -class LRCLibItem(TypedDict): - """Lyrics data item returned by the LRCLib API.""" - - id: int - name: str - trackName: str - artistName: str - albumName: str - duration: float | None - instrumental: bool - plainLyrics: str - syncedLyrics: str | None - - @dataclass @total_ordering class LRCLyrics: @@ -297,7 +284,9 @@ class LRCLyrics: return self.dist < other.dist @classmethod - def make(cls, candidate: LRCLibItem, target_duration: float) -> LRCLyrics: + def make( + cls, candidate: LRCLibAPI.Item, target_duration: float + ) -> LRCLyrics: return cls( target_duration, candidate["duration"] or 0.0, @@ -354,7 +343,7 @@ class LRCLib(Backend): def fetch_candidates( self, artist: str, title: str, album: str, length: int - ) -> Iterator[list[LRCLibItem]]: + ) -> Iterator[list[LRCLibAPI.Item]]: """Yield lyrics candidates for the given song data. I found that the ``/get`` endpoint sometimes returns inaccurate or @@ -494,13 +483,15 @@ class Genius(SearchBackend): bigishdata.com/2016/09/27/getting-song-lyrics-from-geniuss-api-scraping/ """ + LYRICS_IN_JSON_RE = re.compile(r'(?<=.\\"html\\":\\").*?(?=(? dict[str, str]: + return {"Authorization": f'Bearer {self.config["genius_api_key"]}'} def fetch(self, artist: str, title: str, *_) -> str | None: """Fetch lyrics from genius.com @@ -509,85 +500,39 @@ class Genius(SearchBackend): we first query the api for a url matching our artist & title, then attempt to scrape that url for the lyrics. """ - json = self._search(artist, title) - check = partial(self.check_match, artist, title) - for hit in json["response"]["hits"]: - result = hit["result"] - url = hit["result"]["url"] - if check(result["primary_artist"]["name"], result["title"]) and ( - lyrics := self.scrape_lyrics(self.fetch_text(url)) - ): - return collapse_newlines(lyrics) + data = self.fetch_json( + self.search_url, + params={"q": f"{artist} {title}".lower()}, + headers=self.headers, + ) + if (url := self.find_lyrics_url(data, artist, title)) and ( + lyrics := self.scrape_lyrics(self.fetch_text(url)) + ): + return collapse_newlines(lyrics) return None - def _search(self, artist, title): - """Searches the genius api for a given artist and title + def find_lyrics_url( + self, data: GeniusAPI.Search, artist: str, title: str + ) -> str | None: + """Find URL to the lyrics of the given artist and title. - https://docs.genius.com/#search-h2 - - :returns: json response + https://docs.genius.com/#search-h2. """ - return self.fetch_json( - self.search_url, - params={"q": f"{title} {artist.lower()}"}, - headers=self.headers, - ) + check = partial(self.check_match, artist, title) + for result in (hit["result"] for hit in data["response"]["hits"]): + if check(result["artist_names"], result["title"]): + return result["url"] + + return None def scrape_lyrics(self, html: str) -> str | None: - """Scrape lyrics from a given genius.com html""" - soup = get_soup(html) + if m := self.LYRICS_IN_JSON_RE.search(html): + html_text = self.remove_backslash(m[0]).replace(r"\n", "\n") + return get_soup(html_text).get_text().strip() - # Most of the time, the page contains a div with class="lyrics" where - # all of the lyrics can be found already correctly formatted - # Sometimes, though, it packages the lyrics into separate divs, most - # likely for easier ad placement - - lyrics_divs = soup.find_all("div", {"data-lyrics-container": True}) - if not lyrics_divs: - self.debug("Received unusual song page html") - return self._try_extracting_lyrics_from_non_data_lyrics_container( - soup - ) - lyrics = "" - for lyrics_div in lyrics_divs: - lyrics += lyrics_div.get_text() + "\n\n" - while lyrics[-1] == "\n": - lyrics = lyrics[:-1] - return lyrics - - def _try_extracting_lyrics_from_non_data_lyrics_container(self, soup): - """Extract lyrics from a div without attribute data-lyrics-container - This is the second most common layout on genius.com - """ - verse_div = soup.find("div", class_=re.compile("Lyrics__Container")) - if not verse_div: - if soup.find( - "div", - class_=re.compile("LyricsPlaceholder__Message"), - string="This song is an instrumental", - ): - self.debug("Detected instrumental") - return INSTRUMENTAL_LYRICS - else: - self.debug("Couldn't scrape page using known layouts") - return None - - lyrics_div = verse_div.parent - - ads = lyrics_div.find_all( - "div", class_=re.compile("InreadAd__Container") - ) - for ad in ads: - ad.replace_with("\n") - - footers = lyrics_div.find_all( - "div", class_=re.compile("Lyrics__Footer") - ) - for footer in footers: - footer.replace_with("") - return lyrics_div.get_text() + return None class Tekstowo(DirectBackend): diff --git a/docs/changelog.rst b/docs/changelog.rst index 25ae5bc10..54d085599 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -67,6 +67,9 @@ Bug fixes: * :doc:`plugins/lyrics`: Fix the issue with ``genius`` backend not being able to match lyrics when there is a slight variation in the artist name. :bug:`4791` +* :doc:`plugins/lyrics`: Fix plugin crash when ``genius`` backend returns empty + lyrics. + :bug:`5583` For packagers: diff --git a/test/plugins/test_lyrics.py b/test/plugins/test_lyrics.py index 1d264338a..091776108 100644 --- a/test/plugins/test_lyrics.py +++ b/test/plugins/test_lyrics.py @@ -379,7 +379,7 @@ class TestGeniusLyrics(LyricsBackendTest): @pytest.mark.parametrize( "file_name, expected_line_count", [ - ("geniuscom/2pacalleyezonmelyrics", 134), + ("geniuscom/2pacalleyezonmelyrics", 131), ("geniuscom/Ttngchinchillalyrics", 29), ("geniuscom/sample", 0), # see https://github.com/beetbox/beets/issues/3535 ],