Genius: refactor and simplify

This commit is contained in:
Šarūnas Nejus 2024-10-09 12:12:09 +01:00
parent 54fc67b30a
commit 745c5eb9f0
No known key found for this signature in database
GPG key ID: DD28F6704DBE3435
4 changed files with 126 additions and 92 deletions

86
beetsplug/_typing.py Normal file
View file

@ -0,0 +1,86 @@
from __future__ import annotations
from typing import Any
from typing_extensions import TypedDict
JSONDict = dict[str, Any]
class LRCLibAPI:
class Item(TypedDict):
"""Lyrics data item returned by the LRCLib API."""
id: int
name: str
trackName: str
artistName: str
albumName: str
duration: float | None
instrumental: bool
plainLyrics: str
syncedLyrics: str | None
class GeniusAPI:
"""Genius API data types.
This documents *only* the fields that are used in the plugin.
:attr:`SearchResult` is an exception, since I thought some of the other
fields might be useful in the future.
"""
class DateComponents(TypedDict):
year: int
month: int
day: int
class Artist(TypedDict):
api_path: str
header_image_url: str
id: int
image_url: str
is_meme_verified: bool
is_verified: bool
name: str
url: str
class Stats(TypedDict):
unreviewed_annotations: int
hot: bool
class SearchResult(TypedDict):
annotation_count: int
api_path: str
artist_names: str
full_title: str
header_image_thumbnail_url: str
header_image_url: str
id: int
lyrics_owner_id: int
lyrics_state: str
path: str
primary_artist_names: str
pyongs_count: int | None
relationships_index_url: str
release_date_components: GeniusAPI.DateComponents
release_date_for_display: str
release_date_with_abbreviated_month_for_display: str
song_art_image_thumbnail_url: str
song_art_image_url: str
stats: GeniusAPI.Stats
title: str
title_with_featured: str
url: str
featured_artists: list[GeniusAPI.Artist]
primary_artist: GeniusAPI.Artist
primary_artists: list[GeniusAPI.Artist]
class SearchHit(TypedDict):
result: GeniusAPI.SearchResult
class SearchResponse(TypedDict):
hits: list[GeniusAPI.SearchHit]
class Search(TypedDict):
response: GeniusAPI.SearchResponse

View file

@ -31,7 +31,6 @@ from typing import TYPE_CHECKING, Any, ClassVar, Iterable, Iterator
from urllib.parse import quote, urlencode, urlparse
import requests
from typing_extensions import TypedDict
from unidecode import unidecode
import beets
@ -42,6 +41,8 @@ if TYPE_CHECKING:
from beets.importer import ImportTask
from beets.library import Item
from ._typing import GeniusAPI, LRCLibAPI
try:
from bs4 import BeautifulSoup
@ -266,20 +267,6 @@ class Backend(RequestHandler):
raise NotImplementedError
class LRCLibItem(TypedDict):
"""Lyrics data item returned by the LRCLib API."""
id: int
name: str
trackName: str
artistName: str
albumName: str
duration: float | None
instrumental: bool
plainLyrics: str
syncedLyrics: str | None
@dataclass
@total_ordering
class LRCLyrics:
@ -297,7 +284,9 @@ class LRCLyrics:
return self.dist < other.dist
@classmethod
def make(cls, candidate: LRCLibItem, target_duration: float) -> LRCLyrics:
def make(
cls, candidate: LRCLibAPI.Item, target_duration: float
) -> LRCLyrics:
return cls(
target_duration,
candidate["duration"] or 0.0,
@ -354,7 +343,7 @@ class LRCLib(Backend):
def fetch_candidates(
self, artist: str, title: str, album: str, length: int
) -> Iterator[list[LRCLibItem]]:
) -> Iterator[list[LRCLibAPI.Item]]:
"""Yield lyrics candidates for the given song data.
I found that the ``/get`` endpoint sometimes returns inaccurate or
@ -494,13 +483,15 @@ class Genius(SearchBackend):
bigishdata.com/2016/09/27/getting-song-lyrics-from-geniuss-api-scraping/
"""
LYRICS_IN_JSON_RE = re.compile(r'(?<=.\\"html\\":\\").*?(?=(?<!\\)\\")')
remove_backslash = partial(re.compile(r"\\(?=[^\\])").sub, "")
base_url = "https://api.genius.com"
search_url = f"{base_url}/search"
def __init__(self, config, log):
super().__init__(config, log)
self.api_key = config["genius_api_key"].as_str()
self.headers = {"Authorization": f"Bearer {self.api_key}"}
@cached_property
def headers(self) -> dict[str, str]:
return {"Authorization": f'Bearer {self.config["genius_api_key"]}'}
def fetch(self, artist: str, title: str, *_) -> str | None:
"""Fetch lyrics from genius.com
@ -509,85 +500,39 @@ class Genius(SearchBackend):
we first query the api for a url matching our artist & title,
then attempt to scrape that url for the lyrics.
"""
json = self._search(artist, title)
check = partial(self.check_match, artist, title)
for hit in json["response"]["hits"]:
result = hit["result"]
url = hit["result"]["url"]
if check(result["primary_artist"]["name"], result["title"]) and (
lyrics := self.scrape_lyrics(self.fetch_text(url))
):
return collapse_newlines(lyrics)
data = self.fetch_json(
self.search_url,
params={"q": f"{artist} {title}".lower()},
headers=self.headers,
)
if (url := self.find_lyrics_url(data, artist, title)) and (
lyrics := self.scrape_lyrics(self.fetch_text(url))
):
return collapse_newlines(lyrics)
return None
def _search(self, artist, title):
"""Searches the genius api for a given artist and title
def find_lyrics_url(
self, data: GeniusAPI.Search, artist: str, title: str
) -> str | None:
"""Find URL to the lyrics of the given artist and title.
https://docs.genius.com/#search-h2
:returns: json response
https://docs.genius.com/#search-h2.
"""
return self.fetch_json(
self.search_url,
params={"q": f"{title} {artist.lower()}"},
headers=self.headers,
)
check = partial(self.check_match, artist, title)
for result in (hit["result"] for hit in data["response"]["hits"]):
if check(result["artist_names"], result["title"]):
return result["url"]
return None
def scrape_lyrics(self, html: str) -> str | None:
"""Scrape lyrics from a given genius.com html"""
soup = get_soup(html)
if m := self.LYRICS_IN_JSON_RE.search(html):
html_text = self.remove_backslash(m[0]).replace(r"\n", "\n")
return get_soup(html_text).get_text().strip()
# Most of the time, the page contains a div with class="lyrics" where
# all of the lyrics can be found already correctly formatted
# Sometimes, though, it packages the lyrics into separate divs, most
# likely for easier ad placement
lyrics_divs = soup.find_all("div", {"data-lyrics-container": True})
if not lyrics_divs:
self.debug("Received unusual song page html")
return self._try_extracting_lyrics_from_non_data_lyrics_container(
soup
)
lyrics = ""
for lyrics_div in lyrics_divs:
lyrics += lyrics_div.get_text() + "\n\n"
while lyrics[-1] == "\n":
lyrics = lyrics[:-1]
return lyrics
def _try_extracting_lyrics_from_non_data_lyrics_container(self, soup):
"""Extract lyrics from a div without attribute data-lyrics-container
This is the second most common layout on genius.com
"""
verse_div = soup.find("div", class_=re.compile("Lyrics__Container"))
if not verse_div:
if soup.find(
"div",
class_=re.compile("LyricsPlaceholder__Message"),
string="This song is an instrumental",
):
self.debug("Detected instrumental")
return INSTRUMENTAL_LYRICS
else:
self.debug("Couldn't scrape page using known layouts")
return None
lyrics_div = verse_div.parent
ads = lyrics_div.find_all(
"div", class_=re.compile("InreadAd__Container")
)
for ad in ads:
ad.replace_with("\n")
footers = lyrics_div.find_all(
"div", class_=re.compile("Lyrics__Footer")
)
for footer in footers:
footer.replace_with("")
return lyrics_div.get_text()
return None
class Tekstowo(DirectBackend):

View file

@ -67,6 +67,9 @@ Bug fixes:
* :doc:`plugins/lyrics`: Fix the issue with ``genius`` backend not being able
to match lyrics when there is a slight variation in the artist name.
:bug:`4791`
* :doc:`plugins/lyrics`: Fix plugin crash when ``genius`` backend returns empty
lyrics.
:bug:`5583`
For packagers:

View file

@ -379,7 +379,7 @@ class TestGeniusLyrics(LyricsBackendTest):
@pytest.mark.parametrize(
"file_name, expected_line_count",
[
("geniuscom/2pacalleyezonmelyrics", 134),
("geniuscom/2pacalleyezonmelyrics", 131),
("geniuscom/Ttngchinchillalyrics", 29),
("geniuscom/sample", 0), # see https://github.com/beetbox/beets/issues/3535
],