mirror of
https://github.com/beetbox/beets.git
synced 2025-12-06 08:39:17 +01:00
Google: Refactor and improve
* Type the response data that Google Custom Search API return.
* Exclude some 'letras.mus.br' pages that do not contain lyric.
* Exclude results from Musixmatch as we cannot access their pages.
* Improve parsing of the URL title:
- Handle long URL titles that get truncated (end with ellipsis) for
long searches
- Remove domains starting with 'www'
- Parse the title AND the artist. Previously this would only parse the
title, and fetch lyrics even when the artist did not match.
* Remove now redundant credits cleanup and checks for valid lyrics.
This commit is contained in:
parent
12c5eaae5e
commit
c5c4138d66
3 changed files with 141 additions and 130 deletions
|
|
@ -2,7 +2,7 @@ from __future__ import annotations
|
|||
|
||||
from typing import Any
|
||||
|
||||
from typing_extensions import TypedDict
|
||||
from typing_extensions import NotRequired, TypedDict
|
||||
|
||||
JSONDict = dict[str, Any]
|
||||
|
||||
|
|
@ -84,3 +84,32 @@ class GeniusAPI:
|
|||
|
||||
class Search(TypedDict):
|
||||
response: GeniusAPI.SearchResponse
|
||||
|
||||
|
||||
class GoogleCustomSearchAPI:
|
||||
class Response(TypedDict):
|
||||
"""Search response from the Google Custom Search API.
|
||||
|
||||
If the search returns no results, the :attr:`items` field is not found.
|
||||
"""
|
||||
|
||||
items: NotRequired[list[GoogleCustomSearchAPI.Item]]
|
||||
|
||||
class Item(TypedDict):
|
||||
"""A Google Custom Search API result item.
|
||||
|
||||
:attr:`title` field is shown to the user in the search interface, thus
|
||||
it gets truncated with an ellipsis for longer queries. For most
|
||||
results, the full title is available as ``og:title`` metatag found
|
||||
under the :attr:`pagemap` field. Note neither this metatag nor the
|
||||
``pagemap`` field is guaranteed to be present in the data.
|
||||
"""
|
||||
|
||||
title: str
|
||||
link: str
|
||||
pagemap: NotRequired[GoogleCustomSearchAPI.Pagemap]
|
||||
|
||||
class Pagemap(TypedDict):
|
||||
"""Pagemap data with a single meta tags dict in a list."""
|
||||
|
||||
metatags: list[JSONDict]
|
||||
|
|
|
|||
|
|
@ -28,7 +28,7 @@ from functools import cached_property, partial, total_ordering
|
|||
from html import unescape
|
||||
from http import HTTPStatus
|
||||
from typing import TYPE_CHECKING, ClassVar, Iterable, Iterator, NamedTuple
|
||||
from urllib.parse import quote, urlencode, urlparse
|
||||
from urllib.parse import quote, urlencode
|
||||
|
||||
import requests
|
||||
from unidecode import unidecode
|
||||
|
|
@ -41,7 +41,7 @@ if TYPE_CHECKING:
|
|||
from beets.importer import ImportTask
|
||||
from beets.library import Item
|
||||
|
||||
from ._typing import GeniusAPI, JSONDict, LRCLibAPI
|
||||
from ._typing import GeniusAPI, GoogleCustomSearchAPI, JSONDict, LRCLibAPI
|
||||
|
||||
try:
|
||||
from bs4 import BeautifulSoup
|
||||
|
|
@ -492,7 +492,9 @@ class SearchBackend(Backend):
|
|||
def fetch(self, artist: str, title: str, *_) -> str | None:
|
||||
"""Fetch lyrics for the given artist and title."""
|
||||
for result in self.get_results(artist, title):
|
||||
if lyrics := self.scrape(self.fetch_text(result.url)):
|
||||
if (html := self.fetch_text(result.url)) and (
|
||||
lyrics := self.scrape(html)
|
||||
):
|
||||
return lyrics
|
||||
|
||||
return None
|
||||
|
|
@ -567,20 +569,6 @@ class Tekstowo(DirectBackend):
|
|||
return None
|
||||
|
||||
|
||||
def remove_credits(text):
|
||||
"""Remove first/last line of text if it contains the word 'lyrics'
|
||||
eg 'Lyrics by songsdatabase.com'
|
||||
"""
|
||||
textlines = text.split("\n")
|
||||
credits = None
|
||||
for i in (0, -1):
|
||||
if textlines and "lyrics" in textlines[i].lower():
|
||||
credits = textlines.pop(i)
|
||||
if credits:
|
||||
text = "\n".join(textlines)
|
||||
return text
|
||||
|
||||
|
||||
collapse_newlines = partial(re.compile(r"\n{3,}").sub, r"\n\n")
|
||||
|
||||
|
||||
|
|
@ -617,87 +605,97 @@ class Google(SearchBackend):
|
|||
|
||||
SEARCH_URL = "https://www.googleapis.com/customsearch/v1"
|
||||
|
||||
def is_lyrics(self, text, artist=None):
|
||||
"""Determine whether the text seems to be valid lyrics."""
|
||||
if not text:
|
||||
return False
|
||||
bad_triggers_occ = []
|
||||
nb_lines = text.count("\n")
|
||||
if nb_lines <= 1:
|
||||
self.debug("Ignoring too short lyrics '{}'", text)
|
||||
return False
|
||||
elif nb_lines < 5:
|
||||
bad_triggers_occ.append("too_short")
|
||||
else:
|
||||
# Lyrics look legit, remove credits to avoid being penalized
|
||||
# further down
|
||||
text = remove_credits(text)
|
||||
#: Exclude some letras.mus.br pages which do not contain lyrics.
|
||||
EXCLUDE_PAGES = [
|
||||
"significado.html",
|
||||
"traduccion.html",
|
||||
"traducao.html",
|
||||
"significados.html",
|
||||
]
|
||||
|
||||
bad_triggers = ["lyrics", "copyright", "property", "links"]
|
||||
if artist:
|
||||
bad_triggers += [artist]
|
||||
|
||||
for item in bad_triggers:
|
||||
bad_triggers_occ += [item] * len(
|
||||
re.findall(r"\W%s\W" % item, text, re.I)
|
||||
#: Regular expression to match noise in the URL title.
|
||||
URL_TITLE_NOISE_RE = re.compile(
|
||||
r"""
|
||||
\b
|
||||
(
|
||||
paroles(\ et\ traduction|\ de\ chanson)?
|
||||
| letras?(\ de)?
|
||||
| liedtexte
|
||||
| original\ song\ full\ text\.
|
||||
| official
|
||||
| 20[12]\d\ version
|
||||
| (absolute\ |az)?lyrics(\ complete)?
|
||||
| www\S+
|
||||
| \S+\.(com|net|mus\.br)
|
||||
)
|
||||
([^\w.]|$)
|
||||
""",
|
||||
re.IGNORECASE | re.VERBOSE,
|
||||
)
|
||||
#: Split cleaned up URL title into artist and title parts.
|
||||
URL_TITLE_PARTS_RE = re.compile(r" +(?:[ :|-]+|par|by) +")
|
||||
|
||||
if bad_triggers_occ:
|
||||
self.debug("Bad triggers detected: {}", bad_triggers_occ)
|
||||
return len(bad_triggers_occ) < 2
|
||||
def fetch_text(self, *args, **kwargs) -> str:
|
||||
"""Handle an error so that we can continue with the next URL."""
|
||||
with self.handle_request():
|
||||
return super().fetch_text(*args, **kwargs)
|
||||
|
||||
BY_TRANS = ["by", "par", "de", "von"]
|
||||
LYRICS_TRANS = ["lyrics", "paroles", "letras", "liedtexte"]
|
||||
@staticmethod
|
||||
def get_part_dist(artist: str, title: str, part: str) -> float:
|
||||
"""Return the distance between the given part and the artist and title.
|
||||
|
||||
A number between -1 and 1 is returned, where -1 means the part is
|
||||
closer to the artist and 1 means it is closer to the title.
|
||||
"""
|
||||
return string_dist(artist, part) - string_dist(title, part)
|
||||
|
||||
@classmethod
|
||||
def make_search_result(
|
||||
self, artist: str, url_link: str, url_title: str
|
||||
cls, artist: str, title: str, item: GoogleCustomSearchAPI.Item
|
||||
) -> SearchResult:
|
||||
"""Parse artist and title from the URL title and return a search result."""
|
||||
url_title_slug = slug(url_title)
|
||||
artist = slug(artist)
|
||||
sitename = urlparse(url_link).netloc
|
||||
|
||||
# or try extracting song title from URL title and check if
|
||||
# they are close enough
|
||||
tokens = (
|
||||
[by + "-" + artist for by in self.BY_TRANS]
|
||||
+ [artist, sitename, sitename.replace("www.", "")]
|
||||
+ self.LYRICS_TRANS
|
||||
url_title = (
|
||||
# get full title from metatags if available
|
||||
item.get("pagemap", {}).get("metatags", [{}])[0].get("og:title")
|
||||
# default to the dispolay title
|
||||
or item["title"]
|
||||
)
|
||||
song_title = re.sub(
|
||||
"(%s)" % "|".join(tokens), "", url_title_slug
|
||||
).strip("-")
|
||||
clean_title = cls.URL_TITLE_NOISE_RE.sub("", url_title).strip(" .-|")
|
||||
# split it into parts which may be part of the artist or the title
|
||||
# `dict.fromkeys` removes duplicates keeping the order
|
||||
parts = list(dict.fromkeys(cls.URL_TITLE_PARTS_RE.split(clean_title)))
|
||||
|
||||
return SearchResult(artist, song_title, url_link)
|
||||
if len(parts) == 1:
|
||||
part = parts[0]
|
||||
if m := re.search(rf"(?i)\W*({re.escape(title)})\W*", part):
|
||||
# artist and title may not have a separator
|
||||
result_title = m[1]
|
||||
result_artist = part.replace(m[0], "")
|
||||
else:
|
||||
# assume that this is the title
|
||||
result_artist, result_title = "", parts[0]
|
||||
else:
|
||||
# sort parts by their similarity to the artist
|
||||
parts.sort(key=lambda p: cls.get_part_dist(artist, title, p))
|
||||
result_artist, result_title = parts[0], " ".join(parts[1:])
|
||||
|
||||
return SearchResult(result_artist, result_title, item["link"])
|
||||
|
||||
def search(self, artist: str, title: str) -> Iterable[SearchResult]:
|
||||
params = {
|
||||
"key": self.config["google_API_key"].as_str(),
|
||||
"cx": self.config["google_engine_ID"].as_str(),
|
||||
"q": f"{artist} {title}",
|
||||
"siteSearch": "www.musixmatch.com",
|
||||
"siteSearchFilter": "e",
|
||||
"excludeTerms": ", ".join(self.EXCLUDE_PAGES),
|
||||
}
|
||||
|
||||
data = self.fetch_json(self.SEARCH_URL, params=params)
|
||||
for item in data.get("items", []):
|
||||
yield self.make_search_result(artist, item["link"], item["title"])
|
||||
|
||||
def get_results(self, artist: str, title: str) -> Iterable[SearchResult]:
|
||||
return super().get_results(artist, slug(title))
|
||||
|
||||
def fetch(self, artist: str, title: str, *_) -> str | None:
|
||||
for result in self.get_results(artist, title):
|
||||
with self.handle_request():
|
||||
lyrics = self.scrape(self.fetch_text(result.url))
|
||||
if not lyrics:
|
||||
continue
|
||||
|
||||
if self.is_lyrics(lyrics, artist):
|
||||
self.debug(
|
||||
"Got lyrics from {}", urlparse(result.url).netloc
|
||||
data: GoogleCustomSearchAPI.Response = self.fetch_json(
|
||||
self.SEARCH_URL, params=params
|
||||
)
|
||||
return lyrics
|
||||
|
||||
return None
|
||||
for item in data.get("items", []):
|
||||
yield self.make_search_result(artist, title, item)
|
||||
|
||||
@classmethod
|
||||
def scrape(cls, html: str) -> str | None:
|
||||
|
|
|
|||
|
|
@ -101,24 +101,6 @@ class TestLyricsUtils:
|
|||
|
||||
assert list(actual_titles) == [title, *expected_extra_titles]
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"initial_lyrics, expected",
|
||||
[
|
||||
("Verse\nLyrics credit in the last line", "Verse"),
|
||||
("Lyrics credit in the first line\nVerse", "Verse"),
|
||||
(
|
||||
"""Verse
|
||||
Lyrics mentioned somewhere in the middle
|
||||
Verse""",
|
||||
"""Verse
|
||||
Lyrics mentioned somewhere in the middle
|
||||
Verse""",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_remove_credits(self, initial_lyrics, expected):
|
||||
assert lyrics.remove_credits(initial_lyrics) == expected
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"initial_text, expected",
|
||||
[
|
||||
|
|
@ -311,8 +293,6 @@ class TestLyricsSources(LyricsBackendTest):
|
|||
class TestGoogleLyrics(LyricsBackendTest):
|
||||
"""Test scraping heuristics on a fake html page."""
|
||||
|
||||
TITLE = "Beets song"
|
||||
|
||||
@pytest.fixture(scope="class")
|
||||
def backend_name(self):
|
||||
return "google"
|
||||
|
|
@ -325,51 +305,55 @@ class TestGoogleLyrics(LyricsBackendTest):
|
|||
def file_name(self):
|
||||
return "examplecom/beetssong"
|
||||
|
||||
@pytest.fixture
|
||||
def search_item(self, url_title, url):
|
||||
return {"title": url_title, "link": url}
|
||||
|
||||
def test_mocked_source_ok(self, backend, lyrics_html):
|
||||
"""Test that lyrics of the mocked page are correctly scraped"""
|
||||
result = backend.scrape(lyrics_html).lower()
|
||||
|
||||
assert result
|
||||
assert backend.is_lyrics(result)
|
||||
assert PHRASE_BY_TITLE[self.TITLE] in result
|
||||
assert PHRASE_BY_TITLE["Beets song"] in result
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"url_title, artist, expected_title",
|
||||
"url_title, expected_artist, expected_title",
|
||||
[
|
||||
("John Doe - beets song Lyrics", "John Doe", "beets-song"),
|
||||
("example.com | Beats song by John doe", "John Doe", "beats-song"),
|
||||
("Artist - beets song Lyrics", "Artist", "beets song"),
|
||||
("www.azlyrics.com | Beats song by Artist", "Artist", "Beats song"),
|
||||
("lyric.com | seets bong lyrics by Artist", "Artist", "seets bong"),
|
||||
("foo", "", "foo"),
|
||||
("Artist - Beets Song lyrics | AZLyrics", "Artist", "Beets Song"),
|
||||
("Letra de Artist - Beets Song", "Artist", "Beets Song"),
|
||||
("Letra de Artist - Beets ...", "Artist", "Beets"),
|
||||
("Artist Beets Song", "Artist", "Beets Song"),
|
||||
("BeetsSong - Artist", "Artist", "BeetsSong"),
|
||||
("Artist - BeetsSong", "Artist", "BeetsSong"),
|
||||
("Beets Song", "", "Beets Song"),
|
||||
("Beets Song Artist", "Artist", "Beets Song"),
|
||||
(
|
||||
"example.com | seets bong lyrics by John doe",
|
||||
"John Doe",
|
||||
"seets-bong",
|
||||
"BeetsSong (feat. Other & Another) - Artist",
|
||||
"Artist",
|
||||
"BeetsSong (feat. Other & Another)",
|
||||
),
|
||||
(
|
||||
(
|
||||
"Beets song lyrics by Artist - original song full text. "
|
||||
"Official Beets song lyrics, 2024 version | LyricsMode.com"
|
||||
),
|
||||
"Artist",
|
||||
"Beets song",
|
||||
),
|
||||
("foo", "Sun O)))", "foo"),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("url", ["http://doesntmatter.com"])
|
||||
def test_make_search_result(
|
||||
self, backend, url_title, artist, expected_title
|
||||
self, backend, search_item, expected_artist, expected_title
|
||||
):
|
||||
result = backend.make_search_result(
|
||||
artist, "https://example.com", url_title
|
||||
)
|
||||
assert result.title == expected_title
|
||||
result = backend.make_search_result("Artist", "Beets song", search_item)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"lyrics",
|
||||
[
|
||||
"LyricsMania.com - Copyright (c) 2013 - All Rights Reserved",
|
||||
"""All material found on this site is property\n
|
||||
of mywickedsongtext brand""",
|
||||
"""
|
||||
Lyricsmania staff is working hard for you to add $TITLE lyrics as soon
|
||||
as they'll be released by $ARTIST, check back soon!
|
||||
In case you have the lyrics to $TITLE and want to send them to us, fill out
|
||||
the following form.
|
||||
""",
|
||||
],
|
||||
)
|
||||
def test_bad_lyrics(self, backend, lyrics):
|
||||
assert not backend.is_lyrics(lyrics)
|
||||
assert result.artist == expected_artist
|
||||
assert result.title == expected_title
|
||||
|
||||
|
||||
class TestGeniusLyrics(LyricsBackendTest):
|
||||
|
|
|
|||
Loading…
Reference in a new issue