mirror of
https://github.com/beetbox/beets.git
synced 2025-12-06 16:42:42 +01:00
Google: Refactor and improve
* Type the response data that Google Custom Search API return.
* Exclude some 'letras.mus.br' pages that do not contain lyric.
* Exclude results from Musixmatch as we cannot access their pages.
* Improve parsing of the URL title:
- Handle long URL titles that get truncated (end with ellipsis) for
long searches
- Remove domains starting with 'www'
- Parse the title AND the artist. Previously this would only parse the
title, and fetch lyrics even when the artist did not match.
* Remove now redundant credits cleanup and checks for valid lyrics.
This commit is contained in:
parent
12c5eaae5e
commit
c5c4138d66
3 changed files with 141 additions and 130 deletions
|
|
@ -2,7 +2,7 @@ from __future__ import annotations
|
||||||
|
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
from typing_extensions import TypedDict
|
from typing_extensions import NotRequired, TypedDict
|
||||||
|
|
||||||
JSONDict = dict[str, Any]
|
JSONDict = dict[str, Any]
|
||||||
|
|
||||||
|
|
@ -84,3 +84,32 @@ class GeniusAPI:
|
||||||
|
|
||||||
class Search(TypedDict):
|
class Search(TypedDict):
|
||||||
response: GeniusAPI.SearchResponse
|
response: GeniusAPI.SearchResponse
|
||||||
|
|
||||||
|
|
||||||
|
class GoogleCustomSearchAPI:
|
||||||
|
class Response(TypedDict):
|
||||||
|
"""Search response from the Google Custom Search API.
|
||||||
|
|
||||||
|
If the search returns no results, the :attr:`items` field is not found.
|
||||||
|
"""
|
||||||
|
|
||||||
|
items: NotRequired[list[GoogleCustomSearchAPI.Item]]
|
||||||
|
|
||||||
|
class Item(TypedDict):
|
||||||
|
"""A Google Custom Search API result item.
|
||||||
|
|
||||||
|
:attr:`title` field is shown to the user in the search interface, thus
|
||||||
|
it gets truncated with an ellipsis for longer queries. For most
|
||||||
|
results, the full title is available as ``og:title`` metatag found
|
||||||
|
under the :attr:`pagemap` field. Note neither this metatag nor the
|
||||||
|
``pagemap`` field is guaranteed to be present in the data.
|
||||||
|
"""
|
||||||
|
|
||||||
|
title: str
|
||||||
|
link: str
|
||||||
|
pagemap: NotRequired[GoogleCustomSearchAPI.Pagemap]
|
||||||
|
|
||||||
|
class Pagemap(TypedDict):
|
||||||
|
"""Pagemap data with a single meta tags dict in a list."""
|
||||||
|
|
||||||
|
metatags: list[JSONDict]
|
||||||
|
|
|
||||||
|
|
@ -28,7 +28,7 @@ from functools import cached_property, partial, total_ordering
|
||||||
from html import unescape
|
from html import unescape
|
||||||
from http import HTTPStatus
|
from http import HTTPStatus
|
||||||
from typing import TYPE_CHECKING, ClassVar, Iterable, Iterator, NamedTuple
|
from typing import TYPE_CHECKING, ClassVar, Iterable, Iterator, NamedTuple
|
||||||
from urllib.parse import quote, urlencode, urlparse
|
from urllib.parse import quote, urlencode
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
from unidecode import unidecode
|
from unidecode import unidecode
|
||||||
|
|
@ -41,7 +41,7 @@ if TYPE_CHECKING:
|
||||||
from beets.importer import ImportTask
|
from beets.importer import ImportTask
|
||||||
from beets.library import Item
|
from beets.library import Item
|
||||||
|
|
||||||
from ._typing import GeniusAPI, JSONDict, LRCLibAPI
|
from ._typing import GeniusAPI, GoogleCustomSearchAPI, JSONDict, LRCLibAPI
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
@ -492,7 +492,9 @@ class SearchBackend(Backend):
|
||||||
def fetch(self, artist: str, title: str, *_) -> str | None:
|
def fetch(self, artist: str, title: str, *_) -> str | None:
|
||||||
"""Fetch lyrics for the given artist and title."""
|
"""Fetch lyrics for the given artist and title."""
|
||||||
for result in self.get_results(artist, title):
|
for result in self.get_results(artist, title):
|
||||||
if lyrics := self.scrape(self.fetch_text(result.url)):
|
if (html := self.fetch_text(result.url)) and (
|
||||||
|
lyrics := self.scrape(html)
|
||||||
|
):
|
||||||
return lyrics
|
return lyrics
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
@ -567,20 +569,6 @@ class Tekstowo(DirectBackend):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def remove_credits(text):
|
|
||||||
"""Remove first/last line of text if it contains the word 'lyrics'
|
|
||||||
eg 'Lyrics by songsdatabase.com'
|
|
||||||
"""
|
|
||||||
textlines = text.split("\n")
|
|
||||||
credits = None
|
|
||||||
for i in (0, -1):
|
|
||||||
if textlines and "lyrics" in textlines[i].lower():
|
|
||||||
credits = textlines.pop(i)
|
|
||||||
if credits:
|
|
||||||
text = "\n".join(textlines)
|
|
||||||
return text
|
|
||||||
|
|
||||||
|
|
||||||
collapse_newlines = partial(re.compile(r"\n{3,}").sub, r"\n\n")
|
collapse_newlines = partial(re.compile(r"\n{3,}").sub, r"\n\n")
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -617,87 +605,97 @@ class Google(SearchBackend):
|
||||||
|
|
||||||
SEARCH_URL = "https://www.googleapis.com/customsearch/v1"
|
SEARCH_URL = "https://www.googleapis.com/customsearch/v1"
|
||||||
|
|
||||||
def is_lyrics(self, text, artist=None):
|
#: Exclude some letras.mus.br pages which do not contain lyrics.
|
||||||
"""Determine whether the text seems to be valid lyrics."""
|
EXCLUDE_PAGES = [
|
||||||
if not text:
|
"significado.html",
|
||||||
return False
|
"traduccion.html",
|
||||||
bad_triggers_occ = []
|
"traducao.html",
|
||||||
nb_lines = text.count("\n")
|
"significados.html",
|
||||||
if nb_lines <= 1:
|
]
|
||||||
self.debug("Ignoring too short lyrics '{}'", text)
|
|
||||||
return False
|
|
||||||
elif nb_lines < 5:
|
|
||||||
bad_triggers_occ.append("too_short")
|
|
||||||
else:
|
|
||||||
# Lyrics look legit, remove credits to avoid being penalized
|
|
||||||
# further down
|
|
||||||
text = remove_credits(text)
|
|
||||||
|
|
||||||
bad_triggers = ["lyrics", "copyright", "property", "links"]
|
#: Regular expression to match noise in the URL title.
|
||||||
if artist:
|
URL_TITLE_NOISE_RE = re.compile(
|
||||||
bad_triggers += [artist]
|
r"""
|
||||||
|
\b
|
||||||
|
(
|
||||||
|
paroles(\ et\ traduction|\ de\ chanson)?
|
||||||
|
| letras?(\ de)?
|
||||||
|
| liedtexte
|
||||||
|
| original\ song\ full\ text\.
|
||||||
|
| official
|
||||||
|
| 20[12]\d\ version
|
||||||
|
| (absolute\ |az)?lyrics(\ complete)?
|
||||||
|
| www\S+
|
||||||
|
| \S+\.(com|net|mus\.br)
|
||||||
|
)
|
||||||
|
([^\w.]|$)
|
||||||
|
""",
|
||||||
|
re.IGNORECASE | re.VERBOSE,
|
||||||
|
)
|
||||||
|
#: Split cleaned up URL title into artist and title parts.
|
||||||
|
URL_TITLE_PARTS_RE = re.compile(r" +(?:[ :|-]+|par|by) +")
|
||||||
|
|
||||||
for item in bad_triggers:
|
def fetch_text(self, *args, **kwargs) -> str:
|
||||||
bad_triggers_occ += [item] * len(
|
"""Handle an error so that we can continue with the next URL."""
|
||||||
re.findall(r"\W%s\W" % item, text, re.I)
|
with self.handle_request():
|
||||||
)
|
return super().fetch_text(*args, **kwargs)
|
||||||
|
|
||||||
if bad_triggers_occ:
|
@staticmethod
|
||||||
self.debug("Bad triggers detected: {}", bad_triggers_occ)
|
def get_part_dist(artist: str, title: str, part: str) -> float:
|
||||||
return len(bad_triggers_occ) < 2
|
"""Return the distance between the given part and the artist and title.
|
||||||
|
|
||||||
BY_TRANS = ["by", "par", "de", "von"]
|
A number between -1 and 1 is returned, where -1 means the part is
|
||||||
LYRICS_TRANS = ["lyrics", "paroles", "letras", "liedtexte"]
|
closer to the artist and 1 means it is closer to the title.
|
||||||
|
"""
|
||||||
|
return string_dist(artist, part) - string_dist(title, part)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
def make_search_result(
|
def make_search_result(
|
||||||
self, artist: str, url_link: str, url_title: str
|
cls, artist: str, title: str, item: GoogleCustomSearchAPI.Item
|
||||||
) -> SearchResult:
|
) -> SearchResult:
|
||||||
"""Parse artist and title from the URL title and return a search result."""
|
"""Parse artist and title from the URL title and return a search result."""
|
||||||
url_title_slug = slug(url_title)
|
url_title = (
|
||||||
artist = slug(artist)
|
# get full title from metatags if available
|
||||||
sitename = urlparse(url_link).netloc
|
item.get("pagemap", {}).get("metatags", [{}])[0].get("og:title")
|
||||||
|
# default to the dispolay title
|
||||||
# or try extracting song title from URL title and check if
|
or item["title"]
|
||||||
# they are close enough
|
|
||||||
tokens = (
|
|
||||||
[by + "-" + artist for by in self.BY_TRANS]
|
|
||||||
+ [artist, sitename, sitename.replace("www.", "")]
|
|
||||||
+ self.LYRICS_TRANS
|
|
||||||
)
|
)
|
||||||
song_title = re.sub(
|
clean_title = cls.URL_TITLE_NOISE_RE.sub("", url_title).strip(" .-|")
|
||||||
"(%s)" % "|".join(tokens), "", url_title_slug
|
# split it into parts which may be part of the artist or the title
|
||||||
).strip("-")
|
# `dict.fromkeys` removes duplicates keeping the order
|
||||||
|
parts = list(dict.fromkeys(cls.URL_TITLE_PARTS_RE.split(clean_title)))
|
||||||
|
|
||||||
return SearchResult(artist, song_title, url_link)
|
if len(parts) == 1:
|
||||||
|
part = parts[0]
|
||||||
|
if m := re.search(rf"(?i)\W*({re.escape(title)})\W*", part):
|
||||||
|
# artist and title may not have a separator
|
||||||
|
result_title = m[1]
|
||||||
|
result_artist = part.replace(m[0], "")
|
||||||
|
else:
|
||||||
|
# assume that this is the title
|
||||||
|
result_artist, result_title = "", parts[0]
|
||||||
|
else:
|
||||||
|
# sort parts by their similarity to the artist
|
||||||
|
parts.sort(key=lambda p: cls.get_part_dist(artist, title, p))
|
||||||
|
result_artist, result_title = parts[0], " ".join(parts[1:])
|
||||||
|
|
||||||
|
return SearchResult(result_artist, result_title, item["link"])
|
||||||
|
|
||||||
def search(self, artist: str, title: str) -> Iterable[SearchResult]:
|
def search(self, artist: str, title: str) -> Iterable[SearchResult]:
|
||||||
params = {
|
params = {
|
||||||
"key": self.config["google_API_key"].as_str(),
|
"key": self.config["google_API_key"].as_str(),
|
||||||
"cx": self.config["google_engine_ID"].as_str(),
|
"cx": self.config["google_engine_ID"].as_str(),
|
||||||
"q": f"{artist} {title}",
|
"q": f"{artist} {title}",
|
||||||
|
"siteSearch": "www.musixmatch.com",
|
||||||
|
"siteSearchFilter": "e",
|
||||||
|
"excludeTerms": ", ".join(self.EXCLUDE_PAGES),
|
||||||
}
|
}
|
||||||
|
|
||||||
data = self.fetch_json(self.SEARCH_URL, params=params)
|
data: GoogleCustomSearchAPI.Response = self.fetch_json(
|
||||||
|
self.SEARCH_URL, params=params
|
||||||
|
)
|
||||||
for item in data.get("items", []):
|
for item in data.get("items", []):
|
||||||
yield self.make_search_result(artist, item["link"], item["title"])
|
yield self.make_search_result(artist, title, item)
|
||||||
|
|
||||||
def get_results(self, artist: str, title: str) -> Iterable[SearchResult]:
|
|
||||||
return super().get_results(artist, slug(title))
|
|
||||||
|
|
||||||
def fetch(self, artist: str, title: str, *_) -> str | None:
|
|
||||||
for result in self.get_results(artist, title):
|
|
||||||
with self.handle_request():
|
|
||||||
lyrics = self.scrape(self.fetch_text(result.url))
|
|
||||||
if not lyrics:
|
|
||||||
continue
|
|
||||||
|
|
||||||
if self.is_lyrics(lyrics, artist):
|
|
||||||
self.debug(
|
|
||||||
"Got lyrics from {}", urlparse(result.url).netloc
|
|
||||||
)
|
|
||||||
return lyrics
|
|
||||||
|
|
||||||
return None
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def scrape(cls, html: str) -> str | None:
|
def scrape(cls, html: str) -> str | None:
|
||||||
|
|
|
||||||
|
|
@ -101,24 +101,6 @@ class TestLyricsUtils:
|
||||||
|
|
||||||
assert list(actual_titles) == [title, *expected_extra_titles]
|
assert list(actual_titles) == [title, *expected_extra_titles]
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
"initial_lyrics, expected",
|
|
||||||
[
|
|
||||||
("Verse\nLyrics credit in the last line", "Verse"),
|
|
||||||
("Lyrics credit in the first line\nVerse", "Verse"),
|
|
||||||
(
|
|
||||||
"""Verse
|
|
||||||
Lyrics mentioned somewhere in the middle
|
|
||||||
Verse""",
|
|
||||||
"""Verse
|
|
||||||
Lyrics mentioned somewhere in the middle
|
|
||||||
Verse""",
|
|
||||||
),
|
|
||||||
],
|
|
||||||
)
|
|
||||||
def test_remove_credits(self, initial_lyrics, expected):
|
|
||||||
assert lyrics.remove_credits(initial_lyrics) == expected
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"initial_text, expected",
|
"initial_text, expected",
|
||||||
[
|
[
|
||||||
|
|
@ -311,8 +293,6 @@ class TestLyricsSources(LyricsBackendTest):
|
||||||
class TestGoogleLyrics(LyricsBackendTest):
|
class TestGoogleLyrics(LyricsBackendTest):
|
||||||
"""Test scraping heuristics on a fake html page."""
|
"""Test scraping heuristics on a fake html page."""
|
||||||
|
|
||||||
TITLE = "Beets song"
|
|
||||||
|
|
||||||
@pytest.fixture(scope="class")
|
@pytest.fixture(scope="class")
|
||||||
def backend_name(self):
|
def backend_name(self):
|
||||||
return "google"
|
return "google"
|
||||||
|
|
@ -325,51 +305,55 @@ class TestGoogleLyrics(LyricsBackendTest):
|
||||||
def file_name(self):
|
def file_name(self):
|
||||||
return "examplecom/beetssong"
|
return "examplecom/beetssong"
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def search_item(self, url_title, url):
|
||||||
|
return {"title": url_title, "link": url}
|
||||||
|
|
||||||
def test_mocked_source_ok(self, backend, lyrics_html):
|
def test_mocked_source_ok(self, backend, lyrics_html):
|
||||||
"""Test that lyrics of the mocked page are correctly scraped"""
|
"""Test that lyrics of the mocked page are correctly scraped"""
|
||||||
result = backend.scrape(lyrics_html).lower()
|
result = backend.scrape(lyrics_html).lower()
|
||||||
|
|
||||||
assert result
|
assert result
|
||||||
assert backend.is_lyrics(result)
|
assert PHRASE_BY_TITLE["Beets song"] in result
|
||||||
assert PHRASE_BY_TITLE[self.TITLE] in result
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"url_title, artist, expected_title",
|
"url_title, expected_artist, expected_title",
|
||||||
[
|
[
|
||||||
("John Doe - beets song Lyrics", "John Doe", "beets-song"),
|
("Artist - beets song Lyrics", "Artist", "beets song"),
|
||||||
("example.com | Beats song by John doe", "John Doe", "beats-song"),
|
("www.azlyrics.com | Beats song by Artist", "Artist", "Beats song"),
|
||||||
|
("lyric.com | seets bong lyrics by Artist", "Artist", "seets bong"),
|
||||||
|
("foo", "", "foo"),
|
||||||
|
("Artist - Beets Song lyrics | AZLyrics", "Artist", "Beets Song"),
|
||||||
|
("Letra de Artist - Beets Song", "Artist", "Beets Song"),
|
||||||
|
("Letra de Artist - Beets ...", "Artist", "Beets"),
|
||||||
|
("Artist Beets Song", "Artist", "Beets Song"),
|
||||||
|
("BeetsSong - Artist", "Artist", "BeetsSong"),
|
||||||
|
("Artist - BeetsSong", "Artist", "BeetsSong"),
|
||||||
|
("Beets Song", "", "Beets Song"),
|
||||||
|
("Beets Song Artist", "Artist", "Beets Song"),
|
||||||
(
|
(
|
||||||
"example.com | seets bong lyrics by John doe",
|
"BeetsSong (feat. Other & Another) - Artist",
|
||||||
"John Doe",
|
"Artist",
|
||||||
"seets-bong",
|
"BeetsSong (feat. Other & Another)",
|
||||||
|
),
|
||||||
|
(
|
||||||
|
(
|
||||||
|
"Beets song lyrics by Artist - original song full text. "
|
||||||
|
"Official Beets song lyrics, 2024 version | LyricsMode.com"
|
||||||
|
),
|
||||||
|
"Artist",
|
||||||
|
"Beets song",
|
||||||
),
|
),
|
||||||
("foo", "Sun O)))", "foo"),
|
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
@pytest.mark.parametrize("url", ["http://doesntmatter.com"])
|
||||||
def test_make_search_result(
|
def test_make_search_result(
|
||||||
self, backend, url_title, artist, expected_title
|
self, backend, search_item, expected_artist, expected_title
|
||||||
):
|
):
|
||||||
result = backend.make_search_result(
|
result = backend.make_search_result("Artist", "Beets song", search_item)
|
||||||
artist, "https://example.com", url_title
|
|
||||||
)
|
|
||||||
assert result.title == expected_title
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
assert result.artist == expected_artist
|
||||||
"lyrics",
|
assert result.title == expected_title
|
||||||
[
|
|
||||||
"LyricsMania.com - Copyright (c) 2013 - All Rights Reserved",
|
|
||||||
"""All material found on this site is property\n
|
|
||||||
of mywickedsongtext brand""",
|
|
||||||
"""
|
|
||||||
Lyricsmania staff is working hard for you to add $TITLE lyrics as soon
|
|
||||||
as they'll be released by $ARTIST, check back soon!
|
|
||||||
In case you have the lyrics to $TITLE and want to send them to us, fill out
|
|
||||||
the following form.
|
|
||||||
""",
|
|
||||||
],
|
|
||||||
)
|
|
||||||
def test_bad_lyrics(self, backend, lyrics):
|
|
||||||
assert not backend.is_lyrics(lyrics)
|
|
||||||
|
|
||||||
|
|
||||||
class TestGeniusLyrics(LyricsBackendTest):
|
class TestGeniusLyrics(LyricsBackendTest):
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue