Google: Refactor and improve

* Type the response data that Google Custom Search API return.
* Exclude some 'letras.mus.br' pages that do not contain lyric.
* Exclude results from Musixmatch as we cannot access their pages.
* Improve parsing of the URL title:
  - Handle long URL titles that get truncated (end with ellipsis) for
    long searches
  - Remove domains starting with 'www'
  - Parse the title AND the artist. Previously this would only parse the
    title, and fetch lyrics even when the artist did not match.
* Remove now redundant credits cleanup and checks for valid lyrics.
This commit is contained in:
Šarūnas Nejus 2024-10-13 16:36:41 +01:00
parent 12c5eaae5e
commit c5c4138d66
No known key found for this signature in database
GPG key ID: DD28F6704DBE3435
3 changed files with 141 additions and 130 deletions

View file

@ -2,7 +2,7 @@ from __future__ import annotations
from typing import Any from typing import Any
from typing_extensions import TypedDict from typing_extensions import NotRequired, TypedDict
JSONDict = dict[str, Any] JSONDict = dict[str, Any]
@ -84,3 +84,32 @@ class GeniusAPI:
class Search(TypedDict): class Search(TypedDict):
response: GeniusAPI.SearchResponse response: GeniusAPI.SearchResponse
class GoogleCustomSearchAPI:
class Response(TypedDict):
"""Search response from the Google Custom Search API.
If the search returns no results, the :attr:`items` field is not found.
"""
items: NotRequired[list[GoogleCustomSearchAPI.Item]]
class Item(TypedDict):
"""A Google Custom Search API result item.
:attr:`title` field is shown to the user in the search interface, thus
it gets truncated with an ellipsis for longer queries. For most
results, the full title is available as ``og:title`` metatag found
under the :attr:`pagemap` field. Note neither this metatag nor the
``pagemap`` field is guaranteed to be present in the data.
"""
title: str
link: str
pagemap: NotRequired[GoogleCustomSearchAPI.Pagemap]
class Pagemap(TypedDict):
"""Pagemap data with a single meta tags dict in a list."""
metatags: list[JSONDict]

View file

@ -28,7 +28,7 @@ from functools import cached_property, partial, total_ordering
from html import unescape from html import unescape
from http import HTTPStatus from http import HTTPStatus
from typing import TYPE_CHECKING, ClassVar, Iterable, Iterator, NamedTuple from typing import TYPE_CHECKING, ClassVar, Iterable, Iterator, NamedTuple
from urllib.parse import quote, urlencode, urlparse from urllib.parse import quote, urlencode
import requests import requests
from unidecode import unidecode from unidecode import unidecode
@ -41,7 +41,7 @@ if TYPE_CHECKING:
from beets.importer import ImportTask from beets.importer import ImportTask
from beets.library import Item from beets.library import Item
from ._typing import GeniusAPI, JSONDict, LRCLibAPI from ._typing import GeniusAPI, GoogleCustomSearchAPI, JSONDict, LRCLibAPI
try: try:
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
@ -492,7 +492,9 @@ class SearchBackend(Backend):
def fetch(self, artist: str, title: str, *_) -> str | None: def fetch(self, artist: str, title: str, *_) -> str | None:
"""Fetch lyrics for the given artist and title.""" """Fetch lyrics for the given artist and title."""
for result in self.get_results(artist, title): for result in self.get_results(artist, title):
if lyrics := self.scrape(self.fetch_text(result.url)): if (html := self.fetch_text(result.url)) and (
lyrics := self.scrape(html)
):
return lyrics return lyrics
return None return None
@ -567,20 +569,6 @@ class Tekstowo(DirectBackend):
return None return None
def remove_credits(text):
"""Remove first/last line of text if it contains the word 'lyrics'
eg 'Lyrics by songsdatabase.com'
"""
textlines = text.split("\n")
credits = None
for i in (0, -1):
if textlines and "lyrics" in textlines[i].lower():
credits = textlines.pop(i)
if credits:
text = "\n".join(textlines)
return text
collapse_newlines = partial(re.compile(r"\n{3,}").sub, r"\n\n") collapse_newlines = partial(re.compile(r"\n{3,}").sub, r"\n\n")
@ -617,87 +605,97 @@ class Google(SearchBackend):
SEARCH_URL = "https://www.googleapis.com/customsearch/v1" SEARCH_URL = "https://www.googleapis.com/customsearch/v1"
def is_lyrics(self, text, artist=None): #: Exclude some letras.mus.br pages which do not contain lyrics.
"""Determine whether the text seems to be valid lyrics.""" EXCLUDE_PAGES = [
if not text: "significado.html",
return False "traduccion.html",
bad_triggers_occ = [] "traducao.html",
nb_lines = text.count("\n") "significados.html",
if nb_lines <= 1: ]
self.debug("Ignoring too short lyrics '{}'", text)
return False
elif nb_lines < 5:
bad_triggers_occ.append("too_short")
else:
# Lyrics look legit, remove credits to avoid being penalized
# further down
text = remove_credits(text)
bad_triggers = ["lyrics", "copyright", "property", "links"] #: Regular expression to match noise in the URL title.
if artist: URL_TITLE_NOISE_RE = re.compile(
bad_triggers += [artist] r"""
\b
(
paroles(\ et\ traduction|\ de\ chanson)?
| letras?(\ de)?
| liedtexte
| original\ song\ full\ text\.
| official
| 20[12]\d\ version
| (absolute\ |az)?lyrics(\ complete)?
| www\S+
| \S+\.(com|net|mus\.br)
)
([^\w.]|$)
""",
re.IGNORECASE | re.VERBOSE,
)
#: Split cleaned up URL title into artist and title parts.
URL_TITLE_PARTS_RE = re.compile(r" +(?:[ :|-]+|par|by) +")
for item in bad_triggers: def fetch_text(self, *args, **kwargs) -> str:
bad_triggers_occ += [item] * len( """Handle an error so that we can continue with the next URL."""
re.findall(r"\W%s\W" % item, text, re.I) with self.handle_request():
) return super().fetch_text(*args, **kwargs)
if bad_triggers_occ: @staticmethod
self.debug("Bad triggers detected: {}", bad_triggers_occ) def get_part_dist(artist: str, title: str, part: str) -> float:
return len(bad_triggers_occ) < 2 """Return the distance between the given part and the artist and title.
BY_TRANS = ["by", "par", "de", "von"] A number between -1 and 1 is returned, where -1 means the part is
LYRICS_TRANS = ["lyrics", "paroles", "letras", "liedtexte"] closer to the artist and 1 means it is closer to the title.
"""
return string_dist(artist, part) - string_dist(title, part)
@classmethod
def make_search_result( def make_search_result(
self, artist: str, url_link: str, url_title: str cls, artist: str, title: str, item: GoogleCustomSearchAPI.Item
) -> SearchResult: ) -> SearchResult:
"""Parse artist and title from the URL title and return a search result.""" """Parse artist and title from the URL title and return a search result."""
url_title_slug = slug(url_title) url_title = (
artist = slug(artist) # get full title from metatags if available
sitename = urlparse(url_link).netloc item.get("pagemap", {}).get("metatags", [{}])[0].get("og:title")
# default to the dispolay title
# or try extracting song title from URL title and check if or item["title"]
# they are close enough
tokens = (
[by + "-" + artist for by in self.BY_TRANS]
+ [artist, sitename, sitename.replace("www.", "")]
+ self.LYRICS_TRANS
) )
song_title = re.sub( clean_title = cls.URL_TITLE_NOISE_RE.sub("", url_title).strip(" .-|")
"(%s)" % "|".join(tokens), "", url_title_slug # split it into parts which may be part of the artist or the title
).strip("-") # `dict.fromkeys` removes duplicates keeping the order
parts = list(dict.fromkeys(cls.URL_TITLE_PARTS_RE.split(clean_title)))
return SearchResult(artist, song_title, url_link) if len(parts) == 1:
part = parts[0]
if m := re.search(rf"(?i)\W*({re.escape(title)})\W*", part):
# artist and title may not have a separator
result_title = m[1]
result_artist = part.replace(m[0], "")
else:
# assume that this is the title
result_artist, result_title = "", parts[0]
else:
# sort parts by their similarity to the artist
parts.sort(key=lambda p: cls.get_part_dist(artist, title, p))
result_artist, result_title = parts[0], " ".join(parts[1:])
return SearchResult(result_artist, result_title, item["link"])
def search(self, artist: str, title: str) -> Iterable[SearchResult]: def search(self, artist: str, title: str) -> Iterable[SearchResult]:
params = { params = {
"key": self.config["google_API_key"].as_str(), "key": self.config["google_API_key"].as_str(),
"cx": self.config["google_engine_ID"].as_str(), "cx": self.config["google_engine_ID"].as_str(),
"q": f"{artist} {title}", "q": f"{artist} {title}",
"siteSearch": "www.musixmatch.com",
"siteSearchFilter": "e",
"excludeTerms": ", ".join(self.EXCLUDE_PAGES),
} }
data = self.fetch_json(self.SEARCH_URL, params=params) data: GoogleCustomSearchAPI.Response = self.fetch_json(
self.SEARCH_URL, params=params
)
for item in data.get("items", []): for item in data.get("items", []):
yield self.make_search_result(artist, item["link"], item["title"]) yield self.make_search_result(artist, title, item)
def get_results(self, artist: str, title: str) -> Iterable[SearchResult]:
return super().get_results(artist, slug(title))
def fetch(self, artist: str, title: str, *_) -> str | None:
for result in self.get_results(artist, title):
with self.handle_request():
lyrics = self.scrape(self.fetch_text(result.url))
if not lyrics:
continue
if self.is_lyrics(lyrics, artist):
self.debug(
"Got lyrics from {}", urlparse(result.url).netloc
)
return lyrics
return None
@classmethod @classmethod
def scrape(cls, html: str) -> str | None: def scrape(cls, html: str) -> str | None:

View file

@ -101,24 +101,6 @@ class TestLyricsUtils:
assert list(actual_titles) == [title, *expected_extra_titles] assert list(actual_titles) == [title, *expected_extra_titles]
@pytest.mark.parametrize(
"initial_lyrics, expected",
[
("Verse\nLyrics credit in the last line", "Verse"),
("Lyrics credit in the first line\nVerse", "Verse"),
(
"""Verse
Lyrics mentioned somewhere in the middle
Verse""",
"""Verse
Lyrics mentioned somewhere in the middle
Verse""",
),
],
)
def test_remove_credits(self, initial_lyrics, expected):
assert lyrics.remove_credits(initial_lyrics) == expected
@pytest.mark.parametrize( @pytest.mark.parametrize(
"initial_text, expected", "initial_text, expected",
[ [
@ -311,8 +293,6 @@ class TestLyricsSources(LyricsBackendTest):
class TestGoogleLyrics(LyricsBackendTest): class TestGoogleLyrics(LyricsBackendTest):
"""Test scraping heuristics on a fake html page.""" """Test scraping heuristics on a fake html page."""
TITLE = "Beets song"
@pytest.fixture(scope="class") @pytest.fixture(scope="class")
def backend_name(self): def backend_name(self):
return "google" return "google"
@ -325,51 +305,55 @@ class TestGoogleLyrics(LyricsBackendTest):
def file_name(self): def file_name(self):
return "examplecom/beetssong" return "examplecom/beetssong"
@pytest.fixture
def search_item(self, url_title, url):
return {"title": url_title, "link": url}
def test_mocked_source_ok(self, backend, lyrics_html): def test_mocked_source_ok(self, backend, lyrics_html):
"""Test that lyrics of the mocked page are correctly scraped""" """Test that lyrics of the mocked page are correctly scraped"""
result = backend.scrape(lyrics_html).lower() result = backend.scrape(lyrics_html).lower()
assert result assert result
assert backend.is_lyrics(result) assert PHRASE_BY_TITLE["Beets song"] in result
assert PHRASE_BY_TITLE[self.TITLE] in result
@pytest.mark.parametrize( @pytest.mark.parametrize(
"url_title, artist, expected_title", "url_title, expected_artist, expected_title",
[ [
("John Doe - beets song Lyrics", "John Doe", "beets-song"), ("Artist - beets song Lyrics", "Artist", "beets song"),
("example.com | Beats song by John doe", "John Doe", "beats-song"), ("www.azlyrics.com | Beats song by Artist", "Artist", "Beats song"),
("lyric.com | seets bong lyrics by Artist", "Artist", "seets bong"),
("foo", "", "foo"),
("Artist - Beets Song lyrics | AZLyrics", "Artist", "Beets Song"),
("Letra de Artist - Beets Song", "Artist", "Beets Song"),
("Letra de Artist - Beets ...", "Artist", "Beets"),
("Artist Beets Song", "Artist", "Beets Song"),
("BeetsSong - Artist", "Artist", "BeetsSong"),
("Artist - BeetsSong", "Artist", "BeetsSong"),
("Beets Song", "", "Beets Song"),
("Beets Song Artist", "Artist", "Beets Song"),
( (
"example.com | seets bong lyrics by John doe", "BeetsSong (feat. Other & Another) - Artist",
"John Doe", "Artist",
"seets-bong", "BeetsSong (feat. Other & Another)",
),
(
(
"Beets song lyrics by Artist - original song full text. "
"Official Beets song lyrics, 2024 version | LyricsMode.com"
),
"Artist",
"Beets song",
), ),
("foo", "Sun O)))", "foo"),
], ],
) )
@pytest.mark.parametrize("url", ["http://doesntmatter.com"])
def test_make_search_result( def test_make_search_result(
self, backend, url_title, artist, expected_title self, backend, search_item, expected_artist, expected_title
): ):
result = backend.make_search_result( result = backend.make_search_result("Artist", "Beets song", search_item)
artist, "https://example.com", url_title
)
assert result.title == expected_title
@pytest.mark.parametrize( assert result.artist == expected_artist
"lyrics", assert result.title == expected_title
[
"LyricsMania.com - Copyright (c) 2013 - All Rights Reserved",
"""All material found on this site is property\n
of mywickedsongtext brand""",
"""
Lyricsmania staff is working hard for you to add $TITLE lyrics as soon
as they'll be released by $ARTIST, check back soon!
In case you have the lyrics to $TITLE and want to send them to us, fill out
the following form.
""",
],
)
def test_bad_lyrics(self, backend, lyrics):
assert not backend.is_lyrics(lyrics)
class TestGeniusLyrics(LyricsBackendTest): class TestGeniusLyrics(LyricsBackendTest):