From 2ff57505d8aa5dfd233b2ce4b5f932a87a9cca7c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?= <snejus@protonmail.com>
Date: Sat, 12 Oct 2024 02:20:37 +0100
Subject: [PATCH 01/25] Apply dist_thresh to Genius and Google backends

This commit introduces a distance threshold mechanism for the Genius and
Google backends.

- Create a new `SearchBackend` base class with a method `check_match`
  that performs checking.
- Start using undocumented `dist_thresh` configuration option for good,
  and mention it in the docs. This controls the maximum allowable
  distance for matching artist and title names.

These changes aim to improve the accuracy of lyrics matching, especially
when there are slight variations in artist or title names, see #4791.
---
 beetsplug/lyrics.py         | 117 +++++++++++++++++++++---------------
 docs/changelog.rst          |  16 +++--
 docs/plugins/lyrics.rst     |   6 ++
 test/plugins/test_lyrics.py |  40 +++++++++++-
 4 files changed, 125 insertions(+), 54 deletions(-)

diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index d1d715ce4..84bc26ea7 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -16,10 +16,10 @@
 
 from __future__ import annotations
 
-import difflib
 import errno
 import itertools
 import json
+import math
 import os.path
 import re
 import struct
@@ -30,7 +30,7 @@ from dataclasses import dataclass
 from functools import cached_property, partial, total_ordering
 from http import HTTPStatus
 from typing import TYPE_CHECKING, ClassVar, Iterable, Iterator
-from urllib.parse import quote, urlencode
+from urllib.parse import quote, urlencode, urlparse
 
 import requests
 from typing_extensions import TypedDict
@@ -38,6 +38,7 @@ from unidecode import unidecode
 
 import beets
 from beets import plugins, ui
+from beets.autotag.hooks import string_dist
 
 if TYPE_CHECKING:
     from beets.importer import ImportTask
@@ -488,15 +489,47 @@ class MusiXmatch(DirectBackend):
         return lyrics
 
 
-class Genius(Backend):
+class SearchBackend(Backend):
+    REQUIRES_BS = True
+
+    @cached_property
+    def dist_thresh(self) -> float:
+        return self.config["dist_thresh"].get(float)
+
+    def check_match(
+        self, target_artist: str, target_title: str, artist: str, title: str
+    ) -> bool:
+        """Check if the given artist and title are 'good enough' match."""
+        max_dist = max(
+            string_dist(target_artist, artist),
+            string_dist(target_title, title),
+        )
+
+        if (max_dist := round(max_dist, 2)) <= self.dist_thresh:
+            return True
+
+        if math.isclose(max_dist, self.dist_thresh, abs_tol=0.4):
+            # log out the candidate that did not make it but was close.
+            # This may show a matching candidate with some noise in the name
+            self._log.debug(
+                "({}, {}) does not match ({}, {}) but dist was close: {:.2f}",
+                artist,
+                title,
+                target_artist,
+                target_title,
+                max_dist,
+            )
+
+        return False
+
+
+class Genius(SearchBackend):
     """Fetch lyrics from Genius via genius-api.
 
     Simply adapted from
     bigishdata.com/2016/09/27/getting-song-lyrics-from-geniuss-api-scraping/
     """
 
-    REQUIRES_BS = True
-
     base_url = "https://api.genius.com"
 
     def __init__(self, config, log):
@@ -519,19 +552,15 @@ class Genius(Backend):
             self._log.debug("Genius API request returned invalid JSON")
             return None
 
-        # find a matching artist in the json
+        check = partial(self.check_match, artist, title)
         for hit in json["response"]["hits"]:
-            hit_artist = hit["result"]["primary_artist"]["name"]
-
-            if slug(hit_artist) == slug(artist):
-                html = self.fetch_url(hit["result"]["url"])
+            result = hit["result"]
+            if check(result["primary_artist"]["name"], result["title"]):
+                html = self.fetch_url(result["url"])
                 if not html:
                     return None
                 return self._scrape_lyrics_from_html(html)
 
-        self._log.debug(
-            "Genius failed to find a matching artist for '{0}'", artist
-        )
         return None
 
     def _search(self, artist, title):
@@ -727,10 +756,9 @@ def scrape_lyrics_from_html(html):
         return None
 
 
-class Google(Backend):
+class Google(SearchBackend):
     """Fetch lyrics from Google search results."""
 
-    REQUIRES_BS = True
     SEARCH_URL = "https://www.googleapis.com/customsearch/v1"
 
     def is_lyrics(self, text, artist=None):
@@ -778,21 +806,20 @@ class Google(Backend):
     BY_TRANS = ["by", "par", "de", "von"]
     LYRICS_TRANS = ["lyrics", "paroles", "letras", "liedtexte"]
 
-    def is_page_candidate(self, url_link, url_title, title, artist):
+    def is_page_candidate(
+        self, artist: str, title: str, url_link: str, url_title: str
+    ) -> bool:
         """Return True if the URL title makes it a good candidate to be a
         page that contains lyrics of title by artist.
         """
-        title = self.slugify(title.lower())
-        artist = self.slugify(artist.lower())
-        sitename = re.search(
-            "//([^/]+)/.*", self.slugify(url_link.lower())
-        ).group(1)
-        url_title = self.slugify(url_title.lower())
-
-        # Check if URL title contains song title (exact match)
-        if url_title.find(title) != -1:
+        title_slug = self.slugify(title.lower())
+        url_title_slug = self.slugify(url_title.lower())
+        if title_slug in url_title_slug:
             return True
 
+        artist = self.slugify(artist.lower())
+        sitename = urlparse(url_link).netloc
+
         # or try extracting song title from URL title and check if
         # they are close enough
         tokens = (
@@ -801,12 +828,9 @@ class Google(Backend):
             + self.LYRICS_TRANS
         )
         tokens = [re.escape(t) for t in tokens]
-        song_title = re.sub("(%s)" % "|".join(tokens), "", url_title)
+        song_title = re.sub("(%s)" % "|".join(tokens), "", url_title_slug)
 
-        song_title = song_title.strip("_|")
-        typo_ratio = 0.9
-        ratio = difflib.SequenceMatcher(None, song_title, title).ratio()
-        return ratio >= typo_ratio
+        return self.check_match(artist, title_slug, artist, song_title)
 
     def fetch(self, artist: str, title: str, *_) -> str | None:
         params = {
@@ -828,24 +852,21 @@ class Google(Backend):
             self._log.debug("google backend error: {0}", reason)
             return None
 
-        if "items" in data.keys():
-            for item in data["items"]:
-                url_link = item["link"]
-                url_title = item.get("title", "")
-                if not self.is_page_candidate(
-                    url_link, url_title, title, artist
-                ):
-                    continue
-                html = self.fetch_url(url_link)
-                if not html:
-                    continue
-                lyrics = scrape_lyrics_from_html(html)
-                if not lyrics:
-                    continue
+        check_candidate = partial(self.is_page_candidate, artist, title)
+        for item in data.get("items", []):
+            url_link = item["link"]
+            if not check_candidate(url_link, item.get("title", "")):
+                continue
+            html = self.fetch_url(url_link)
+            if not html:
+                continue
+            lyrics = scrape_lyrics_from_html(html)
+            if not lyrics:
+                continue
 
-                if self.is_lyrics(lyrics, artist):
-                    self._log.debug("got lyrics from {0}", item["displayLink"])
-                    return lyrics
+            if self.is_lyrics(lyrics, artist):
+                self._log.debug("got lyrics from {0}", item["displayLink"])
+                return lyrics
 
         return None
 
@@ -869,6 +890,7 @@ class LyricsPlugin(plugins.BeetsPlugin):
                 "bing_client_secret": None,
                 "bing_lang_from": [],
                 "bing_lang_to": None,
+                "dist_thresh": 0.11,
                 "google_API_key": None,
                 "google_engine_ID": "009217259823014548361:lndtuqkycfu",
                 "genius_api_key": "Ryq93pUGm8bM6eUWwD_M3NOFFDAtp2yEE7W"
@@ -880,7 +902,6 @@ class LyricsPlugin(plugins.BeetsPlugin):
                 # Musixmatch is disabled by default as they are currently blocking
                 # requests with the beets user agent.
                 "sources": [s for s in self.SOURCES if s != "musixmatch"],
-                "dist_thresh": 0.1,
             }
         )
         self.config["bing_client_secret"].redact = True
diff --git a/docs/changelog.rst b/docs/changelog.rst
index 46fa3b64e..25ae5bc10 100644
--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@@ -11,6 +11,15 @@ been dropped.
 
 New features:
 
+* :doc:`plugins/lastgenre`: The new configuration option, ``keep_existing``,
+  provides more fine-grained control over how pre-populated genre tags are
+  handled. The ``force`` option now behaves in a more conventional manner.
+  :bug:`4982`
+* :doc:`plugins/lyrics`: Add new configuration option ``dist_thresh`` to
+  control the maximum allowed distance between the lyrics search result and the
+  tagged item's artist and title. This is useful for preventing false positives
+  when fetching lyrics.
+
 Bug fixes:
 
 * :doc:`plugins/lyrics`: LRCLib will fallback to plain lyrics if synced lyrics
@@ -55,10 +64,9 @@ Bug fixes:
   ``lrclib`` over other sources since it returns reliable results quicker than
   others.
   :bug:`5102`
-* :doc:`plugins/lastgenre`: The new configuration option, ``keep_existing``,
-  provides more fine-grained control over how pre-populated genre tags are
-  handled. The ``force`` option now behaves in a more conventional manner.
-  :bug:`4982`
+* :doc:`plugins/lyrics`: Fix the issue with ``genius`` backend not being able
+  to match lyrics when there is a slight variation in the artist name.
+  :bug:`4791`
 
 For packagers:
 
diff --git a/docs/plugins/lyrics.rst b/docs/plugins/lyrics.rst
index d1f434d70..d080b1f94 100644
--- a/docs/plugins/lyrics.rst
+++ b/docs/plugins/lyrics.rst
@@ -42,6 +42,12 @@ configuration file. The available options are:
   Default: ``[]``
 - **bing_lang_to**: Language to translate lyrics into.
   Default: None.
+- **dist_thresh**: The maximum distance between the artist and title
+  combination of the music file and lyrics candidate to consider them a match.
+  Lower values will make the plugin more strict, higher values will make it
+  more lenient. This does not apply to the ``lrclib`` backend as it matches
+  durations.
+  Default: ``0.11``.
 - **fallback**: By default, the file will be left unchanged when no lyrics are
   found. Use the empty string ``''`` to reset the lyrics in such a case.
   Default: None.
diff --git a/test/plugins/test_lyrics.py b/test/plugins/test_lyrics.py
index 47c983770..812399698 100644
--- a/test/plugins/test_lyrics.py
+++ b/test/plugins/test_lyrics.py
@@ -161,6 +161,42 @@ class TestLyricsUtils:
         assert lyrics.slug(text) == expected
 
 
+class TestSearchBackend:
+    @pytest.fixture
+    def backend(self, dist_thresh):
+        plugin = lyrics.LyricsPlugin()
+        plugin.config.set({"dist_thresh": dist_thresh})
+        return lyrics.SearchBackend(plugin.config, plugin._log)
+
+    @pytest.mark.parametrize(
+        "dist_thresh, target_artist, artist, should_match",
+        [
+            (0.11, "Target Artist", "Target Artist", True),
+            (0.11, "Target Artist", "Target Artis", True),
+            (0.11, "Target Artist", "Target Arti", False),
+            (0.11, "Psychonaut", "Psychonaut (BEL)", True),
+            (0.11, "beets song", "beats song", True),
+            (0.10, "beets song", "beats song", False),
+            (
+                0.11,
+                "Lucid Dreams (Forget Me)",
+                "Lucid Dreams (Remix) ft. Lil Uzi Vert",
+                False,
+            ),
+            (
+                0.12,
+                "Lucid Dreams (Forget Me)",
+                "Lucid Dreams (Remix) ft. Lil Uzi Vert",
+                True,
+            ),
+        ],
+    )
+    def test_check_match(self, backend, target_artist, artist, should_match):
+        assert (
+            backend.check_match(target_artist, "", artist, "") == should_match
+        )
+
+
 @pytest.fixture(scope="module")
 def lyrics_root_dir(pytestconfig: pytest.Config):
     return pytestconfig.rootpath / "test" / "rsrc" / "lyrics"
@@ -275,10 +311,10 @@ class TestGoogleLyrics(LyricsBackendTest):
         self, backend, lyrics_html, url_title, artist, should_be_candidate
     ):
         result = backend.is_page_candidate(
+            artist,
+            self.TITLE,
             "http://www.example.com/lyrics/beetssong",
             url_title,
-            self.TITLE,
-            artist,
         )
         assert bool(result) == should_be_candidate
 

From c40db1034acb5e8164fa113860427ace2e7308c1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?= <snejus@protonmail.com>
Date: Tue, 27 Aug 2024 13:43:31 +0100
Subject: [PATCH 02/25] Make lyrics plugin documentation slightly more clear

---
 beetsplug/lyrics.py     |   6 +-
 docs/plugins/lyrics.rst | 142 ++++++++++++++++++++--------------------
 2 files changed, 76 insertions(+), 72 deletions(-)

diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index 84bc26ea7..2f8998d96 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -893,8 +893,10 @@ class LyricsPlugin(plugins.BeetsPlugin):
                 "dist_thresh": 0.11,
                 "google_API_key": None,
                 "google_engine_ID": "009217259823014548361:lndtuqkycfu",
-                "genius_api_key": "Ryq93pUGm8bM6eUWwD_M3NOFFDAtp2yEE7W"
-                "76V-uFL5jks5dNvcGCdarqFjDhP9c",
+                "genius_api_key": (
+                    "Ryq93pUGm8bM6eUWwD_M3NOFFDAtp2yEE7W"
+                    "76V-uFL5jks5dNvcGCdarqFjDhP9c"
+                ),
                 "fallback": None,
                 "force": False,
                 "local": False,
diff --git a/docs/plugins/lyrics.rst b/docs/plugins/lyrics.rst
index d080b1f94..f034cf47a 100644
--- a/docs/plugins/lyrics.rst
+++ b/docs/plugins/lyrics.rst
@@ -2,25 +2,27 @@ Lyrics Plugin
 =============
 
 The ``lyrics`` plugin fetches and stores song lyrics from databases on the Web.
-Namely, the current version of the plugin uses `Genius.com`_, `Tekstowo.pl`_, `LRCLIB`_
-and, optionally, the Google custom search API.
+Namely, the current version of the plugin uses `Genius.com`_, `Tekstowo.pl`_,
+`LRCLIB`_ and, optionally, the Google Custom Search API.
 
 .. _Genius.com: https://genius.com/
 .. _Tekstowo.pl: https://www.tekstowo.pl/
 .. _LRCLIB: https://lrclib.net/
 
 
-Fetch Lyrics During Import
---------------------------
+Install
+-------
 
-To automatically fetch lyrics for songs you import, first enable it in your
-configuration (see :ref:`using-plugins`). Then, install ``beets`` with
-``lyrics`` extra
+Firstly, enable ``lyrics`` plugin in your configuration (see
+:ref:`using-plugins`). Then, install ``beets`` with ``lyrics`` extra
 
 .. code-block:: bash
 
     pip install "beets[lyrics]"
 
+Fetch Lyrics During Import
+--------------------------
+
 When importing new files, beets will now fetch lyrics for files that don't
 already have them. The lyrics will be stored in the beets database. If the
 ``import.write`` config option is on, then the lyrics will also be written to
@@ -29,52 +31,52 @@ the files' tags.
 Configuration
 -------------
 
-To configure the plugin, make a ``lyrics:`` section in your
-configuration file. The available options are:
+To configure the plugin, make a ``lyrics:`` section in your configuration file.
+Default configuration:
+
+.. code-block:: yaml
+
+    lyrics:
+        auto: yes
+        bing_client_secret: null
+        bing_lang_from: []
+        bing_lang_to: null
+        dist_thresh: 0.11
+        fallback: null
+        force: no
+        google_API_key: null
+        google_engine_ID: 009217259823014548361:lndtuqkycfu
+        sources: [lrclib, google, genius, tekstowo]
+        synced: no
+
+The available options are:
 
 - **auto**: Fetch lyrics automatically during import.
-  Default: ``yes``.
 - **bing_client_secret**: Your Bing Translation application password
-  (to :ref:`lyrics-translation`)
+  (see :ref:`lyrics-translation`)
 - **bing_lang_from**: By default all lyrics with a language other than
   ``bing_lang_to`` are translated. Use a list of lang codes to restrict the set
   of source languages to translate.
-  Default: ``[]``
 - **bing_lang_to**: Language to translate lyrics into.
-  Default: None.
 - **dist_thresh**: The maximum distance between the artist and title
   combination of the music file and lyrics candidate to consider them a match.
   Lower values will make the plugin more strict, higher values will make it
   more lenient. This does not apply to the ``lrclib`` backend as it matches
   durations.
-  Default: ``0.11``.
 - **fallback**: By default, the file will be left unchanged when no lyrics are
   found. Use the empty string ``''`` to reset the lyrics in such a case.
-  Default: None.
 - **force**: By default, beets won't fetch lyrics if the files already have
   ones. To instead always fetch lyrics, set the ``force`` option to ``yes``.
-  Default: ``no``.
 - **google_API_key**: Your Google API key (to enable the Google Custom Search
   backend).
-  Default: None.
 - **google_engine_ID**: The custom search engine to use.
   Default: The `beets custom search engine`_, which gathers an updated list of
   sources known to be scrapeable.
 - **sources**: List of sources to search for lyrics. An asterisk ``*`` expands
-  to all available sources.
-  Default: ``lrclib google genius tekstowo``, i.e., all the available sources. The
-  ``google`` source will be automatically deactivated if no ``google_API_key``
-  is setup.
-  The ``google``, ``genius``, and ``tekstowo`` sources will only be enabled if
-  BeautifulSoup is installed.
-- **synced**: Prefer synced lyrics over plain lyrics if a source offers them. Currently `lrclib` is the only source that provides them. Default: `no`.
-
-Here's an example of ``config.yaml``::
-
-    lyrics:
-      fallback: ''
-      google_API_key: AZERTYUIOPQSDFGHJKLMWXCVBN1234567890_ab
-      google_engine_ID: 009217259823014548361:lndtuqkycfu
+  to all available sources. The ``google`` source will be automatically
+  deactivated if no ``google_API_key`` is setup.
+- **synced**: Prefer synced lyrics over plain lyrics if a source offers them.
+  Currently ``lrclib`` is the only source that provides them.
 
 .. _beets custom search engine: https://www.google.com:443/cse/publicurl?cx=009217259823014548361:lndtuqkycfu
 
@@ -89,74 +91,74 @@ by that band, and ``beet lyrics`` will get lyrics for my entire library. The
 lyrics will be added to the beets database and, if ``import.write`` is on,
 embedded into files' metadata.
 
-The ``-p`` option to the ``lyrics`` command makes it print lyrics out to the
-console so you can view the fetched (or previously-stored) lyrics.
+The ``-p, --print`` option to the ``lyrics`` command makes it print lyrics out
+to the console so you can view the fetched (or previously-stored) lyrics.
 
-The ``-f`` option forces the command to fetch lyrics, even for tracks that
-already have lyrics. Inversely, the ``-l`` option restricts operations
-to lyrics that are locally available, which show lyrics faster without using
-the network at all.
+The ``-f, --force`` option forces the command to fetch lyrics, even for tracks
+that already have lyrics.
+
+Inversely, the ``-l, --local`` option restricts operations to lyrics that are
+locally available, which show lyrics faster without using the network at all.
 
 Rendering Lyrics into Other Formats
 -----------------------------------
 
-The ``-r directory`` option renders all lyrics as `reStructuredText`_ (ReST)
-documents in ``directory`` (by default, the current directory). That
-directory, in turn, can be parsed by tools like `Sphinx`_ to generate HTML,
-ePUB, or PDF documents.
+The ``-r directory, --write-rest directory`` option renders all lyrics as
+`reStructuredText`_ (ReST) documents in ``directory`` (by default, the current
+directory). That directory, in turn, can be parsed by tools like `Sphinx`_ to
+generate HTML, ePUB, or PDF documents.
 
-A minimal ``conf.py`` and ``index.rst`` files are created the first time the
+Minimal ``conf.py`` and ``index.rst`` files are created the first time the
 command is run. They are not overwritten on subsequent runs, so you can safely
 modify these files to customize the output.
 
+Sphinx supports various `builders`_, see a few suggestions:
+
+
+.. admonition:: Build an HTML version
+
+  ::
+
+      sphinx-build -b html . _build/html
+
+.. admonition:: Build an ePUB3 formatted file, usable on ebook readers
+
+  ::
+
+      sphinx-build -b epub3 . _build/epub
+
+.. admonition:: Build a PDF file, which incidentally also builds a LaTeX file
+
+  ::
+
+      sphinx-build -b latex %s _build/latex && make -C _build/latex all-pdf
+
+
 .. _Sphinx: https://www.sphinx-doc.org/
 .. _reStructuredText: http://docutils.sourceforge.net/rst.html
-
-Sphinx supports various `builders
-<https://www.sphinx-doc.org/en/stable/builders.html>`_, but here are a
-few suggestions.
-
- * Build an HTML version::
-
-    sphinx-build -b html . _build/html
-
- * Build an ePUB3 formatted file, usable on ebook readers::
-
-    sphinx-build -b epub3 . _build/epub
-
- * Build a PDF file, which incidentally also builds a LaTeX file::
-
-    sphinx-build -b latex %s _build/latex && make -C _build/latex all-pdf
-
-.. _activate-google-custom-search:
+.. _builders: https://www.sphinx-doc.org/en/stable/builders.html
 
 Activate Google Custom Search
 ------------------------------
 
 You need to `register for a Google API key`_. Set the ``google_API_key``
 configuration option to your key.
+
 Then add ``google`` to the list of sources in your configuration (or use
 default list, which includes it as long as you have an API key).
 If you use default ``google_engine_ID``, we recommend limiting the sources to
 ``google`` as the other sources are already included in the Google results.
 
-.. _register for a Google API key: https://console.developers.google.com/
-
 Optionally, you can `define a custom search engine`_. Get your search engine's
 token and use it for your ``google_engine_ID`` configuration option. By
 default, beets use a list of sources known to be scrapeable.
 
-.. _define a custom search engine: https://www.google.com/cse/all
-
 Note that the Google custom search API is limited to 100 queries per day.
 After that, the lyrics plugin will fall back on other declared data sources.
 
-.. _BeautifulSoup: https://www.crummy.com/software/BeautifulSoup/bs4/doc/
+.. _register for a Google API key: https://console.developers.google.com/
+.. _define a custom search engine: https://www.google.com/cse/all
 
-Activate Genius and Tekstowo.pl Lyrics
---------------------------------------
-
-These backends are enabled by default.
 
 .. _lyrics-translation:
 
@@ -167,6 +169,6 @@ You need to register for a Microsoft Azure Marketplace free account and
 to the `Microsoft Translator API`_. Follow the four steps process, specifically
 at step 3 enter ``beets`` as *Client ID* and copy/paste the generated
 *Client secret* into your ``bing_client_secret`` configuration, alongside
-``bing_lang_to`` target `language code`.
+``bing_lang_to`` target ``language code``.
 
 .. _Microsoft Translator API: https://docs.microsoft.com/en-us/azure/cognitive-services/translator/translator-how-to-signup

From 06eac79c0d5f58c63e0ca23ea1897ad87bd7f585 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?= <snejus@protonmail.com>
Date: Wed, 4 Sep 2024 04:15:46 +0100
Subject: [PATCH 03/25] Centralize requests setup with requests.Session

Improve requests performance with requests.Session which uses connection
pooling for repeated requests to the same host.

Additionally, this centralizes request configuration, making sure that
we use the same timeout and provide beets user agent for all requests.
---
 beetsplug/lyrics.py | 68 +++++++++++++++++----------------------------
 setup.cfg           |  4 +--
 2 files changed, 28 insertions(+), 44 deletions(-)

diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index 2f8998d96..8e9452a2c 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -16,6 +16,7 @@
 
 from __future__ import annotations
 
+import atexit
 import errno
 import itertools
 import json
@@ -24,13 +25,12 @@ import os.path
 import re
 import struct
 import unicodedata
-import warnings
 from contextlib import suppress
 from dataclasses import dataclass
 from functools import cached_property, partial, total_ordering
 from http import HTTPStatus
 from typing import TYPE_CHECKING, ClassVar, Iterable, Iterator
-from urllib.parse import quote, urlencode, urlparse
+from urllib.parse import quote, urlparse
 
 import requests
 from typing_extensions import TypedDict
@@ -106,6 +106,22 @@ class NotFoundError(requests.exceptions.HTTPError):
     pass
 
 
+class TimeoutSession(requests.Session):
+    def request(self, *args, **kwargs):
+        kwargs.setdefault("timeout", 10)
+        return super().request(*args, **kwargs)
+
+
+r_session = TimeoutSession()
+r_session.headers.update({"User-Agent": USER_AGENT})
+
+
+@atexit.register
+def close_session():
+    """Close the requests session on shut down."""
+    r_session.close()
+
+
 # Utilities.
 
 
@@ -246,21 +262,7 @@ class Backend:
         is unreachable.
         """
         try:
-            # Disable the InsecureRequestWarning that comes from using
-            # `verify=false`.
-            # https://github.com/kennethreitz/requests/issues/2214
-            # We're not overly worried about the NSA MITMing our lyrics scraper
-            with warnings.catch_warnings():
-                warnings.simplefilter("ignore")
-                r = requests.get(
-                    url,
-                    verify=False,
-                    headers={
-                        "User-Agent": USER_AGENT,
-                    },
-                    timeout=10,
-                    **kwargs,
-                )
+            r = r_session.get(url)
         except requests.RequestException as exc:
             self._log.debug("lyrics request failed: {0}", exc)
             return
@@ -368,9 +370,7 @@ class LRCLib(Backend):
 
     def fetch_json(self, *args, **kwargs):
         """Wrap the request method to raise an exception on HTTP errors."""
-        kwargs.setdefault("timeout", 10)
-        kwargs.setdefault("headers", {"User-Agent": USER_AGENT})
-        r = requests.get(*args, **kwargs)
+        r = r_session.get(*args, **kwargs)
         if r.status_code == HTTPStatus.NOT_FOUND:
             raise NotFoundError("HTTP Error: Not Found", response=r)
         r.raise_for_status()
@@ -535,10 +535,7 @@ class Genius(SearchBackend):
     def __init__(self, config, log):
         super().__init__(config, log)
         self.api_key = config["genius_api_key"].as_str()
-        self.headers = {
-            "Authorization": "Bearer %s" % self.api_key,
-            "User-Agent": USER_AGENT,
-        }
+        self.headers = {"Authorization": f"Bearer {self.api_key}"}
 
     def fetch(self, artist: str, title: str, *_) -> str | None:
         """Fetch lyrics from genius.com
@@ -573,18 +570,13 @@ class Genius(SearchBackend):
         search_url = self.base_url + "/search"
         data = {"q": title + " " + artist.lower()}
         try:
-            response = requests.get(
-                search_url,
-                params=data,
-                headers=self.headers,
-                timeout=10,
-            )
+            r = r_session.get(search_url, params=data, headers=self.headers)
         except requests.RequestException as exc:
             self._log.debug("Genius API request failed: {0}", exc)
             return None
 
         try:
-            return response.json()
+            return r.json()
         except ValueError:
             return None
 
@@ -979,13 +971,7 @@ class LyricsPlugin(plugins.BeetsPlugin):
         }
 
         oauth_url = "https://datamarket.accesscontrol.windows.net/v2/OAuth2-13"
-        oauth_token = json.loads(
-            requests.post(
-                oauth_url,
-                data=urlencode(params),
-                timeout=10,
-            ).content
-        )
+        oauth_token = r_session.post(oauth_url, params=params).json()
         if "access_token" in oauth_token:
             return "Bearer " + oauth_token["access_token"]
         else:
@@ -1202,10 +1188,8 @@ class LyricsPlugin(plugins.BeetsPlugin):
                 "https://api.microsofttranslator.com/v2/Http.svc/"
                 "Translate?text=%s&to=%s" % ("|".join(text_lines), to_lang)
             )
-            r = requests.get(
-                url,
-                headers={"Authorization ": self.bing_auth_token},
-                timeout=10,
+            r = r_session.get(
+                url, headers={"Authorization": self.bing_auth_token}
             )
             if r.status_code != 200:
                 self._log.debug(
diff --git a/setup.cfg b/setup.cfg
index 15ca23f65..8e3d7e3b8 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -21,8 +21,8 @@ omit = beets/test/*
 precision = 2
 skip_empty = true
 show_missing = true
-exclude_lines =
-    pragma: no cover
+exclude_also =
+    @atexit.register
     if TYPE_CHECKING
     if typing.TYPE_CHECKING
     raise AssertionError

From 283c513c72db7e960e51363607efb8eb0766c823 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?= <snejus@protonmail.com>
Date: Sun, 13 Oct 2024 16:14:15 +0100
Subject: [PATCH 04/25] Centralise request error handling

---
 beetsplug/lyrics.py         | 228 +++++++++++++++---------------------
 test/plugins/test_lyrics.py |  62 ++++++----
 2 files changed, 136 insertions(+), 154 deletions(-)

diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index 8e9452a2c..5a35519b5 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -19,13 +19,12 @@ from __future__ import annotations
 import atexit
 import errno
 import itertools
-import json
 import math
 import os.path
 import re
 import struct
 import unicodedata
-from contextlib import suppress
+from contextlib import contextmanager, suppress
 from dataclasses import dataclass
 from functools import cached_property, partial, total_ordering
 from http import HTTPStatus
@@ -108,8 +107,14 @@ class NotFoundError(requests.exceptions.HTTPError):
 
 class TimeoutSession(requests.Session):
     def request(self, *args, **kwargs):
+        """Wrap the request method to raise an exception on HTTP errors."""
         kwargs.setdefault("timeout", 10)
-        return super().request(*args, **kwargs)
+        r = super().request(*args, **kwargs)
+        if r.status_code == HTTPStatus.NOT_FOUND:
+            raise NotFoundError("HTTP Error: Not Found", response=r)
+        r.raise_for_status()
+
+        return r
 
 
 r_session = TimeoutSession()
@@ -250,28 +255,36 @@ else:
         return None
 
 
-class Backend:
+class RequestHandler:
+    _log: beets.logging.Logger
+
+    def fetch_text(self, url: str, **kwargs) -> str:
+        """Return text / HTML data from the given URL."""
+        self._log.debug("Fetching HTML from {}", url)
+        return r_session.get(url, **kwargs).text
+
+    def fetch_json(self, url: str, **kwargs):
+        """Return JSON data from the given URL."""
+        self._log.debug("Fetching JSON from {}", url)
+        return r_session.get(url, **kwargs).json()
+
+    @contextmanager
+    def handle_request(self) -> Iterator[None]:
+        try:
+            yield
+        except requests.JSONDecodeError:
+            self._log.warning("Could not decode response JSON data")
+        except requests.RequestException as exc:
+            self._log.warning("Request error: {}", exc)
+
+
+class Backend(RequestHandler):
     REQUIRES_BS = False
 
     def __init__(self, config, log):
         self._log = log
         self.config = config
 
-    def fetch_url(self, url, **kwargs):
-        """Retrieve the content at a given URL, or return None if the source
-        is unreachable.
-        """
-        try:
-            r = r_session.get(url)
-        except requests.RequestException as exc:
-            self._log.debug("lyrics request failed: {0}", exc)
-            return
-        if r.status_code == requests.codes.ok:
-            return r.text
-        else:
-            self._log.debug("failed to fetch: {0} ({1})", url, r.status_code)
-            return None
-
     def fetch(
         self, artist: str, title: str, album: str, length: int
     ) -> str | None:
@@ -368,15 +381,6 @@ class LRCLib(Backend):
         """Log a warning message with the class name."""
         self._log.warning(f"{self.__class__.__name__}: {message}", *args)
 
-    def fetch_json(self, *args, **kwargs):
-        """Wrap the request method to raise an exception on HTTP errors."""
-        r = r_session.get(*args, **kwargs)
-        if r.status_code == HTTPStatus.NOT_FOUND:
-            raise NotFoundError("HTTP Error: Not Found", response=r)
-        r.raise_for_status()
-
-        return r.json()
-
     def fetch_candidates(
         self, artist: str, title: str, album: str, length: int
     ) -> Iterator[list[LRCLibItem]]:
@@ -417,13 +421,7 @@ class LRCLib(Backend):
                 if item := self.pick_best_match(candidates):
                     return item.get_text(self.config["synced"])
         except StopIteration:
-            pass
-        except requests.JSONDecodeError:
-            self.warn("Could not decode response JSON data")
-        except requests.RequestException as exc:
-            self.warn("Request error: {}", exc)
-
-        return None
+            return None
 
 
 class DirectBackend(Backend):
@@ -463,9 +461,7 @@ class MusiXmatch(DirectBackend):
     def fetch(self, artist: str, title: str, *_) -> str | None:
         url = self.build_url(artist, title)
 
-        html = self.fetch_url(url)
-        if not html:
-            return None
+        html = self.fetch_text(url)
         if "We detected that your IP is blocked" in html:
             self._log.warning(
                 "we are blocked at MusixMatch: url %s failed" % url
@@ -531,6 +527,7 @@ class Genius(SearchBackend):
     """
 
     base_url = "https://api.genius.com"
+    search_url = f"{base_url}/search"
 
     def __init__(self, config, log):
         super().__init__(config, log)
@@ -553,10 +550,9 @@ class Genius(SearchBackend):
         for hit in json["response"]["hits"]:
             result = hit["result"]
             if check(result["primary_artist"]["name"], result["title"]):
-                html = self.fetch_url(result["url"])
-                if not html:
-                    return None
-                return self._scrape_lyrics_from_html(html)
+                return self._scrape_lyrics_from_html(
+                    self.fetch_text(result["url"])
+                )
 
         return None
 
@@ -567,29 +563,20 @@ class Genius(SearchBackend):
 
         :returns: json response
         """
-        search_url = self.base_url + "/search"
-        data = {"q": title + " " + artist.lower()}
-        try:
-            r = r_session.get(search_url, params=data, headers=self.headers)
-        except requests.RequestException as exc:
-            self._log.debug("Genius API request failed: {0}", exc)
-            return None
-
-        try:
-            return r.json()
-        except ValueError:
-            return None
+        return self.fetch_json(
+            self.search_url,
+            params={"q": f"{title} {artist.lower()}"},
+            headers=self.headers,
+        )
 
     def replace_br(self, lyrics_div):
         for br in lyrics_div.find_all("br"):
             br.replace_with("\n")
 
-    def _scrape_lyrics_from_html(self, html):
+    def _scrape_lyrics_from_html(self, html: str) -> str | None:
         """Scrape lyrics from a given genius.com html"""
 
         soup = try_parse_html(html)
-        if not soup:
-            return
 
         # Remove script tags that they put in the middle of the lyrics.
         [h.extract() for h in soup("script")]
@@ -660,10 +647,12 @@ class Tekstowo(DirectBackend):
         return cls.non_alpha_to_underscore(unidecode(text.lower()))
 
     def fetch(self, artist: str, title: str, *_) -> str | None:
-        if html := self.fetch_url(self.build_url(artist, title)):
-            return self.extract_lyrics(html)
-
-        return None
+        # We are expecting to receive a 404 since we are guessing the URL.
+        # Thus suppress the error so that it does not end up in the logs.
+        with suppress(NotFoundError):
+            return self.extract_lyrics(
+                self.fetch_text(self.build_url(artist, title))
+            )
 
     def extract_lyrics(self, html: str) -> str | None:
         html = _scrape_strip_cruft(html)
@@ -717,7 +706,7 @@ def _scrape_merge_paragraphs(html):
     return re.sub(r"<div .*>\s*</div>", "\n", html)
 
 
-def scrape_lyrics_from_html(html):
+def scrape_lyrics_from_html(html: str) -> str | None:
     """Scrape lyrics from a URL. If no lyrics can be found, return None
     instead.
     """
@@ -737,8 +726,6 @@ def scrape_lyrics_from_html(html):
 
     # extract all long text blocks that are not code
     soup = try_parse_html(html, parse_only=SoupStrainer(string=is_text_notcode))
-    if not soup:
-        return None
 
     # Get the longest text element (if any).
     strings = sorted(soup.stripped_strings, key=len, reverse=True)
@@ -831,39 +818,26 @@ class Google(SearchBackend):
             "q": f"{artist} {title}",
         }
 
-        data = self.fetch_url(self.SEARCH_URL, params=params)
-        if not data:
-            self._log.debug("google backend returned no data")
-            return None
-        try:
-            data = json.loads(data)
-        except ValueError as exc:
-            self._log.debug("google backend returned malformed JSON: {}", exc)
-        if "error" in data:
-            reason = data["error"]["errors"][0]["reason"]
-            self._log.debug("google backend error: {0}", reason)
-            return None
-
         check_candidate = partial(self.is_page_candidate, artist, title)
-        for item in data.get("items", []):
+        for item in self.fetch_json(self.SEARCH_URL, params=params).get(
+            "items", []
+        ):
             url_link = item["link"]
             if not check_candidate(url_link, item.get("title", "")):
                 continue
-            html = self.fetch_url(url_link)
-            if not html:
-                continue
-            lyrics = scrape_lyrics_from_html(html)
-            if not lyrics:
-                continue
+            with self.handle_request():
+                lyrics = scrape_lyrics_from_html(self.fetch_text(url_link))
+                if not lyrics:
+                    continue
 
-            if self.is_lyrics(lyrics, artist):
-                self._log.debug("got lyrics from {0}", item["displayLink"])
-                return lyrics
+                if self.is_lyrics(lyrics, artist):
+                    self._log.debug("got lyrics from {0}", item["displayLink"])
+                    return lyrics
 
         return None
 
 
-class LyricsPlugin(plugins.BeetsPlugin):
+class LyricsPlugin(RequestHandler, plugins.BeetsPlugin):
     SOURCES = ["lrclib", "google", "musixmatch", "genius", "tekstowo"]
     SOURCE_BACKENDS = {
         "google": Google,
@@ -934,7 +908,6 @@ class LyricsPlugin(plugins.BeetsPlugin):
         self.config["bing_lang_from"] = [
             x.lower() for x in self.config["bing_lang_from"].as_str_seq()
         ]
-        self.bing_auth_token = None
 
         if not HAS_LANGDETECT and self.config["bing_client_secret"].get():
             self._log.warning(
@@ -962,7 +935,8 @@ class LyricsPlugin(plugins.BeetsPlugin):
 
         return enabled_sources
 
-    def get_bing_access_token(self):
+    @cached_property
+    def bing_access_token(self) -> str | None:
         params = {
             "client_id": "beets",
             "client_secret": self.config["bing_client_secret"],
@@ -971,14 +945,11 @@ class LyricsPlugin(plugins.BeetsPlugin):
         }
 
         oauth_url = "https://datamarket.accesscontrol.windows.net/v2/OAuth2-13"
-        oauth_token = r_session.post(oauth_url, params=params).json()
-        if "access_token" in oauth_token:
-            return "Bearer " + oauth_token["access_token"]
-        else:
-            self._log.warning(
-                "Could not get Bing Translate API access token."
-                ' Check your "bing_client_secret" password'
-            )
+        with self.handle_request():
+            r = r_session.post(oauth_url, params=params)
+            return r.json()["access_token"]
+
+        return None
 
     def commands(self):
         cmd = ui.Subcommand("lyrics", help="fetch song lyrics")
@@ -1167,44 +1138,39 @@ class LyricsPlugin(plugins.BeetsPlugin):
         None if no lyrics were found.
         """
         for backend in self.backends:
-            lyrics = backend.fetch(artist, title, *args)
-            if lyrics:
-                self._log.debug(
-                    "got lyrics from backend: {0}", backend.__class__.__name__
-                )
-                return _scrape_strip_cruft(lyrics, True)
+            with backend.handle_request():
+                if lyrics := backend.fetch(artist, title, *args):
+                    self._log.debug(
+                        "got lyrics from backend: {0}",
+                        backend.__class__.__name__,
+                    )
+                    return _scrape_strip_cruft(lyrics, True)
 
         return None
 
     def append_translation(self, text, to_lang):
         from xml.etree import ElementTree
 
-        if not self.bing_auth_token:
-            self.bing_auth_token = self.get_bing_access_token()
-        if self.bing_auth_token:
-            # Extract unique lines to limit API request size per song
-            text_lines = set(text.split("\n"))
-            url = (
-                "https://api.microsofttranslator.com/v2/Http.svc/"
-                "Translate?text=%s&to=%s" % ("|".join(text_lines), to_lang)
+        if not (token := self.bing_access_token):
+            self._log.warning(
+                "Could not get Bing Translate API access token. "
+                "Check your 'bing_client_secret' password."
             )
-            r = r_session.get(
-                url, headers={"Authorization": self.bing_auth_token}
+            return text
+
+        # Extract unique lines to limit API request size per song
+        lines = text.split("\n")
+        unique_lines = set(lines)
+        url = "https://api.microsofttranslator.com/v2/Http.svc/Translate"
+        with self.handle_request():
+            text = self.fetch_text(
+                url,
+                headers={"Authorization": f"Bearer {token}"},
+                params={"text": "|".join(unique_lines), "to": to_lang},
             )
-            if r.status_code != 200:
-                self._log.debug(
-                    "translation API error {}: {}", r.status_code, r.text
-                )
-                if "token has expired" in r.text:
-                    self.bing_auth_token = None
-                    return self.append_translation(text, to_lang)
-                return text
-            lines_translated = ElementTree.fromstring(
-                r.text.encode("utf-8")
-            ).text
-            # Use a translation mapping dict to build resulting lyrics
-            translations = dict(zip(text_lines, lines_translated.split("|")))
-            result = ""
-            for line in text.split("\n"):
-                result += "{} / {}\n".format(line, translations[line])
-            return result
+            if translated := ElementTree.fromstring(text.encode("utf-8")).text:
+                # Use a translation mapping dict to build resulting lyrics
+                translations = dict(zip(unique_lines, translated.split("|")))
+                return "".join(f"{ln} / {translations[ln]}\n" for ln in lines)
+
+        return text
diff --git a/test/plugins/test_lyrics.py b/test/plugins/test_lyrics.py
index 812399698..b77054b52 100644
--- a/test/plugins/test_lyrics.py
+++ b/test/plugins/test_lyrics.py
@@ -202,7 +202,7 @@ def lyrics_root_dir(pytestconfig: pytest.Config):
     return pytestconfig.rootpath / "test" / "rsrc" / "lyrics"
 
 
-class LyricsBackendTest(PluginMixin):
+class LyricsPluginMixin(PluginMixin):
     plugin = "lyrics"
 
     @pytest.fixture
@@ -218,6 +218,42 @@ class LyricsBackendTest(PluginMixin):
 
         return lyrics.LyricsPlugin()
 
+
+class TestLyricsPlugin(LyricsPluginMixin):
+    @pytest.fixture
+    def backend_name(self):
+        """Return lyrics configuration to test."""
+        return "lrclib"
+
+    @pytest.mark.parametrize(
+        "request_kwargs, expected_log_match",
+        [
+            (
+                {"status_code": HTTPStatus.BAD_GATEWAY},
+                r"lyrics: Request error: 502",
+            ),
+            ({"text": "invalid"}, r"lyrics: Could not decode.*JSON"),
+        ],
+    )
+    def test_error_handling(
+        self,
+        requests_mock,
+        lyrics_plugin,
+        caplog,
+        request_kwargs,
+        expected_log_match,
+    ):
+        """Errors are logged with the plugin name."""
+        requests_mock.get(lyrics.LRCLib.SEARCH_URL, **request_kwargs)
+
+        assert lyrics_plugin.get_lyrics("", "", "", 0.0) is None
+        assert caplog.messages
+        last_log = caplog.messages[-1]
+        assert last_log
+        assert re.search(expected_log_match, last_log, re.I)
+
+
+class LyricsBackendTest(LyricsPluginMixin):
     @pytest.fixture
     def backend(self, lyrics_plugin):
         """Return a lyrics backend instance."""
@@ -399,13 +435,9 @@ class TestLRCLibLyrics(LyricsBackendTest):
         return "lrclib"
 
     @pytest.fixture
-    def request_kwargs(self, response_data):
-        return {"json": response_data}
-
-    @pytest.fixture
-    def fetch_lyrics(self, backend, requests_mock, request_kwargs):
+    def fetch_lyrics(self, backend, requests_mock, response_data):
         requests_mock.get(backend.GET_URL, status_code=HTTPStatus.NOT_FOUND)
-        requests_mock.get(backend.SEARCH_URL, **request_kwargs)
+        requests_mock.get(backend.SEARCH_URL, json=response_data)
 
         return partial(backend.fetch, "la", "la", "la", self.ITEM_DURATION)
 
@@ -478,19 +510,3 @@ class TestLRCLibLyrics(LyricsBackendTest):
     @pytest.mark.parametrize("plugin_config", [{"synced": True}])
     def test_fetch_lyrics(self, fetch_lyrics, expected_lyrics):
         assert fetch_lyrics() == expected_lyrics
-
-    @pytest.mark.parametrize(
-        "request_kwargs, expected_log_match",
-        [
-            (
-                {"status_code": HTTPStatus.BAD_GATEWAY},
-                r"LRCLib: Request error: 502",
-            ),
-            ({"text": "invalid"}, r"LRCLib: Could not decode.*JSON"),
-        ],
-    )
-    def test_error(self, caplog, fetch_lyrics, expected_log_match):
-        assert fetch_lyrics() is None
-        assert caplog.messages
-        assert (last_log := caplog.messages[-1])
-        assert re.search(expected_log_match, last_log, re.I)

From cb29605bfd84c20725a7036258dba0473266ffd7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?= <snejus@protonmail.com>
Date: Fri, 6 Sep 2024 07:35:24 +0100
Subject: [PATCH 05/25] Include class name in the log messages

---
 beetsplug/lyrics.py         | 113 +++++++++++++++++++-----------------
 test/plugins/test_lyrics.py |   6 +-
 2 files changed, 64 insertions(+), 55 deletions(-)

diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index 5a35519b5..db819c513 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -28,8 +28,8 @@ from contextlib import contextmanager, suppress
 from dataclasses import dataclass
 from functools import cached_property, partial, total_ordering
 from http import HTTPStatus
-from typing import TYPE_CHECKING, ClassVar, Iterable, Iterator
-from urllib.parse import quote, urlparse
+from typing import TYPE_CHECKING, Any, ClassVar, Iterable, Iterator
+from urllib.parse import quote, urlencode, urlparse
 
 import requests
 from typing_extensions import TypedDict
@@ -58,6 +58,8 @@ try:
 except ImportError:
     HAS_LANGDETECT = False
 
+JSONDict = dict[str, Any]
+
 DIV_RE = re.compile(r"<(/?)div>?", re.I)
 COMMENT_RE = re.compile(r"<!--.*-->", re.S)
 TAG_RE = re.compile(r"<[^>]*>")
@@ -258,14 +260,37 @@ else:
 class RequestHandler:
     _log: beets.logging.Logger
 
-    def fetch_text(self, url: str, **kwargs) -> str:
+    def debug(self, message: str, *args) -> None:
+        """Log a debug message with the class name."""
+        self._log.debug(f"{self.__class__.__name__}: {message}", *args)
+
+    def info(self, message: str, *args) -> None:
+        """Log an info message with the class name."""
+        self._log.info(f"{self.__class__.__name__}: {message}", *args)
+
+    def warn(self, message: str, *args) -> None:
+        """Log warning with the class name."""
+        self._log.warning(f"{self.__class__.__name__}: {message}", *args)
+
+    @staticmethod
+    def format_url(url: str, params: JSONDict | None) -> str:
+        if not params:
+            return url
+
+        return f"{url}?{urlencode(params)}"
+
+    def fetch_text(
+        self, url: str, params: JSONDict | None = None, **kwargs
+    ) -> str:
         """Return text / HTML data from the given URL."""
-        self._log.debug("Fetching HTML from {}", url)
+        url = self.format_url(url, params)
+        self.debug("Fetching HTML from {}", url)
         return r_session.get(url, **kwargs).text
 
-    def fetch_json(self, url: str, **kwargs):
+    def fetch_json(self, url: str, params: JSONDict | None = None, **kwargs):
         """Return JSON data from the given URL."""
-        self._log.debug("Fetching JSON from {}", url)
+        url = self.format_url(url, params)
+        self.debug("Fetching JSON from {}", url)
         return r_session.get(url, **kwargs).json()
 
     @contextmanager
@@ -273,9 +298,9 @@ class RequestHandler:
         try:
             yield
         except requests.JSONDecodeError:
-            self._log.warning("Could not decode response JSON data")
+            self.warn("Could not decode response JSON data")
         except requests.RequestException as exc:
-            self._log.warning("Request error: {}", exc)
+            self.warn("Request error: {}", exc)
 
 
 class Backend(RequestHandler):
@@ -377,10 +402,6 @@ class LRCLib(Backend):
     GET_URL = f"{BASE_URL}/get"
     SEARCH_URL = f"{BASE_URL}/search"
 
-    def warn(self, message: str, *args) -> None:
-        """Log a warning message with the class name."""
-        self._log.warning(f"{self.__class__.__name__}: {message}", *args)
-
     def fetch_candidates(
         self, artist: str, title: str, album: str, length: int
     ) -> Iterator[list[LRCLibItem]]:
@@ -415,13 +436,12 @@ class LRCLib(Backend):
         """Fetch lyrics text for the given song data."""
         evaluate_item = partial(LRCLyrics.make, target_duration=length)
 
-        try:
-            for group in self.fetch_candidates(artist, title, album, length):
-                candidates = [evaluate_item(item) for item in group]
-                if item := self.pick_best_match(candidates):
-                    return item.get_text(self.config["synced"])
-        except StopIteration:
-            return None
+        for group in self.fetch_candidates(artist, title, album, length):
+            candidates = [evaluate_item(item) for item in group]
+            if item := self.pick_best_match(candidates):
+                return item.get_text(self.config["synced"])
+
+        return None
 
 
 class DirectBackend(Backend):
@@ -463,9 +483,7 @@ class MusiXmatch(DirectBackend):
 
         html = self.fetch_text(url)
         if "We detected that your IP is blocked" in html:
-            self._log.warning(
-                "we are blocked at MusixMatch: url %s failed" % url
-            )
+            self.warn("Failed: Blocked IP address")
             return None
         html_parts = html.split('<p class="mxm-lyrics__content')
         # Sometimes lyrics come in 2 or more parts
@@ -507,7 +525,7 @@ class SearchBackend(Backend):
         if math.isclose(max_dist, self.dist_thresh, abs_tol=0.4):
             # log out the candidate that did not make it but was close.
             # This may show a matching candidate with some noise in the name
-            self._log.debug(
+            self.debug(
                 "({}, {}) does not match ({}, {}) but dist was close: {:.2f}",
                 artist,
                 title,
@@ -542,9 +560,6 @@ class Genius(SearchBackend):
         then attempt to scrape that url for the lyrics.
         """
         json = self._search(artist, title)
-        if not json:
-            self._log.debug("Genius API request returned invalid JSON")
-            return None
 
         check = partial(self.check_match, artist, title)
         for hit in json["response"]["hits"]:
@@ -588,7 +603,7 @@ class Genius(SearchBackend):
 
         lyrics_divs = soup.find_all("div", {"data-lyrics-container": True})
         if not lyrics_divs:
-            self._log.debug("Received unusual song page html")
+            self.debug("Received unusual song page html")
             return self._try_extracting_lyrics_from_non_data_lyrics_container(
                 soup
             )
@@ -611,10 +626,10 @@ class Genius(SearchBackend):
                 class_=re.compile("LyricsPlaceholder__Message"),
                 string="This song is an instrumental",
             ):
-                self._log.debug("Detected instrumental")
+                self.debug("Detected instrumental")
                 return INSTRUMENTAL_LYRICS
             else:
-                self._log.debug("Couldn't scrape page using known layouts")
+                self.debug("Couldn't scrape page using known layouts")
                 return None
 
         lyrics_div = verse_div.parent
@@ -654,6 +669,8 @@ class Tekstowo(DirectBackend):
                 self.fetch_text(self.build_url(artist, title))
             )
 
+        return None
+
     def extract_lyrics(self, html: str) -> str | None:
         html = _scrape_strip_cruft(html)
         html = _scrape_merge_paragraphs(html)
@@ -747,7 +764,7 @@ class Google(SearchBackend):
         bad_triggers_occ = []
         nb_lines = text.count("\n")
         if nb_lines <= 1:
-            self._log.debug("Ignoring too short lyrics '{0}'", text)
+            self.debug("Ignoring too short lyrics '{}'", text)
             return False
         elif nb_lines < 5:
             bad_triggers_occ.append("too_short")
@@ -766,7 +783,7 @@ class Google(SearchBackend):
             )
 
         if bad_triggers_occ:
-            self._log.debug("Bad triggers detected: {0}", bad_triggers_occ)
+            self.debug("Bad triggers detected: {}", bad_triggers_occ)
         return len(bad_triggers_occ) < 2
 
     def slugify(self, text):
@@ -779,7 +796,7 @@ class Google(SearchBackend):
             text = unicodedata.normalize("NFKD", text).encode("ascii", "ignore")
             text = str(re.sub(r"[-\s]+", " ", text.decode("utf-8")))
         except UnicodeDecodeError:
-            self._log.exception("Failing to normalize '{0}'", text)
+            self.debug("Failed to normalize '{}'", text)
         return text
 
     BY_TRANS = ["by", "par", "de", "von"]
@@ -831,7 +848,7 @@ class Google(SearchBackend):
                     continue
 
                 if self.is_lyrics(lyrics, artist):
-                    self._log.debug("got lyrics from {0}", item["displayLink"])
+                    self.debug("Got lyrics from {}", item["displayLink"])
                     return lyrics
 
         return None
@@ -900,9 +917,7 @@ class LyricsPlugin(RequestHandler, plugins.BeetsPlugin):
                 # configuration includes `google`. This way, the source
                 # is silent by default but can be enabled just by
                 # setting an API key.
-                self._log.debug(
-                    "Disabling google source: " "no API key configured."
-                )
+                self.debug("Disabling google source: " "no API key configured.")
                 sources.remove("google")
 
         self.config["bing_lang_from"] = [
@@ -910,15 +925,14 @@ class LyricsPlugin(RequestHandler, plugins.BeetsPlugin):
         ]
 
         if not HAS_LANGDETECT and self.config["bing_client_secret"].get():
-            self._log.warning(
-                "To use bing translations, you need to "
-                "install the langdetect module. See the "
-                "documentation for further details."
+            self.warn(
+                "To use bing translations, you need to install the langdetect "
+                "module. See the documentation for further details."
             )
 
         self.backends = [
-            self.SOURCE_BACKENDS[source](self.config, self._log)
-            for source in sources
+            self.SOURCE_BACKENDS[s](self.config, self._log.getChild(s))
+            for s in sources
         ]
 
     def sanitize_bs_sources(self, sources):
@@ -949,8 +963,6 @@ class LyricsPlugin(RequestHandler, plugins.BeetsPlugin):
             r = r_session.post(oauth_url, params=params)
             return r.json()["access_token"]
 
-        return None
-
     def commands(self):
         cmd = ui.Subcommand("lyrics", help="fetch song lyrics")
         cmd.parser.add_option(
@@ -1095,7 +1107,7 @@ class LyricsPlugin(RequestHandler, plugins.BeetsPlugin):
         """
         # Skip if the item already has lyrics.
         if not force and item.lyrics:
-            self._log.info("lyrics already present: {0}", item)
+            self.info("Lyrics already present: {}", item)
             return
 
         lyrics_matches = []
@@ -1111,7 +1123,7 @@ class LyricsPlugin(RequestHandler, plugins.BeetsPlugin):
         lyrics = "\n\n---\n\n".join(filter(None, lyrics_matches))
 
         if lyrics:
-            self._log.info("fetched lyrics: {0}", item)
+            self.info("Lyrics found: {}", item)
             if HAS_LANGDETECT and self.config["bing_client_secret"].get():
                 lang_from = langdetect.detect(lyrics)
                 if self.config["bing_lang_to"].get() != lang_from and (
@@ -1122,7 +1134,7 @@ class LyricsPlugin(RequestHandler, plugins.BeetsPlugin):
                         lyrics, self.config["bing_lang_to"]
                     )
         else:
-            self._log.info("lyrics not found: {0}", item)
+            self.info("Lyrics not found: {}", item)
             fallback = self.config["fallback"].get()
             if fallback:
                 lyrics = fallback
@@ -1137,13 +1149,10 @@ class LyricsPlugin(RequestHandler, plugins.BeetsPlugin):
         """Fetch lyrics, trying each source in turn. Return a string or
         None if no lyrics were found.
         """
+        self.info("Fetching lyrics for {} - {}", artist, title)
         for backend in self.backends:
             with backend.handle_request():
                 if lyrics := backend.fetch(artist, title, *args):
-                    self._log.debug(
-                        "got lyrics from backend: {0}",
-                        backend.__class__.__name__,
-                    )
                     return _scrape_strip_cruft(lyrics, True)
 
         return None
@@ -1152,7 +1161,7 @@ class LyricsPlugin(RequestHandler, plugins.BeetsPlugin):
         from xml.etree import ElementTree
 
         if not (token := self.bing_access_token):
-            self._log.warning(
+            self.warn(
                 "Could not get Bing Translate API access token. "
                 "Check your 'bing_client_secret' password."
             )
diff --git a/test/plugins/test_lyrics.py b/test/plugins/test_lyrics.py
index b77054b52..817eb4af9 100644
--- a/test/plugins/test_lyrics.py
+++ b/test/plugins/test_lyrics.py
@@ -230,9 +230,9 @@ class TestLyricsPlugin(LyricsPluginMixin):
         [
             (
                 {"status_code": HTTPStatus.BAD_GATEWAY},
-                r"lyrics: Request error: 502",
+                r"LRCLib: Request error: 502",
             ),
-            ({"text": "invalid"}, r"lyrics: Could not decode.*JSON"),
+            ({"text": "invalid"}, r"LRCLib: Could not decode.*JSON"),
         ],
     )
     def test_error_handling(
@@ -243,7 +243,7 @@ class TestLyricsPlugin(LyricsPluginMixin):
         request_kwargs,
         expected_log_match,
     ):
-        """Errors are logged with the plugin name."""
+        """Errors are logged with the backend name."""
         requests_mock.get(lyrics.LRCLib.SEARCH_URL, **request_kwargs)
 
         assert lyrics_plugin.get_lyrics("", "", "", 0.0) is None

From 7c2fb31136d3ecba9e3974dca7bf7729bd0db500 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?= <snejus@protonmail.com>
Date: Wed, 11 Sep 2024 07:51:43 +0100
Subject: [PATCH 06/25] Leave a single chef in the kitchen

---
 beetsplug/lyrics.py         | 79 ++++++++++---------------------------
 test/plugins/test_lyrics.py |  6 +--
 2 files changed, 23 insertions(+), 62 deletions(-)

diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index db819c513..3d0e09673 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -44,8 +44,7 @@ if TYPE_CHECKING:
     from beets.library import Item
 
 try:
-    import bs4
-    from bs4 import SoupStrainer
+    from bs4 import BeautifulSoup
 
     HAS_BEAUTIFUL_SOUP = True
 except ImportError:
@@ -246,17 +245,6 @@ def slug(text):
     return re.sub(r"\W+", "-", unidecode(text).lower().strip()).strip("-")
 
 
-if HAS_BEAUTIFUL_SOUP:
-
-    def try_parse_html(html, **kwargs):
-        return bs4.BeautifulSoup(html, "html.parser", **kwargs)
-
-else:
-
-    def try_parse_html(html, **kwargs):
-        return None
-
-
 class RequestHandler:
     _log: beets.logging.Logger
 
@@ -565,9 +553,7 @@ class Genius(SearchBackend):
         for hit in json["response"]["hits"]:
             result = hit["result"]
             if check(result["primary_artist"]["name"], result["title"]):
-                return self._scrape_lyrics_from_html(
-                    self.fetch_text(result["url"])
-                )
+                return self.scrape_lyrics(self.fetch_text(hit["result"]["url"]))
 
         return None
 
@@ -584,17 +570,9 @@ class Genius(SearchBackend):
             headers=self.headers,
         )
 
-    def replace_br(self, lyrics_div):
-        for br in lyrics_div.find_all("br"):
-            br.replace_with("\n")
-
-    def _scrape_lyrics_from_html(self, html: str) -> str | None:
+    def scrape_lyrics(self, html: str) -> str | None:
         """Scrape lyrics from a given genius.com html"""
-
-        soup = try_parse_html(html)
-
-        # Remove script tags that they put in the middle of the lyrics.
-        [h.extract() for h in soup("script")]
+        soup = get_soup(html)
 
         # Most of the time, the page contains a div with class="lyrics" where
         # all of the lyrics can be found already correctly formatted
@@ -609,7 +587,6 @@ class Genius(SearchBackend):
             )
         lyrics = ""
         for lyrics_div in lyrics_divs:
-            self.replace_br(lyrics_div)
             lyrics += lyrics_div.get_text() + "\n\n"
         while lyrics[-1] == "\n":
             lyrics = lyrics[:-1]
@@ -633,7 +610,6 @@ class Genius(SearchBackend):
                 return None
 
         lyrics_div = verse_div.parent
-        self.replace_br(lyrics_div)
 
         ads = lyrics_div.find_all(
             "div", class_=re.compile("InreadAd__Container")
@@ -665,17 +641,14 @@ class Tekstowo(DirectBackend):
         # We are expecting to receive a 404 since we are guessing the URL.
         # Thus suppress the error so that it does not end up in the logs.
         with suppress(NotFoundError):
-            return self.extract_lyrics(
+            return self.scrape_lyrics(
                 self.fetch_text(self.build_url(artist, title))
             )
 
         return None
 
-    def extract_lyrics(self, html: str) -> str | None:
-        html = _scrape_strip_cruft(html)
-        html = _scrape_merge_paragraphs(html)
-
-        soup = try_parse_html(html)
+    def scrape_lyrics(self, html: str) -> str | None:
+        soup = get_soup(html)
 
         if lyrics_div := soup.select_one("div.song-text > div.inner-text"):
             return lyrics_div.get_text()
@@ -723,33 +696,11 @@ def _scrape_merge_paragraphs(html):
     return re.sub(r"<div .*>\s*</div>", "\n", html)
 
 
-def scrape_lyrics_from_html(html: str) -> str | None:
-    """Scrape lyrics from a URL. If no lyrics can be found, return None
-    instead.
-    """
-
-    def is_text_notcode(text):
-        if not text:
-            return False
-        length = len(text)
-        return (
-            length > 20
-            and text.count(" ") > length / 25
-            and (text.find("{") == -1 or text.find(";") == -1)
-        )
-
+def get_soup(html: str) -> BeautifulSoup:
     html = _scrape_strip_cruft(html)
     html = _scrape_merge_paragraphs(html)
 
-    # extract all long text blocks that are not code
-    soup = try_parse_html(html, parse_only=SoupStrainer(string=is_text_notcode))
-
-    # Get the longest text element (if any).
-    strings = sorted(soup.stripped_strings, key=len, reverse=True)
-    if strings:
-        return strings[0]
-    else:
-        return None
+    return BeautifulSoup(html, "html.parser")
 
 
 class Google(SearchBackend):
@@ -757,6 +708,16 @@ class Google(SearchBackend):
 
     SEARCH_URL = "https://www.googleapis.com/customsearch/v1"
 
+    @staticmethod
+    def scrape_lyrics(html: str) -> str | None:
+        soup = get_soup(html)
+
+        # Get the longest text element (if any).
+        strings = sorted(soup.stripped_strings, key=len, reverse=True)
+        if strings:
+            return strings[0]
+        return None
+
     def is_lyrics(self, text, artist=None):
         """Determine whether the text seems to be valid lyrics."""
         if not text:
@@ -843,7 +804,7 @@ class Google(SearchBackend):
             if not check_candidate(url_link, item.get("title", "")):
                 continue
             with self.handle_request():
-                lyrics = scrape_lyrics_from_html(self.fetch_text(url_link))
+                lyrics = self.scrape_lyrics(self.fetch_text(url_link))
                 if not lyrics:
                     continue
 
diff --git a/test/plugins/test_lyrics.py b/test/plugins/test_lyrics.py
index 817eb4af9..d412d318b 100644
--- a/test/plugins/test_lyrics.py
+++ b/test/plugins/test_lyrics.py
@@ -328,7 +328,7 @@ class TestGoogleLyrics(LyricsBackendTest):
 
     def test_mocked_source_ok(self, backend, lyrics_html):
         """Test that lyrics of the mocked page are correctly scraped"""
-        result = lyrics.scrape_lyrics_from_html(lyrics_html).lower()
+        result = backend.scrape_lyrics(lyrics_html).lower()
 
         assert result
         assert backend.is_lyrics(result)
@@ -390,7 +390,7 @@ class TestGeniusLyrics(LyricsBackendTest):
         ],
     )  # fmt: skip
     def test_scrape(self, backend, lyrics_html, expected_line_count):
-        result = backend._scrape_lyrics_from_html(lyrics_html) or ""
+        result = backend.scrape_lyrics(lyrics_html) or ""
 
         assert len(result.splitlines()) == expected_line_count
 
@@ -411,7 +411,7 @@ class TestTekstowoLyrics(LyricsBackendTest):
         ],
     )
     def test_scrape(self, backend, lyrics_html, expecting_lyrics):
-        assert bool(backend.extract_lyrics(lyrics_html)) == expecting_lyrics
+        assert bool(backend.scrape_lyrics(lyrics_html)) == expecting_lyrics
 
 
 LYRICS_DURATION = 950

From dd9f178fffd1cd130bde56fdbaae853aab4ca0c9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?= <snejus@protonmail.com>
Date: Sat, 19 Oct 2024 03:30:41 +0100
Subject: [PATCH 07/25] Do not try to strip cruft from the parsed lyrics text.

Having removed it I fuond that only the Genius lyrics changed: it had en
extra new line. Thus I defined a function 'collapse_newlines' which now
gets called for the Genius lyrics.
---
 beetsplug/lyrics.py         | 24 +++++++++++-------------
 test/plugins/test_lyrics.py |  5 ++---
 2 files changed, 13 insertions(+), 16 deletions(-)

diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index 3d0e09673..2ec362356 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -59,9 +59,6 @@ except ImportError:
 
 JSONDict = dict[str, Any]
 
-DIV_RE = re.compile(r"<(/?)div>?", re.I)
-COMMENT_RE = re.compile(r"<!--.*-->", re.S)
-TAG_RE = re.compile(r"<[^>]*>")
 BREAK_RE = re.compile(r"\n?\s*<br([\s|/][^>]*)*>\s*\n?", re.I)
 USER_AGENT = f"beets/{beets.__version__}"
 INSTRUMENTAL_LYRICS = "[Instrumental]"
@@ -552,8 +549,11 @@ class Genius(SearchBackend):
         check = partial(self.check_match, artist, title)
         for hit in json["response"]["hits"]:
             result = hit["result"]
-            if check(result["primary_artist"]["name"], result["title"]):
-                return self.scrape_lyrics(self.fetch_text(hit["result"]["url"]))
+            url = hit["result"]["url"]
+            if check(result["primary_artist"]["name"], result["title"]) and (
+                lyrics := self.scrape_lyrics(self.fetch_text(url))
+            ):
+                return collapse_newlines(lyrics)
 
         return None
 
@@ -670,7 +670,10 @@ def remove_credits(text):
     return text
 
 
-def _scrape_strip_cruft(html, plain_text_out=False):
+collapse_newlines = partial(re.compile(r"\n{3,}").sub, r"\n\n")
+
+
+def _scrape_strip_cruft(html: str) -> str:
     """Clean up HTML"""
     html = unescape(html)
 
@@ -682,13 +685,8 @@ def _scrape_strip_cruft(html, plain_text_out=False):
     html = re.sub("<aside .+?</aside>", "", html)  # remove Google Ads tags
     html = re.sub(r"</?(em|strong)[^>]*>", "", html)  # remove italics / bold
 
-    if plain_text_out:  # Strip remaining HTML tags
-        html = COMMENT_RE.sub("", html)
-        html = TAG_RE.sub("", html)
-
     html = "\n".join([x.strip() for x in html.strip().split("\n")])
-    html = re.sub(r"\n{3,}", r"\n\n", html)
-    return html
+    return collapse_newlines(html)
 
 
 def _scrape_merge_paragraphs(html):
@@ -1114,7 +1112,7 @@ class LyricsPlugin(RequestHandler, plugins.BeetsPlugin):
         for backend in self.backends:
             with backend.handle_request():
                 if lyrics := backend.fetch(artist, title, *args):
-                    return _scrape_strip_cruft(lyrics, True)
+                    return lyrics
 
         return None
 
diff --git a/test/plugins/test_lyrics.py b/test/plugins/test_lyrics.py
index d412d318b..9e24d46c2 100644
--- a/test/plugins/test_lyrics.py
+++ b/test/plugins/test_lyrics.py
@@ -129,14 +129,13 @@ class TestLyricsUtils:
                   two  !
                   <br><br \\>
                   <blink>four</blink>""",
-                "one\ntwo !\n\nfour",
+                "<!--lyrics below-->\none\ntwo !\n\n<blink>four</blink>",
             ),
             ("foo<script>bar</script>baz", "foobaz"),
-            ("foo<!--<bar>-->qux", "fooqux"),
         ],
     )
     def test_scrape_strip_cruft(self, initial_text, expected):
-        assert lyrics._scrape_strip_cruft(initial_text, True) == expected
+        assert lyrics._scrape_strip_cruft(initial_text) == expected
 
     def test_scrape_merge_paragraphs(self):
         text = "one</p>   <p class='myclass'>two</p><p>three"

From f94d2767f9919dd933da0ba870355babd29b371e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?= <snejus@protonmail.com>
Date: Fri, 6 Sep 2024 12:11:01 +0100
Subject: [PATCH 08/25] Use a single slug implementation

Tidy up 'Google.is_page_candidate' method and remove 'Google.sluggify'
method which was a duplicate of 'slug'.

Since 'GeniusFetchTest' only tested whether the artist name is cleaned
up (the rest of the functionality is patched), remove it and move its
test cases to the 'test_slug' test.
---
 beetsplug/lyrics.py         | 33 ++++++++-------------------------
 test/plugins/test_lyrics.py |  4 ----
 2 files changed, 8 insertions(+), 29 deletions(-)

diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index 2ec362356..097110e13 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -23,7 +23,6 @@ import math
 import os.path
 import re
 import struct
-import unicodedata
 from contextlib import contextmanager, suppress
 from dataclasses import dataclass
 from functools import cached_property, partial, total_ordering
@@ -224,7 +223,7 @@ def search_pairs(item):
     return itertools.product(artists, multi_titles)
 
 
-def slug(text):
+def slug(text: str) -> str:
     """Make a URL-safe, human-readable version of the given text
 
     This will do the following:
@@ -234,10 +233,6 @@ def slug(text):
     3. strip whitespace
     4. replace other non-word characters with dashes
     5. strip extra dashes
-
-    This somewhat duplicates the :func:`Google.slugify` function but
-    slugify is not as generic as this one, which can be reused
-    elsewhere.
     """
     return re.sub(r"\W+", "-", unidecode(text).lower().strip()).strip("-")
 
@@ -745,19 +740,6 @@ class Google(SearchBackend):
             self.debug("Bad triggers detected: {}", bad_triggers_occ)
         return len(bad_triggers_occ) < 2
 
-    def slugify(self, text):
-        """Normalize a string and remove non-alphanumeric characters."""
-        text = re.sub(r"[-'_\s]", "_", text)
-        text = re.sub(r"_+", "_", text).strip("_")
-        pat = r"([^,\(]*)\((.*?)\)"  # Remove content within parentheses
-        text = re.sub(pat, r"\g<1>", text).strip()
-        try:
-            text = unicodedata.normalize("NFKD", text).encode("ascii", "ignore")
-            text = str(re.sub(r"[-\s]+", " ", text.decode("utf-8")))
-        except UnicodeDecodeError:
-            self.debug("Failed to normalize '{}'", text)
-        return text
-
     BY_TRANS = ["by", "par", "de", "von"]
     LYRICS_TRANS = ["lyrics", "paroles", "letras", "liedtexte"]
 
@@ -767,23 +749,24 @@ class Google(SearchBackend):
         """Return True if the URL title makes it a good candidate to be a
         page that contains lyrics of title by artist.
         """
-        title_slug = self.slugify(title.lower())
-        url_title_slug = self.slugify(url_title.lower())
+        title_slug = slug(title)
+        url_title_slug = slug(url_title)
         if title_slug in url_title_slug:
             return True
 
-        artist = self.slugify(artist.lower())
+        artist = slug(artist)
         sitename = urlparse(url_link).netloc
 
         # or try extracting song title from URL title and check if
         # they are close enough
         tokens = (
-            [by + "_" + artist for by in self.BY_TRANS]
+            [by + "-" + artist for by in self.BY_TRANS]
             + [artist, sitename, sitename.replace("www.", "")]
             + self.LYRICS_TRANS
         )
-        tokens = [re.escape(t) for t in tokens]
-        song_title = re.sub("(%s)" % "|".join(tokens), "", url_title_slug)
+        song_title = re.sub(
+            "(%s)" % "|".join(tokens), "", url_title_slug
+        ).strip("-")
 
         return self.check_match(artist, title_slug, artist, song_title)
 
diff --git a/test/plugins/test_lyrics.py b/test/plugins/test_lyrics.py
index 9e24d46c2..1d264338a 100644
--- a/test/plugins/test_lyrics.py
+++ b/test/plugins/test_lyrics.py
@@ -370,10 +370,6 @@ the following form.
     def test_bad_lyrics(self, backend, lyrics):
         assert not backend.is_lyrics(lyrics)
 
-    def test_slugify(self, backend):
-        text = "http://site.com/\xe7afe-au_lait(boisson)"
-        assert backend.slugify(text) == "http://site.com/cafe_au_lait"
-
 
 class TestGeniusLyrics(LyricsBackendTest):
     @pytest.fixture(scope="class")

From 8bdc2c6cf0ee58dad2ae236a8455b1ae06a5d4a0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?= <snejus@protonmail.com>
Date: Thu, 19 Sep 2024 20:00:44 +0100
Subject: [PATCH 09/25] lyrics: Add symbols for better visual feedback in the
 logs

---
 beetsplug/lyrics.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index 097110e13..5a1301039 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -1049,7 +1049,7 @@ class LyricsPlugin(RequestHandler, plugins.BeetsPlugin):
         """
         # Skip if the item already has lyrics.
         if not force and item.lyrics:
-            self.info("Lyrics already present: {}", item)
+            self.info("🔵 Lyrics already present: {}", item)
             return
 
         lyrics_matches = []
@@ -1065,7 +1065,7 @@ class LyricsPlugin(RequestHandler, plugins.BeetsPlugin):
         lyrics = "\n\n---\n\n".join(filter(None, lyrics_matches))
 
         if lyrics:
-            self.info("Lyrics found: {}", item)
+            self.info("🟢 Found lyrics: {0}", item)
             if HAS_LANGDETECT and self.config["bing_client_secret"].get():
                 lang_from = langdetect.detect(lyrics)
                 if self.config["bing_lang_to"].get() != lang_from and (
@@ -1076,7 +1076,7 @@ class LyricsPlugin(RequestHandler, plugins.BeetsPlugin):
                         lyrics, self.config["bing_lang_to"]
                     )
         else:
-            self.info("Lyrics not found: {}", item)
+            self.info("🔴 Lyrics not found: {}", item)
             fallback = self.config["fallback"].get()
             if fallback:
                 lyrics = fallback

From 8a1ce274210fc90aef1865721cfb433e24bcbb77 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?= <snejus@protonmail.com>
Date: Fri, 27 Sep 2024 22:57:20 +0100
Subject: [PATCH 10/25] lyrics: Do not write item unless lyrics have changed

---
 beetsplug/lyrics.py | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index 5a1301039..5f733b7db 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -1077,15 +1077,13 @@ class LyricsPlugin(RequestHandler, plugins.BeetsPlugin):
                     )
         else:
             self.info("🔴 Lyrics not found: {}", item)
-            fallback = self.config["fallback"].get()
-            if fallback:
-                lyrics = fallback
-            else:
-                return
-        item.lyrics = lyrics
-        if write:
-            item.try_write()
-        item.store()
+            lyrics = self.config["fallback"].get()
+
+        if lyrics not in {None, item.lyrics}:
+            item.lyrics = lyrics
+            if write:
+                item.try_write()
+            item.store()
 
     def get_lyrics(self, artist: str, title: str, *args) -> str | None:
         """Fetch lyrics, trying each source in turn. Return a string or

From 55b78249483915993116e6131bd7c6c507dd4d70 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?= <snejus@protonmail.com>
Date: Mon, 7 Oct 2024 10:33:01 +0100
Subject: [PATCH 11/25] Replace custom unescape implementation by html.unescape

---
 beetsplug/lyrics.py | 23 +----------------------
 1 file changed, 1 insertion(+), 22 deletions(-)

diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index 5f733b7db..8bd7f15d3 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -22,10 +22,10 @@ import itertools
 import math
 import os.path
 import re
-import struct
 from contextlib import contextmanager, suppress
 from dataclasses import dataclass
 from functools import cached_property, partial, total_ordering
+from html import unescape
 from http import HTTPStatus
 from typing import TYPE_CHECKING, Any, ClassVar, Iterable, Iterator
 from urllib.parse import quote, urlencode, urlparse
@@ -127,27 +127,6 @@ def close_session():
 # Utilities.
 
 
-def unichar(i):
-    try:
-        return chr(i)
-    except ValueError:
-        return struct.pack("i", i).decode("utf-32")
-
-
-def unescape(text):
-    """Resolve &#xxx; HTML entities (and some others)."""
-    if isinstance(text, bytes):
-        text = text.decode("utf-8", "ignore")
-    out = text.replace("&nbsp;", " ")
-
-    def replchar(m):
-        num = m.group(1)
-        return unichar(int(num))
-
-    out = re.sub("&#(\\d+);", replchar, out)
-    return out
-
-
 def extract_text_between(html, start_marker, end_marker):
     try:
         _, html = html.split(start_marker, 1)

From 54fc67b30a8ecaa33fe40790a6f277749c9ebc00 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?= <snejus@protonmail.com>
Date: Mon, 7 Oct 2024 18:24:22 +0100
Subject: [PATCH 12/25] Remove extract_text_between

---
 beetsplug/lyrics.py | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index 8bd7f15d3..7adb1bc7e 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -127,15 +127,6 @@ def close_session():
 # Utilities.
 
 
-def extract_text_between(html, start_marker, end_marker):
-    try:
-        _, html = html.split(start_marker, 1)
-        html, _ = html.split(end_marker, 1)
-    except ValueError:
-        return ""
-    return html
-
-
 def search_pairs(item):
     """Yield a pairs of artists and titles to search for.
 
@@ -448,7 +439,7 @@ class MusiXmatch(DirectBackend):
         # Sometimes lyrics come in 2 or more parts
         lyrics_parts = []
         for html_part in html_parts:
-            lyrics_parts.append(extract_text_between(html_part, ">", "</p>"))
+            lyrics_parts.append(re.sub(r"^[^>]+>|</p>.*", "", html_part))
         lyrics = "\n".join(lyrics_parts)
         lyrics = lyrics.strip(',"').replace("\\n", "\n")
         # another odd case: sometimes only that string remains, for

From 745c5eb9f058603c0817003d57adc4c80dc7aecd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?= <snejus@protonmail.com>
Date: Wed, 9 Oct 2024 12:12:09 +0100
Subject: [PATCH 13/25] Genius: refactor and simplify

---
 beetsplug/_typing.py        |  86 ++++++++++++++++++++++++
 beetsplug/lyrics.py         | 127 ++++++++++--------------------------
 docs/changelog.rst          |   3 +
 test/plugins/test_lyrics.py |   2 +-
 4 files changed, 126 insertions(+), 92 deletions(-)
 create mode 100644 beetsplug/_typing.py

diff --git a/beetsplug/_typing.py b/beetsplug/_typing.py
new file mode 100644
index 000000000..93f4a2a58
--- /dev/null
+++ b/beetsplug/_typing.py
@@ -0,0 +1,86 @@
+from __future__ import annotations
+
+from typing import Any
+
+from typing_extensions import TypedDict
+
+JSONDict = dict[str, Any]
+
+
+class LRCLibAPI:
+    class Item(TypedDict):
+        """Lyrics data item returned by the LRCLib API."""
+
+        id: int
+        name: str
+        trackName: str
+        artistName: str
+        albumName: str
+        duration: float | None
+        instrumental: bool
+        plainLyrics: str
+        syncedLyrics: str | None
+
+
+class GeniusAPI:
+    """Genius API data types.
+
+    This documents *only* the fields that are used in the plugin.
+    :attr:`SearchResult` is an exception, since I thought some of the other
+    fields might be useful in the future.
+    """
+
+    class DateComponents(TypedDict):
+        year: int
+        month: int
+        day: int
+
+    class Artist(TypedDict):
+        api_path: str
+        header_image_url: str
+        id: int
+        image_url: str
+        is_meme_verified: bool
+        is_verified: bool
+        name: str
+        url: str
+
+    class Stats(TypedDict):
+        unreviewed_annotations: int
+        hot: bool
+
+    class SearchResult(TypedDict):
+        annotation_count: int
+        api_path: str
+        artist_names: str
+        full_title: str
+        header_image_thumbnail_url: str
+        header_image_url: str
+        id: int
+        lyrics_owner_id: int
+        lyrics_state: str
+        path: str
+        primary_artist_names: str
+        pyongs_count: int | None
+        relationships_index_url: str
+        release_date_components: GeniusAPI.DateComponents
+        release_date_for_display: str
+        release_date_with_abbreviated_month_for_display: str
+        song_art_image_thumbnail_url: str
+        song_art_image_url: str
+        stats: GeniusAPI.Stats
+        title: str
+        title_with_featured: str
+        url: str
+        featured_artists: list[GeniusAPI.Artist]
+        primary_artist: GeniusAPI.Artist
+        primary_artists: list[GeniusAPI.Artist]
+
+    class SearchHit(TypedDict):
+        result: GeniusAPI.SearchResult
+
+    class SearchResponse(TypedDict):
+        hits: list[GeniusAPI.SearchHit]
+
+    class Search(TypedDict):
+        response: GeniusAPI.SearchResponse
diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index 7adb1bc7e..12b2d1f37 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -31,7 +31,6 @@ from typing import TYPE_CHECKING, Any, ClassVar, Iterable, Iterator
 from urllib.parse import quote, urlencode, urlparse
 
 import requests
-from typing_extensions import TypedDict
 from unidecode import unidecode
 
 import beets
@@ -42,6 +41,8 @@ if TYPE_CHECKING:
     from beets.importer import ImportTask
     from beets.library import Item
 
+    from ._typing import GeniusAPI, LRCLibAPI
+
 try:
     from bs4 import BeautifulSoup
 
@@ -266,20 +267,6 @@ class Backend(RequestHandler):
         raise NotImplementedError
 
 
-class LRCLibItem(TypedDict):
-    """Lyrics data item returned by the LRCLib API."""
-
-    id: int
-    name: str
-    trackName: str
-    artistName: str
-    albumName: str
-    duration: float | None
-    instrumental: bool
-    plainLyrics: str
-    syncedLyrics: str | None
-
-
 @dataclass
 @total_ordering
 class LRCLyrics:
@@ -297,7 +284,9 @@ class LRCLyrics:
         return self.dist < other.dist
 
     @classmethod
-    def make(cls, candidate: LRCLibItem, target_duration: float) -> LRCLyrics:
+    def make(
+        cls, candidate: LRCLibAPI.Item, target_duration: float
+    ) -> LRCLyrics:
         return cls(
             target_duration,
             candidate["duration"] or 0.0,
@@ -354,7 +343,7 @@ class LRCLib(Backend):
 
     def fetch_candidates(
         self, artist: str, title: str, album: str, length: int
-    ) -> Iterator[list[LRCLibItem]]:
+    ) -> Iterator[list[LRCLibAPI.Item]]:
         """Yield lyrics candidates for the given song data.
 
         I found that the ``/get`` endpoint sometimes returns inaccurate or
@@ -494,13 +483,15 @@ class Genius(SearchBackend):
     bigishdata.com/2016/09/27/getting-song-lyrics-from-geniuss-api-scraping/
     """
 
+    LYRICS_IN_JSON_RE = re.compile(r'(?<=.\\"html\\":\\").*?(?=(?<!\\)\\")')
+    remove_backslash = partial(re.compile(r"\\(?=[^\\])").sub, "")
+
     base_url = "https://api.genius.com"
     search_url = f"{base_url}/search"
 
-    def __init__(self, config, log):
-        super().__init__(config, log)
-        self.api_key = config["genius_api_key"].as_str()
-        self.headers = {"Authorization": f"Bearer {self.api_key}"}
+    @cached_property
+    def headers(self) -> dict[str, str]:
+        return {"Authorization": f'Bearer {self.config["genius_api_key"]}'}
 
     def fetch(self, artist: str, title: str, *_) -> str | None:
         """Fetch lyrics from genius.com
@@ -509,85 +500,39 @@ class Genius(SearchBackend):
         we first query the api for a url matching our artist & title,
         then attempt to scrape that url for the lyrics.
         """
-        json = self._search(artist, title)
 
-        check = partial(self.check_match, artist, title)
-        for hit in json["response"]["hits"]:
-            result = hit["result"]
-            url = hit["result"]["url"]
-            if check(result["primary_artist"]["name"], result["title"]) and (
-                lyrics := self.scrape_lyrics(self.fetch_text(url))
-            ):
-                return collapse_newlines(lyrics)
+        data = self.fetch_json(
+            self.search_url,
+            params={"q": f"{artist} {title}".lower()},
+            headers=self.headers,
+        )
+        if (url := self.find_lyrics_url(data, artist, title)) and (
+            lyrics := self.scrape_lyrics(self.fetch_text(url))
+        ):
+            return collapse_newlines(lyrics)
 
         return None
 
-    def _search(self, artist, title):
-        """Searches the genius api for a given artist and title
+    def find_lyrics_url(
+        self, data: GeniusAPI.Search, artist: str, title: str
+    ) -> str | None:
+        """Find URL to the lyrics of the given artist and title.
 
-        https://docs.genius.com/#search-h2
-
-        :returns: json response
+        https://docs.genius.com/#search-h2.
         """
-        return self.fetch_json(
-            self.search_url,
-            params={"q": f"{title} {artist.lower()}"},
-            headers=self.headers,
-        )
+        check = partial(self.check_match, artist, title)
+        for result in (hit["result"] for hit in data["response"]["hits"]):
+            if check(result["artist_names"], result["title"]):
+                return result["url"]
+
+        return None
 
     def scrape_lyrics(self, html: str) -> str | None:
-        """Scrape lyrics from a given genius.com html"""
-        soup = get_soup(html)
+        if m := self.LYRICS_IN_JSON_RE.search(html):
+            html_text = self.remove_backslash(m[0]).replace(r"\n", "\n")
+            return get_soup(html_text).get_text().strip()
 
-        # Most of the time, the page contains a div with class="lyrics" where
-        # all of the lyrics can be found already correctly formatted
-        # Sometimes, though, it packages the lyrics into separate divs, most
-        # likely for easier ad placement
-
-        lyrics_divs = soup.find_all("div", {"data-lyrics-container": True})
-        if not lyrics_divs:
-            self.debug("Received unusual song page html")
-            return self._try_extracting_lyrics_from_non_data_lyrics_container(
-                soup
-            )
-        lyrics = ""
-        for lyrics_div in lyrics_divs:
-            lyrics += lyrics_div.get_text() + "\n\n"
-        while lyrics[-1] == "\n":
-            lyrics = lyrics[:-1]
-        return lyrics
-
-    def _try_extracting_lyrics_from_non_data_lyrics_container(self, soup):
-        """Extract lyrics from a div without attribute data-lyrics-container
-        This is the second most common layout on genius.com
-        """
-        verse_div = soup.find("div", class_=re.compile("Lyrics__Container"))
-        if not verse_div:
-            if soup.find(
-                "div",
-                class_=re.compile("LyricsPlaceholder__Message"),
-                string="This song is an instrumental",
-            ):
-                self.debug("Detected instrumental")
-                return INSTRUMENTAL_LYRICS
-            else:
-                self.debug("Couldn't scrape page using known layouts")
-                return None
-
-        lyrics_div = verse_div.parent
-
-        ads = lyrics_div.find_all(
-            "div", class_=re.compile("InreadAd__Container")
-        )
-        for ad in ads:
-            ad.replace_with("\n")
-
-        footers = lyrics_div.find_all(
-            "div", class_=re.compile("Lyrics__Footer")
-        )
-        for footer in footers:
-            footer.replace_with("")
-        return lyrics_div.get_text()
+        return None
 
 
 class Tekstowo(DirectBackend):
diff --git a/docs/changelog.rst b/docs/changelog.rst
index 25ae5bc10..54d085599 100644
--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@@ -67,6 +67,9 @@ Bug fixes:
 * :doc:`plugins/lyrics`: Fix the issue with ``genius`` backend not being able
   to match lyrics when there is a slight variation in the artist name.
   :bug:`4791`
+* :doc:`plugins/lyrics`: Fix plugin crash when ``genius`` backend returns empty
+  lyrics.
+  :bug:`5583`
 
 For packagers:
 
diff --git a/test/plugins/test_lyrics.py b/test/plugins/test_lyrics.py
index 1d264338a..091776108 100644
--- a/test/plugins/test_lyrics.py
+++ b/test/plugins/test_lyrics.py
@@ -379,7 +379,7 @@ class TestGeniusLyrics(LyricsBackendTest):
     @pytest.mark.parametrize(
         "file_name, expected_line_count",
         [
-            ("geniuscom/2pacalleyezonmelyrics", 134),
+            ("geniuscom/2pacalleyezonmelyrics", 131),
             ("geniuscom/Ttngchinchillalyrics", 29),
             ("geniuscom/sample", 0),  # see https://github.com/beetbox/beets/issues/3535
         ],

From 12c5eaae5e144dcd88e32a0af23701aa346e3d00 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?= <snejus@protonmail.com>
Date: Sun, 13 Oct 2024 17:04:58 +0100
Subject: [PATCH 14/25] Unite Genius, Tekstowo and Google backends under the
 same interface

---
 beetsplug/lyrics.py         | 165 ++++++++++++++++++------------------
 test/plugins/test_lyrics.py |  39 ++++-----
 2 files changed, 101 insertions(+), 103 deletions(-)

diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index 12b2d1f37..9424a14b6 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -27,7 +27,7 @@ from dataclasses import dataclass
 from functools import cached_property, partial, total_ordering
 from html import unescape
 from http import HTTPStatus
-from typing import TYPE_CHECKING, Any, ClassVar, Iterable, Iterator
+from typing import TYPE_CHECKING, ClassVar, Iterable, Iterator, NamedTuple
 from urllib.parse import quote, urlencode, urlparse
 
 import requests
@@ -41,7 +41,7 @@ if TYPE_CHECKING:
     from beets.importer import ImportTask
     from beets.library import Item
 
-    from ._typing import GeniusAPI, LRCLibAPI
+    from ._typing import GeniusAPI, JSONDict, LRCLibAPI
 
 try:
     from bs4 import BeautifulSoup
@@ -57,8 +57,6 @@ try:
 except ImportError:
     HAS_LANGDETECT = False
 
-JSONDict = dict[str, Any]
-
 BREAK_RE = re.compile(r"\n?\s*<br([\s|/][^>]*)*>\s*\n?", re.I)
 USER_AGENT = f"beets/{beets.__version__}"
 INSTRUMENTAL_LYRICS = "[Instrumental]"
@@ -442,6 +440,12 @@ class MusiXmatch(DirectBackend):
         return lyrics
 
 
+class SearchResult(NamedTuple):
+    artist: str
+    title: str
+    url: str
+
+
 class SearchBackend(Backend):
     REQUIRES_BS = True
 
@@ -450,12 +454,12 @@ class SearchBackend(Backend):
         return self.config["dist_thresh"].get(float)
 
     def check_match(
-        self, target_artist: str, target_title: str, artist: str, title: str
+        self, target_artist: str, target_title: str, result: SearchResult
     ) -> bool:
-        """Check if the given artist and title are 'good enough' match."""
+        """Check if the given search result is a 'good enough' match."""
         max_dist = max(
-            string_dist(target_artist, artist),
-            string_dist(target_title, title),
+            string_dist(target_artist, result.artist),
+            string_dist(target_title, result.title),
         )
 
         if (max_dist := round(max_dist, 2)) <= self.dist_thresh:
@@ -466,8 +470,8 @@ class SearchBackend(Backend):
             # This may show a matching candidate with some noise in the name
             self.debug(
                 "({}, {}) does not match ({}, {}) but dist was close: {:.2f}",
-                artist,
-                title,
+                result.artist,
+                result.title,
                 target_artist,
                 target_title,
                 max_dist,
@@ -475,61 +479,59 @@ class SearchBackend(Backend):
 
         return False
 
+    def search(self, artist: str, title: str) -> Iterable[SearchResult]:
+        """Search for the given query and yield search results."""
+        raise NotImplementedError
+
+    def get_results(self, artist: str, title: str) -> Iterable[SearchResult]:
+        check_match = partial(self.check_match, artist, title)
+        for candidate in self.search(artist, title):
+            if check_match(candidate):
+                yield candidate
+
+    def fetch(self, artist: str, title: str, *_) -> str | None:
+        """Fetch lyrics for the given artist and title."""
+        for result in self.get_results(artist, title):
+            if lyrics := self.scrape(self.fetch_text(result.url)):
+                return lyrics
+
+        return None
+
+    @classmethod
+    def scrape(cls, html: str) -> str | None:
+        """Scrape the lyrics from the given HTML."""
+        raise NotImplementedError
+
 
 class Genius(SearchBackend):
     """Fetch lyrics from Genius via genius-api.
 
-    Simply adapted from
-    bigishdata.com/2016/09/27/getting-song-lyrics-from-geniuss-api-scraping/
+    Because genius doesn't allow accessing lyrics via the api, we first query
+    the api for a url matching our artist & title, then scrape the HTML text
+    for the JSON data containing the lyrics.
     """
 
+    SEARCH_URL = "https://api.genius.com/search"
     LYRICS_IN_JSON_RE = re.compile(r'(?<=.\\"html\\":\\").*?(?=(?<!\\)\\")')
     remove_backslash = partial(re.compile(r"\\(?=[^\\])").sub, "")
 
-    base_url = "https://api.genius.com"
-    search_url = f"{base_url}/search"
-
     @cached_property
     def headers(self) -> dict[str, str]:
         return {"Authorization": f'Bearer {self.config["genius_api_key"]}'}
 
-    def fetch(self, artist: str, title: str, *_) -> str | None:
-        """Fetch lyrics from genius.com
-
-        Because genius doesn't allow accessing lyrics via the api,
-        we first query the api for a url matching our artist & title,
-        then attempt to scrape that url for the lyrics.
-        """
-
-        data = self.fetch_json(
-            self.search_url,
-            params={"q": f"{artist} {title}".lower()},
+    def search(self, artist: str, title: str) -> Iterable[SearchResult]:
+        search_data: GeniusAPI.Search = self.fetch_json(
+            self.SEARCH_URL,
+            params={"q": f"{artist} {title}"},
             headers=self.headers,
         )
-        if (url := self.find_lyrics_url(data, artist, title)) and (
-            lyrics := self.scrape_lyrics(self.fetch_text(url))
-        ):
-            return collapse_newlines(lyrics)
+        for r in (hit["result"] for hit in search_data["response"]["hits"]):
+            yield SearchResult(r["artist_names"], r["title"], r["url"])
 
-        return None
-
-    def find_lyrics_url(
-        self, data: GeniusAPI.Search, artist: str, title: str
-    ) -> str | None:
-        """Find URL to the lyrics of the given artist and title.
-
-        https://docs.genius.com/#search-h2.
-        """
-        check = partial(self.check_match, artist, title)
-        for result in (hit["result"] for hit in data["response"]["hits"]):
-            if check(result["artist_names"], result["title"]):
-                return result["url"]
-
-        return None
-
-    def scrape_lyrics(self, html: str) -> str | None:
-        if m := self.LYRICS_IN_JSON_RE.search(html):
-            html_text = self.remove_backslash(m[0]).replace(r"\n", "\n")
+    @classmethod
+    def scrape(cls, html: str) -> str | None:
+        if m := cls.LYRICS_IN_JSON_RE.search(html):
+            html_text = cls.remove_backslash(m[0]).replace(r"\n", "\n")
             return get_soup(html_text).get_text().strip()
 
         return None
@@ -551,13 +553,12 @@ class Tekstowo(DirectBackend):
         # We are expecting to receive a 404 since we are guessing the URL.
         # Thus suppress the error so that it does not end up in the logs.
         with suppress(NotFoundError):
-            return self.scrape_lyrics(
-                self.fetch_text(self.build_url(artist, title))
-            )
+            return self.scrape(self.fetch_text(self.build_url(artist, title)))
 
         return None
 
-    def scrape_lyrics(self, html: str) -> str | None:
+    @classmethod
+    def scrape(cls, html: str) -> str | None:
         soup = get_soup(html)
 
         if lyrics_div := soup.select_one("div.song-text > div.inner-text"):
@@ -616,16 +617,6 @@ class Google(SearchBackend):
 
     SEARCH_URL = "https://www.googleapis.com/customsearch/v1"
 
-    @staticmethod
-    def scrape_lyrics(html: str) -> str | None:
-        soup = get_soup(html)
-
-        # Get the longest text element (if any).
-        strings = sorted(soup.stripped_strings, key=len, reverse=True)
-        if strings:
-            return strings[0]
-        return None
-
     def is_lyrics(self, text, artist=None):
         """Determine whether the text seems to be valid lyrics."""
         if not text:
@@ -658,17 +649,11 @@ class Google(SearchBackend):
     BY_TRANS = ["by", "par", "de", "von"]
     LYRICS_TRANS = ["lyrics", "paroles", "letras", "liedtexte"]
 
-    def is_page_candidate(
-        self, artist: str, title: str, url_link: str, url_title: str
-    ) -> bool:
-        """Return True if the URL title makes it a good candidate to be a
-        page that contains lyrics of title by artist.
-        """
-        title_slug = slug(title)
+    def make_search_result(
+        self, artist: str, url_link: str, url_title: str
+    ) -> SearchResult:
+        """Parse artist and title from the URL title and return a search result."""
         url_title_slug = slug(url_title)
-        if title_slug in url_title_slug:
-            return True
-
         artist = slug(artist)
         sitename = urlparse(url_link).netloc
 
@@ -683,33 +668,45 @@ class Google(SearchBackend):
             "(%s)" % "|".join(tokens), "", url_title_slug
         ).strip("-")
 
-        return self.check_match(artist, title_slug, artist, song_title)
+        return SearchResult(artist, song_title, url_link)
 
-    def fetch(self, artist: str, title: str, *_) -> str | None:
+    def search(self, artist: str, title: str) -> Iterable[SearchResult]:
         params = {
             "key": self.config["google_API_key"].as_str(),
             "cx": self.config["google_engine_ID"].as_str(),
             "q": f"{artist} {title}",
         }
 
-        check_candidate = partial(self.is_page_candidate, artist, title)
-        for item in self.fetch_json(self.SEARCH_URL, params=params).get(
-            "items", []
-        ):
-            url_link = item["link"]
-            if not check_candidate(url_link, item.get("title", "")):
-                continue
+        data = self.fetch_json(self.SEARCH_URL, params=params)
+        for item in data.get("items", []):
+            yield self.make_search_result(artist, item["link"], item["title"])
+
+    def get_results(self, artist: str, title: str) -> Iterable[SearchResult]:
+        return super().get_results(artist, slug(title))
+
+    def fetch(self, artist: str, title: str, *_) -> str | None:
+        for result in self.get_results(artist, title):
             with self.handle_request():
-                lyrics = self.scrape_lyrics(self.fetch_text(url_link))
+                lyrics = self.scrape(self.fetch_text(result.url))
                 if not lyrics:
                     continue
 
                 if self.is_lyrics(lyrics, artist):
-                    self.debug("Got lyrics from {}", item["displayLink"])
+                    self.debug(
+                        "Got lyrics from {}", urlparse(result.url).netloc
+                    )
                     return lyrics
 
         return None
 
+    @classmethod
+    def scrape(cls, html: str) -> str | None:
+        # Get the longest text element (if any).
+        if strings := sorted(get_soup(html).stripped_strings, key=len):
+            return strings[-1]
+
+        return None
+
 
 class LyricsPlugin(RequestHandler, plugins.BeetsPlugin):
     SOURCES = ["lrclib", "google", "musixmatch", "genius", "tekstowo"]
diff --git a/test/plugins/test_lyrics.py b/test/plugins/test_lyrics.py
index 091776108..8afd80585 100644
--- a/test/plugins/test_lyrics.py
+++ b/test/plugins/test_lyrics.py
@@ -191,9 +191,9 @@ class TestSearchBackend:
         ],
     )
     def test_check_match(self, backend, target_artist, artist, should_match):
-        assert (
-            backend.check_match(target_artist, "", artist, "") == should_match
-        )
+        result = lyrics.SearchResult(artist, "", "")
+
+        assert backend.check_match(target_artist, "", result) == should_match
 
 
 @pytest.fixture(scope="module")
@@ -327,31 +327,32 @@ class TestGoogleLyrics(LyricsBackendTest):
 
     def test_mocked_source_ok(self, backend, lyrics_html):
         """Test that lyrics of the mocked page are correctly scraped"""
-        result = backend.scrape_lyrics(lyrics_html).lower()
+        result = backend.scrape(lyrics_html).lower()
 
         assert result
         assert backend.is_lyrics(result)
         assert PHRASE_BY_TITLE[self.TITLE] in result
 
     @pytest.mark.parametrize(
-        "url_title, artist, should_be_candidate",
+        "url_title, artist, expected_title",
         [
-            ("John Doe - beets song Lyrics", "John Doe", True),
-            ("example.com | Beats song by John doe", "John Doe", True),
-            ("example.com | seets bong lyrics by John doe", "John Doe", False),
-            ("foo", "Sun O)))", False),
+            ("John Doe - beets song Lyrics", "John Doe", "beets-song"),
+            ("example.com | Beats song by John doe", "John Doe", "beats-song"),
+            (
+                "example.com | seets bong lyrics by John doe",
+                "John Doe",
+                "seets-bong",
+            ),
+            ("foo", "Sun O)))", "foo"),
         ],
     )
-    def test_is_page_candidate(
-        self, backend, lyrics_html, url_title, artist, should_be_candidate
+    def test_make_search_result(
+        self, backend, url_title, artist, expected_title
     ):
-        result = backend.is_page_candidate(
-            artist,
-            self.TITLE,
-            "http://www.example.com/lyrics/beetssong",
-            url_title,
+        result = backend.make_search_result(
+            artist, "https://example.com", url_title
         )
-        assert bool(result) == should_be_candidate
+        assert result.title == expected_title
 
     @pytest.mark.parametrize(
         "lyrics",
@@ -385,7 +386,7 @@ class TestGeniusLyrics(LyricsBackendTest):
         ],
     )  # fmt: skip
     def test_scrape(self, backend, lyrics_html, expected_line_count):
-        result = backend.scrape_lyrics(lyrics_html) or ""
+        result = backend.scrape(lyrics_html) or ""
 
         assert len(result.splitlines()) == expected_line_count
 
@@ -406,7 +407,7 @@ class TestTekstowoLyrics(LyricsBackendTest):
         ],
     )
     def test_scrape(self, backend, lyrics_html, expecting_lyrics):
-        assert bool(backend.scrape_lyrics(lyrics_html)) == expecting_lyrics
+        assert bool(backend.scrape(lyrics_html)) == expecting_lyrics
 
 
 LYRICS_DURATION = 950

From c5c4138d6638e854a811eee6f98ab6afdba0b82f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?= <snejus@protonmail.com>
Date: Sun, 13 Oct 2024 16:36:41 +0100
Subject: [PATCH 15/25] Google: Refactor and improve

* Type the response data that Google Custom Search API return.
* Exclude some 'letras.mus.br' pages that do not contain lyric.
* Exclude results from Musixmatch as we cannot access their pages.
* Improve parsing of the URL title:
  - Handle long URL titles that get truncated (end with ellipsis) for
    long searches
  - Remove domains starting with 'www'
  - Parse the title AND the artist. Previously this would only parse the
    title, and fetch lyrics even when the artist did not match.
* Remove now redundant credits cleanup and checks for valid lyrics.
---
 beetsplug/_typing.py        |  31 ++++++-
 beetsplug/lyrics.py         | 156 ++++++++++++++++++------------------
 test/plugins/test_lyrics.py |  84 ++++++++-----------
 3 files changed, 141 insertions(+), 130 deletions(-)

diff --git a/beetsplug/_typing.py b/beetsplug/_typing.py
index 93f4a2a58..915ea77e8 100644
--- a/beetsplug/_typing.py
+++ b/beetsplug/_typing.py
@@ -2,7 +2,7 @@ from __future__ import annotations
 
 from typing import Any
 
-from typing_extensions import TypedDict
+from typing_extensions import NotRequired, TypedDict
 
 JSONDict = dict[str, Any]
 
@@ -84,3 +84,32 @@ class GeniusAPI:
 
     class Search(TypedDict):
         response: GeniusAPI.SearchResponse
+
+
+class GoogleCustomSearchAPI:
+    class Response(TypedDict):
+        """Search response from the Google Custom Search API.
+
+        If the search returns no results, the :attr:`items` field is not found.
+        """
+
+        items: NotRequired[list[GoogleCustomSearchAPI.Item]]
+
+    class Item(TypedDict):
+        """A Google Custom Search API result item.
+
+        :attr:`title` field is shown to the user in the search interface, thus
+        it gets truncated with an ellipsis for longer queries. For most
+        results, the full title is available as ``og:title`` metatag found
+        under the :attr:`pagemap` field. Note neither this metatag nor the
+        ``pagemap`` field is guaranteed to be present in the data.
+        """
+
+        title: str
+        link: str
+        pagemap: NotRequired[GoogleCustomSearchAPI.Pagemap]
+
+    class Pagemap(TypedDict):
+        """Pagemap data with a single meta tags dict in a list."""
+
+        metatags: list[JSONDict]
diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index 9424a14b6..0982120f2 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -28,7 +28,7 @@ from functools import cached_property, partial, total_ordering
 from html import unescape
 from http import HTTPStatus
 from typing import TYPE_CHECKING, ClassVar, Iterable, Iterator, NamedTuple
-from urllib.parse import quote, urlencode, urlparse
+from urllib.parse import quote, urlencode
 
 import requests
 from unidecode import unidecode
@@ -41,7 +41,7 @@ if TYPE_CHECKING:
     from beets.importer import ImportTask
     from beets.library import Item
 
-    from ._typing import GeniusAPI, JSONDict, LRCLibAPI
+    from ._typing import GeniusAPI, GoogleCustomSearchAPI, JSONDict, LRCLibAPI
 
 try:
     from bs4 import BeautifulSoup
@@ -492,7 +492,9 @@ class SearchBackend(Backend):
     def fetch(self, artist: str, title: str, *_) -> str | None:
         """Fetch lyrics for the given artist and title."""
         for result in self.get_results(artist, title):
-            if lyrics := self.scrape(self.fetch_text(result.url)):
+            if (html := self.fetch_text(result.url)) and (
+                lyrics := self.scrape(html)
+            ):
                 return lyrics
 
         return None
@@ -567,20 +569,6 @@ class Tekstowo(DirectBackend):
         return None
 
 
-def remove_credits(text):
-    """Remove first/last line of text if it contains the word 'lyrics'
-    eg 'Lyrics by songsdatabase.com'
-    """
-    textlines = text.split("\n")
-    credits = None
-    for i in (0, -1):
-        if textlines and "lyrics" in textlines[i].lower():
-            credits = textlines.pop(i)
-    if credits:
-        text = "\n".join(textlines)
-    return text
-
-
 collapse_newlines = partial(re.compile(r"\n{3,}").sub, r"\n\n")
 
 
@@ -617,87 +605,97 @@ class Google(SearchBackend):
 
     SEARCH_URL = "https://www.googleapis.com/customsearch/v1"
 
-    def is_lyrics(self, text, artist=None):
-        """Determine whether the text seems to be valid lyrics."""
-        if not text:
-            return False
-        bad_triggers_occ = []
-        nb_lines = text.count("\n")
-        if nb_lines <= 1:
-            self.debug("Ignoring too short lyrics '{}'", text)
-            return False
-        elif nb_lines < 5:
-            bad_triggers_occ.append("too_short")
-        else:
-            # Lyrics look legit, remove credits to avoid being penalized
-            # further down
-            text = remove_credits(text)
+    #: Exclude some letras.mus.br pages which do not contain lyrics.
+    EXCLUDE_PAGES = [
+        "significado.html",
+        "traduccion.html",
+        "traducao.html",
+        "significados.html",
+    ]
 
-        bad_triggers = ["lyrics", "copyright", "property", "links"]
-        if artist:
-            bad_triggers += [artist]
+    #: Regular expression to match noise in the URL title.
+    URL_TITLE_NOISE_RE = re.compile(
+        r"""
+\b
+(
+      paroles(\ et\ traduction|\ de\ chanson)?
+    | letras?(\ de)?
+    | liedtexte
+    | original\ song\ full\ text\.
+    | official
+    | 20[12]\d\ version
+    | (absolute\ |az)?lyrics(\ complete)?
+    | www\S+
+    | \S+\.(com|net|mus\.br)
+)
+([^\w.]|$)
+""",
+        re.IGNORECASE | re.VERBOSE,
+    )
+    #: Split cleaned up URL title into artist and title parts.
+    URL_TITLE_PARTS_RE = re.compile(r" +(?:[ :|-]+|par|by) +")
 
-        for item in bad_triggers:
-            bad_triggers_occ += [item] * len(
-                re.findall(r"\W%s\W" % item, text, re.I)
-            )
+    def fetch_text(self, *args, **kwargs) -> str:
+        """Handle an error so that we can continue with the next URL."""
+        with self.handle_request():
+            return super().fetch_text(*args, **kwargs)
 
-        if bad_triggers_occ:
-            self.debug("Bad triggers detected: {}", bad_triggers_occ)
-        return len(bad_triggers_occ) < 2
+    @staticmethod
+    def get_part_dist(artist: str, title: str, part: str) -> float:
+        """Return the distance between the given part and the artist and title.
 
-    BY_TRANS = ["by", "par", "de", "von"]
-    LYRICS_TRANS = ["lyrics", "paroles", "letras", "liedtexte"]
+        A number between -1 and 1 is returned, where -1 means the part is
+        closer to the artist and 1 means it is closer to the title.
+        """
+        return string_dist(artist, part) - string_dist(title, part)
 
+    @classmethod
     def make_search_result(
-        self, artist: str, url_link: str, url_title: str
+        cls, artist: str, title: str, item: GoogleCustomSearchAPI.Item
     ) -> SearchResult:
         """Parse artist and title from the URL title and return a search result."""
-        url_title_slug = slug(url_title)
-        artist = slug(artist)
-        sitename = urlparse(url_link).netloc
-
-        # or try extracting song title from URL title and check if
-        # they are close enough
-        tokens = (
-            [by + "-" + artist for by in self.BY_TRANS]
-            + [artist, sitename, sitename.replace("www.", "")]
-            + self.LYRICS_TRANS
+        url_title = (
+            # get full title from metatags if available
+            item.get("pagemap", {}).get("metatags", [{}])[0].get("og:title")
+            # default to the dispolay title
+            or item["title"]
         )
-        song_title = re.sub(
-            "(%s)" % "|".join(tokens), "", url_title_slug
-        ).strip("-")
+        clean_title = cls.URL_TITLE_NOISE_RE.sub("", url_title).strip(" .-|")
+        # split it into parts which may be part of the artist or the title
+        # `dict.fromkeys` removes duplicates keeping the order
+        parts = list(dict.fromkeys(cls.URL_TITLE_PARTS_RE.split(clean_title)))
 
-        return SearchResult(artist, song_title, url_link)
+        if len(parts) == 1:
+            part = parts[0]
+            if m := re.search(rf"(?i)\W*({re.escape(title)})\W*", part):
+                # artist and title may not have a separator
+                result_title = m[1]
+                result_artist = part.replace(m[0], "")
+            else:
+                # assume that this is the title
+                result_artist, result_title = "", parts[0]
+        else:
+            # sort parts by their similarity to the artist
+            parts.sort(key=lambda p: cls.get_part_dist(artist, title, p))
+            result_artist, result_title = parts[0], " ".join(parts[1:])
+
+        return SearchResult(result_artist, result_title, item["link"])
 
     def search(self, artist: str, title: str) -> Iterable[SearchResult]:
         params = {
             "key": self.config["google_API_key"].as_str(),
             "cx": self.config["google_engine_ID"].as_str(),
             "q": f"{artist} {title}",
+            "siteSearch": "www.musixmatch.com",
+            "siteSearchFilter": "e",
+            "excludeTerms": ", ".join(self.EXCLUDE_PAGES),
         }
 
-        data = self.fetch_json(self.SEARCH_URL, params=params)
+        data: GoogleCustomSearchAPI.Response = self.fetch_json(
+            self.SEARCH_URL, params=params
+        )
         for item in data.get("items", []):
-            yield self.make_search_result(artist, item["link"], item["title"])
-
-    def get_results(self, artist: str, title: str) -> Iterable[SearchResult]:
-        return super().get_results(artist, slug(title))
-
-    def fetch(self, artist: str, title: str, *_) -> str | None:
-        for result in self.get_results(artist, title):
-            with self.handle_request():
-                lyrics = self.scrape(self.fetch_text(result.url))
-                if not lyrics:
-                    continue
-
-                if self.is_lyrics(lyrics, artist):
-                    self.debug(
-                        "Got lyrics from {}", urlparse(result.url).netloc
-                    )
-                    return lyrics
-
-        return None
+            yield self.make_search_result(artist, title, item)
 
     @classmethod
     def scrape(cls, html: str) -> str | None:
diff --git a/test/plugins/test_lyrics.py b/test/plugins/test_lyrics.py
index 8afd80585..6986e4f06 100644
--- a/test/plugins/test_lyrics.py
+++ b/test/plugins/test_lyrics.py
@@ -101,24 +101,6 @@ class TestLyricsUtils:
 
         assert list(actual_titles) == [title, *expected_extra_titles]
 
-    @pytest.mark.parametrize(
-        "initial_lyrics, expected",
-        [
-            ("Verse\nLyrics credit in the last line", "Verse"),
-            ("Lyrics credit in the first line\nVerse", "Verse"),
-            (
-                """Verse
-                Lyrics mentioned somewhere in the middle
-                Verse""",
-                """Verse
-                Lyrics mentioned somewhere in the middle
-                Verse""",
-            ),
-        ],
-    )
-    def test_remove_credits(self, initial_lyrics, expected):
-        assert lyrics.remove_credits(initial_lyrics) == expected
-
     @pytest.mark.parametrize(
         "initial_text, expected",
         [
@@ -311,8 +293,6 @@ class TestLyricsSources(LyricsBackendTest):
 class TestGoogleLyrics(LyricsBackendTest):
     """Test scraping heuristics on a fake html page."""
 
-    TITLE = "Beets song"
-
     @pytest.fixture(scope="class")
     def backend_name(self):
         return "google"
@@ -325,51 +305,55 @@ class TestGoogleLyrics(LyricsBackendTest):
     def file_name(self):
         return "examplecom/beetssong"
 
+    @pytest.fixture
+    def search_item(self, url_title, url):
+        return {"title": url_title, "link": url}
+
     def test_mocked_source_ok(self, backend, lyrics_html):
         """Test that lyrics of the mocked page are correctly scraped"""
         result = backend.scrape(lyrics_html).lower()
 
         assert result
-        assert backend.is_lyrics(result)
-        assert PHRASE_BY_TITLE[self.TITLE] in result
+        assert PHRASE_BY_TITLE["Beets song"] in result
 
     @pytest.mark.parametrize(
-        "url_title, artist, expected_title",
+        "url_title, expected_artist, expected_title",
         [
-            ("John Doe - beets song Lyrics", "John Doe", "beets-song"),
-            ("example.com | Beats song by John doe", "John Doe", "beats-song"),
+            ("Artist - beets song Lyrics", "Artist", "beets song"),
+            ("www.azlyrics.com | Beats song by Artist", "Artist", "Beats song"),
+            ("lyric.com | seets bong lyrics by Artist", "Artist", "seets bong"),
+            ("foo", "", "foo"),
+            ("Artist - Beets Song lyrics | AZLyrics", "Artist", "Beets Song"),
+            ("Letra de Artist - Beets Song", "Artist", "Beets Song"),
+            ("Letra de Artist - Beets ...", "Artist", "Beets"),
+            ("Artist Beets Song", "Artist", "Beets Song"),
+            ("BeetsSong - Artist", "Artist", "BeetsSong"),
+            ("Artist - BeetsSong", "Artist", "BeetsSong"),
+            ("Beets Song", "", "Beets Song"),
+            ("Beets Song Artist", "Artist", "Beets Song"),
             (
-                "example.com | seets bong lyrics by John doe",
-                "John Doe",
-                "seets-bong",
+                "BeetsSong (feat. Other & Another) - Artist",
+                "Artist",
+                "BeetsSong (feat. Other & Another)",
+            ),
+            (
+                (
+                    "Beets song lyrics by Artist - original song full text. "
+                    "Official Beets song lyrics, 2024 version | LyricsMode.com"
+                ),
+                "Artist",
+                "Beets song",
             ),
-            ("foo", "Sun O)))", "foo"),
         ],
     )
+    @pytest.mark.parametrize("url", ["http://doesntmatter.com"])
     def test_make_search_result(
-        self, backend, url_title, artist, expected_title
+        self, backend, search_item, expected_artist, expected_title
     ):
-        result = backend.make_search_result(
-            artist, "https://example.com", url_title
-        )
-        assert result.title == expected_title
+        result = backend.make_search_result("Artist", "Beets song", search_item)
 
-    @pytest.mark.parametrize(
-        "lyrics",
-        [
-            "LyricsMania.com - Copyright (c) 2013 - All Rights Reserved",
-            """All material found on this site is property\n
-                     of mywickedsongtext brand""",
-            """
-Lyricsmania staff is working hard for you to add $TITLE lyrics as soon
-as they'll be released by $ARTIST, check back soon!
-In case you have the lyrics to $TITLE and want to send them to us, fill out
-the following form.
-""",
-        ],
-    )
-    def test_bad_lyrics(self, backend, lyrics):
-        assert not backend.is_lyrics(lyrics)
+        assert result.artist == expected_artist
+        assert result.title == expected_title
 
 
 class TestGeniusLyrics(LyricsBackendTest):

From 70554640e579635a72b7292e541a1eb48645f712 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?= <snejus@protonmail.com>
Date: Sun, 13 Oct 2024 13:34:12 +0100
Subject: [PATCH 16/25] Create Html class for cleaning up the html text

Additionally, improve HTML pre-processing:

* Ensure a new line between blocks of lyrics text from letras.mus.br.
* Parse a missing last block of lyrics text from lacocinelle.net.
* Parse a missing last block of lyrics text from paroles.net.
* Fix encoding issues with AZLyrics by setting response encoding to
  None, allowing `requests` to handle it.
---
 beetsplug/lyrics.py          | 105 ++++++++++++++++++++++-------------
 test/plugins/lyrics_pages.py |  42 ++++++++++++++
 test/plugins/test_lyrics.py  |  41 +++++++-------
 3 files changed, 127 insertions(+), 61 deletions(-)

diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index 0982120f2..a19d8c616 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -57,7 +57,6 @@ try:
 except ImportError:
     HAS_LANGDETECT = False
 
-BREAK_RE = re.compile(r"\n?\s*<br([\s|/][^>]*)*>\s*\n?", re.I)
 USER_AGENT = f"beets/{beets.__version__}"
 INSTRUMENTAL_LYRICS = "[Instrumental]"
 
@@ -231,10 +230,16 @@ class RequestHandler:
     def fetch_text(
         self, url: str, params: JSONDict | None = None, **kwargs
     ) -> str:
-        """Return text / HTML data from the given URL."""
+        """Return text / HTML data from the given URL.
+
+        Set the encoding to None to let requests handle it because some sites
+        set it incorrectly.
+        """
         url = self.format_url(url, params)
         self.debug("Fetching HTML from {}", url)
-        return r_session.get(url, **kwargs).text
+        r = r_session.get(url, **kwargs)
+        r.encoding = None
+        return r.text
 
     def fetch_json(self, url: str, params: JSONDict | None = None, **kwargs):
         """Return JSON data from the given URL."""
@@ -440,13 +445,60 @@ class MusiXmatch(DirectBackend):
         return lyrics
 
 
+class Html:
+    collapse_space = partial(re.compile(r"(^| ) +", re.M).sub, r"\1")
+    expand_br = partial(re.compile(r"\s*<br[^>]*>\s*", re.I).sub, "\n")
+    #: two newlines between paragraphs on the same line (musica, letras.mus.br)
+    merge_blocks = partial(re.compile(r"(?<!>)</p><p[^>]*>").sub, "\n\n")
+    #: a single new line between paragraphs on separate lines
+    #: (paroles.net, sweetslyrics.com, lacoccinelle.net)
+    merge_lines = partial(re.compile(r"</p>\s+<p[^>]*>(?!___)").sub, "\n")
+    #: remove empty divs (lacoccinelle.net)
+    remove_empty_divs = partial(re.compile(r"<div[^>]*>\s*</div>").sub, "")
+    #: remove Google Ads tags (musica.com)
+    remove_aside = partial(re.compile("<aside .+?</aside>").sub, "")
+    #: remove adslot-Content_1 div from the lyrics text (paroles.net)
+    remove_adslot = partial(
+        re.compile(r"\n</div>[^\n]+-- Content_\d+ --.*?\n<div>", re.S).sub,
+        "\n",
+    )
+    #: remove text formatting (azlyrics.com, lacocinelle.net)
+    remove_formatting = partial(
+        re.compile(r" *</?(i|em|pre|strong)[^>]*>").sub, ""
+    )
+
+    @classmethod
+    def normalize_space(cls, text: str) -> str:
+        text = unescape(text).replace("\r", "").replace("\xa0", " ")
+        return cls.collapse_space(cls.expand_br(text))
+
+    @classmethod
+    def remove_ads(cls, text: str) -> str:
+        return cls.remove_adslot(cls.remove_aside(text))
+
+    @classmethod
+    def merge_paragraphs(cls, text: str) -> str:
+        return cls.merge_blocks(cls.merge_lines(cls.remove_empty_divs(text)))
+
+
+class SoupMixin:
+    @classmethod
+    def pre_process_html(cls, html: str) -> str:
+        """Pre-process the HTML content before scraping."""
+        return Html.normalize_space(html)
+
+    @classmethod
+    def get_soup(cls, html: str) -> BeautifulSoup:
+        return BeautifulSoup(cls.pre_process_html(html), "html.parser")
+
+
 class SearchResult(NamedTuple):
     artist: str
     title: str
     url: str
 
 
-class SearchBackend(Backend):
+class SearchBackend(SoupMixin, Backend):
     REQUIRES_BS = True
 
     @cached_property
@@ -534,12 +586,12 @@ class Genius(SearchBackend):
     def scrape(cls, html: str) -> str | None:
         if m := cls.LYRICS_IN_JSON_RE.search(html):
             html_text = cls.remove_backslash(m[0]).replace(r"\n", "\n")
-            return get_soup(html_text).get_text().strip()
+            return cls.get_soup(html_text).get_text().strip()
 
         return None
 
 
-class Tekstowo(DirectBackend):
+class Tekstowo(SoupMixin, DirectBackend):
     """Fetch lyrics from Tekstowo.pl."""
 
     REQUIRES_BS = True
@@ -561,7 +613,7 @@ class Tekstowo(DirectBackend):
 
     @classmethod
     def scrape(cls, html: str) -> str | None:
-        soup = get_soup(html)
+        soup = cls.get_soup(html)
 
         if lyrics_div := soup.select_one("div.song-text > div.inner-text"):
             return lyrics_div.get_text()
@@ -569,37 +621,6 @@ class Tekstowo(DirectBackend):
         return None
 
 
-collapse_newlines = partial(re.compile(r"\n{3,}").sub, r"\n\n")
-
-
-def _scrape_strip_cruft(html: str) -> str:
-    """Clean up HTML"""
-    html = unescape(html)
-
-    html = html.replace("\r", "\n")  # Normalize EOL.
-    html = re.sub(r" +", " ", html)  # Whitespaces collapse.
-    html = BREAK_RE.sub("\n", html)  # <br> eats up surrounding '\n'.
-    html = re.sub(r"(?s)<(script).*?</\1>", "", html)  # Strip script tags.
-    html = re.sub("\u2005", " ", html)  # replace unicode with regular space
-    html = re.sub("<aside .+?</aside>", "", html)  # remove Google Ads tags
-    html = re.sub(r"</?(em|strong)[^>]*>", "", html)  # remove italics / bold
-
-    html = "\n".join([x.strip() for x in html.strip().split("\n")])
-    return collapse_newlines(html)
-
-
-def _scrape_merge_paragraphs(html):
-    html = re.sub(r"</p>\s*<p(\s*[^>]*)>", "\n", html)
-    return re.sub(r"<div .*>\s*</div>", "\n", html)
-
-
-def get_soup(html: str) -> BeautifulSoup:
-    html = _scrape_strip_cruft(html)
-    html = _scrape_merge_paragraphs(html)
-
-    return BeautifulSoup(html, "html.parser")
-
-
 class Google(SearchBackend):
     """Fetch lyrics from Google search results."""
 
@@ -635,6 +656,12 @@ class Google(SearchBackend):
     #: Split cleaned up URL title into artist and title parts.
     URL_TITLE_PARTS_RE = re.compile(r" +(?:[ :|-]+|par|by) +")
 
+    @classmethod
+    def pre_process_html(cls, html: str) -> str:
+        """Pre-process the HTML content before scraping."""
+        html = Html.remove_ads(super().pre_process_html(html))
+        return Html.remove_formatting(Html.merge_paragraphs(html))
+
     def fetch_text(self, *args, **kwargs) -> str:
         """Handle an error so that we can continue with the next URL."""
         with self.handle_request():
@@ -700,7 +727,7 @@ class Google(SearchBackend):
     @classmethod
     def scrape(cls, html: str) -> str | None:
         # Get the longest text element (if any).
-        if strings := sorted(get_soup(html).stripped_strings, key=len):
+        if strings := sorted(cls.get_soup(html).stripped_strings, key=len):
             return strings[-1]
 
         return None
diff --git a/test/plugins/lyrics_pages.py b/test/plugins/lyrics_pages.py
index 46a72076d..bca66ef1b 100644
--- a/test/plugins/lyrics_pages.py
+++ b/test/plugins/lyrics_pages.py
@@ -223,6 +223,20 @@ lyrics_pages = [
         Mademoiselle Madonna, couchée sur votre lit
         Listen to the music playing in your head.
         Vous écoutez la musique qui joue dans votre tête
+
+        Tuesday afternoon is never ending.
+        Le mardi après-midi n'en finit pas
+        Wednesday morning papers didn't come.
+        Le mercredi matin les journaux ne sont pas arrivés
+        Thursday night you stockings needed mending.
+        Jeudi soir, vos bas avaient besoin d'être réparés
+        See how they run.
+        Regardez comme ils filent
+
+        Lady Madonna, children at your feet.
+        Mademoiselle Madonna, les enfants à vos pieds
+        Wonder how you manage to make ends meet.
+        Je me demande comment vous vous débrouillez pour joindre les deux bouts
         """,
         url_title="Paroles et traduction The Beatles : Lady Madonna - paroles de chanson",  # noqa: E501
     ),
@@ -235,29 +249,35 @@ lyrics_pages = [
         Children at your feet
         Wonder how you manage
         To make ends meet
+
         Who finds the money
         When you pay the rent?
         Did you think that money
         Was Heaven sent?
+
         Friday night arrives without a suitcase
         Sunday morning creeping like a nun
         Monday's child has learned
         To tie his bootlace
         See how they run
+
         Lady Madonna
         Baby at your breast
         Wonders how you manage
         To feed the rest
         See how they run
+
         Lady Madonna
         Lying on the bed
         Listen to the music
         Playing in your head
+
         Tuesday afternoon is neverending
         Wednesday morning papers didn't come
         Thursday night your stockings
         Needed mending
         See how they run
+
         Lady Madonna
         Children at your feet
         Wonder how you manage
@@ -415,15 +435,29 @@ lyrics_pages = [
     LyricsPage.make(
         "https://www.musica.com/letras.asp?letra=59862",
         """
+        Lady Madonna, children at your feet
+        Wonder how you manage to make ends meet
+        Who finds the money when you pay the rent?
+        Did you think that money was heaven sent?
+
+        Friday night arrives without a suitcase
+        Sunday morning creeping like a nun
+        Monday's child has learned to tie his bootlace
+        See how they run
+
         Lady Madonna, baby at your breast
         Wonders how you manage to feed the rest
+
         See how they run
+
         Lady Madonna lying on the bed
         Listen to the music playing in your head
+
         Tuesday afternoon is never ending
         Wednesday morning papers didn't come
         Thursday night your stockings needed mending
         See how they run
+
         Lady Madonna, children at your feet
         Wonder how you manage to make ends meet
         """,
@@ -448,6 +482,14 @@ lyrics_pages = [
         See how they run.
         Lady Madonna, lying on the bed,
         Listen to the music playing in your head.
+
+        Tuesday afternoon is never ending.
+        Wednesday morning papers didn't come.
+        Thursday night your stockings needed mending.
+        See how they run.
+
+        Lady Madonna, children at your feet.
+        Wonder how you manage to make ends meet.
         """,
         url_title="Paroles Lady Madonna par The Beatles - Lyrics - Paroles.net",
     ),
diff --git a/test/plugins/test_lyrics.py b/test/plugins/test_lyrics.py
index 6986e4f06..9fa3931fd 100644
--- a/test/plugins/test_lyrics.py
+++ b/test/plugins/test_lyrics.py
@@ -101,28 +101,6 @@ class TestLyricsUtils:
 
         assert list(actual_titles) == [title, *expected_extra_titles]
 
-    @pytest.mark.parametrize(
-        "initial_text, expected",
-        [
-            (
-                """<!--lyrics below-->
-                  &nbsp;one
-                  <br class='myclass'>
-                  two  !
-                  <br><br \\>
-                  <blink>four</blink>""",
-                "<!--lyrics below-->\none\ntwo !\n\n<blink>four</blink>",
-            ),
-            ("foo<script>bar</script>baz", "foobaz"),
-        ],
-    )
-    def test_scrape_strip_cruft(self, initial_text, expected):
-        assert lyrics._scrape_strip_cruft(initial_text) == expected
-
-    def test_scrape_merge_paragraphs(self):
-        text = "one</p>   <p class='myclass'>two</p><p>three"
-        assert lyrics._scrape_merge_paragraphs(text) == "one\ntwo\nthree"
-
     @pytest.mark.parametrize(
         "text, expected",
         [
@@ -142,6 +120,25 @@ class TestLyricsUtils:
         assert lyrics.slug(text) == expected
 
 
+class TestHtml:
+    def test_scrape_strip_cruft(self):
+        initial = """<!--lyrics below-->
+                  &nbsp;one
+                  <br class='myclass'>
+                  two  !
+                  <br><br \\>
+                  <blink>four</blink>"""
+        expected = "<!--lyrics below-->\none\ntwo !\n\n<blink>four</blink>"
+
+        assert lyrics.Html.normalize_space(initial) == expected
+
+    def test_scrape_merge_paragraphs(self):
+        text = "one</p>   <p class='myclass'>two</p><p>three"
+        expected = "one\ntwo\n\nthree"
+
+        assert lyrics.Html.merge_paragraphs(text) == expected
+
+
 class TestSearchBackend:
     @pytest.fixture
     def backend(self, dist_thresh):

From 07d372c13d7931bd1360432d8bf727098a9e1673 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?= <snejus@protonmail.com>
Date: Sun, 13 Oct 2024 13:35:47 +0100
Subject: [PATCH 17/25] Google: prioritise Songlyrics and AZlyrics sources

---
 beetsplug/lyrics.py | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index a19d8c616..dcb80b2be 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -28,7 +28,7 @@ from functools import cached_property, partial, total_ordering
 from html import unescape
 from http import HTTPStatus
 from typing import TYPE_CHECKING, ClassVar, Iterable, Iterator, NamedTuple
-from urllib.parse import quote, urlencode
+from urllib.parse import quote, urlencode, urlparse
 
 import requests
 from unidecode import unidecode
@@ -497,6 +497,10 @@ class SearchResult(NamedTuple):
     title: str
     url: str
 
+    @property
+    def source(self) -> str:
+        return urlparse(self.url).netloc
+
 
 class SearchBackend(SoupMixin, Backend):
     REQUIRES_BS = True
@@ -656,6 +660,8 @@ class Google(SearchBackend):
     #: Split cleaned up URL title into artist and title parts.
     URL_TITLE_PARTS_RE = re.compile(r" +(?:[ :|-]+|par|by) +")
 
+    SOURCE_DIST_FACTOR = {"www.azlyrics.com": 0.5, "www.songlyrics.com": 0.6}
+
     @classmethod
     def pre_process_html(cls, html: str) -> str:
         """Pre-process the HTML content before scraping."""
@@ -724,6 +730,14 @@ class Google(SearchBackend):
         for item in data.get("items", []):
             yield self.make_search_result(artist, title, item)
 
+    def get_results(self, *args) -> Iterable[SearchResult]:
+        """Try results from preferred sources first."""
+        for result in sorted(
+            super().get_results(*args),
+            key=lambda r: self.SOURCE_DIST_FACTOR.get(r.source, 1),
+        ):
+            yield result
+
     @classmethod
     def scrape(cls, html: str) -> str | None:
         # Get the longest text element (if any).

From b2402b163489cc159c6aa3d9f3360b27664a8732 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?= <snejus@protonmail.com>
Date: Sun, 13 Oct 2024 15:57:09 +0100
Subject: [PATCH 18/25] Google: make sure we do not return the captcha text

If we get caught by Cloudfare, it forwards our request somewhere else
and returns some validation text response. To make sure that this text
does not get assumed for lyrics, we can disable redirects for the Google
backend, check the response code and raise if there's a redirect
attempt. This source will then be skipped and the backend continues with
the next one.
---
 beetsplug/lyrics.py | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index dcb80b2be..caffee1fa 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -100,6 +100,10 @@ class NotFoundError(requests.exceptions.HTTPError):
     pass
 
 
+class CaptchaError(requests.exceptions.HTTPError):
+    pass
+
+
 class TimeoutSession(requests.Session):
     def request(self, *args, **kwargs):
         """Wrap the request method to raise an exception on HTTP errors."""
@@ -107,6 +111,9 @@ class TimeoutSession(requests.Session):
         r = super().request(*args, **kwargs)
         if r.status_code == HTTPStatus.NOT_FOUND:
             raise NotFoundError("HTTP Error: Not Found", response=r)
+        if 300 <= r.status_code < 400:
+            raise CaptchaError("Captcha is required", response=r)
+
         r.raise_for_status()
 
         return r
@@ -662,6 +669,8 @@ class Google(SearchBackend):
 
     SOURCE_DIST_FACTOR = {"www.azlyrics.com": 0.5, "www.songlyrics.com": 0.6}
 
+    ignored_domains: set[str] = set()
+
     @classmethod
     def pre_process_html(cls, html: str) -> str:
         """Pre-process the HTML content before scraping."""
@@ -670,8 +679,13 @@ class Google(SearchBackend):
 
     def fetch_text(self, *args, **kwargs) -> str:
         """Handle an error so that we can continue with the next URL."""
+        kwargs.setdefault("allow_redirects", False)
         with self.handle_request():
-            return super().fetch_text(*args, **kwargs)
+            try:
+                return super().fetch_text(*args, **kwargs)
+            except CaptchaError:
+                self.ignored_domains.add(urlparse(args[0]).netloc)
+                raise
 
     @staticmethod
     def get_part_dist(artist: str, title: str, part: str) -> float:
@@ -736,7 +750,8 @@ class Google(SearchBackend):
             super().get_results(*args),
             key=lambda r: self.SOURCE_DIST_FACTOR.get(r.source, 1),
         ):
-            yield result
+            if result.source not in self.ignored_domains:
+                yield result
 
     @classmethod
     def scrape(cls, html: str) -> str | None:

From 04054cac5cde875c9b4dc73596be2b8242727bc2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?= <snejus@protonmail.com>
Date: Sat, 26 Oct 2024 01:17:12 +0100
Subject: [PATCH 19/25] Remove dependency existence checks

I think we can make our life easier by removing these checks assuming
that users follow the instructions in the docs.
---
 beetsplug/lyrics.py | 46 +++------------------------------------------
 1 file changed, 3 insertions(+), 43 deletions(-)

diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index caffee1fa..885c826a1 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -30,7 +30,9 @@ from http import HTTPStatus
 from typing import TYPE_CHECKING, ClassVar, Iterable, Iterator, NamedTuple
 from urllib.parse import quote, urlencode, urlparse
 
+import langdetect
 import requests
+from bs4 import BeautifulSoup
 from unidecode import unidecode
 
 import beets
@@ -43,20 +45,6 @@ if TYPE_CHECKING:
 
     from ._typing import GeniusAPI, GoogleCustomSearchAPI, JSONDict, LRCLibAPI
 
-try:
-    from bs4 import BeautifulSoup
-
-    HAS_BEAUTIFUL_SOUP = True
-except ImportError:
-    HAS_BEAUTIFUL_SOUP = False
-
-try:
-    import langdetect
-
-    HAS_LANGDETECT = True
-except ImportError:
-    HAS_LANGDETECT = False
-
 USER_AGENT = f"beets/{beets.__version__}"
 INSTRUMENTAL_LYRICS = "[Instrumental]"
 
@@ -265,8 +253,6 @@ class RequestHandler:
 
 
 class Backend(RequestHandler):
-    REQUIRES_BS = False
-
     def __init__(self, config, log):
         self._log = log
         self.config = config
@@ -510,8 +496,6 @@ class SearchResult(NamedTuple):
 
 
 class SearchBackend(SoupMixin, Backend):
-    REQUIRES_BS = True
-
     @cached_property
     def dist_thresh(self) -> float:
         return self.config["dist_thresh"].get(float)
@@ -605,7 +589,6 @@ class Genius(SearchBackend):
 class Tekstowo(SoupMixin, DirectBackend):
     """Fetch lyrics from Tekstowo.pl."""
 
-    REQUIRES_BS = True
     URL_TEMPLATE = "https://www.tekstowo.pl/piosenka,{},{}.html"
 
     non_alpha_to_underscore = partial(re.compile(r"\W").sub, "_")
@@ -816,9 +799,6 @@ class LyricsPlugin(RequestHandler, plugins.BeetsPlugin):
             self.config["sources"].as_str_seq(), available_sources
         )
 
-        if not HAS_BEAUTIFUL_SOUP:
-            sources = self.sanitize_bs_sources(sources)
-
         if "google" in sources:
             if not self.config["google_API_key"].get():
                 # We log a *debug* message here because the default
@@ -832,31 +812,11 @@ class LyricsPlugin(RequestHandler, plugins.BeetsPlugin):
             x.lower() for x in self.config["bing_lang_from"].as_str_seq()
         ]
 
-        if not HAS_LANGDETECT and self.config["bing_client_secret"].get():
-            self.warn(
-                "To use bing translations, you need to install the langdetect "
-                "module. See the documentation for further details."
-            )
-
         self.backends = [
             self.SOURCE_BACKENDS[s](self.config, self._log.getChild(s))
             for s in sources
         ]
 
-    def sanitize_bs_sources(self, sources):
-        enabled_sources = []
-        for source in sources:
-            if self.SOURCE_BACKENDS[source].REQUIRES_BS:
-                self._log.debug(
-                    "To use the %s lyrics source, you must "
-                    "install the beautifulsoup4 module. See "
-                    "the documentation for further details." % source
-                )
-            else:
-                enabled_sources.append(source)
-
-        return enabled_sources
-
     @cached_property
     def bing_access_token(self) -> str | None:
         params = {
@@ -1032,7 +992,7 @@ class LyricsPlugin(RequestHandler, plugins.BeetsPlugin):
 
         if lyrics:
             self.info("🟢 Found lyrics: {0}", item)
-            if HAS_LANGDETECT and self.config["bing_client_secret"].get():
+            if self.config["bing_client_secret"].get():
                 lang_from = langdetect.detect(lyrics)
                 if self.config["bing_lang_to"].get() != lang_from and (
                     not self.config["bing_lang_from"]

From bdc564a573c971aaf31d9ee79601df7006e70fb8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?= <snejus@protonmail.com>
Date: Sat, 26 Oct 2024 02:42:45 +0100
Subject: [PATCH 20/25] Tidy up handling of backends

---
 beetsplug/lyrics.py         | 52 +++++++++++++++++--------------------
 test/plugins/test_lyrics.py |  6 ++++-
 2 files changed, 29 insertions(+), 29 deletions(-)

diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index 885c826a1..8bef1e5ee 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -252,7 +252,14 @@ class RequestHandler:
             self.warn("Request error: {}", exc)
 
 
-class Backend(RequestHandler):
+class BackendClass(type):
+    @property
+    def name(cls) -> str:
+        """Return lowercase name of the backend class."""
+        return cls.__name__.lower()
+
+
+class Backend(RequestHandler, metaclass=BackendClass):
     def __init__(self, config, log):
         self._log = log
         self.config = config
@@ -746,15 +753,21 @@ class Google(SearchBackend):
 
 
 class LyricsPlugin(RequestHandler, plugins.BeetsPlugin):
-    SOURCES = ["lrclib", "google", "musixmatch", "genius", "tekstowo"]
-    SOURCE_BACKENDS = {
-        "google": Google,
-        "musixmatch": MusiXmatch,
-        "genius": Genius,
-        "tekstowo": Tekstowo,
-        "lrclib": LRCLib,
+    BACKEND_BY_NAME = {
+        b.name: b for b in [LRCLib, Google, Genius, Tekstowo, MusiXmatch]
     }
 
+    @cached_property
+    def backends(self) -> list[Backend]:
+        user_sources = self.config["sources"].get()
+
+        chosen = plugins.sanitize_choices(user_sources, self.BACKEND_BY_NAME)
+        if "google" in chosen and not self.config["google_API_key"].get():
+            self.warn("Disabling Google source: no API key configured.")
+            chosen.remove("google")
+
+        return [self.BACKEND_BY_NAME[c](self.config, self._log) for c in chosen]
+
     def __init__(self):
         super().__init__()
         self.import_stages = [self.imported]
@@ -777,7 +790,9 @@ class LyricsPlugin(RequestHandler, plugins.BeetsPlugin):
                 "synced": False,
                 # Musixmatch is disabled by default as they are currently blocking
                 # requests with the beets user agent.
-                "sources": [s for s in self.SOURCES if s != "musixmatch"],
+                "sources": [
+                    n for n in self.BACKEND_BY_NAME if n != "musixmatch"
+                ],
             }
         )
         self.config["bing_client_secret"].redact = True
@@ -794,29 +809,10 @@ class LyricsPlugin(RequestHandler, plugins.BeetsPlugin):
         # open yet.
         self.rest = None
 
-        available_sources = list(self.SOURCES)
-        sources = plugins.sanitize_choices(
-            self.config["sources"].as_str_seq(), available_sources
-        )
-
-        if "google" in sources:
-            if not self.config["google_API_key"].get():
-                # We log a *debug* message here because the default
-                # configuration includes `google`. This way, the source
-                # is silent by default but can be enabled just by
-                # setting an API key.
-                self.debug("Disabling google source: " "no API key configured.")
-                sources.remove("google")
-
         self.config["bing_lang_from"] = [
             x.lower() for x in self.config["bing_lang_from"].as_str_seq()
         ]
 
-        self.backends = [
-            self.SOURCE_BACKENDS[s](self.config, self._log.getChild(s))
-            for s in sources
-        ]
-
     @cached_property
     def bing_access_token(self) -> str | None:
         params = {
diff --git a/test/plugins/test_lyrics.py b/test/plugins/test_lyrics.py
index 9fa3931fd..3d65b3ee6 100644
--- a/test/plugins/test_lyrics.py
+++ b/test/plugins/test_lyrics.py
@@ -294,7 +294,7 @@ class TestGoogleLyrics(LyricsBackendTest):
     def backend_name(self):
         return "google"
 
-    @pytest.fixture(scope="class")
+    @pytest.fixture
     def plugin_config(self):
         return {"google_API_key": "test"}
 
@@ -306,6 +306,10 @@ class TestGoogleLyrics(LyricsBackendTest):
     def search_item(self, url_title, url):
         return {"title": url_title, "link": url}
 
+    @pytest.mark.parametrize("plugin_config", [{}])
+    def test_disabled_without_api_key(self, lyrics_plugin):
+        assert not lyrics_plugin.backends
+
     def test_mocked_source_ok(self, backend, lyrics_html):
         """Test that lyrics of the mocked page are correctly scraped"""
         result = backend.scrape(lyrics_html).lower()

From 734bcc28a8ee8683f13c5774619d3b382b24e0c5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?= <snejus@protonmail.com>
Date: Sat, 19 Oct 2024 03:15:03 +0100
Subject: [PATCH 21/25] Append source to the lyrics

---
 beetsplug/lyrics.py         | 30 +++++++++++++++++++-----------
 test/plugins/test_lyrics.py | 18 ++++++++++++++----
 2 files changed, 33 insertions(+), 15 deletions(-)

diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index 8bef1e5ee..58fb95565 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -266,7 +266,7 @@ class Backend(RequestHandler, metaclass=BackendClass):
 
     def fetch(
         self, artist: str, title: str, album: str, length: int
-    ) -> str | None:
+    ) -> tuple[str, str] | None:
         raise NotImplementedError
 
 
@@ -277,6 +277,7 @@ class LRCLyrics:
     DURATION_DIFF_TOLERANCE = 0.05
 
     target_duration: float
+    id: int
     duration: float
     instrumental: bool
     plain: str
@@ -292,6 +293,7 @@ class LRCLyrics:
     ) -> LRCLyrics:
         return cls(
             target_duration,
+            candidate["id"],
             candidate["duration"] or 0.0,
             candidate["instrumental"],
             candidate["plainLyrics"],
@@ -374,14 +376,15 @@ class LRCLib(Backend):
 
     def fetch(
         self, artist: str, title: str, album: str, length: int
-    ) -> str | None:
+    ) -> tuple[str, str] | None:
         """Fetch lyrics text for the given song data."""
         evaluate_item = partial(LRCLyrics.make, target_duration=length)
 
         for group in self.fetch_candidates(artist, title, album, length):
             candidates = [evaluate_item(item) for item in group]
             if item := self.pick_best_match(candidates):
-                return item.get_text(self.config["synced"])
+                lyrics = item.get_text(self.config["synced"])
+                return lyrics, f"{self.GET_URL}/{item.id}"
 
         return None
 
@@ -420,7 +423,7 @@ class MusiXmatch(DirectBackend):
 
         return quote(unidecode(text))
 
-    def fetch(self, artist: str, title: str, *_) -> str | None:
+    def fetch(self, artist: str, title: str, *_) -> tuple[str, str] | None:
         url = self.build_url(artist, title)
 
         html = self.fetch_text(url)
@@ -442,7 +445,7 @@ class MusiXmatch(DirectBackend):
         # sometimes there are non-existent lyrics with some content
         if "Lyrics | Musixmatch" in lyrics:
             return None
-        return lyrics
+        return lyrics, url
 
 
 class Html:
@@ -543,13 +546,13 @@ class SearchBackend(SoupMixin, Backend):
             if check_match(candidate):
                 yield candidate
 
-    def fetch(self, artist: str, title: str, *_) -> str | None:
+    def fetch(self, artist: str, title: str, *_) -> tuple[str, str] | None:
         """Fetch lyrics for the given artist and title."""
         for result in self.get_results(artist, title):
             if (html := self.fetch_text(result.url)) and (
                 lyrics := self.scrape(html)
             ):
-                return lyrics
+                return lyrics, result.url
 
         return None
 
@@ -604,11 +607,15 @@ class Tekstowo(SoupMixin, DirectBackend):
     def encode(cls, text: str) -> str:
         return cls.non_alpha_to_underscore(unidecode(text.lower()))
 
-    def fetch(self, artist: str, title: str, *_) -> str | None:
+    def fetch(self, artist: str, title: str, *_) -> tuple[str, str] | None:
+        url = self.build_url(artist, title)
         # We are expecting to receive a 404 since we are guessing the URL.
         # Thus suppress the error so that it does not end up in the logs.
         with suppress(NotFoundError):
-            return self.scrape(self.fetch_text(self.build_url(artist, title)))
+            if lyrics := self.scrape(self.fetch_text(url)):
+                return lyrics, url
+
+        return None
 
         return None
 
@@ -1014,8 +1021,9 @@ class LyricsPlugin(RequestHandler, plugins.BeetsPlugin):
         self.info("Fetching lyrics for {} - {}", artist, title)
         for backend in self.backends:
             with backend.handle_request():
-                if lyrics := backend.fetch(artist, title, *args):
-                    return lyrics
+                if lyrics_info := backend.fetch(artist, title, *args):
+                    lyrics, url = lyrics_info
+                    return f"{lyrics}\n\nSource: {url}"
 
         return None
 
diff --git a/test/plugins/test_lyrics.py b/test/plugins/test_lyrics.py
index 3d65b3ee6..24a870a91 100644
--- a/test/plugins/test_lyrics.py
+++ b/test/plugins/test_lyrics.py
@@ -279,11 +279,12 @@ class TestLyricsSources(LyricsBackendTest):
 
     def test_backend_source(self, lyrics_plugin, lyrics_page: LyricsPage):
         """Test parsed lyrics from each of the configured lyrics pages."""
-        lyrics = lyrics_plugin.get_lyrics(
+        lyrics_info = lyrics_plugin.get_lyrics(
             lyrics_page.artist, lyrics_page.track_title, "", 186
         )
 
-        assert lyrics
+        assert lyrics_info
+        lyrics, _ = lyrics_info.split("\n\nSource: ")
         assert lyrics == lyrics_page.lyrics
 
 
@@ -400,6 +401,7 @@ LYRICS_DURATION = 950
 
 def lyrics_match(**overrides):
     return {
+        "id": 1,
         "instrumental": False,
         "duration": LYRICS_DURATION,
         "syncedLyrics": "synced",
@@ -428,7 +430,9 @@ class TestLRCLibLyrics(LyricsBackendTest):
         [({"synced": True}, "synced"), ({"synced": False}, "plain")],
     )
     def test_synced_config_option(self, fetch_lyrics, expected_lyrics):
-        assert fetch_lyrics() == expected_lyrics
+        lyrics, _ = fetch_lyrics()
+
+        assert lyrics == expected_lyrics
 
     @pytest.mark.parametrize(
         "response_data, expected_lyrics",
@@ -490,4 +494,10 @@ class TestLRCLibLyrics(LyricsBackendTest):
     )
     @pytest.mark.parametrize("plugin_config", [{"synced": True}])
     def test_fetch_lyrics(self, fetch_lyrics, expected_lyrics):
-        assert fetch_lyrics() == expected_lyrics
+        lyrics_info = fetch_lyrics()
+        if lyrics_info is None:
+            assert expected_lyrics is None
+        else:
+            lyrics, _ = fetch_lyrics()
+
+            assert lyrics == expected_lyrics

From 858c13558c544ee10f7c81b9774f4f74d19c4ff4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?= <snejus@protonmail.com>
Date: Sun, 20 Oct 2024 00:37:39 +0100
Subject: [PATCH 22/25] Xfail Songlyrics source

---
 test/plugins/lyrics_pages.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/plugins/lyrics_pages.py b/test/plugins/lyrics_pages.py
index bca66ef1b..835803f0e 100644
--- a/test/plugins/lyrics_pages.py
+++ b/test/plugins/lyrics_pages.py
@@ -522,6 +522,7 @@ lyrics_pages = [
         Wonder how you manage to make ends meet
         """,
         url_title="THE BEATLES - LADY MADONNA LYRICS",
+        marks=[xfail_on_ci("Songlyrics is blocked by Cloudflare")],
     ),
     LyricsPage.make(
         "https://sweetslyrics.com/the-beatles/lady-madonna-lyrics",

From 39c479fcabebc52d4eff5c81562d9e54ae37198c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?= <snejus@protonmail.com>
Date: Sat, 26 Oct 2024 17:31:48 +0100
Subject: [PATCH 23/25] Google: add support for dainuzodziai.lt

---
 beetsplug/lyrics.py          |  7 +++++--
 test/plugins/lyrics_pages.py | 21 +++++++++++++++++++++
 2 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index 58fb95565..8bea4a5f4 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -457,7 +457,9 @@ class Html:
     #: (paroles.net, sweetslyrics.com, lacoccinelle.net)
     merge_lines = partial(re.compile(r"</p>\s+<p[^>]*>(?!___)").sub, "\n")
     #: remove empty divs (lacoccinelle.net)
-    remove_empty_divs = partial(re.compile(r"<div[^>]*>\s*</div>").sub, "")
+    remove_empty_tags = partial(
+        re.compile(r"(<(div|span)[^>]*>\s*</\2>)").sub, ""
+    )
     #: remove Google Ads tags (musica.com)
     remove_aside = partial(re.compile("<aside .+?</aside>").sub, "")
     #: remove adslot-Content_1 div from the lyrics text (paroles.net)
@@ -481,7 +483,7 @@ class Html:
 
     @classmethod
     def merge_paragraphs(cls, text: str) -> str:
-        return cls.merge_blocks(cls.merge_lines(cls.remove_empty_divs(text)))
+        return cls.merge_blocks(cls.merge_lines(cls.remove_empty_tags(text)))
 
 
 class SoupMixin:
@@ -650,6 +652,7 @@ class Google(SearchBackend):
       paroles(\ et\ traduction|\ de\ chanson)?
     | letras?(\ de)?
     | liedtexte
+    | dainų\ žodžiai
     | original\ song\ full\ text\.
     | official
     | 20[12]\d\ version
diff --git a/test/plugins/lyrics_pages.py b/test/plugins/lyrics_pages.py
index 835803f0e..84c9e2441 100644
--- a/test/plugins/lyrics_pages.py
+++ b/test/plugins/lyrics_pages.py
@@ -147,6 +147,27 @@ lyrics_pages = [
         """,
         url_title="The Beatles Lady Madonna lyrics",
     ),
+    LyricsPage.make(
+        "https://www.dainuzodziai.lt/m/mergaites-nori-mylet-atlanta/",
+        """
+        Jos nesuspėja skriet paskui vėją
+        Bangos į krantą grąžina jas vėl
+        Jos karštą saulę paliesti norėjo
+        Ant kranto palikę visas negandas
+
+        Bet jos nori mylėt
+        Jos nenori liūdėt
+        Leisk mergaitėms mylėt
+        Kaip jos moka mylėt
+        Koks vakaras šiltas ir nieko nestinga
+        Veidus apšviečia žaisminga šviesa
+        Jos buvo laimingos prie jūros kur liko
+        Tik vėjas išmokęs visas jų dainas
+        """,
+        artist="Atlanta",
+        track_title="Mergaitės Nori Mylėt",
+        url_title="Mergaitės nori mylėt – Atlanta | Dainų Žodžiai",
+    ),
     LyricsPage.make(
         "https://genius.com/The-beatles-lady-madonna-lyrics",
         """

From 7389f241f4cb7e8773fc70d363653e97fb9c435f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?= <snejus@protonmail.com>
Date: Sun, 19 Jan 2025 15:42:47 +0000
Subject: [PATCH 24/25] Do not search for Various Artists, split titles by ' /
 '

---
 beetsplug/lyrics.py         | 18 ++++++++++++++----
 test/plugins/test_lyrics.py |  3 ++-
 2 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index 8bea4a5f4..bf889e711 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -158,10 +158,20 @@ def search_pairs(item):
         # Remove any featuring artists from the artists name
         rf"(.*?) {plugins.feat_tokens()}"
     ]
-    artists = generate_alternatives(artist, patterns)
+
+    # Skip various artists
+    artists = []
+    lower_artist = artist.lower()
+    if "various" not in lower_artist:
+        artists.extend(generate_alternatives(artist, patterns))
     # Use the artist_sort as fallback only if it differs from artist to avoid
     # repeated remote requests with the same search terms
-    if artist_sort and artist.lower() != artist_sort.lower():
+    artist_sort_lower = artist_sort.lower()
+    if (
+        artist_sort
+        and lower_artist != artist_sort_lower
+        and "various" not in artist_sort_lower
+    ):
         artists.append(artist_sort)
 
     patterns = [
@@ -180,8 +190,8 @@ def search_pairs(item):
     multi_titles = []
     for title in titles:
         multi_titles.append([title])
-        if "/" in title:
-            multi_titles.append([x.strip() for x in title.split("/")])
+        if " / " in title:
+            multi_titles.append([x.strip() for x in title.split(" / ")])
 
     return itertools.product(artists, multi_titles)
 
diff --git a/test/plugins/test_lyrics.py b/test/plugins/test_lyrics.py
index 24a870a91..c6d48c3bd 100644
--- a/test/plugins/test_lyrics.py
+++ b/test/plugins/test_lyrics.py
@@ -37,6 +37,7 @@ class TestLyricsUtils:
     @pytest.mark.parametrize(
         "artist, title",
         [
+            ("Various Artists", "Title"),
             ("Artist", ""),
             ("", "Title"),
             (" ", ""),
@@ -81,7 +82,7 @@ class TestLyricsUtils:
     @pytest.mark.parametrize(
         "title, expected_extra_titles",
         [
-            ("1/2", ["1", "2"]),
+            ("1/2", []),
             ("1 / 2", ["1", "2"]),
             ("Song (live)", ["Song"]),
             ("Song (live) (new)", ["Song"]),

From dab9a0d7c48f76eb905a00c19888b3dbc0125d42 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?= <snejus@protonmail.com>
Date: Sun, 15 Dec 2024 11:35:29 +0000
Subject: [PATCH 25/25] Bring back Tekstowo search

It was my mistake to remove search earlier - I found that in many cases
it works fine.
---
 beetsplug/lyrics.py | 54 ++++++++++++++++++---------------------------
 1 file changed, 21 insertions(+), 33 deletions(-)

diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index bf889e711..1732edbf7 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -27,8 +27,8 @@ from dataclasses import dataclass
 from functools import cached_property, partial, total_ordering
 from html import unescape
 from http import HTTPStatus
-from typing import TYPE_CHECKING, ClassVar, Iterable, Iterator, NamedTuple
-from urllib.parse import quote, urlencode, urlparse
+from typing import TYPE_CHECKING, Iterable, Iterator, NamedTuple
+from urllib.parse import quote, quote_plus, urlencode, urlparse
 
 import langdetect
 import requests
@@ -399,22 +399,7 @@ class LRCLib(Backend):
         return None
 
 
-class DirectBackend(Backend):
-    """A backend for fetching lyrics directly."""
-
-    URL_TEMPLATE: ClassVar[str]  #: May include formatting placeholders
-
-    @classmethod
-    def encode(cls, text: str) -> str:
-        """Encode the string for inclusion in a URL."""
-        raise NotImplementedError
-
-    @classmethod
-    def build_url(cls, *args: str) -> str:
-        return cls.URL_TEMPLATE.format(*map(cls.encode, args))
-
-
-class MusiXmatch(DirectBackend):
+class MusiXmatch(Backend):
     URL_TEMPLATE = "https://www.musixmatch.com/lyrics/{}/{}"
 
     REPLACEMENTS = {
@@ -433,6 +418,10 @@ class MusiXmatch(DirectBackend):
 
         return quote(unidecode(text))
 
+    @classmethod
+    def build_url(cls, *args: str) -> str:
+        return cls.URL_TEMPLATE.format(*map(cls.encode, args))
+
     def fetch(self, artist: str, title: str, *_) -> tuple[str, str] | None:
         url = self.build_url(artist, title)
 
@@ -608,26 +597,25 @@ class Genius(SearchBackend):
         return None
 
 
-class Tekstowo(SoupMixin, DirectBackend):
+class Tekstowo(SearchBackend):
     """Fetch lyrics from Tekstowo.pl."""
 
-    URL_TEMPLATE = "https://www.tekstowo.pl/piosenka,{},{}.html"
+    BASE_URL = "https://www.tekstowo.pl"
+    SEARCH_URL = BASE_URL + "/szukaj,{}.html"
 
-    non_alpha_to_underscore = partial(re.compile(r"\W").sub, "_")
+    def build_url(self, artist, title):
+        artistitle = f"{artist.title()} {title.title()}"
 
-    @classmethod
-    def encode(cls, text: str) -> str:
-        return cls.non_alpha_to_underscore(unidecode(text.lower()))
+        return self.SEARCH_URL.format(quote_plus(unidecode(artistitle)))
 
-    def fetch(self, artist: str, title: str, *_) -> tuple[str, str] | None:
-        url = self.build_url(artist, title)
-        # We are expecting to receive a 404 since we are guessing the URL.
-        # Thus suppress the error so that it does not end up in the logs.
-        with suppress(NotFoundError):
-            if lyrics := self.scrape(self.fetch_text(url)):
-                return lyrics, url
-
-        return None
+    def search(self, artist: str, title: str) -> Iterable[SearchResult]:
+        if html := self.fetch_text(self.build_url(title, artist)):
+            soup = self.get_soup(html)
+            for tag in soup.select("div[class=flex-group] > a[title*=' - ']"):
+                artist, title = str(tag["title"]).split(" - ", 1)
+                yield SearchResult(
+                    artist, title, f"{self.BASE_URL}{tag['href']}"
+                )
 
         return None