From 2ff57505d8aa5dfd233b2ce4b5f932a87a9cca7c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?=
Date: Sat, 12 Oct 2024 02:20:37 +0100
Subject: [PATCH 01/25] Apply dist_thresh to Genius and Google backends
This commit introduces a distance threshold mechanism for the Genius and
Google backends.
- Create a new `SearchBackend` base class with a method `check_match`
that performs checking.
- Start using undocumented `dist_thresh` configuration option for good,
and mention it in the docs. This controls the maximum allowable
distance for matching artist and title names.
These changes aim to improve the accuracy of lyrics matching, especially
when there are slight variations in artist or title names, see #4791.
---
beetsplug/lyrics.py | 117 +++++++++++++++++++++---------------
docs/changelog.rst | 16 +++--
docs/plugins/lyrics.rst | 6 ++
test/plugins/test_lyrics.py | 40 +++++++++++-
4 files changed, 125 insertions(+), 54 deletions(-)
diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index d1d715ce4..84bc26ea7 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -16,10 +16,10 @@
from __future__ import annotations
-import difflib
import errno
import itertools
import json
+import math
import os.path
import re
import struct
@@ -30,7 +30,7 @@ from dataclasses import dataclass
from functools import cached_property, partial, total_ordering
from http import HTTPStatus
from typing import TYPE_CHECKING, ClassVar, Iterable, Iterator
-from urllib.parse import quote, urlencode
+from urllib.parse import quote, urlencode, urlparse
import requests
from typing_extensions import TypedDict
@@ -38,6 +38,7 @@ from unidecode import unidecode
import beets
from beets import plugins, ui
+from beets.autotag.hooks import string_dist
if TYPE_CHECKING:
from beets.importer import ImportTask
@@ -488,15 +489,47 @@ class MusiXmatch(DirectBackend):
return lyrics
-class Genius(Backend):
+class SearchBackend(Backend):
+ REQUIRES_BS = True
+
+ @cached_property
+ def dist_thresh(self) -> float:
+ return self.config["dist_thresh"].get(float)
+
+ def check_match(
+ self, target_artist: str, target_title: str, artist: str, title: str
+ ) -> bool:
+ """Check if the given artist and title are 'good enough' match."""
+ max_dist = max(
+ string_dist(target_artist, artist),
+ string_dist(target_title, title),
+ )
+
+ if (max_dist := round(max_dist, 2)) <= self.dist_thresh:
+ return True
+
+ if math.isclose(max_dist, self.dist_thresh, abs_tol=0.4):
+ # log out the candidate that did not make it but was close.
+ # This may show a matching candidate with some noise in the name
+ self._log.debug(
+ "({}, {}) does not match ({}, {}) but dist was close: {:.2f}",
+ artist,
+ title,
+ target_artist,
+ target_title,
+ max_dist,
+ )
+
+ return False
+
+
+class Genius(SearchBackend):
"""Fetch lyrics from Genius via genius-api.
Simply adapted from
bigishdata.com/2016/09/27/getting-song-lyrics-from-geniuss-api-scraping/
"""
- REQUIRES_BS = True
-
base_url = "https://api.genius.com"
def __init__(self, config, log):
@@ -519,19 +552,15 @@ class Genius(Backend):
self._log.debug("Genius API request returned invalid JSON")
return None
- # find a matching artist in the json
+ check = partial(self.check_match, artist, title)
for hit in json["response"]["hits"]:
- hit_artist = hit["result"]["primary_artist"]["name"]
-
- if slug(hit_artist) == slug(artist):
- html = self.fetch_url(hit["result"]["url"])
+ result = hit["result"]
+ if check(result["primary_artist"]["name"], result["title"]):
+ html = self.fetch_url(result["url"])
if not html:
return None
return self._scrape_lyrics_from_html(html)
- self._log.debug(
- "Genius failed to find a matching artist for '{0}'", artist
- )
return None
def _search(self, artist, title):
@@ -727,10 +756,9 @@ def scrape_lyrics_from_html(html):
return None
-class Google(Backend):
+class Google(SearchBackend):
"""Fetch lyrics from Google search results."""
- REQUIRES_BS = True
SEARCH_URL = "https://www.googleapis.com/customsearch/v1"
def is_lyrics(self, text, artist=None):
@@ -778,21 +806,20 @@ class Google(Backend):
BY_TRANS = ["by", "par", "de", "von"]
LYRICS_TRANS = ["lyrics", "paroles", "letras", "liedtexte"]
- def is_page_candidate(self, url_link, url_title, title, artist):
+ def is_page_candidate(
+ self, artist: str, title: str, url_link: str, url_title: str
+ ) -> bool:
"""Return True if the URL title makes it a good candidate to be a
page that contains lyrics of title by artist.
"""
- title = self.slugify(title.lower())
- artist = self.slugify(artist.lower())
- sitename = re.search(
- "//([^/]+)/.*", self.slugify(url_link.lower())
- ).group(1)
- url_title = self.slugify(url_title.lower())
-
- # Check if URL title contains song title (exact match)
- if url_title.find(title) != -1:
+ title_slug = self.slugify(title.lower())
+ url_title_slug = self.slugify(url_title.lower())
+ if title_slug in url_title_slug:
return True
+ artist = self.slugify(artist.lower())
+ sitename = urlparse(url_link).netloc
+
# or try extracting song title from URL title and check if
# they are close enough
tokens = (
@@ -801,12 +828,9 @@ class Google(Backend):
+ self.LYRICS_TRANS
)
tokens = [re.escape(t) for t in tokens]
- song_title = re.sub("(%s)" % "|".join(tokens), "", url_title)
+ song_title = re.sub("(%s)" % "|".join(tokens), "", url_title_slug)
- song_title = song_title.strip("_|")
- typo_ratio = 0.9
- ratio = difflib.SequenceMatcher(None, song_title, title).ratio()
- return ratio >= typo_ratio
+ return self.check_match(artist, title_slug, artist, song_title)
def fetch(self, artist: str, title: str, *_) -> str | None:
params = {
@@ -828,24 +852,21 @@ class Google(Backend):
self._log.debug("google backend error: {0}", reason)
return None
- if "items" in data.keys():
- for item in data["items"]:
- url_link = item["link"]
- url_title = item.get("title", "")
- if not self.is_page_candidate(
- url_link, url_title, title, artist
- ):
- continue
- html = self.fetch_url(url_link)
- if not html:
- continue
- lyrics = scrape_lyrics_from_html(html)
- if not lyrics:
- continue
+ check_candidate = partial(self.is_page_candidate, artist, title)
+ for item in data.get("items", []):
+ url_link = item["link"]
+ if not check_candidate(url_link, item.get("title", "")):
+ continue
+ html = self.fetch_url(url_link)
+ if not html:
+ continue
+ lyrics = scrape_lyrics_from_html(html)
+ if not lyrics:
+ continue
- if self.is_lyrics(lyrics, artist):
- self._log.debug("got lyrics from {0}", item["displayLink"])
- return lyrics
+ if self.is_lyrics(lyrics, artist):
+ self._log.debug("got lyrics from {0}", item["displayLink"])
+ return lyrics
return None
@@ -869,6 +890,7 @@ class LyricsPlugin(plugins.BeetsPlugin):
"bing_client_secret": None,
"bing_lang_from": [],
"bing_lang_to": None,
+ "dist_thresh": 0.11,
"google_API_key": None,
"google_engine_ID": "009217259823014548361:lndtuqkycfu",
"genius_api_key": "Ryq93pUGm8bM6eUWwD_M3NOFFDAtp2yEE7W"
@@ -880,7 +902,6 @@ class LyricsPlugin(plugins.BeetsPlugin):
# Musixmatch is disabled by default as they are currently blocking
# requests with the beets user agent.
"sources": [s for s in self.SOURCES if s != "musixmatch"],
- "dist_thresh": 0.1,
}
)
self.config["bing_client_secret"].redact = True
diff --git a/docs/changelog.rst b/docs/changelog.rst
index 46fa3b64e..25ae5bc10 100644
--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@@ -11,6 +11,15 @@ been dropped.
New features:
+* :doc:`plugins/lastgenre`: The new configuration option, ``keep_existing``,
+ provides more fine-grained control over how pre-populated genre tags are
+ handled. The ``force`` option now behaves in a more conventional manner.
+ :bug:`4982`
+* :doc:`plugins/lyrics`: Add new configuration option ``dist_thresh`` to
+ control the maximum allowed distance between the lyrics search result and the
+ tagged item's artist and title. This is useful for preventing false positives
+ when fetching lyrics.
+
Bug fixes:
* :doc:`plugins/lyrics`: LRCLib will fallback to plain lyrics if synced lyrics
@@ -55,10 +64,9 @@ Bug fixes:
``lrclib`` over other sources since it returns reliable results quicker than
others.
:bug:`5102`
-* :doc:`plugins/lastgenre`: The new configuration option, ``keep_existing``,
- provides more fine-grained control over how pre-populated genre tags are
- handled. The ``force`` option now behaves in a more conventional manner.
- :bug:`4982`
+* :doc:`plugins/lyrics`: Fix the issue with ``genius`` backend not being able
+ to match lyrics when there is a slight variation in the artist name.
+ :bug:`4791`
For packagers:
diff --git a/docs/plugins/lyrics.rst b/docs/plugins/lyrics.rst
index d1f434d70..d080b1f94 100644
--- a/docs/plugins/lyrics.rst
+++ b/docs/plugins/lyrics.rst
@@ -42,6 +42,12 @@ configuration file. The available options are:
Default: ``[]``
- **bing_lang_to**: Language to translate lyrics into.
Default: None.
+- **dist_thresh**: The maximum distance between the artist and title
+ combination of the music file and lyrics candidate to consider them a match.
+ Lower values will make the plugin more strict, higher values will make it
+ more lenient. This does not apply to the ``lrclib`` backend as it matches
+ durations.
+ Default: ``0.11``.
- **fallback**: By default, the file will be left unchanged when no lyrics are
found. Use the empty string ``''`` to reset the lyrics in such a case.
Default: None.
diff --git a/test/plugins/test_lyrics.py b/test/plugins/test_lyrics.py
index 47c983770..812399698 100644
--- a/test/plugins/test_lyrics.py
+++ b/test/plugins/test_lyrics.py
@@ -161,6 +161,42 @@ class TestLyricsUtils:
assert lyrics.slug(text) == expected
+class TestSearchBackend:
+ @pytest.fixture
+ def backend(self, dist_thresh):
+ plugin = lyrics.LyricsPlugin()
+ plugin.config.set({"dist_thresh": dist_thresh})
+ return lyrics.SearchBackend(plugin.config, plugin._log)
+
+ @pytest.mark.parametrize(
+ "dist_thresh, target_artist, artist, should_match",
+ [
+ (0.11, "Target Artist", "Target Artist", True),
+ (0.11, "Target Artist", "Target Artis", True),
+ (0.11, "Target Artist", "Target Arti", False),
+ (0.11, "Psychonaut", "Psychonaut (BEL)", True),
+ (0.11, "beets song", "beats song", True),
+ (0.10, "beets song", "beats song", False),
+ (
+ 0.11,
+ "Lucid Dreams (Forget Me)",
+ "Lucid Dreams (Remix) ft. Lil Uzi Vert",
+ False,
+ ),
+ (
+ 0.12,
+ "Lucid Dreams (Forget Me)",
+ "Lucid Dreams (Remix) ft. Lil Uzi Vert",
+ True,
+ ),
+ ],
+ )
+ def test_check_match(self, backend, target_artist, artist, should_match):
+ assert (
+ backend.check_match(target_artist, "", artist, "") == should_match
+ )
+
+
@pytest.fixture(scope="module")
def lyrics_root_dir(pytestconfig: pytest.Config):
return pytestconfig.rootpath / "test" / "rsrc" / "lyrics"
@@ -275,10 +311,10 @@ class TestGoogleLyrics(LyricsBackendTest):
self, backend, lyrics_html, url_title, artist, should_be_candidate
):
result = backend.is_page_candidate(
+ artist,
+ self.TITLE,
"http://www.example.com/lyrics/beetssong",
url_title,
- self.TITLE,
- artist,
)
assert bool(result) == should_be_candidate
From c40db1034acb5e8164fa113860427ace2e7308c1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?=
Date: Tue, 27 Aug 2024 13:43:31 +0100
Subject: [PATCH 02/25] Make lyrics plugin documentation slightly more clear
---
beetsplug/lyrics.py | 6 +-
docs/plugins/lyrics.rst | 142 ++++++++++++++++++++--------------------
2 files changed, 76 insertions(+), 72 deletions(-)
diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index 84bc26ea7..2f8998d96 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -893,8 +893,10 @@ class LyricsPlugin(plugins.BeetsPlugin):
"dist_thresh": 0.11,
"google_API_key": None,
"google_engine_ID": "009217259823014548361:lndtuqkycfu",
- "genius_api_key": "Ryq93pUGm8bM6eUWwD_M3NOFFDAtp2yEE7W"
- "76V-uFL5jks5dNvcGCdarqFjDhP9c",
+ "genius_api_key": (
+ "Ryq93pUGm8bM6eUWwD_M3NOFFDAtp2yEE7W"
+ "76V-uFL5jks5dNvcGCdarqFjDhP9c"
+ ),
"fallback": None,
"force": False,
"local": False,
diff --git a/docs/plugins/lyrics.rst b/docs/plugins/lyrics.rst
index d080b1f94..f034cf47a 100644
--- a/docs/plugins/lyrics.rst
+++ b/docs/plugins/lyrics.rst
@@ -2,25 +2,27 @@ Lyrics Plugin
=============
The ``lyrics`` plugin fetches and stores song lyrics from databases on the Web.
-Namely, the current version of the plugin uses `Genius.com`_, `Tekstowo.pl`_, `LRCLIB`_
-and, optionally, the Google custom search API.
+Namely, the current version of the plugin uses `Genius.com`_, `Tekstowo.pl`_,
+`LRCLIB`_ and, optionally, the Google Custom Search API.
.. _Genius.com: https://genius.com/
.. _Tekstowo.pl: https://www.tekstowo.pl/
.. _LRCLIB: https://lrclib.net/
-Fetch Lyrics During Import
---------------------------
+Install
+-------
-To automatically fetch lyrics for songs you import, first enable it in your
-configuration (see :ref:`using-plugins`). Then, install ``beets`` with
-``lyrics`` extra
+Firstly, enable ``lyrics`` plugin in your configuration (see
+:ref:`using-plugins`). Then, install ``beets`` with ``lyrics`` extra
.. code-block:: bash
pip install "beets[lyrics]"
+Fetch Lyrics During Import
+--------------------------
+
When importing new files, beets will now fetch lyrics for files that don't
already have them. The lyrics will be stored in the beets database. If the
``import.write`` config option is on, then the lyrics will also be written to
@@ -29,52 +31,52 @@ the files' tags.
Configuration
-------------
-To configure the plugin, make a ``lyrics:`` section in your
-configuration file. The available options are:
+To configure the plugin, make a ``lyrics:`` section in your configuration file.
+Default configuration:
+
+.. code-block:: yaml
+
+ lyrics:
+ auto: yes
+ bing_client_secret: null
+ bing_lang_from: []
+ bing_lang_to: null
+ dist_thresh: 0.11
+ fallback: null
+ force: no
+ google_API_key: null
+ google_engine_ID: 009217259823014548361:lndtuqkycfu
+ sources: [lrclib, google, genius, tekstowo]
+ synced: no
+
+The available options are:
- **auto**: Fetch lyrics automatically during import.
- Default: ``yes``.
- **bing_client_secret**: Your Bing Translation application password
- (to :ref:`lyrics-translation`)
+ (see :ref:`lyrics-translation`)
- **bing_lang_from**: By default all lyrics with a language other than
``bing_lang_to`` are translated. Use a list of lang codes to restrict the set
of source languages to translate.
- Default: ``[]``
- **bing_lang_to**: Language to translate lyrics into.
- Default: None.
- **dist_thresh**: The maximum distance between the artist and title
combination of the music file and lyrics candidate to consider them a match.
Lower values will make the plugin more strict, higher values will make it
more lenient. This does not apply to the ``lrclib`` backend as it matches
durations.
- Default: ``0.11``.
- **fallback**: By default, the file will be left unchanged when no lyrics are
found. Use the empty string ``''`` to reset the lyrics in such a case.
- Default: None.
- **force**: By default, beets won't fetch lyrics if the files already have
ones. To instead always fetch lyrics, set the ``force`` option to ``yes``.
- Default: ``no``.
- **google_API_key**: Your Google API key (to enable the Google Custom Search
backend).
- Default: None.
- **google_engine_ID**: The custom search engine to use.
Default: The `beets custom search engine`_, which gathers an updated list of
sources known to be scrapeable.
- **sources**: List of sources to search for lyrics. An asterisk ``*`` expands
- to all available sources.
- Default: ``lrclib google genius tekstowo``, i.e., all the available sources. The
- ``google`` source will be automatically deactivated if no ``google_API_key``
- is setup.
- The ``google``, ``genius``, and ``tekstowo`` sources will only be enabled if
- BeautifulSoup is installed.
-- **synced**: Prefer synced lyrics over plain lyrics if a source offers them. Currently `lrclib` is the only source that provides them. Default: `no`.
-
-Here's an example of ``config.yaml``::
-
- lyrics:
- fallback: ''
- google_API_key: AZERTYUIOPQSDFGHJKLMWXCVBN1234567890_ab
- google_engine_ID: 009217259823014548361:lndtuqkycfu
+ to all available sources. The ``google`` source will be automatically
+ deactivated if no ``google_API_key`` is setup.
+- **synced**: Prefer synced lyrics over plain lyrics if a source offers them.
+ Currently ``lrclib`` is the only source that provides them.
.. _beets custom search engine: https://www.google.com:443/cse/publicurl?cx=009217259823014548361:lndtuqkycfu
@@ -89,74 +91,74 @@ by that band, and ``beet lyrics`` will get lyrics for my entire library. The
lyrics will be added to the beets database and, if ``import.write`` is on,
embedded into files' metadata.
-The ``-p`` option to the ``lyrics`` command makes it print lyrics out to the
-console so you can view the fetched (or previously-stored) lyrics.
+The ``-p, --print`` option to the ``lyrics`` command makes it print lyrics out
+to the console so you can view the fetched (or previously-stored) lyrics.
-The ``-f`` option forces the command to fetch lyrics, even for tracks that
-already have lyrics. Inversely, the ``-l`` option restricts operations
-to lyrics that are locally available, which show lyrics faster without using
-the network at all.
+The ``-f, --force`` option forces the command to fetch lyrics, even for tracks
+that already have lyrics.
+
+Inversely, the ``-l, --local`` option restricts operations to lyrics that are
+locally available, which show lyrics faster without using the network at all.
Rendering Lyrics into Other Formats
-----------------------------------
-The ``-r directory`` option renders all lyrics as `reStructuredText`_ (ReST)
-documents in ``directory`` (by default, the current directory). That
-directory, in turn, can be parsed by tools like `Sphinx`_ to generate HTML,
-ePUB, or PDF documents.
+The ``-r directory, --write-rest directory`` option renders all lyrics as
+`reStructuredText`_ (ReST) documents in ``directory`` (by default, the current
+directory). That directory, in turn, can be parsed by tools like `Sphinx`_ to
+generate HTML, ePUB, or PDF documents.
-A minimal ``conf.py`` and ``index.rst`` files are created the first time the
+Minimal ``conf.py`` and ``index.rst`` files are created the first time the
command is run. They are not overwritten on subsequent runs, so you can safely
modify these files to customize the output.
+Sphinx supports various `builders`_, see a few suggestions:
+
+
+.. admonition:: Build an HTML version
+
+ ::
+
+ sphinx-build -b html . _build/html
+
+.. admonition:: Build an ePUB3 formatted file, usable on ebook readers
+
+ ::
+
+ sphinx-build -b epub3 . _build/epub
+
+.. admonition:: Build a PDF file, which incidentally also builds a LaTeX file
+
+ ::
+
+ sphinx-build -b latex %s _build/latex && make -C _build/latex all-pdf
+
+
.. _Sphinx: https://www.sphinx-doc.org/
.. _reStructuredText: http://docutils.sourceforge.net/rst.html
-
-Sphinx supports various `builders
-`_, but here are a
-few suggestions.
-
- * Build an HTML version::
-
- sphinx-build -b html . _build/html
-
- * Build an ePUB3 formatted file, usable on ebook readers::
-
- sphinx-build -b epub3 . _build/epub
-
- * Build a PDF file, which incidentally also builds a LaTeX file::
-
- sphinx-build -b latex %s _build/latex && make -C _build/latex all-pdf
-
-.. _activate-google-custom-search:
+.. _builders: https://www.sphinx-doc.org/en/stable/builders.html
Activate Google Custom Search
------------------------------
You need to `register for a Google API key`_. Set the ``google_API_key``
configuration option to your key.
+
Then add ``google`` to the list of sources in your configuration (or use
default list, which includes it as long as you have an API key).
If you use default ``google_engine_ID``, we recommend limiting the sources to
``google`` as the other sources are already included in the Google results.
-.. _register for a Google API key: https://console.developers.google.com/
-
Optionally, you can `define a custom search engine`_. Get your search engine's
token and use it for your ``google_engine_ID`` configuration option. By
default, beets use a list of sources known to be scrapeable.
-.. _define a custom search engine: https://www.google.com/cse/all
-
Note that the Google custom search API is limited to 100 queries per day.
After that, the lyrics plugin will fall back on other declared data sources.
-.. _BeautifulSoup: https://www.crummy.com/software/BeautifulSoup/bs4/doc/
+.. _register for a Google API key: https://console.developers.google.com/
+.. _define a custom search engine: https://www.google.com/cse/all
-Activate Genius and Tekstowo.pl Lyrics
---------------------------------------
-
-These backends are enabled by default.
.. _lyrics-translation:
@@ -167,6 +169,6 @@ You need to register for a Microsoft Azure Marketplace free account and
to the `Microsoft Translator API`_. Follow the four steps process, specifically
at step 3 enter ``beets`` as *Client ID* and copy/paste the generated
*Client secret* into your ``bing_client_secret`` configuration, alongside
-``bing_lang_to`` target `language code`.
+``bing_lang_to`` target ``language code``.
.. _Microsoft Translator API: https://docs.microsoft.com/en-us/azure/cognitive-services/translator/translator-how-to-signup
From 06eac79c0d5f58c63e0ca23ea1897ad87bd7f585 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?=
Date: Wed, 4 Sep 2024 04:15:46 +0100
Subject: [PATCH 03/25] Centralize requests setup with requests.Session
Improve requests performance with requests.Session which uses connection
pooling for repeated requests to the same host.
Additionally, this centralizes request configuration, making sure that
we use the same timeout and provide beets user agent for all requests.
---
beetsplug/lyrics.py | 68 +++++++++++++++++----------------------------
setup.cfg | 4 +--
2 files changed, 28 insertions(+), 44 deletions(-)
diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index 2f8998d96..8e9452a2c 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -16,6 +16,7 @@
from __future__ import annotations
+import atexit
import errno
import itertools
import json
@@ -24,13 +25,12 @@ import os.path
import re
import struct
import unicodedata
-import warnings
from contextlib import suppress
from dataclasses import dataclass
from functools import cached_property, partial, total_ordering
from http import HTTPStatus
from typing import TYPE_CHECKING, ClassVar, Iterable, Iterator
-from urllib.parse import quote, urlencode, urlparse
+from urllib.parse import quote, urlparse
import requests
from typing_extensions import TypedDict
@@ -106,6 +106,22 @@ class NotFoundError(requests.exceptions.HTTPError):
pass
+class TimeoutSession(requests.Session):
+ def request(self, *args, **kwargs):
+ kwargs.setdefault("timeout", 10)
+ return super().request(*args, **kwargs)
+
+
+r_session = TimeoutSession()
+r_session.headers.update({"User-Agent": USER_AGENT})
+
+
+@atexit.register
+def close_session():
+ """Close the requests session on shut down."""
+ r_session.close()
+
+
# Utilities.
@@ -246,21 +262,7 @@ class Backend:
is unreachable.
"""
try:
- # Disable the InsecureRequestWarning that comes from using
- # `verify=false`.
- # https://github.com/kennethreitz/requests/issues/2214
- # We're not overly worried about the NSA MITMing our lyrics scraper
- with warnings.catch_warnings():
- warnings.simplefilter("ignore")
- r = requests.get(
- url,
- verify=False,
- headers={
- "User-Agent": USER_AGENT,
- },
- timeout=10,
- **kwargs,
- )
+ r = r_session.get(url)
except requests.RequestException as exc:
self._log.debug("lyrics request failed: {0}", exc)
return
@@ -368,9 +370,7 @@ class LRCLib(Backend):
def fetch_json(self, *args, **kwargs):
"""Wrap the request method to raise an exception on HTTP errors."""
- kwargs.setdefault("timeout", 10)
- kwargs.setdefault("headers", {"User-Agent": USER_AGENT})
- r = requests.get(*args, **kwargs)
+ r = r_session.get(*args, **kwargs)
if r.status_code == HTTPStatus.NOT_FOUND:
raise NotFoundError("HTTP Error: Not Found", response=r)
r.raise_for_status()
@@ -535,10 +535,7 @@ class Genius(SearchBackend):
def __init__(self, config, log):
super().__init__(config, log)
self.api_key = config["genius_api_key"].as_str()
- self.headers = {
- "Authorization": "Bearer %s" % self.api_key,
- "User-Agent": USER_AGENT,
- }
+ self.headers = {"Authorization": f"Bearer {self.api_key}"}
def fetch(self, artist: str, title: str, *_) -> str | None:
"""Fetch lyrics from genius.com
@@ -573,18 +570,13 @@ class Genius(SearchBackend):
search_url = self.base_url + "/search"
data = {"q": title + " " + artist.lower()}
try:
- response = requests.get(
- search_url,
- params=data,
- headers=self.headers,
- timeout=10,
- )
+ r = r_session.get(search_url, params=data, headers=self.headers)
except requests.RequestException as exc:
self._log.debug("Genius API request failed: {0}", exc)
return None
try:
- return response.json()
+ return r.json()
except ValueError:
return None
@@ -979,13 +971,7 @@ class LyricsPlugin(plugins.BeetsPlugin):
}
oauth_url = "https://datamarket.accesscontrol.windows.net/v2/OAuth2-13"
- oauth_token = json.loads(
- requests.post(
- oauth_url,
- data=urlencode(params),
- timeout=10,
- ).content
- )
+ oauth_token = r_session.post(oauth_url, params=params).json()
if "access_token" in oauth_token:
return "Bearer " + oauth_token["access_token"]
else:
@@ -1202,10 +1188,8 @@ class LyricsPlugin(plugins.BeetsPlugin):
"https://api.microsofttranslator.com/v2/Http.svc/"
"Translate?text=%s&to=%s" % ("|".join(text_lines), to_lang)
)
- r = requests.get(
- url,
- headers={"Authorization ": self.bing_auth_token},
- timeout=10,
+ r = r_session.get(
+ url, headers={"Authorization": self.bing_auth_token}
)
if r.status_code != 200:
self._log.debug(
diff --git a/setup.cfg b/setup.cfg
index 15ca23f65..8e3d7e3b8 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -21,8 +21,8 @@ omit = beets/test/*
precision = 2
skip_empty = true
show_missing = true
-exclude_lines =
- pragma: no cover
+exclude_also =
+ @atexit.register
if TYPE_CHECKING
if typing.TYPE_CHECKING
raise AssertionError
From 283c513c72db7e960e51363607efb8eb0766c823 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?=
Date: Sun, 13 Oct 2024 16:14:15 +0100
Subject: [PATCH 04/25] Centralise request error handling
---
beetsplug/lyrics.py | 228 +++++++++++++++---------------------
test/plugins/test_lyrics.py | 62 ++++++----
2 files changed, 136 insertions(+), 154 deletions(-)
diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index 8e9452a2c..5a35519b5 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -19,13 +19,12 @@ from __future__ import annotations
import atexit
import errno
import itertools
-import json
import math
import os.path
import re
import struct
import unicodedata
-from contextlib import suppress
+from contextlib import contextmanager, suppress
from dataclasses import dataclass
from functools import cached_property, partial, total_ordering
from http import HTTPStatus
@@ -108,8 +107,14 @@ class NotFoundError(requests.exceptions.HTTPError):
class TimeoutSession(requests.Session):
def request(self, *args, **kwargs):
+ """Wrap the request method to raise an exception on HTTP errors."""
kwargs.setdefault("timeout", 10)
- return super().request(*args, **kwargs)
+ r = super().request(*args, **kwargs)
+ if r.status_code == HTTPStatus.NOT_FOUND:
+ raise NotFoundError("HTTP Error: Not Found", response=r)
+ r.raise_for_status()
+
+ return r
r_session = TimeoutSession()
@@ -250,28 +255,36 @@ else:
return None
-class Backend:
+class RequestHandler:
+ _log: beets.logging.Logger
+
+ def fetch_text(self, url: str, **kwargs) -> str:
+ """Return text / HTML data from the given URL."""
+ self._log.debug("Fetching HTML from {}", url)
+ return r_session.get(url, **kwargs).text
+
+ def fetch_json(self, url: str, **kwargs):
+ """Return JSON data from the given URL."""
+ self._log.debug("Fetching JSON from {}", url)
+ return r_session.get(url, **kwargs).json()
+
+ @contextmanager
+ def handle_request(self) -> Iterator[None]:
+ try:
+ yield
+ except requests.JSONDecodeError:
+ self._log.warning("Could not decode response JSON data")
+ except requests.RequestException as exc:
+ self._log.warning("Request error: {}", exc)
+
+
+class Backend(RequestHandler):
REQUIRES_BS = False
def __init__(self, config, log):
self._log = log
self.config = config
- def fetch_url(self, url, **kwargs):
- """Retrieve the content at a given URL, or return None if the source
- is unreachable.
- """
- try:
- r = r_session.get(url)
- except requests.RequestException as exc:
- self._log.debug("lyrics request failed: {0}", exc)
- return
- if r.status_code == requests.codes.ok:
- return r.text
- else:
- self._log.debug("failed to fetch: {0} ({1})", url, r.status_code)
- return None
-
def fetch(
self, artist: str, title: str, album: str, length: int
) -> str | None:
@@ -368,15 +381,6 @@ class LRCLib(Backend):
"""Log a warning message with the class name."""
self._log.warning(f"{self.__class__.__name__}: {message}", *args)
- def fetch_json(self, *args, **kwargs):
- """Wrap the request method to raise an exception on HTTP errors."""
- r = r_session.get(*args, **kwargs)
- if r.status_code == HTTPStatus.NOT_FOUND:
- raise NotFoundError("HTTP Error: Not Found", response=r)
- r.raise_for_status()
-
- return r.json()
-
def fetch_candidates(
self, artist: str, title: str, album: str, length: int
) -> Iterator[list[LRCLibItem]]:
@@ -417,13 +421,7 @@ class LRCLib(Backend):
if item := self.pick_best_match(candidates):
return item.get_text(self.config["synced"])
except StopIteration:
- pass
- except requests.JSONDecodeError:
- self.warn("Could not decode response JSON data")
- except requests.RequestException as exc:
- self.warn("Request error: {}", exc)
-
- return None
+ return None
class DirectBackend(Backend):
@@ -463,9 +461,7 @@ class MusiXmatch(DirectBackend):
def fetch(self, artist: str, title: str, *_) -> str | None:
url = self.build_url(artist, title)
- html = self.fetch_url(url)
- if not html:
- return None
+ html = self.fetch_text(url)
if "We detected that your IP is blocked" in html:
self._log.warning(
"we are blocked at MusixMatch: url %s failed" % url
@@ -531,6 +527,7 @@ class Genius(SearchBackend):
"""
base_url = "https://api.genius.com"
+ search_url = f"{base_url}/search"
def __init__(self, config, log):
super().__init__(config, log)
@@ -553,10 +550,9 @@ class Genius(SearchBackend):
for hit in json["response"]["hits"]:
result = hit["result"]
if check(result["primary_artist"]["name"], result["title"]):
- html = self.fetch_url(result["url"])
- if not html:
- return None
- return self._scrape_lyrics_from_html(html)
+ return self._scrape_lyrics_from_html(
+ self.fetch_text(result["url"])
+ )
return None
@@ -567,29 +563,20 @@ class Genius(SearchBackend):
:returns: json response
"""
- search_url = self.base_url + "/search"
- data = {"q": title + " " + artist.lower()}
- try:
- r = r_session.get(search_url, params=data, headers=self.headers)
- except requests.RequestException as exc:
- self._log.debug("Genius API request failed: {0}", exc)
- return None
-
- try:
- return r.json()
- except ValueError:
- return None
+ return self.fetch_json(
+ self.search_url,
+ params={"q": f"{title} {artist.lower()}"},
+ headers=self.headers,
+ )
def replace_br(self, lyrics_div):
for br in lyrics_div.find_all("br"):
br.replace_with("\n")
- def _scrape_lyrics_from_html(self, html):
+ def _scrape_lyrics_from_html(self, html: str) -> str | None:
"""Scrape lyrics from a given genius.com html"""
soup = try_parse_html(html)
- if not soup:
- return
# Remove script tags that they put in the middle of the lyrics.
[h.extract() for h in soup("script")]
@@ -660,10 +647,12 @@ class Tekstowo(DirectBackend):
return cls.non_alpha_to_underscore(unidecode(text.lower()))
def fetch(self, artist: str, title: str, *_) -> str | None:
- if html := self.fetch_url(self.build_url(artist, title)):
- return self.extract_lyrics(html)
-
- return None
+ # We are expecting to receive a 404 since we are guessing the URL.
+ # Thus suppress the error so that it does not end up in the logs.
+ with suppress(NotFoundError):
+ return self.extract_lyrics(
+ self.fetch_text(self.build_url(artist, title))
+ )
def extract_lyrics(self, html: str) -> str | None:
html = _scrape_strip_cruft(html)
@@ -717,7 +706,7 @@ def _scrape_merge_paragraphs(html):
return re.sub(r"
\s*
", "\n", html)
-def scrape_lyrics_from_html(html):
+def scrape_lyrics_from_html(html: str) -> str | None:
"""Scrape lyrics from a URL. If no lyrics can be found, return None
instead.
"""
@@ -737,8 +726,6 @@ def scrape_lyrics_from_html(html):
# extract all long text blocks that are not code
soup = try_parse_html(html, parse_only=SoupStrainer(string=is_text_notcode))
- if not soup:
- return None
# Get the longest text element (if any).
strings = sorted(soup.stripped_strings, key=len, reverse=True)
@@ -831,39 +818,26 @@ class Google(SearchBackend):
"q": f"{artist} {title}",
}
- data = self.fetch_url(self.SEARCH_URL, params=params)
- if not data:
- self._log.debug("google backend returned no data")
- return None
- try:
- data = json.loads(data)
- except ValueError as exc:
- self._log.debug("google backend returned malformed JSON: {}", exc)
- if "error" in data:
- reason = data["error"]["errors"][0]["reason"]
- self._log.debug("google backend error: {0}", reason)
- return None
-
check_candidate = partial(self.is_page_candidate, artist, title)
- for item in data.get("items", []):
+ for item in self.fetch_json(self.SEARCH_URL, params=params).get(
+ "items", []
+ ):
url_link = item["link"]
if not check_candidate(url_link, item.get("title", "")):
continue
- html = self.fetch_url(url_link)
- if not html:
- continue
- lyrics = scrape_lyrics_from_html(html)
- if not lyrics:
- continue
+ with self.handle_request():
+ lyrics = scrape_lyrics_from_html(self.fetch_text(url_link))
+ if not lyrics:
+ continue
- if self.is_lyrics(lyrics, artist):
- self._log.debug("got lyrics from {0}", item["displayLink"])
- return lyrics
+ if self.is_lyrics(lyrics, artist):
+ self._log.debug("got lyrics from {0}", item["displayLink"])
+ return lyrics
return None
-class LyricsPlugin(plugins.BeetsPlugin):
+class LyricsPlugin(RequestHandler, plugins.BeetsPlugin):
SOURCES = ["lrclib", "google", "musixmatch", "genius", "tekstowo"]
SOURCE_BACKENDS = {
"google": Google,
@@ -934,7 +908,6 @@ class LyricsPlugin(plugins.BeetsPlugin):
self.config["bing_lang_from"] = [
x.lower() for x in self.config["bing_lang_from"].as_str_seq()
]
- self.bing_auth_token = None
if not HAS_LANGDETECT and self.config["bing_client_secret"].get():
self._log.warning(
@@ -962,7 +935,8 @@ class LyricsPlugin(plugins.BeetsPlugin):
return enabled_sources
- def get_bing_access_token(self):
+ @cached_property
+ def bing_access_token(self) -> str | None:
params = {
"client_id": "beets",
"client_secret": self.config["bing_client_secret"],
@@ -971,14 +945,11 @@ class LyricsPlugin(plugins.BeetsPlugin):
}
oauth_url = "https://datamarket.accesscontrol.windows.net/v2/OAuth2-13"
- oauth_token = r_session.post(oauth_url, params=params).json()
- if "access_token" in oauth_token:
- return "Bearer " + oauth_token["access_token"]
- else:
- self._log.warning(
- "Could not get Bing Translate API access token."
- ' Check your "bing_client_secret" password'
- )
+ with self.handle_request():
+ r = r_session.post(oauth_url, params=params)
+ return r.json()["access_token"]
+
+ return None
def commands(self):
cmd = ui.Subcommand("lyrics", help="fetch song lyrics")
@@ -1167,44 +1138,39 @@ class LyricsPlugin(plugins.BeetsPlugin):
None if no lyrics were found.
"""
for backend in self.backends:
- lyrics = backend.fetch(artist, title, *args)
- if lyrics:
- self._log.debug(
- "got lyrics from backend: {0}", backend.__class__.__name__
- )
- return _scrape_strip_cruft(lyrics, True)
+ with backend.handle_request():
+ if lyrics := backend.fetch(artist, title, *args):
+ self._log.debug(
+ "got lyrics from backend: {0}",
+ backend.__class__.__name__,
+ )
+ return _scrape_strip_cruft(lyrics, True)
return None
def append_translation(self, text, to_lang):
from xml.etree import ElementTree
- if not self.bing_auth_token:
- self.bing_auth_token = self.get_bing_access_token()
- if self.bing_auth_token:
- # Extract unique lines to limit API request size per song
- text_lines = set(text.split("\n"))
- url = (
- "https://api.microsofttranslator.com/v2/Http.svc/"
- "Translate?text=%s&to=%s" % ("|".join(text_lines), to_lang)
+ if not (token := self.bing_access_token):
+ self._log.warning(
+ "Could not get Bing Translate API access token. "
+ "Check your 'bing_client_secret' password."
)
- r = r_session.get(
- url, headers={"Authorization": self.bing_auth_token}
+ return text
+
+ # Extract unique lines to limit API request size per song
+ lines = text.split("\n")
+ unique_lines = set(lines)
+ url = "https://api.microsofttranslator.com/v2/Http.svc/Translate"
+ with self.handle_request():
+ text = self.fetch_text(
+ url,
+ headers={"Authorization": f"Bearer {token}"},
+ params={"text": "|".join(unique_lines), "to": to_lang},
)
- if r.status_code != 200:
- self._log.debug(
- "translation API error {}: {}", r.status_code, r.text
- )
- if "token has expired" in r.text:
- self.bing_auth_token = None
- return self.append_translation(text, to_lang)
- return text
- lines_translated = ElementTree.fromstring(
- r.text.encode("utf-8")
- ).text
- # Use a translation mapping dict to build resulting lyrics
- translations = dict(zip(text_lines, lines_translated.split("|")))
- result = ""
- for line in text.split("\n"):
- result += "{} / {}\n".format(line, translations[line])
- return result
+ if translated := ElementTree.fromstring(text.encode("utf-8")).text:
+ # Use a translation mapping dict to build resulting lyrics
+ translations = dict(zip(unique_lines, translated.split("|")))
+ return "".join(f"{ln} / {translations[ln]}\n" for ln in lines)
+
+ return text
diff --git a/test/plugins/test_lyrics.py b/test/plugins/test_lyrics.py
index 812399698..b77054b52 100644
--- a/test/plugins/test_lyrics.py
+++ b/test/plugins/test_lyrics.py
@@ -202,7 +202,7 @@ def lyrics_root_dir(pytestconfig: pytest.Config):
return pytestconfig.rootpath / "test" / "rsrc" / "lyrics"
-class LyricsBackendTest(PluginMixin):
+class LyricsPluginMixin(PluginMixin):
plugin = "lyrics"
@pytest.fixture
@@ -218,6 +218,42 @@ class LyricsBackendTest(PluginMixin):
return lyrics.LyricsPlugin()
+
+class TestLyricsPlugin(LyricsPluginMixin):
+ @pytest.fixture
+ def backend_name(self):
+ """Return lyrics configuration to test."""
+ return "lrclib"
+
+ @pytest.mark.parametrize(
+ "request_kwargs, expected_log_match",
+ [
+ (
+ {"status_code": HTTPStatus.BAD_GATEWAY},
+ r"lyrics: Request error: 502",
+ ),
+ ({"text": "invalid"}, r"lyrics: Could not decode.*JSON"),
+ ],
+ )
+ def test_error_handling(
+ self,
+ requests_mock,
+ lyrics_plugin,
+ caplog,
+ request_kwargs,
+ expected_log_match,
+ ):
+ """Errors are logged with the plugin name."""
+ requests_mock.get(lyrics.LRCLib.SEARCH_URL, **request_kwargs)
+
+ assert lyrics_plugin.get_lyrics("", "", "", 0.0) is None
+ assert caplog.messages
+ last_log = caplog.messages[-1]
+ assert last_log
+ assert re.search(expected_log_match, last_log, re.I)
+
+
+class LyricsBackendTest(LyricsPluginMixin):
@pytest.fixture
def backend(self, lyrics_plugin):
"""Return a lyrics backend instance."""
@@ -399,13 +435,9 @@ class TestLRCLibLyrics(LyricsBackendTest):
return "lrclib"
@pytest.fixture
- def request_kwargs(self, response_data):
- return {"json": response_data}
-
- @pytest.fixture
- def fetch_lyrics(self, backend, requests_mock, request_kwargs):
+ def fetch_lyrics(self, backend, requests_mock, response_data):
requests_mock.get(backend.GET_URL, status_code=HTTPStatus.NOT_FOUND)
- requests_mock.get(backend.SEARCH_URL, **request_kwargs)
+ requests_mock.get(backend.SEARCH_URL, json=response_data)
return partial(backend.fetch, "la", "la", "la", self.ITEM_DURATION)
@@ -478,19 +510,3 @@ class TestLRCLibLyrics(LyricsBackendTest):
@pytest.mark.parametrize("plugin_config", [{"synced": True}])
def test_fetch_lyrics(self, fetch_lyrics, expected_lyrics):
assert fetch_lyrics() == expected_lyrics
-
- @pytest.mark.parametrize(
- "request_kwargs, expected_log_match",
- [
- (
- {"status_code": HTTPStatus.BAD_GATEWAY},
- r"LRCLib: Request error: 502",
- ),
- ({"text": "invalid"}, r"LRCLib: Could not decode.*JSON"),
- ],
- )
- def test_error(self, caplog, fetch_lyrics, expected_log_match):
- assert fetch_lyrics() is None
- assert caplog.messages
- assert (last_log := caplog.messages[-1])
- assert re.search(expected_log_match, last_log, re.I)
From cb29605bfd84c20725a7036258dba0473266ffd7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?=
Date: Fri, 6 Sep 2024 07:35:24 +0100
Subject: [PATCH 05/25] Include class name in the log messages
---
beetsplug/lyrics.py | 113 +++++++++++++++++++-----------------
test/plugins/test_lyrics.py | 6 +-
2 files changed, 64 insertions(+), 55 deletions(-)
diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index 5a35519b5..db819c513 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -28,8 +28,8 @@ from contextlib import contextmanager, suppress
from dataclasses import dataclass
from functools import cached_property, partial, total_ordering
from http import HTTPStatus
-from typing import TYPE_CHECKING, ClassVar, Iterable, Iterator
-from urllib.parse import quote, urlparse
+from typing import TYPE_CHECKING, Any, ClassVar, Iterable, Iterator
+from urllib.parse import quote, urlencode, urlparse
import requests
from typing_extensions import TypedDict
@@ -58,6 +58,8 @@ try:
except ImportError:
HAS_LANGDETECT = False
+JSONDict = dict[str, Any]
+
DIV_RE = re.compile(r"<(/?)div>?", re.I)
COMMENT_RE = re.compile(r"", re.S)
TAG_RE = re.compile(r"<[^>]*>")
@@ -258,14 +260,37 @@ else:
class RequestHandler:
_log: beets.logging.Logger
- def fetch_text(self, url: str, **kwargs) -> str:
+ def debug(self, message: str, *args) -> None:
+ """Log a debug message with the class name."""
+ self._log.debug(f"{self.__class__.__name__}: {message}", *args)
+
+ def info(self, message: str, *args) -> None:
+ """Log an info message with the class name."""
+ self._log.info(f"{self.__class__.__name__}: {message}", *args)
+
+ def warn(self, message: str, *args) -> None:
+ """Log warning with the class name."""
+ self._log.warning(f"{self.__class__.__name__}: {message}", *args)
+
+ @staticmethod
+ def format_url(url: str, params: JSONDict | None) -> str:
+ if not params:
+ return url
+
+ return f"{url}?{urlencode(params)}"
+
+ def fetch_text(
+ self, url: str, params: JSONDict | None = None, **kwargs
+ ) -> str:
"""Return text / HTML data from the given URL."""
- self._log.debug("Fetching HTML from {}", url)
+ url = self.format_url(url, params)
+ self.debug("Fetching HTML from {}", url)
return r_session.get(url, **kwargs).text
- def fetch_json(self, url: str, **kwargs):
+ def fetch_json(self, url: str, params: JSONDict | None = None, **kwargs):
"""Return JSON data from the given URL."""
- self._log.debug("Fetching JSON from {}", url)
+ url = self.format_url(url, params)
+ self.debug("Fetching JSON from {}", url)
return r_session.get(url, **kwargs).json()
@contextmanager
@@ -273,9 +298,9 @@ class RequestHandler:
try:
yield
except requests.JSONDecodeError:
- self._log.warning("Could not decode response JSON data")
+ self.warn("Could not decode response JSON data")
except requests.RequestException as exc:
- self._log.warning("Request error: {}", exc)
+ self.warn("Request error: {}", exc)
class Backend(RequestHandler):
@@ -377,10 +402,6 @@ class LRCLib(Backend):
GET_URL = f"{BASE_URL}/get"
SEARCH_URL = f"{BASE_URL}/search"
- def warn(self, message: str, *args) -> None:
- """Log a warning message with the class name."""
- self._log.warning(f"{self.__class__.__name__}: {message}", *args)
-
def fetch_candidates(
self, artist: str, title: str, album: str, length: int
) -> Iterator[list[LRCLibItem]]:
@@ -415,13 +436,12 @@ class LRCLib(Backend):
"""Fetch lyrics text for the given song data."""
evaluate_item = partial(LRCLyrics.make, target_duration=length)
- try:
- for group in self.fetch_candidates(artist, title, album, length):
- candidates = [evaluate_item(item) for item in group]
- if item := self.pick_best_match(candidates):
- return item.get_text(self.config["synced"])
- except StopIteration:
- return None
+ for group in self.fetch_candidates(artist, title, album, length):
+ candidates = [evaluate_item(item) for item in group]
+ if item := self.pick_best_match(candidates):
+ return item.get_text(self.config["synced"])
+
+ return None
class DirectBackend(Backend):
@@ -463,9 +483,7 @@ class MusiXmatch(DirectBackend):
html = self.fetch_text(url)
if "We detected that your IP is blocked" in html:
- self._log.warning(
- "we are blocked at MusixMatch: url %s failed" % url
- )
+ self.warn("Failed: Blocked IP address")
return None
html_parts = html.split('
str | None:
html = _scrape_strip_cruft(html)
html = _scrape_merge_paragraphs(html)
@@ -747,7 +764,7 @@ class Google(SearchBackend):
bad_triggers_occ = []
nb_lines = text.count("\n")
if nb_lines <= 1:
- self._log.debug("Ignoring too short lyrics '{0}'", text)
+ self.debug("Ignoring too short lyrics '{}'", text)
return False
elif nb_lines < 5:
bad_triggers_occ.append("too_short")
@@ -766,7 +783,7 @@ class Google(SearchBackend):
)
if bad_triggers_occ:
- self._log.debug("Bad triggers detected: {0}", bad_triggers_occ)
+ self.debug("Bad triggers detected: {}", bad_triggers_occ)
return len(bad_triggers_occ) < 2
def slugify(self, text):
@@ -779,7 +796,7 @@ class Google(SearchBackend):
text = unicodedata.normalize("NFKD", text).encode("ascii", "ignore")
text = str(re.sub(r"[-\s]+", " ", text.decode("utf-8")))
except UnicodeDecodeError:
- self._log.exception("Failing to normalize '{0}'", text)
+ self.debug("Failed to normalize '{}'", text)
return text
BY_TRANS = ["by", "par", "de", "von"]
@@ -831,7 +848,7 @@ class Google(SearchBackend):
continue
if self.is_lyrics(lyrics, artist):
- self._log.debug("got lyrics from {0}", item["displayLink"])
+ self.debug("Got lyrics from {}", item["displayLink"])
return lyrics
return None
@@ -900,9 +917,7 @@ class LyricsPlugin(RequestHandler, plugins.BeetsPlugin):
# configuration includes `google`. This way, the source
# is silent by default but can be enabled just by
# setting an API key.
- self._log.debug(
- "Disabling google source: " "no API key configured."
- )
+ self.debug("Disabling google source: " "no API key configured.")
sources.remove("google")
self.config["bing_lang_from"] = [
@@ -910,15 +925,14 @@ class LyricsPlugin(RequestHandler, plugins.BeetsPlugin):
]
if not HAS_LANGDETECT and self.config["bing_client_secret"].get():
- self._log.warning(
- "To use bing translations, you need to "
- "install the langdetect module. See the "
- "documentation for further details."
+ self.warn(
+ "To use bing translations, you need to install the langdetect "
+ "module. See the documentation for further details."
)
self.backends = [
- self.SOURCE_BACKENDS[source](self.config, self._log)
- for source in sources
+ self.SOURCE_BACKENDS[s](self.config, self._log.getChild(s))
+ for s in sources
]
def sanitize_bs_sources(self, sources):
@@ -949,8 +963,6 @@ class LyricsPlugin(RequestHandler, plugins.BeetsPlugin):
r = r_session.post(oauth_url, params=params)
return r.json()["access_token"]
- return None
-
def commands(self):
cmd = ui.Subcommand("lyrics", help="fetch song lyrics")
cmd.parser.add_option(
@@ -1095,7 +1107,7 @@ class LyricsPlugin(RequestHandler, plugins.BeetsPlugin):
"""
# Skip if the item already has lyrics.
if not force and item.lyrics:
- self._log.info("lyrics already present: {0}", item)
+ self.info("Lyrics already present: {}", item)
return
lyrics_matches = []
@@ -1111,7 +1123,7 @@ class LyricsPlugin(RequestHandler, plugins.BeetsPlugin):
lyrics = "\n\n---\n\n".join(filter(None, lyrics_matches))
if lyrics:
- self._log.info("fetched lyrics: {0}", item)
+ self.info("Lyrics found: {}", item)
if HAS_LANGDETECT and self.config["bing_client_secret"].get():
lang_from = langdetect.detect(lyrics)
if self.config["bing_lang_to"].get() != lang_from and (
@@ -1122,7 +1134,7 @@ class LyricsPlugin(RequestHandler, plugins.BeetsPlugin):
lyrics, self.config["bing_lang_to"]
)
else:
- self._log.info("lyrics not found: {0}", item)
+ self.info("Lyrics not found: {}", item)
fallback = self.config["fallback"].get()
if fallback:
lyrics = fallback
@@ -1137,13 +1149,10 @@ class LyricsPlugin(RequestHandler, plugins.BeetsPlugin):
"""Fetch lyrics, trying each source in turn. Return a string or
None if no lyrics were found.
"""
+ self.info("Fetching lyrics for {} - {}", artist, title)
for backend in self.backends:
with backend.handle_request():
if lyrics := backend.fetch(artist, title, *args):
- self._log.debug(
- "got lyrics from backend: {0}",
- backend.__class__.__name__,
- )
return _scrape_strip_cruft(lyrics, True)
return None
@@ -1152,7 +1161,7 @@ class LyricsPlugin(RequestHandler, plugins.BeetsPlugin):
from xml.etree import ElementTree
if not (token := self.bing_access_token):
- self._log.warning(
+ self.warn(
"Could not get Bing Translate API access token. "
"Check your 'bing_client_secret' password."
)
diff --git a/test/plugins/test_lyrics.py b/test/plugins/test_lyrics.py
index b77054b52..817eb4af9 100644
--- a/test/plugins/test_lyrics.py
+++ b/test/plugins/test_lyrics.py
@@ -230,9 +230,9 @@ class TestLyricsPlugin(LyricsPluginMixin):
[
(
{"status_code": HTTPStatus.BAD_GATEWAY},
- r"lyrics: Request error: 502",
+ r"LRCLib: Request error: 502",
),
- ({"text": "invalid"}, r"lyrics: Could not decode.*JSON"),
+ ({"text": "invalid"}, r"LRCLib: Could not decode.*JSON"),
],
)
def test_error_handling(
@@ -243,7 +243,7 @@ class TestLyricsPlugin(LyricsPluginMixin):
request_kwargs,
expected_log_match,
):
- """Errors are logged with the plugin name."""
+ """Errors are logged with the backend name."""
requests_mock.get(lyrics.LRCLib.SEARCH_URL, **request_kwargs)
assert lyrics_plugin.get_lyrics("", "", "", 0.0) is None
From 7c2fb31136d3ecba9e3974dca7bf7729bd0db500 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?=
Date: Wed, 11 Sep 2024 07:51:43 +0100
Subject: [PATCH 06/25] Leave a single chef in the kitchen
---
beetsplug/lyrics.py | 79 ++++++++++---------------------------
test/plugins/test_lyrics.py | 6 +--
2 files changed, 23 insertions(+), 62 deletions(-)
diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index db819c513..3d0e09673 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -44,8 +44,7 @@ if TYPE_CHECKING:
from beets.library import Item
try:
- import bs4
- from bs4 import SoupStrainer
+ from bs4 import BeautifulSoup
HAS_BEAUTIFUL_SOUP = True
except ImportError:
@@ -246,17 +245,6 @@ def slug(text):
return re.sub(r"\W+", "-", unidecode(text).lower().strip()).strip("-")
-if HAS_BEAUTIFUL_SOUP:
-
- def try_parse_html(html, **kwargs):
- return bs4.BeautifulSoup(html, "html.parser", **kwargs)
-
-else:
-
- def try_parse_html(html, **kwargs):
- return None
-
-
class RequestHandler:
_log: beets.logging.Logger
@@ -565,9 +553,7 @@ class Genius(SearchBackend):
for hit in json["response"]["hits"]:
result = hit["result"]
if check(result["primary_artist"]["name"], result["title"]):
- return self._scrape_lyrics_from_html(
- self.fetch_text(result["url"])
- )
+ return self.scrape_lyrics(self.fetch_text(hit["result"]["url"]))
return None
@@ -584,17 +570,9 @@ class Genius(SearchBackend):
headers=self.headers,
)
- def replace_br(self, lyrics_div):
- for br in lyrics_div.find_all("br"):
- br.replace_with("\n")
-
- def _scrape_lyrics_from_html(self, html: str) -> str | None:
+ def scrape_lyrics(self, html: str) -> str | None:
"""Scrape lyrics from a given genius.com html"""
-
- soup = try_parse_html(html)
-
- # Remove script tags that they put in the middle of the lyrics.
- [h.extract() for h in soup("script")]
+ soup = get_soup(html)
# Most of the time, the page contains a div with class="lyrics" where
# all of the lyrics can be found already correctly formatted
@@ -609,7 +587,6 @@ class Genius(SearchBackend):
)
lyrics = ""
for lyrics_div in lyrics_divs:
- self.replace_br(lyrics_div)
lyrics += lyrics_div.get_text() + "\n\n"
while lyrics[-1] == "\n":
lyrics = lyrics[:-1]
@@ -633,7 +610,6 @@ class Genius(SearchBackend):
return None
lyrics_div = verse_div.parent
- self.replace_br(lyrics_div)
ads = lyrics_div.find_all(
"div", class_=re.compile("InreadAd__Container")
@@ -665,17 +641,14 @@ class Tekstowo(DirectBackend):
# We are expecting to receive a 404 since we are guessing the URL.
# Thus suppress the error so that it does not end up in the logs.
with suppress(NotFoundError):
- return self.extract_lyrics(
+ return self.scrape_lyrics(
self.fetch_text(self.build_url(artist, title))
)
return None
- def extract_lyrics(self, html: str) -> str | None:
- html = _scrape_strip_cruft(html)
- html = _scrape_merge_paragraphs(html)
-
- soup = try_parse_html(html)
+ def scrape_lyrics(self, html: str) -> str | None:
+ soup = get_soup(html)
if lyrics_div := soup.select_one("div.song-text > div.inner-text"):
return lyrics_div.get_text()
@@ -723,33 +696,11 @@ def _scrape_merge_paragraphs(html):
return re.sub(r"
\s*
", "\n", html)
-def scrape_lyrics_from_html(html: str) -> str | None:
- """Scrape lyrics from a URL. If no lyrics can be found, return None
- instead.
- """
-
- def is_text_notcode(text):
- if not text:
- return False
- length = len(text)
- return (
- length > 20
- and text.count(" ") > length / 25
- and (text.find("{") == -1 or text.find(";") == -1)
- )
-
+def get_soup(html: str) -> BeautifulSoup:
html = _scrape_strip_cruft(html)
html = _scrape_merge_paragraphs(html)
- # extract all long text blocks that are not code
- soup = try_parse_html(html, parse_only=SoupStrainer(string=is_text_notcode))
-
- # Get the longest text element (if any).
- strings = sorted(soup.stripped_strings, key=len, reverse=True)
- if strings:
- return strings[0]
- else:
- return None
+ return BeautifulSoup(html, "html.parser")
class Google(SearchBackend):
@@ -757,6 +708,16 @@ class Google(SearchBackend):
SEARCH_URL = "https://www.googleapis.com/customsearch/v1"
+ @staticmethod
+ def scrape_lyrics(html: str) -> str | None:
+ soup = get_soup(html)
+
+ # Get the longest text element (if any).
+ strings = sorted(soup.stripped_strings, key=len, reverse=True)
+ if strings:
+ return strings[0]
+ return None
+
def is_lyrics(self, text, artist=None):
"""Determine whether the text seems to be valid lyrics."""
if not text:
@@ -843,7 +804,7 @@ class Google(SearchBackend):
if not check_candidate(url_link, item.get("title", "")):
continue
with self.handle_request():
- lyrics = scrape_lyrics_from_html(self.fetch_text(url_link))
+ lyrics = self.scrape_lyrics(self.fetch_text(url_link))
if not lyrics:
continue
diff --git a/test/plugins/test_lyrics.py b/test/plugins/test_lyrics.py
index 817eb4af9..d412d318b 100644
--- a/test/plugins/test_lyrics.py
+++ b/test/plugins/test_lyrics.py
@@ -328,7 +328,7 @@ class TestGoogleLyrics(LyricsBackendTest):
def test_mocked_source_ok(self, backend, lyrics_html):
"""Test that lyrics of the mocked page are correctly scraped"""
- result = lyrics.scrape_lyrics_from_html(lyrics_html).lower()
+ result = backend.scrape_lyrics(lyrics_html).lower()
assert result
assert backend.is_lyrics(result)
@@ -390,7 +390,7 @@ class TestGeniusLyrics(LyricsBackendTest):
],
) # fmt: skip
def test_scrape(self, backend, lyrics_html, expected_line_count):
- result = backend._scrape_lyrics_from_html(lyrics_html) or ""
+ result = backend.scrape_lyrics(lyrics_html) or ""
assert len(result.splitlines()) == expected_line_count
@@ -411,7 +411,7 @@ class TestTekstowoLyrics(LyricsBackendTest):
],
)
def test_scrape(self, backend, lyrics_html, expecting_lyrics):
- assert bool(backend.extract_lyrics(lyrics_html)) == expecting_lyrics
+ assert bool(backend.scrape_lyrics(lyrics_html)) == expecting_lyrics
LYRICS_DURATION = 950
From dd9f178fffd1cd130bde56fdbaae853aab4ca0c9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?=
Date: Sat, 19 Oct 2024 03:30:41 +0100
Subject: [PATCH 07/25] Do not try to strip cruft from the parsed lyrics text.
Having removed it I fuond that only the Genius lyrics changed: it had en
extra new line. Thus I defined a function 'collapse_newlines' which now
gets called for the Genius lyrics.
---
beetsplug/lyrics.py | 24 +++++++++++-------------
test/plugins/test_lyrics.py | 5 ++---
2 files changed, 13 insertions(+), 16 deletions(-)
diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index 3d0e09673..2ec362356 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -59,9 +59,6 @@ except ImportError:
JSONDict = dict[str, Any]
-DIV_RE = re.compile(r"<(/?)div>?", re.I)
-COMMENT_RE = re.compile(r"", re.S)
-TAG_RE = re.compile(r"<[^>]*>")
BREAK_RE = re.compile(r"\n?\s* ]*)*>\s*\n?", re.I)
USER_AGENT = f"beets/{beets.__version__}"
INSTRUMENTAL_LYRICS = "[Instrumental]"
@@ -552,8 +549,11 @@ class Genius(SearchBackend):
check = partial(self.check_match, artist, title)
for hit in json["response"]["hits"]:
result = hit["result"]
- if check(result["primary_artist"]["name"], result["title"]):
- return self.scrape_lyrics(self.fetch_text(hit["result"]["url"]))
+ url = hit["result"]["url"]
+ if check(result["primary_artist"]["name"], result["title"]) and (
+ lyrics := self.scrape_lyrics(self.fetch_text(url))
+ ):
+ return collapse_newlines(lyrics)
return None
@@ -670,7 +670,10 @@ def remove_credits(text):
return text
-def _scrape_strip_cruft(html, plain_text_out=False):
+collapse_newlines = partial(re.compile(r"\n{3,}").sub, r"\n\n")
+
+
+def _scrape_strip_cruft(html: str) -> str:
"""Clean up HTML"""
html = unescape(html)
@@ -682,13 +685,8 @@ def _scrape_strip_cruft(html, plain_text_out=False):
html = re.sub("
two
three"
From f94d2767f9919dd933da0ba870355babd29b371e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?=
Date: Fri, 6 Sep 2024 12:11:01 +0100
Subject: [PATCH 08/25] Use a single slug implementation
Tidy up 'Google.is_page_candidate' method and remove 'Google.sluggify'
method which was a duplicate of 'slug'.
Since 'GeniusFetchTest' only tested whether the artist name is cleaned
up (the rest of the functionality is patched), remove it and move its
test cases to the 'test_slug' test.
---
beetsplug/lyrics.py | 33 ++++++++-------------------------
test/plugins/test_lyrics.py | 4 ----
2 files changed, 8 insertions(+), 29 deletions(-)
diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index 2ec362356..097110e13 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -23,7 +23,6 @@ import math
import os.path
import re
import struct
-import unicodedata
from contextlib import contextmanager, suppress
from dataclasses import dataclass
from functools import cached_property, partial, total_ordering
@@ -224,7 +223,7 @@ def search_pairs(item):
return itertools.product(artists, multi_titles)
-def slug(text):
+def slug(text: str) -> str:
"""Make a URL-safe, human-readable version of the given text
This will do the following:
@@ -234,10 +233,6 @@ def slug(text):
3. strip whitespace
4. replace other non-word characters with dashes
5. strip extra dashes
-
- This somewhat duplicates the :func:`Google.slugify` function but
- slugify is not as generic as this one, which can be reused
- elsewhere.
"""
return re.sub(r"\W+", "-", unidecode(text).lower().strip()).strip("-")
@@ -745,19 +740,6 @@ class Google(SearchBackend):
self.debug("Bad triggers detected: {}", bad_triggers_occ)
return len(bad_triggers_occ) < 2
- def slugify(self, text):
- """Normalize a string and remove non-alphanumeric characters."""
- text = re.sub(r"[-'_\s]", "_", text)
- text = re.sub(r"_+", "_", text).strip("_")
- pat = r"([^,\(]*)\((.*?)\)" # Remove content within parentheses
- text = re.sub(pat, r"\g<1>", text).strip()
- try:
- text = unicodedata.normalize("NFKD", text).encode("ascii", "ignore")
- text = str(re.sub(r"[-\s]+", " ", text.decode("utf-8")))
- except UnicodeDecodeError:
- self.debug("Failed to normalize '{}'", text)
- return text
-
BY_TRANS = ["by", "par", "de", "von"]
LYRICS_TRANS = ["lyrics", "paroles", "letras", "liedtexte"]
@@ -767,23 +749,24 @@ class Google(SearchBackend):
"""Return True if the URL title makes it a good candidate to be a
page that contains lyrics of title by artist.
"""
- title_slug = self.slugify(title.lower())
- url_title_slug = self.slugify(url_title.lower())
+ title_slug = slug(title)
+ url_title_slug = slug(url_title)
if title_slug in url_title_slug:
return True
- artist = self.slugify(artist.lower())
+ artist = slug(artist)
sitename = urlparse(url_link).netloc
# or try extracting song title from URL title and check if
# they are close enough
tokens = (
- [by + "_" + artist for by in self.BY_TRANS]
+ [by + "-" + artist for by in self.BY_TRANS]
+ [artist, sitename, sitename.replace("www.", "")]
+ self.LYRICS_TRANS
)
- tokens = [re.escape(t) for t in tokens]
- song_title = re.sub("(%s)" % "|".join(tokens), "", url_title_slug)
+ song_title = re.sub(
+ "(%s)" % "|".join(tokens), "", url_title_slug
+ ).strip("-")
return self.check_match(artist, title_slug, artist, song_title)
diff --git a/test/plugins/test_lyrics.py b/test/plugins/test_lyrics.py
index 9e24d46c2..1d264338a 100644
--- a/test/plugins/test_lyrics.py
+++ b/test/plugins/test_lyrics.py
@@ -370,10 +370,6 @@ the following form.
def test_bad_lyrics(self, backend, lyrics):
assert not backend.is_lyrics(lyrics)
- def test_slugify(self, backend):
- text = "http://site.com/\xe7afe-au_lait(boisson)"
- assert backend.slugify(text) == "http://site.com/cafe_au_lait"
-
class TestGeniusLyrics(LyricsBackendTest):
@pytest.fixture(scope="class")
From 8bdc2c6cf0ee58dad2ae236a8455b1ae06a5d4a0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?=
Date: Thu, 19 Sep 2024 20:00:44 +0100
Subject: [PATCH 09/25] lyrics: Add symbols for better visual feedback in the
logs
---
beetsplug/lyrics.py | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index 097110e13..5a1301039 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -1049,7 +1049,7 @@ class LyricsPlugin(RequestHandler, plugins.BeetsPlugin):
"""
# Skip if the item already has lyrics.
if not force and item.lyrics:
- self.info("Lyrics already present: {}", item)
+ self.info("🔵 Lyrics already present: {}", item)
return
lyrics_matches = []
@@ -1065,7 +1065,7 @@ class LyricsPlugin(RequestHandler, plugins.BeetsPlugin):
lyrics = "\n\n---\n\n".join(filter(None, lyrics_matches))
if lyrics:
- self.info("Lyrics found: {}", item)
+ self.info("🟢 Found lyrics: {0}", item)
if HAS_LANGDETECT and self.config["bing_client_secret"].get():
lang_from = langdetect.detect(lyrics)
if self.config["bing_lang_to"].get() != lang_from and (
@@ -1076,7 +1076,7 @@ class LyricsPlugin(RequestHandler, plugins.BeetsPlugin):
lyrics, self.config["bing_lang_to"]
)
else:
- self.info("Lyrics not found: {}", item)
+ self.info("🔴 Lyrics not found: {}", item)
fallback = self.config["fallback"].get()
if fallback:
lyrics = fallback
From 8a1ce274210fc90aef1865721cfb433e24bcbb77 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?=
Date: Fri, 27 Sep 2024 22:57:20 +0100
Subject: [PATCH 10/25] lyrics: Do not write item unless lyrics have changed
---
beetsplug/lyrics.py | 16 +++++++---------
1 file changed, 7 insertions(+), 9 deletions(-)
diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index 5a1301039..5f733b7db 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -1077,15 +1077,13 @@ class LyricsPlugin(RequestHandler, plugins.BeetsPlugin):
)
else:
self.info("🔴 Lyrics not found: {}", item)
- fallback = self.config["fallback"].get()
- if fallback:
- lyrics = fallback
- else:
- return
- item.lyrics = lyrics
- if write:
- item.try_write()
- item.store()
+ lyrics = self.config["fallback"].get()
+
+ if lyrics not in {None, item.lyrics}:
+ item.lyrics = lyrics
+ if write:
+ item.try_write()
+ item.store()
def get_lyrics(self, artist: str, title: str, *args) -> str | None:
"""Fetch lyrics, trying each source in turn. Return a string or
From 55b78249483915993116e6131bd7c6c507dd4d70 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?=
Date: Mon, 7 Oct 2024 10:33:01 +0100
Subject: [PATCH 11/25] Replace custom unescape implementation by html.unescape
---
beetsplug/lyrics.py | 23 +----------------------
1 file changed, 1 insertion(+), 22 deletions(-)
diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index 5f733b7db..8bd7f15d3 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -22,10 +22,10 @@ import itertools
import math
import os.path
import re
-import struct
from contextlib import contextmanager, suppress
from dataclasses import dataclass
from functools import cached_property, partial, total_ordering
+from html import unescape
from http import HTTPStatus
from typing import TYPE_CHECKING, Any, ClassVar, Iterable, Iterator
from urllib.parse import quote, urlencode, urlparse
@@ -127,27 +127,6 @@ def close_session():
# Utilities.
-def unichar(i):
- try:
- return chr(i)
- except ValueError:
- return struct.pack("i", i).decode("utf-32")
-
-
-def unescape(text):
- """Resolve xx; HTML entities (and some others)."""
- if isinstance(text, bytes):
- text = text.decode("utf-8", "ignore")
- out = text.replace(" ", " ")
-
- def replchar(m):
- num = m.group(1)
- return unichar(int(num))
-
- out = re.sub("(\\d+);", replchar, out)
- return out
-
-
def extract_text_between(html, start_marker, end_marker):
try:
_, html = html.split(start_marker, 1)
From 54fc67b30a8ecaa33fe40790a6f277749c9ebc00 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?=
Date: Mon, 7 Oct 2024 18:24:22 +0100
Subject: [PATCH 12/25] Remove extract_text_between
---
beetsplug/lyrics.py | 11 +----------
1 file changed, 1 insertion(+), 10 deletions(-)
diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index 8bd7f15d3..7adb1bc7e 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -127,15 +127,6 @@ def close_session():
# Utilities.
-def extract_text_between(html, start_marker, end_marker):
- try:
- _, html = html.split(start_marker, 1)
- html, _ = html.split(end_marker, 1)
- except ValueError:
- return ""
- return html
-
-
def search_pairs(item):
"""Yield a pairs of artists and titles to search for.
@@ -448,7 +439,7 @@ class MusiXmatch(DirectBackend):
# Sometimes lyrics come in 2 or more parts
lyrics_parts = []
for html_part in html_parts:
- lyrics_parts.append(extract_text_between(html_part, ">", "
"))
+ lyrics_parts.append(re.sub(r"^[^>]+>|
.*", "", html_part))
lyrics = "\n".join(lyrics_parts)
lyrics = lyrics.strip(',"').replace("\\n", "\n")
# another odd case: sometimes only that string remains, for
From 745c5eb9f058603c0817003d57adc4c80dc7aecd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?=
Date: Wed, 9 Oct 2024 12:12:09 +0100
Subject: [PATCH 13/25] Genius: refactor and simplify
---
beetsplug/_typing.py | 86 ++++++++++++++++++++++++
beetsplug/lyrics.py | 127 ++++++++++--------------------------
docs/changelog.rst | 3 +
test/plugins/test_lyrics.py | 2 +-
4 files changed, 126 insertions(+), 92 deletions(-)
create mode 100644 beetsplug/_typing.py
diff --git a/beetsplug/_typing.py b/beetsplug/_typing.py
new file mode 100644
index 000000000..93f4a2a58
--- /dev/null
+++ b/beetsplug/_typing.py
@@ -0,0 +1,86 @@
+from __future__ import annotations
+
+from typing import Any
+
+from typing_extensions import TypedDict
+
+JSONDict = dict[str, Any]
+
+
+class LRCLibAPI:
+ class Item(TypedDict):
+ """Lyrics data item returned by the LRCLib API."""
+
+ id: int
+ name: str
+ trackName: str
+ artistName: str
+ albumName: str
+ duration: float | None
+ instrumental: bool
+ plainLyrics: str
+ syncedLyrics: str | None
+
+
+class GeniusAPI:
+ """Genius API data types.
+
+ This documents *only* the fields that are used in the plugin.
+ :attr:`SearchResult` is an exception, since I thought some of the other
+ fields might be useful in the future.
+ """
+
+ class DateComponents(TypedDict):
+ year: int
+ month: int
+ day: int
+
+ class Artist(TypedDict):
+ api_path: str
+ header_image_url: str
+ id: int
+ image_url: str
+ is_meme_verified: bool
+ is_verified: bool
+ name: str
+ url: str
+
+ class Stats(TypedDict):
+ unreviewed_annotations: int
+ hot: bool
+
+ class SearchResult(TypedDict):
+ annotation_count: int
+ api_path: str
+ artist_names: str
+ full_title: str
+ header_image_thumbnail_url: str
+ header_image_url: str
+ id: int
+ lyrics_owner_id: int
+ lyrics_state: str
+ path: str
+ primary_artist_names: str
+ pyongs_count: int | None
+ relationships_index_url: str
+ release_date_components: GeniusAPI.DateComponents
+ release_date_for_display: str
+ release_date_with_abbreviated_month_for_display: str
+ song_art_image_thumbnail_url: str
+ song_art_image_url: str
+ stats: GeniusAPI.Stats
+ title: str
+ title_with_featured: str
+ url: str
+ featured_artists: list[GeniusAPI.Artist]
+ primary_artist: GeniusAPI.Artist
+ primary_artists: list[GeniusAPI.Artist]
+
+ class SearchHit(TypedDict):
+ result: GeniusAPI.SearchResult
+
+ class SearchResponse(TypedDict):
+ hits: list[GeniusAPI.SearchHit]
+
+ class Search(TypedDict):
+ response: GeniusAPI.SearchResponse
diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index 7adb1bc7e..12b2d1f37 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -31,7 +31,6 @@ from typing import TYPE_CHECKING, Any, ClassVar, Iterable, Iterator
from urllib.parse import quote, urlencode, urlparse
import requests
-from typing_extensions import TypedDict
from unidecode import unidecode
import beets
@@ -42,6 +41,8 @@ if TYPE_CHECKING:
from beets.importer import ImportTask
from beets.library import Item
+ from ._typing import GeniusAPI, LRCLibAPI
+
try:
from bs4 import BeautifulSoup
@@ -266,20 +267,6 @@ class Backend(RequestHandler):
raise NotImplementedError
-class LRCLibItem(TypedDict):
- """Lyrics data item returned by the LRCLib API."""
-
- id: int
- name: str
- trackName: str
- artistName: str
- albumName: str
- duration: float | None
- instrumental: bool
- plainLyrics: str
- syncedLyrics: str | None
-
-
@dataclass
@total_ordering
class LRCLyrics:
@@ -297,7 +284,9 @@ class LRCLyrics:
return self.dist < other.dist
@classmethod
- def make(cls, candidate: LRCLibItem, target_duration: float) -> LRCLyrics:
+ def make(
+ cls, candidate: LRCLibAPI.Item, target_duration: float
+ ) -> LRCLyrics:
return cls(
target_duration,
candidate["duration"] or 0.0,
@@ -354,7 +343,7 @@ class LRCLib(Backend):
def fetch_candidates(
self, artist: str, title: str, album: str, length: int
- ) -> Iterator[list[LRCLibItem]]:
+ ) -> Iterator[list[LRCLibAPI.Item]]:
"""Yield lyrics candidates for the given song data.
I found that the ``/get`` endpoint sometimes returns inaccurate or
@@ -494,13 +483,15 @@ class Genius(SearchBackend):
bigishdata.com/2016/09/27/getting-song-lyrics-from-geniuss-api-scraping/
"""
+ LYRICS_IN_JSON_RE = re.compile(r'(?<=.\\"html\\":\\").*?(?=(? dict[str, str]:
+ return {"Authorization": f'Bearer {self.config["genius_api_key"]}'}
def fetch(self, artist: str, title: str, *_) -> str | None:
"""Fetch lyrics from genius.com
@@ -509,85 +500,39 @@ class Genius(SearchBackend):
we first query the api for a url matching our artist & title,
then attempt to scrape that url for the lyrics.
"""
- json = self._search(artist, title)
- check = partial(self.check_match, artist, title)
- for hit in json["response"]["hits"]:
- result = hit["result"]
- url = hit["result"]["url"]
- if check(result["primary_artist"]["name"], result["title"]) and (
- lyrics := self.scrape_lyrics(self.fetch_text(url))
- ):
- return collapse_newlines(lyrics)
+ data = self.fetch_json(
+ self.search_url,
+ params={"q": f"{artist} {title}".lower()},
+ headers=self.headers,
+ )
+ if (url := self.find_lyrics_url(data, artist, title)) and (
+ lyrics := self.scrape_lyrics(self.fetch_text(url))
+ ):
+ return collapse_newlines(lyrics)
return None
- def _search(self, artist, title):
- """Searches the genius api for a given artist and title
+ def find_lyrics_url(
+ self, data: GeniusAPI.Search, artist: str, title: str
+ ) -> str | None:
+ """Find URL to the lyrics of the given artist and title.
- https://docs.genius.com/#search-h2
-
- :returns: json response
+ https://docs.genius.com/#search-h2.
"""
- return self.fetch_json(
- self.search_url,
- params={"q": f"{title} {artist.lower()}"},
- headers=self.headers,
- )
+ check = partial(self.check_match, artist, title)
+ for result in (hit["result"] for hit in data["response"]["hits"]):
+ if check(result["artist_names"], result["title"]):
+ return result["url"]
+
+ return None
def scrape_lyrics(self, html: str) -> str | None:
- """Scrape lyrics from a given genius.com html"""
- soup = get_soup(html)
+ if m := self.LYRICS_IN_JSON_RE.search(html):
+ html_text = self.remove_backslash(m[0]).replace(r"\n", "\n")
+ return get_soup(html_text).get_text().strip()
- # Most of the time, the page contains a div with class="lyrics" where
- # all of the lyrics can be found already correctly formatted
- # Sometimes, though, it packages the lyrics into separate divs, most
- # likely for easier ad placement
-
- lyrics_divs = soup.find_all("div", {"data-lyrics-container": True})
- if not lyrics_divs:
- self.debug("Received unusual song page html")
- return self._try_extracting_lyrics_from_non_data_lyrics_container(
- soup
- )
- lyrics = ""
- for lyrics_div in lyrics_divs:
- lyrics += lyrics_div.get_text() + "\n\n"
- while lyrics[-1] == "\n":
- lyrics = lyrics[:-1]
- return lyrics
-
- def _try_extracting_lyrics_from_non_data_lyrics_container(self, soup):
- """Extract lyrics from a div without attribute data-lyrics-container
- This is the second most common layout on genius.com
- """
- verse_div = soup.find("div", class_=re.compile("Lyrics__Container"))
- if not verse_div:
- if soup.find(
- "div",
- class_=re.compile("LyricsPlaceholder__Message"),
- string="This song is an instrumental",
- ):
- self.debug("Detected instrumental")
- return INSTRUMENTAL_LYRICS
- else:
- self.debug("Couldn't scrape page using known layouts")
- return None
-
- lyrics_div = verse_div.parent
-
- ads = lyrics_div.find_all(
- "div", class_=re.compile("InreadAd__Container")
- )
- for ad in ads:
- ad.replace_with("\n")
-
- footers = lyrics_div.find_all(
- "div", class_=re.compile("Lyrics__Footer")
- )
- for footer in footers:
- footer.replace_with("")
- return lyrics_div.get_text()
+ return None
class Tekstowo(DirectBackend):
diff --git a/docs/changelog.rst b/docs/changelog.rst
index 25ae5bc10..54d085599 100644
--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@@ -67,6 +67,9 @@ Bug fixes:
* :doc:`plugins/lyrics`: Fix the issue with ``genius`` backend not being able
to match lyrics when there is a slight variation in the artist name.
:bug:`4791`
+* :doc:`plugins/lyrics`: Fix plugin crash when ``genius`` backend returns empty
+ lyrics.
+ :bug:`5583`
For packagers:
diff --git a/test/plugins/test_lyrics.py b/test/plugins/test_lyrics.py
index 1d264338a..091776108 100644
--- a/test/plugins/test_lyrics.py
+++ b/test/plugins/test_lyrics.py
@@ -379,7 +379,7 @@ class TestGeniusLyrics(LyricsBackendTest):
@pytest.mark.parametrize(
"file_name, expected_line_count",
[
- ("geniuscom/2pacalleyezonmelyrics", 134),
+ ("geniuscom/2pacalleyezonmelyrics", 131),
("geniuscom/Ttngchinchillalyrics", 29),
("geniuscom/sample", 0), # see https://github.com/beetbox/beets/issues/3535
],
From 12c5eaae5e144dcd88e32a0af23701aa346e3d00 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?=
Date: Sun, 13 Oct 2024 17:04:58 +0100
Subject: [PATCH 14/25] Unite Genius, Tekstowo and Google backends under the
same interface
---
beetsplug/lyrics.py | 165 ++++++++++++++++++------------------
test/plugins/test_lyrics.py | 39 ++++-----
2 files changed, 101 insertions(+), 103 deletions(-)
diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index 12b2d1f37..9424a14b6 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -27,7 +27,7 @@ from dataclasses import dataclass
from functools import cached_property, partial, total_ordering
from html import unescape
from http import HTTPStatus
-from typing import TYPE_CHECKING, Any, ClassVar, Iterable, Iterator
+from typing import TYPE_CHECKING, ClassVar, Iterable, Iterator, NamedTuple
from urllib.parse import quote, urlencode, urlparse
import requests
@@ -41,7 +41,7 @@ if TYPE_CHECKING:
from beets.importer import ImportTask
from beets.library import Item
- from ._typing import GeniusAPI, LRCLibAPI
+ from ._typing import GeniusAPI, JSONDict, LRCLibAPI
try:
from bs4 import BeautifulSoup
@@ -57,8 +57,6 @@ try:
except ImportError:
HAS_LANGDETECT = False
-JSONDict = dict[str, Any]
-
BREAK_RE = re.compile(r"\n?\s* ]*)*>\s*\n?", re.I)
USER_AGENT = f"beets/{beets.__version__}"
INSTRUMENTAL_LYRICS = "[Instrumental]"
@@ -442,6 +440,12 @@ class MusiXmatch(DirectBackend):
return lyrics
+class SearchResult(NamedTuple):
+ artist: str
+ title: str
+ url: str
+
+
class SearchBackend(Backend):
REQUIRES_BS = True
@@ -450,12 +454,12 @@ class SearchBackend(Backend):
return self.config["dist_thresh"].get(float)
def check_match(
- self, target_artist: str, target_title: str, artist: str, title: str
+ self, target_artist: str, target_title: str, result: SearchResult
) -> bool:
- """Check if the given artist and title are 'good enough' match."""
+ """Check if the given search result is a 'good enough' match."""
max_dist = max(
- string_dist(target_artist, artist),
- string_dist(target_title, title),
+ string_dist(target_artist, result.artist),
+ string_dist(target_title, result.title),
)
if (max_dist := round(max_dist, 2)) <= self.dist_thresh:
@@ -466,8 +470,8 @@ class SearchBackend(Backend):
# This may show a matching candidate with some noise in the name
self.debug(
"({}, {}) does not match ({}, {}) but dist was close: {:.2f}",
- artist,
- title,
+ result.artist,
+ result.title,
target_artist,
target_title,
max_dist,
@@ -475,61 +479,59 @@ class SearchBackend(Backend):
return False
+ def search(self, artist: str, title: str) -> Iterable[SearchResult]:
+ """Search for the given query and yield search results."""
+ raise NotImplementedError
+
+ def get_results(self, artist: str, title: str) -> Iterable[SearchResult]:
+ check_match = partial(self.check_match, artist, title)
+ for candidate in self.search(artist, title):
+ if check_match(candidate):
+ yield candidate
+
+ def fetch(self, artist: str, title: str, *_) -> str | None:
+ """Fetch lyrics for the given artist and title."""
+ for result in self.get_results(artist, title):
+ if lyrics := self.scrape(self.fetch_text(result.url)):
+ return lyrics
+
+ return None
+
+ @classmethod
+ def scrape(cls, html: str) -> str | None:
+ """Scrape the lyrics from the given HTML."""
+ raise NotImplementedError
+
class Genius(SearchBackend):
"""Fetch lyrics from Genius via genius-api.
- Simply adapted from
- bigishdata.com/2016/09/27/getting-song-lyrics-from-geniuss-api-scraping/
+ Because genius doesn't allow accessing lyrics via the api, we first query
+ the api for a url matching our artist & title, then scrape the HTML text
+ for the JSON data containing the lyrics.
"""
+ SEARCH_URL = "https://api.genius.com/search"
LYRICS_IN_JSON_RE = re.compile(r'(?<=.\\"html\\":\\").*?(?=(? dict[str, str]:
return {"Authorization": f'Bearer {self.config["genius_api_key"]}'}
- def fetch(self, artist: str, title: str, *_) -> str | None:
- """Fetch lyrics from genius.com
-
- Because genius doesn't allow accessing lyrics via the api,
- we first query the api for a url matching our artist & title,
- then attempt to scrape that url for the lyrics.
- """
-
- data = self.fetch_json(
- self.search_url,
- params={"q": f"{artist} {title}".lower()},
+ def search(self, artist: str, title: str) -> Iterable[SearchResult]:
+ search_data: GeniusAPI.Search = self.fetch_json(
+ self.SEARCH_URL,
+ params={"q": f"{artist} {title}"},
headers=self.headers,
)
- if (url := self.find_lyrics_url(data, artist, title)) and (
- lyrics := self.scrape_lyrics(self.fetch_text(url))
- ):
- return collapse_newlines(lyrics)
+ for r in (hit["result"] for hit in search_data["response"]["hits"]):
+ yield SearchResult(r["artist_names"], r["title"], r["url"])
- return None
-
- def find_lyrics_url(
- self, data: GeniusAPI.Search, artist: str, title: str
- ) -> str | None:
- """Find URL to the lyrics of the given artist and title.
-
- https://docs.genius.com/#search-h2.
- """
- check = partial(self.check_match, artist, title)
- for result in (hit["result"] for hit in data["response"]["hits"]):
- if check(result["artist_names"], result["title"]):
- return result["url"]
-
- return None
-
- def scrape_lyrics(self, html: str) -> str | None:
- if m := self.LYRICS_IN_JSON_RE.search(html):
- html_text = self.remove_backslash(m[0]).replace(r"\n", "\n")
+ @classmethod
+ def scrape(cls, html: str) -> str | None:
+ if m := cls.LYRICS_IN_JSON_RE.search(html):
+ html_text = cls.remove_backslash(m[0]).replace(r"\n", "\n")
return get_soup(html_text).get_text().strip()
return None
@@ -551,13 +553,12 @@ class Tekstowo(DirectBackend):
# We are expecting to receive a 404 since we are guessing the URL.
# Thus suppress the error so that it does not end up in the logs.
with suppress(NotFoundError):
- return self.scrape_lyrics(
- self.fetch_text(self.build_url(artist, title))
- )
+ return self.scrape(self.fetch_text(self.build_url(artist, title)))
return None
- def scrape_lyrics(self, html: str) -> str | None:
+ @classmethod
+ def scrape(cls, html: str) -> str | None:
soup = get_soup(html)
if lyrics_div := soup.select_one("div.song-text > div.inner-text"):
@@ -616,16 +617,6 @@ class Google(SearchBackend):
SEARCH_URL = "https://www.googleapis.com/customsearch/v1"
- @staticmethod
- def scrape_lyrics(html: str) -> str | None:
- soup = get_soup(html)
-
- # Get the longest text element (if any).
- strings = sorted(soup.stripped_strings, key=len, reverse=True)
- if strings:
- return strings[0]
- return None
-
def is_lyrics(self, text, artist=None):
"""Determine whether the text seems to be valid lyrics."""
if not text:
@@ -658,17 +649,11 @@ class Google(SearchBackend):
BY_TRANS = ["by", "par", "de", "von"]
LYRICS_TRANS = ["lyrics", "paroles", "letras", "liedtexte"]
- def is_page_candidate(
- self, artist: str, title: str, url_link: str, url_title: str
- ) -> bool:
- """Return True if the URL title makes it a good candidate to be a
- page that contains lyrics of title by artist.
- """
- title_slug = slug(title)
+ def make_search_result(
+ self, artist: str, url_link: str, url_title: str
+ ) -> SearchResult:
+ """Parse artist and title from the URL title and return a search result."""
url_title_slug = slug(url_title)
- if title_slug in url_title_slug:
- return True
-
artist = slug(artist)
sitename = urlparse(url_link).netloc
@@ -683,33 +668,45 @@ class Google(SearchBackend):
"(%s)" % "|".join(tokens), "", url_title_slug
).strip("-")
- return self.check_match(artist, title_slug, artist, song_title)
+ return SearchResult(artist, song_title, url_link)
- def fetch(self, artist: str, title: str, *_) -> str | None:
+ def search(self, artist: str, title: str) -> Iterable[SearchResult]:
params = {
"key": self.config["google_API_key"].as_str(),
"cx": self.config["google_engine_ID"].as_str(),
"q": f"{artist} {title}",
}
- check_candidate = partial(self.is_page_candidate, artist, title)
- for item in self.fetch_json(self.SEARCH_URL, params=params).get(
- "items", []
- ):
- url_link = item["link"]
- if not check_candidate(url_link, item.get("title", "")):
- continue
+ data = self.fetch_json(self.SEARCH_URL, params=params)
+ for item in data.get("items", []):
+ yield self.make_search_result(artist, item["link"], item["title"])
+
+ def get_results(self, artist: str, title: str) -> Iterable[SearchResult]:
+ return super().get_results(artist, slug(title))
+
+ def fetch(self, artist: str, title: str, *_) -> str | None:
+ for result in self.get_results(artist, title):
with self.handle_request():
- lyrics = self.scrape_lyrics(self.fetch_text(url_link))
+ lyrics = self.scrape(self.fetch_text(result.url))
if not lyrics:
continue
if self.is_lyrics(lyrics, artist):
- self.debug("Got lyrics from {}", item["displayLink"])
+ self.debug(
+ "Got lyrics from {}", urlparse(result.url).netloc
+ )
return lyrics
return None
+ @classmethod
+ def scrape(cls, html: str) -> str | None:
+ # Get the longest text element (if any).
+ if strings := sorted(get_soup(html).stripped_strings, key=len):
+ return strings[-1]
+
+ return None
+
class LyricsPlugin(RequestHandler, plugins.BeetsPlugin):
SOURCES = ["lrclib", "google", "musixmatch", "genius", "tekstowo"]
diff --git a/test/plugins/test_lyrics.py b/test/plugins/test_lyrics.py
index 091776108..8afd80585 100644
--- a/test/plugins/test_lyrics.py
+++ b/test/plugins/test_lyrics.py
@@ -191,9 +191,9 @@ class TestSearchBackend:
],
)
def test_check_match(self, backend, target_artist, artist, should_match):
- assert (
- backend.check_match(target_artist, "", artist, "") == should_match
- )
+ result = lyrics.SearchResult(artist, "", "")
+
+ assert backend.check_match(target_artist, "", result) == should_match
@pytest.fixture(scope="module")
@@ -327,31 +327,32 @@ class TestGoogleLyrics(LyricsBackendTest):
def test_mocked_source_ok(self, backend, lyrics_html):
"""Test that lyrics of the mocked page are correctly scraped"""
- result = backend.scrape_lyrics(lyrics_html).lower()
+ result = backend.scrape(lyrics_html).lower()
assert result
assert backend.is_lyrics(result)
assert PHRASE_BY_TITLE[self.TITLE] in result
@pytest.mark.parametrize(
- "url_title, artist, should_be_candidate",
+ "url_title, artist, expected_title",
[
- ("John Doe - beets song Lyrics", "John Doe", True),
- ("example.com | Beats song by John doe", "John Doe", True),
- ("example.com | seets bong lyrics by John doe", "John Doe", False),
- ("foo", "Sun O)))", False),
+ ("John Doe - beets song Lyrics", "John Doe", "beets-song"),
+ ("example.com | Beats song by John doe", "John Doe", "beats-song"),
+ (
+ "example.com | seets bong lyrics by John doe",
+ "John Doe",
+ "seets-bong",
+ ),
+ ("foo", "Sun O)))", "foo"),
],
)
- def test_is_page_candidate(
- self, backend, lyrics_html, url_title, artist, should_be_candidate
+ def test_make_search_result(
+ self, backend, url_title, artist, expected_title
):
- result = backend.is_page_candidate(
- artist,
- self.TITLE,
- "http://www.example.com/lyrics/beetssong",
- url_title,
+ result = backend.make_search_result(
+ artist, "https://example.com", url_title
)
- assert bool(result) == should_be_candidate
+ assert result.title == expected_title
@pytest.mark.parametrize(
"lyrics",
@@ -385,7 +386,7 @@ class TestGeniusLyrics(LyricsBackendTest):
],
) # fmt: skip
def test_scrape(self, backend, lyrics_html, expected_line_count):
- result = backend.scrape_lyrics(lyrics_html) or ""
+ result = backend.scrape(lyrics_html) or ""
assert len(result.splitlines()) == expected_line_count
@@ -406,7 +407,7 @@ class TestTekstowoLyrics(LyricsBackendTest):
],
)
def test_scrape(self, backend, lyrics_html, expecting_lyrics):
- assert bool(backend.scrape_lyrics(lyrics_html)) == expecting_lyrics
+ assert bool(backend.scrape(lyrics_html)) == expecting_lyrics
LYRICS_DURATION = 950
From c5c4138d6638e854a811eee6f98ab6afdba0b82f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?=
Date: Sun, 13 Oct 2024 16:36:41 +0100
Subject: [PATCH 15/25] Google: Refactor and improve
* Type the response data that Google Custom Search API return.
* Exclude some 'letras.mus.br' pages that do not contain lyric.
* Exclude results from Musixmatch as we cannot access their pages.
* Improve parsing of the URL title:
- Handle long URL titles that get truncated (end with ellipsis) for
long searches
- Remove domains starting with 'www'
- Parse the title AND the artist. Previously this would only parse the
title, and fetch lyrics even when the artist did not match.
* Remove now redundant credits cleanup and checks for valid lyrics.
---
beetsplug/_typing.py | 31 ++++++-
beetsplug/lyrics.py | 156 ++++++++++++++++++------------------
test/plugins/test_lyrics.py | 84 ++++++++-----------
3 files changed, 141 insertions(+), 130 deletions(-)
diff --git a/beetsplug/_typing.py b/beetsplug/_typing.py
index 93f4a2a58..915ea77e8 100644
--- a/beetsplug/_typing.py
+++ b/beetsplug/_typing.py
@@ -2,7 +2,7 @@ from __future__ import annotations
from typing import Any
-from typing_extensions import TypedDict
+from typing_extensions import NotRequired, TypedDict
JSONDict = dict[str, Any]
@@ -84,3 +84,32 @@ class GeniusAPI:
class Search(TypedDict):
response: GeniusAPI.SearchResponse
+
+
+class GoogleCustomSearchAPI:
+ class Response(TypedDict):
+ """Search response from the Google Custom Search API.
+
+ If the search returns no results, the :attr:`items` field is not found.
+ """
+
+ items: NotRequired[list[GoogleCustomSearchAPI.Item]]
+
+ class Item(TypedDict):
+ """A Google Custom Search API result item.
+
+ :attr:`title` field is shown to the user in the search interface, thus
+ it gets truncated with an ellipsis for longer queries. For most
+ results, the full title is available as ``og:title`` metatag found
+ under the :attr:`pagemap` field. Note neither this metatag nor the
+ ``pagemap`` field is guaranteed to be present in the data.
+ """
+
+ title: str
+ link: str
+ pagemap: NotRequired[GoogleCustomSearchAPI.Pagemap]
+
+ class Pagemap(TypedDict):
+ """Pagemap data with a single meta tags dict in a list."""
+
+ metatags: list[JSONDict]
diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index 9424a14b6..0982120f2 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -28,7 +28,7 @@ from functools import cached_property, partial, total_ordering
from html import unescape
from http import HTTPStatus
from typing import TYPE_CHECKING, ClassVar, Iterable, Iterator, NamedTuple
-from urllib.parse import quote, urlencode, urlparse
+from urllib.parse import quote, urlencode
import requests
from unidecode import unidecode
@@ -41,7 +41,7 @@ if TYPE_CHECKING:
from beets.importer import ImportTask
from beets.library import Item
- from ._typing import GeniusAPI, JSONDict, LRCLibAPI
+ from ._typing import GeniusAPI, GoogleCustomSearchAPI, JSONDict, LRCLibAPI
try:
from bs4 import BeautifulSoup
@@ -492,7 +492,9 @@ class SearchBackend(Backend):
def fetch(self, artist: str, title: str, *_) -> str | None:
"""Fetch lyrics for the given artist and title."""
for result in self.get_results(artist, title):
- if lyrics := self.scrape(self.fetch_text(result.url)):
+ if (html := self.fetch_text(result.url)) and (
+ lyrics := self.scrape(html)
+ ):
return lyrics
return None
@@ -567,20 +569,6 @@ class Tekstowo(DirectBackend):
return None
-def remove_credits(text):
- """Remove first/last line of text if it contains the word 'lyrics'
- eg 'Lyrics by songsdatabase.com'
- """
- textlines = text.split("\n")
- credits = None
- for i in (0, -1):
- if textlines and "lyrics" in textlines[i].lower():
- credits = textlines.pop(i)
- if credits:
- text = "\n".join(textlines)
- return text
-
-
collapse_newlines = partial(re.compile(r"\n{3,}").sub, r"\n\n")
@@ -617,87 +605,97 @@ class Google(SearchBackend):
SEARCH_URL = "https://www.googleapis.com/customsearch/v1"
- def is_lyrics(self, text, artist=None):
- """Determine whether the text seems to be valid lyrics."""
- if not text:
- return False
- bad_triggers_occ = []
- nb_lines = text.count("\n")
- if nb_lines <= 1:
- self.debug("Ignoring too short lyrics '{}'", text)
- return False
- elif nb_lines < 5:
- bad_triggers_occ.append("too_short")
- else:
- # Lyrics look legit, remove credits to avoid being penalized
- # further down
- text = remove_credits(text)
+ #: Exclude some letras.mus.br pages which do not contain lyrics.
+ EXCLUDE_PAGES = [
+ "significado.html",
+ "traduccion.html",
+ "traducao.html",
+ "significados.html",
+ ]
- bad_triggers = ["lyrics", "copyright", "property", "links"]
- if artist:
- bad_triggers += [artist]
+ #: Regular expression to match noise in the URL title.
+ URL_TITLE_NOISE_RE = re.compile(
+ r"""
+\b
+(
+ paroles(\ et\ traduction|\ de\ chanson)?
+ | letras?(\ de)?
+ | liedtexte
+ | original\ song\ full\ text\.
+ | official
+ | 20[12]\d\ version
+ | (absolute\ |az)?lyrics(\ complete)?
+ | www\S+
+ | \S+\.(com|net|mus\.br)
+)
+([^\w.]|$)
+""",
+ re.IGNORECASE | re.VERBOSE,
+ )
+ #: Split cleaned up URL title into artist and title parts.
+ URL_TITLE_PARTS_RE = re.compile(r" +(?:[ :|-]+|par|by) +")
- for item in bad_triggers:
- bad_triggers_occ += [item] * len(
- re.findall(r"\W%s\W" % item, text, re.I)
- )
+ def fetch_text(self, *args, **kwargs) -> str:
+ """Handle an error so that we can continue with the next URL."""
+ with self.handle_request():
+ return super().fetch_text(*args, **kwargs)
- if bad_triggers_occ:
- self.debug("Bad triggers detected: {}", bad_triggers_occ)
- return len(bad_triggers_occ) < 2
+ @staticmethod
+ def get_part_dist(artist: str, title: str, part: str) -> float:
+ """Return the distance between the given part and the artist and title.
- BY_TRANS = ["by", "par", "de", "von"]
- LYRICS_TRANS = ["lyrics", "paroles", "letras", "liedtexte"]
+ A number between -1 and 1 is returned, where -1 means the part is
+ closer to the artist and 1 means it is closer to the title.
+ """
+ return string_dist(artist, part) - string_dist(title, part)
+ @classmethod
def make_search_result(
- self, artist: str, url_link: str, url_title: str
+ cls, artist: str, title: str, item: GoogleCustomSearchAPI.Item
) -> SearchResult:
"""Parse artist and title from the URL title and return a search result."""
- url_title_slug = slug(url_title)
- artist = slug(artist)
- sitename = urlparse(url_link).netloc
-
- # or try extracting song title from URL title and check if
- # they are close enough
- tokens = (
- [by + "-" + artist for by in self.BY_TRANS]
- + [artist, sitename, sitename.replace("www.", "")]
- + self.LYRICS_TRANS
+ url_title = (
+ # get full title from metatags if available
+ item.get("pagemap", {}).get("metatags", [{}])[0].get("og:title")
+ # default to the dispolay title
+ or item["title"]
)
- song_title = re.sub(
- "(%s)" % "|".join(tokens), "", url_title_slug
- ).strip("-")
+ clean_title = cls.URL_TITLE_NOISE_RE.sub("", url_title).strip(" .-|")
+ # split it into parts which may be part of the artist or the title
+ # `dict.fromkeys` removes duplicates keeping the order
+ parts = list(dict.fromkeys(cls.URL_TITLE_PARTS_RE.split(clean_title)))
- return SearchResult(artist, song_title, url_link)
+ if len(parts) == 1:
+ part = parts[0]
+ if m := re.search(rf"(?i)\W*({re.escape(title)})\W*", part):
+ # artist and title may not have a separator
+ result_title = m[1]
+ result_artist = part.replace(m[0], "")
+ else:
+ # assume that this is the title
+ result_artist, result_title = "", parts[0]
+ else:
+ # sort parts by their similarity to the artist
+ parts.sort(key=lambda p: cls.get_part_dist(artist, title, p))
+ result_artist, result_title = parts[0], " ".join(parts[1:])
+
+ return SearchResult(result_artist, result_title, item["link"])
def search(self, artist: str, title: str) -> Iterable[SearchResult]:
params = {
"key": self.config["google_API_key"].as_str(),
"cx": self.config["google_engine_ID"].as_str(),
"q": f"{artist} {title}",
+ "siteSearch": "www.musixmatch.com",
+ "siteSearchFilter": "e",
+ "excludeTerms": ", ".join(self.EXCLUDE_PAGES),
}
- data = self.fetch_json(self.SEARCH_URL, params=params)
+ data: GoogleCustomSearchAPI.Response = self.fetch_json(
+ self.SEARCH_URL, params=params
+ )
for item in data.get("items", []):
- yield self.make_search_result(artist, item["link"], item["title"])
-
- def get_results(self, artist: str, title: str) -> Iterable[SearchResult]:
- return super().get_results(artist, slug(title))
-
- def fetch(self, artist: str, title: str, *_) -> str | None:
- for result in self.get_results(artist, title):
- with self.handle_request():
- lyrics = self.scrape(self.fetch_text(result.url))
- if not lyrics:
- continue
-
- if self.is_lyrics(lyrics, artist):
- self.debug(
- "Got lyrics from {}", urlparse(result.url).netloc
- )
- return lyrics
-
- return None
+ yield self.make_search_result(artist, title, item)
@classmethod
def scrape(cls, html: str) -> str | None:
diff --git a/test/plugins/test_lyrics.py b/test/plugins/test_lyrics.py
index 8afd80585..6986e4f06 100644
--- a/test/plugins/test_lyrics.py
+++ b/test/plugins/test_lyrics.py
@@ -101,24 +101,6 @@ class TestLyricsUtils:
assert list(actual_titles) == [title, *expected_extra_titles]
- @pytest.mark.parametrize(
- "initial_lyrics, expected",
- [
- ("Verse\nLyrics credit in the last line", "Verse"),
- ("Lyrics credit in the first line\nVerse", "Verse"),
- (
- """Verse
- Lyrics mentioned somewhere in the middle
- Verse""",
- """Verse
- Lyrics mentioned somewhere in the middle
- Verse""",
- ),
- ],
- )
- def test_remove_credits(self, initial_lyrics, expected):
- assert lyrics.remove_credits(initial_lyrics) == expected
-
@pytest.mark.parametrize(
"initial_text, expected",
[
@@ -311,8 +293,6 @@ class TestLyricsSources(LyricsBackendTest):
class TestGoogleLyrics(LyricsBackendTest):
"""Test scraping heuristics on a fake html page."""
- TITLE = "Beets song"
-
@pytest.fixture(scope="class")
def backend_name(self):
return "google"
@@ -325,51 +305,55 @@ class TestGoogleLyrics(LyricsBackendTest):
def file_name(self):
return "examplecom/beetssong"
+ @pytest.fixture
+ def search_item(self, url_title, url):
+ return {"title": url_title, "link": url}
+
def test_mocked_source_ok(self, backend, lyrics_html):
"""Test that lyrics of the mocked page are correctly scraped"""
result = backend.scrape(lyrics_html).lower()
assert result
- assert backend.is_lyrics(result)
- assert PHRASE_BY_TITLE[self.TITLE] in result
+ assert PHRASE_BY_TITLE["Beets song"] in result
@pytest.mark.parametrize(
- "url_title, artist, expected_title",
+ "url_title, expected_artist, expected_title",
[
- ("John Doe - beets song Lyrics", "John Doe", "beets-song"),
- ("example.com | Beats song by John doe", "John Doe", "beats-song"),
+ ("Artist - beets song Lyrics", "Artist", "beets song"),
+ ("www.azlyrics.com | Beats song by Artist", "Artist", "Beats song"),
+ ("lyric.com | seets bong lyrics by Artist", "Artist", "seets bong"),
+ ("foo", "", "foo"),
+ ("Artist - Beets Song lyrics | AZLyrics", "Artist", "Beets Song"),
+ ("Letra de Artist - Beets Song", "Artist", "Beets Song"),
+ ("Letra de Artist - Beets ...", "Artist", "Beets"),
+ ("Artist Beets Song", "Artist", "Beets Song"),
+ ("BeetsSong - Artist", "Artist", "BeetsSong"),
+ ("Artist - BeetsSong", "Artist", "BeetsSong"),
+ ("Beets Song", "", "Beets Song"),
+ ("Beets Song Artist", "Artist", "Beets Song"),
(
- "example.com | seets bong lyrics by John doe",
- "John Doe",
- "seets-bong",
+ "BeetsSong (feat. Other & Another) - Artist",
+ "Artist",
+ "BeetsSong (feat. Other & Another)",
+ ),
+ (
+ (
+ "Beets song lyrics by Artist - original song full text. "
+ "Official Beets song lyrics, 2024 version | LyricsMode.com"
+ ),
+ "Artist",
+ "Beets song",
),
- ("foo", "Sun O)))", "foo"),
],
)
+ @pytest.mark.parametrize("url", ["http://doesntmatter.com"])
def test_make_search_result(
- self, backend, url_title, artist, expected_title
+ self, backend, search_item, expected_artist, expected_title
):
- result = backend.make_search_result(
- artist, "https://example.com", url_title
- )
- assert result.title == expected_title
+ result = backend.make_search_result("Artist", "Beets song", search_item)
- @pytest.mark.parametrize(
- "lyrics",
- [
- "LyricsMania.com - Copyright (c) 2013 - All Rights Reserved",
- """All material found on this site is property\n
- of mywickedsongtext brand""",
- """
-Lyricsmania staff is working hard for you to add $TITLE lyrics as soon
-as they'll be released by $ARTIST, check back soon!
-In case you have the lyrics to $TITLE and want to send them to us, fill out
-the following form.
-""",
- ],
- )
- def test_bad_lyrics(self, backend, lyrics):
- assert not backend.is_lyrics(lyrics)
+ assert result.artist == expected_artist
+ assert result.title == expected_title
class TestGeniusLyrics(LyricsBackendTest):
From 70554640e579635a72b7292e541a1eb48645f712 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?=
Date: Sun, 13 Oct 2024 13:34:12 +0100
Subject: [PATCH 16/25] Create Html class for cleaning up the html text
Additionally, improve HTML pre-processing:
* Ensure a new line between blocks of lyrics text from letras.mus.br.
* Parse a missing last block of lyrics text from lacocinelle.net.
* Parse a missing last block of lyrics text from paroles.net.
* Fix encoding issues with AZLyrics by setting response encoding to
None, allowing `requests` to handle it.
---
beetsplug/lyrics.py | 105 ++++++++++++++++++++++-------------
test/plugins/lyrics_pages.py | 42 ++++++++++++++
test/plugins/test_lyrics.py | 41 +++++++-------
3 files changed, 127 insertions(+), 61 deletions(-)
diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index 0982120f2..a19d8c616 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -57,7 +57,6 @@ try:
except ImportError:
HAS_LANGDETECT = False
-BREAK_RE = re.compile(r"\n?\s* ]*)*>\s*\n?", re.I)
USER_AGENT = f"beets/{beets.__version__}"
INSTRUMENTAL_LYRICS = "[Instrumental]"
@@ -231,10 +230,16 @@ class RequestHandler:
def fetch_text(
self, url: str, params: JSONDict | None = None, **kwargs
) -> str:
- """Return text / HTML data from the given URL."""
+ """Return text / HTML data from the given URL.
+
+ Set the encoding to None to let requests handle it because some sites
+ set it incorrectly.
+ """
url = self.format_url(url, params)
self.debug("Fetching HTML from {}", url)
- return r_session.get(url, **kwargs).text
+ r = r_session.get(url, **kwargs)
+ r.encoding = None
+ return r.text
def fetch_json(self, url: str, params: JSONDict | None = None, **kwargs):
"""Return JSON data from the given URL."""
@@ -440,13 +445,60 @@ class MusiXmatch(DirectBackend):
return lyrics
+class Html:
+ collapse_space = partial(re.compile(r"(^| ) +", re.M).sub, r"\1")
+ expand_br = partial(re.compile(r"\s* ]*>\s*", re.I).sub, "\n")
+ #: two newlines between paragraphs on the same line (musica, letras.mus.br)
+ merge_blocks = partial(re.compile(r"(?)
]*>").sub, "\n\n")
+ #: a single new line between paragraphs on separate lines
+ #: (paroles.net, sweetslyrics.com, lacoccinelle.net)
+ merge_lines = partial(re.compile(r"