Update Tekstowo backend to fetch lyrics directly

- Refactored Tekstowo backend to fetch lyrics directly from song pages. - Added `encode` method to convert artist and title to their URL format, where non-alphanumeric characters are replaced with underscores. - Removed the now redundant search functionality and associated tests. - Simplified `extract_lyrics` method to directly parse lyrics without any checks.
2025-12-06 08:39:17 +01:00 · 2024-10-12 02:14:18 +01:00 · 2024-10-12 02:14:18 +01:00 · d3955bac65
commit d3955bac65
parent 9d2b34d457
5 changed files with 24 additions and 1253 deletions
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@ -14,6 +14,8 @@

 """Fetches, embeds, and displays lyrics."""

+from __future__ import annotations
+
 import difflib
 import errno
 import itertools
@ -23,6 +25,7 @@ import re
 import struct
 import unicodedata
 import warnings
+from functools import partial
 from typing import ClassVar
 from urllib.parse import quote, urlencode

@ -47,7 +50,6 @@ except ImportError:

 import beets
 from beets import plugins, ui
-from beets.autotag.hooks import string_dist

 DIV_RE = re.compile(r"<(/?)div>?", re.I)
 COMMENT_RE = re.compile(r"<!--.*-->", re.S)
@ -288,7 +290,7 @@ class DirectBackend(Backend):
    @classmethod
    def encode(cls, text: str) -> str:
        """Encode the string for inclusion in a URL."""
-        return quote(unidecode(text))
+        raise NotImplementedError

    @classmethod
    def build_url(cls, *args: str) -> str:
@ -312,7 +314,7 @@ class MusiXmatch(DirectBackend):
        for old, new in cls.REPLACEMENTS.items():
            text = re.sub(old, new, text)

-        return super().encode(text)
+        return quote(unidecode(text))

    def fetch(self, artist, title, album=None, length=None):
        url = self.build_url(artist, title)
@ -485,86 +487,30 @@ class Tekstowo(DirectBackend):
    """Fetch lyrics from Tekstowo.pl."""

    REQUIRES_BS = True
-    BASE_URL = "http://www.tekstowo.pl"
-    URL_TEMPLATE = BASE_URL + "/wyszukaj.html?search-title={}&search-artist={}"
+    URL_TEMPLATE = "https://www.tekstowo.pl/piosenka,{},{}.html"
+
+    non_alpha_to_underscore = partial(re.compile(r"\W").sub, "_")
+
+    @classmethod
+    def encode(cls, text: str) -> str:
+        return cls.non_alpha_to_underscore(unidecode(text.lower()))

    def fetch(self, artist, title, album=None, length=None):
-        url = self.build_url(title, artist)
-        search_results = self.fetch_url(url)
-        if not search_results:
-            return None
+        if html := self.fetch_url(self.build_url(artist, title)):
+            return self.extract_lyrics(html)

-        song_page_url = self.parse_search_results(search_results)
-        if not song_page_url:
-            return None
+        return None

-        song_page_html = self.fetch_url(song_page_url)
-        if not song_page_html:
-            return None
-
-        return self.extract_lyrics(song_page_html, artist, title)
-
-    def parse_search_results(self, html):
+    def extract_lyrics(self, html: str) -> str | None:
        html = _scrape_strip_cruft(html)
        html = _scrape_merge_paragraphs(html)

        soup = try_parse_html(html)
-        if not soup:
-            return None

-        content_div = soup.find("div", class_="content")
-        if not content_div:
-            return None
+        if lyrics_div := soup.select_one("div.song-text > div.inner-text"):
+            return lyrics_div.get_text()

-        card_div = content_div.find("div", class_="card")
-        if not card_div:
-            return None
-
-        song_rows = card_div.find_all("div", class_="box-przeboje")
-        if not song_rows:
-            return None
-
-        song_row = song_rows[0]
-        if not song_row:
-            return None
-
-        link = song_row.find("a")
-        if not link:
-            return None
-
-        return self.BASE_URL + link.get("href")
-
-    def extract_lyrics(self, html, artist, title):
-        html = _scrape_strip_cruft(html)
-        html = _scrape_merge_paragraphs(html)
-
-        soup = try_parse_html(html)
-        if not soup:
-            return None
-
-        info_div = soup.find("div", class_="col-auto")
-        if not info_div:
-            return None
-
-        info_elements = info_div.find_all("a")
-        if not info_elements:
-            return None
-
-        html_title = info_elements[-1].get_text()
-        html_artist = info_elements[-2].get_text()
-
-        title_dist = string_dist(html_title, title)
-        artist_dist = string_dist(html_artist, artist)
-
-        thresh = self.config["dist_thresh"].get(float)
-        if title_dist > thresh or artist_dist > thresh:
-            return None
-
-        lyrics_div = soup.select("div.song-text > div.inner-text")
-        if not lyrics_div:
-            return None
-
-        return lyrics_div[0].get_text()
+        return None


 def remove_credits(text):
--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@ -44,6 +44,9 @@ Bug fixes:
 * :doc:`plugins/discogs`: Fix the ``TypeError`` when there is no description.
 * Remove single quotes from all SQL queries
  :bug:`4709`
+* :doc:`plugins/lyrics`: Update ``tekstowo`` backend to fetch lyrics directly
+  since recent updates to their website made it unsearchable.
+  :bug:`5456`

 For packagers:

--- a/test/plugins/test_lyrics.py
+++ b/test/plugins/test_lyrics.py
@ -564,10 +564,7 @@ class TekstowoExtractLyricsTest(TekstowoBaseTest):
        """Ensure we are able to scrape a page with lyrics"""
        url = "https://www.tekstowo.pl/piosenka,24kgoldn,city_of_angels_1.html"
        mock = MockFetchUrl()
-        assert (
-            tekstowo.extract_lyrics(mock(url), "24kGoldn", "City of Angels")
-            is not None
-        )
+        assert tekstowo.extract_lyrics(mock(url))

    def test_no_lyrics(self):
        """Ensure we don't crash when the scraping the html for a Tekstowo page
@ -578,61 +575,7 @@ class TekstowoExtractLyricsTest(TekstowoBaseTest):
            "beethoven_piano_sonata_17_tempest_the_3rd_movement.html"
        )
        mock = MockFetchUrl()
-        assert (
-            tekstowo.extract_lyrics(
-                mock(url),
-                "Beethoven",
-                "Beethoven Piano Sonata 17" "Tempest The 3rd Movement",
-            )
-            is None
-        )
-
-    def test_song_no_match(self):
-        """Ensure we return None when a song does not match the search query"""
-        # https://github.com/beetbox/beets/issues/4406
-        # expected return value None
-        url = (
-            "https://www.tekstowo.pl/piosenka,bailey_bigger"
-            ",black_eyed_susan.html"
-        )
-        mock = MockFetchUrl()
-        assert (
-            tekstowo.extract_lyrics(
-                mock(url), "Kelly Bailey", "Black Mesa Inbound"
-            )
-            is None
-        )
-
-
-class TekstowoParseSearchResultsTest(TekstowoBaseTest):
-    """tests Tekstowo.parse_search_results()"""
-
-    def setUp(self):
-        """Set up configuration"""
-        TekstowoBaseTest.setUp(self)
-        self.plugin = lyrics.LyricsPlugin()
-
-    def test_multiple_results(self):
-        """Ensure we are able to scrape a page with multiple search results"""
-        url = (
-            "https://www.tekstowo.pl/szukaj,wykonawca,juice+wrld"
-            ",tytul,lucid+dreams.html"
-        )
-        mock = MockFetchUrl()
-        assert (
-            tekstowo.parse_search_results(mock(url))
-            == "http://www.tekstowo.pl/piosenka,juice_wrld,"
-            "lucid_dreams__remix__ft__lil_uzi_vert.html"
-        )
-
-    def test_no_results(self):
-        """Ensure we are able to scrape a page with no search results"""
-        url = (
-            "https://www.tekstowo.pl/szukaj,wykonawca,"
-            "agfdgja,tytul,agfdgafg.html"
-        )
-        mock = MockFetchUrl()
-        assert tekstowo.parse_search_results(mock(url)) is None
+        assert not tekstowo.extract_lyrics(mock(url))


 class TekstowoIntegrationTest(TekstowoBaseTest, LyricsAssertions):
--- a/test/rsrc/lyrics/tekstowopl/szukajwykonawcaagfdgjatytulagfdgafg.txt
+++ b/test/rsrc/lyrics/tekstowopl/szukajwykonawcaagfdgjatytulagfdgafg.txt
--- a/test/rsrc/lyrics/tekstowopl/szukajwykonawcajuicewrldtytulluciddreams.txt
+++ b/test/rsrc/lyrics/tekstowopl/szukajwykonawcajuicewrldtytulluciddreams.txt