Update Tekstowo backend to fetch lyrics directly

- Refactored Tekstowo backend to fetch lyrics directly from song pages. - Added `encode` method to convert artist and title to their URL format, where non-alphanumeric characters are replaced with underscores. - Removed the now redundant search functionality and associated tests. - Simplified `extract_lyrics` method to directly parse lyrics without any checks.
2025-12-06 16:42:42 +01:00 · 2024-10-12 02:14:18 +01:00 · 2024-10-12 02:14:18 +01:00 · d3955bac65
commit d3955bac65
parent 9d2b34d457
5 changed files with 24 additions and 1253 deletions
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@ -14,6 +14,8 @@
 """Fetches, embeds, and displays lyrics."""
 from __future__ import annotations
 import difflib
 import errno
 import itertools
@ -23,6 +25,7 @@ import re
 import struct
 import unicodedata
 import warnings
 from functools import partial
 from typing import ClassVar
 from urllib.parse import quote, urlencode
@ -47,7 +50,6 @@ except ImportError:
 import beets
 from beets import plugins, ui
 from beets.autotag.hooks import string_dist
 DIV_RE = re.compile(r"<(/?)div>?", re.I)
 COMMENT_RE = re.compile(r"<!--.*-->", re.S)
@ -288,7 +290,7 @@ class DirectBackend(Backend):
    @classmethod
    def encode(cls, text: str) -> str:
        """Encode the string for inclusion in a URL."""
-        return quote(unidecode(text))
+        raise NotImplementedError
    @classmethod
    def build_url(cls, *args: str) -> str:
@ -312,7 +314,7 @@ class MusiXmatch(DirectBackend):
        for old, new in cls.REPLACEMENTS.items():
            text = re.sub(old, new, text)
-        return super().encode(text)
+        return quote(unidecode(text))
    def fetch(self, artist, title, album=None, length=None):
        url = self.build_url(artist, title)
@ -485,87 +487,31 @@ class Tekstowo(DirectBackend):
    """Fetch lyrics from Tekstowo.pl."""
    REQUIRES_BS = True
-    BASE_URL = "http://www.tekstowo.pl"
+    URL_TEMPLATE = "https://www.tekstowo.pl/piosenka,{},{}.html"
-    URL_TEMPLATE = BASE_URL + "/wyszukaj.html?search-title={}&search-artist={}"
+
    non_alpha_to_underscore = partial(re.compile(r"\W").sub, "_")
    @classmethod
    def encode(cls, text: str) -> str:
        return cls.non_alpha_to_underscore(unidecode(text.lower()))
    def fetch(self, artist, title, album=None, length=None):
-        url = self.build_url(title, artist)
+        if html := self.fetch_url(self.build_url(artist, title)):
-        search_results = self.fetch_url(url)
+            return self.extract_lyrics(html)
-        if not search_results:
+
        return None
-        song_page_url = self.parse_search_results(search_results)
+    def extract_lyrics(self, html: str) -> str | None:
        if not song_page_url:
            return None
        song_page_html = self.fetch_url(song_page_url)
        if not song_page_html:
            return None
        return self.extract_lyrics(song_page_html, artist, title)
    def parse_search_results(self, html):
        html = _scrape_strip_cruft(html)
        html = _scrape_merge_paragraphs(html)
        soup = try_parse_html(html)
-        if not soup:
+
        if lyrics_div := soup.select_one("div.song-text > div.inner-text"):
            return lyrics_div.get_text()
        return None
        content_div = soup.find("div", class_="content")
        if not content_div:
            return None
        card_div = content_div.find("div", class_="card")
        if not card_div:
            return None
        song_rows = card_div.find_all("div", class_="box-przeboje")
        if not song_rows:
            return None
        song_row = song_rows[0]
        if not song_row:
            return None
        link = song_row.find("a")
        if not link:
            return None
        return self.BASE_URL + link.get("href")
    def extract_lyrics(self, html, artist, title):
        html = _scrape_strip_cruft(html)
        html = _scrape_merge_paragraphs(html)
        soup = try_parse_html(html)
        if not soup:
            return None
        info_div = soup.find("div", class_="col-auto")
        if not info_div:
            return None
        info_elements = info_div.find_all("a")
        if not info_elements:
            return None
        html_title = info_elements[-1].get_text()
        html_artist = info_elements[-2].get_text()
        title_dist = string_dist(html_title, title)
        artist_dist = string_dist(html_artist, artist)
        thresh = self.config["dist_thresh"].get(float)
        if title_dist > thresh or artist_dist > thresh:
            return None
        lyrics_div = soup.select("div.song-text > div.inner-text")
        if not lyrics_div:
            return None
        return lyrics_div[0].get_text()
 def remove_credits(text):
    """Remove first/last line of text if it contains the word 'lyrics'
--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@ -44,6 +44,9 @@ Bug fixes:
 * :doc:`plugins/discogs`: Fix the ``TypeError`` when there is no description.
 * Remove single quotes from all SQL queries
  :bug:`4709`
 * :doc:`plugins/lyrics`: Update ``tekstowo`` backend to fetch lyrics directly
  since recent updates to their website made it unsearchable.
  :bug:`5456`
 For packagers:
--- a/test/plugins/test_lyrics.py
+++ b/test/plugins/test_lyrics.py
@ -564,10 +564,7 @@ class TekstowoExtractLyricsTest(TekstowoBaseTest):
        """Ensure we are able to scrape a page with lyrics"""
        url = "https://www.tekstowo.pl/piosenka,24kgoldn,city_of_angels_1.html"
        mock = MockFetchUrl()
-        assert (
+        assert tekstowo.extract_lyrics(mock(url))
            tekstowo.extract_lyrics(mock(url), "24kGoldn", "City of Angels")
            is not None
        )
    def test_no_lyrics(self):
        """Ensure we don't crash when the scraping the html for a Tekstowo page
@ -578,61 +575,7 @@ class TekstowoExtractLyricsTest(TekstowoBaseTest):
            "beethoven_piano_sonata_17_tempest_the_3rd_movement.html"
        )
        mock = MockFetchUrl()
-        assert (
+        assert not tekstowo.extract_lyrics(mock(url))
            tekstowo.extract_lyrics(
                mock(url),
                "Beethoven",
                "Beethoven Piano Sonata 17" "Tempest The 3rd Movement",
            )
            is None
        )
    def test_song_no_match(self):
        """Ensure we return None when a song does not match the search query"""
        # https://github.com/beetbox/beets/issues/4406
        # expected return value None
        url = (
            "https://www.tekstowo.pl/piosenka,bailey_bigger"
            ",black_eyed_susan.html"
        )
        mock = MockFetchUrl()
        assert (
            tekstowo.extract_lyrics(
                mock(url), "Kelly Bailey", "Black Mesa Inbound"
            )
            is None
        )
 class TekstowoParseSearchResultsTest(TekstowoBaseTest):
    """tests Tekstowo.parse_search_results()"""
    def setUp(self):
        """Set up configuration"""
        TekstowoBaseTest.setUp(self)
        self.plugin = lyrics.LyricsPlugin()
    def test_multiple_results(self):
        """Ensure we are able to scrape a page with multiple search results"""
        url = (
            "https://www.tekstowo.pl/szukaj,wykonawca,juice+wrld"
            ",tytul,lucid+dreams.html"
        )
        mock = MockFetchUrl()
        assert (
            tekstowo.parse_search_results(mock(url))
            == "http://www.tekstowo.pl/piosenka,juice_wrld,"
            "lucid_dreams__remix__ft__lil_uzi_vert.html"
        )
    def test_no_results(self):
        """Ensure we are able to scrape a page with no search results"""
        url = (
            "https://www.tekstowo.pl/szukaj,wykonawca,"
            "agfdgja,tytul,agfdgafg.html"
        )
        mock = MockFetchUrl()
        assert tekstowo.parse_search_results(mock(url)) is None
 class TekstowoIntegrationTest(TekstowoBaseTest, LyricsAssertions):
--- a/test/rsrc/lyrics/tekstowopl/szukajwykonawcaagfdgjatytulagfdgafg.txt
+++ b/test/rsrc/lyrics/tekstowopl/szukajwykonawcaagfdgjatytulagfdgafg.txt
--- a/test/rsrc/lyrics/tekstowopl/szukajwykonawcajuicewrldtytulluciddreams.txt
+++ b/test/rsrc/lyrics/tekstowopl/szukajwykonawcajuicewrldtytulluciddreams.txt