Update Tekstowo backend to fetch lyrics directly

- Refactored Tekstowo backend to fetch lyrics directly from song pages.
- Added `encode` method to convert artist and title to their URL format,
  where non-alphanumeric characters are replaced with underscores.
- Removed the now redundant search functionality and associated tests.
- Simplified `extract_lyrics` method to directly parse lyrics without
  any checks.
This commit is contained in:
Šarūnas Nejus 2024-10-12 02:14:18 +01:00
parent 9d2b34d457
commit d3955bac65
No known key found for this signature in database
GPG key ID: DD28F6704DBE3435
5 changed files with 24 additions and 1253 deletions

View file

@ -14,6 +14,8 @@
"""Fetches, embeds, and displays lyrics.""" """Fetches, embeds, and displays lyrics."""
from __future__ import annotations
import difflib import difflib
import errno import errno
import itertools import itertools
@ -23,6 +25,7 @@ import re
import struct import struct
import unicodedata import unicodedata
import warnings import warnings
from functools import partial
from typing import ClassVar from typing import ClassVar
from urllib.parse import quote, urlencode from urllib.parse import quote, urlencode
@ -47,7 +50,6 @@ except ImportError:
import beets import beets
from beets import plugins, ui from beets import plugins, ui
from beets.autotag.hooks import string_dist
DIV_RE = re.compile(r"<(/?)div>?", re.I) DIV_RE = re.compile(r"<(/?)div>?", re.I)
COMMENT_RE = re.compile(r"<!--.*-->", re.S) COMMENT_RE = re.compile(r"<!--.*-->", re.S)
@ -288,7 +290,7 @@ class DirectBackend(Backend):
@classmethod @classmethod
def encode(cls, text: str) -> str: def encode(cls, text: str) -> str:
"""Encode the string for inclusion in a URL.""" """Encode the string for inclusion in a URL."""
return quote(unidecode(text)) raise NotImplementedError
@classmethod @classmethod
def build_url(cls, *args: str) -> str: def build_url(cls, *args: str) -> str:
@ -312,7 +314,7 @@ class MusiXmatch(DirectBackend):
for old, new in cls.REPLACEMENTS.items(): for old, new in cls.REPLACEMENTS.items():
text = re.sub(old, new, text) text = re.sub(old, new, text)
return super().encode(text) return quote(unidecode(text))
def fetch(self, artist, title, album=None, length=None): def fetch(self, artist, title, album=None, length=None):
url = self.build_url(artist, title) url = self.build_url(artist, title)
@ -485,87 +487,31 @@ class Tekstowo(DirectBackend):
"""Fetch lyrics from Tekstowo.pl.""" """Fetch lyrics from Tekstowo.pl."""
REQUIRES_BS = True REQUIRES_BS = True
BASE_URL = "http://www.tekstowo.pl" URL_TEMPLATE = "https://www.tekstowo.pl/piosenka,{},{}.html"
URL_TEMPLATE = BASE_URL + "/wyszukaj.html?search-title={}&search-artist={}"
non_alpha_to_underscore = partial(re.compile(r"\W").sub, "_")
@classmethod
def encode(cls, text: str) -> str:
return cls.non_alpha_to_underscore(unidecode(text.lower()))
def fetch(self, artist, title, album=None, length=None): def fetch(self, artist, title, album=None, length=None):
url = self.build_url(title, artist) if html := self.fetch_url(self.build_url(artist, title)):
search_results = self.fetch_url(url) return self.extract_lyrics(html)
if not search_results:
return None return None
song_page_url = self.parse_search_results(search_results) def extract_lyrics(self, html: str) -> str | None:
if not song_page_url:
return None
song_page_html = self.fetch_url(song_page_url)
if not song_page_html:
return None
return self.extract_lyrics(song_page_html, artist, title)
def parse_search_results(self, html):
html = _scrape_strip_cruft(html) html = _scrape_strip_cruft(html)
html = _scrape_merge_paragraphs(html) html = _scrape_merge_paragraphs(html)
soup = try_parse_html(html) soup = try_parse_html(html)
if not soup:
if lyrics_div := soup.select_one("div.song-text > div.inner-text"):
return lyrics_div.get_text()
return None return None
content_div = soup.find("div", class_="content")
if not content_div:
return None
card_div = content_div.find("div", class_="card")
if not card_div:
return None
song_rows = card_div.find_all("div", class_="box-przeboje")
if not song_rows:
return None
song_row = song_rows[0]
if not song_row:
return None
link = song_row.find("a")
if not link:
return None
return self.BASE_URL + link.get("href")
def extract_lyrics(self, html, artist, title):
html = _scrape_strip_cruft(html)
html = _scrape_merge_paragraphs(html)
soup = try_parse_html(html)
if not soup:
return None
info_div = soup.find("div", class_="col-auto")
if not info_div:
return None
info_elements = info_div.find_all("a")
if not info_elements:
return None
html_title = info_elements[-1].get_text()
html_artist = info_elements[-2].get_text()
title_dist = string_dist(html_title, title)
artist_dist = string_dist(html_artist, artist)
thresh = self.config["dist_thresh"].get(float)
if title_dist > thresh or artist_dist > thresh:
return None
lyrics_div = soup.select("div.song-text > div.inner-text")
if not lyrics_div:
return None
return lyrics_div[0].get_text()
def remove_credits(text): def remove_credits(text):
"""Remove first/last line of text if it contains the word 'lyrics' """Remove first/last line of text if it contains the word 'lyrics'

View file

@ -44,6 +44,9 @@ Bug fixes:
* :doc:`plugins/discogs`: Fix the ``TypeError`` when there is no description. * :doc:`plugins/discogs`: Fix the ``TypeError`` when there is no description.
* Remove single quotes from all SQL queries * Remove single quotes from all SQL queries
:bug:`4709` :bug:`4709`
* :doc:`plugins/lyrics`: Update ``tekstowo`` backend to fetch lyrics directly
since recent updates to their website made it unsearchable.
:bug:`5456`
For packagers: For packagers:

View file

@ -564,10 +564,7 @@ class TekstowoExtractLyricsTest(TekstowoBaseTest):
"""Ensure we are able to scrape a page with lyrics""" """Ensure we are able to scrape a page with lyrics"""
url = "https://www.tekstowo.pl/piosenka,24kgoldn,city_of_angels_1.html" url = "https://www.tekstowo.pl/piosenka,24kgoldn,city_of_angels_1.html"
mock = MockFetchUrl() mock = MockFetchUrl()
assert ( assert tekstowo.extract_lyrics(mock(url))
tekstowo.extract_lyrics(mock(url), "24kGoldn", "City of Angels")
is not None
)
def test_no_lyrics(self): def test_no_lyrics(self):
"""Ensure we don't crash when the scraping the html for a Tekstowo page """Ensure we don't crash when the scraping the html for a Tekstowo page
@ -578,61 +575,7 @@ class TekstowoExtractLyricsTest(TekstowoBaseTest):
"beethoven_piano_sonata_17_tempest_the_3rd_movement.html" "beethoven_piano_sonata_17_tempest_the_3rd_movement.html"
) )
mock = MockFetchUrl() mock = MockFetchUrl()
assert ( assert not tekstowo.extract_lyrics(mock(url))
tekstowo.extract_lyrics(
mock(url),
"Beethoven",
"Beethoven Piano Sonata 17" "Tempest The 3rd Movement",
)
is None
)
def test_song_no_match(self):
"""Ensure we return None when a song does not match the search query"""
# https://github.com/beetbox/beets/issues/4406
# expected return value None
url = (
"https://www.tekstowo.pl/piosenka,bailey_bigger"
",black_eyed_susan.html"
)
mock = MockFetchUrl()
assert (
tekstowo.extract_lyrics(
mock(url), "Kelly Bailey", "Black Mesa Inbound"
)
is None
)
class TekstowoParseSearchResultsTest(TekstowoBaseTest):
"""tests Tekstowo.parse_search_results()"""
def setUp(self):
"""Set up configuration"""
TekstowoBaseTest.setUp(self)
self.plugin = lyrics.LyricsPlugin()
def test_multiple_results(self):
"""Ensure we are able to scrape a page with multiple search results"""
url = (
"https://www.tekstowo.pl/szukaj,wykonawca,juice+wrld"
",tytul,lucid+dreams.html"
)
mock = MockFetchUrl()
assert (
tekstowo.parse_search_results(mock(url))
== "http://www.tekstowo.pl/piosenka,juice_wrld,"
"lucid_dreams__remix__ft__lil_uzi_vert.html"
)
def test_no_results(self):
"""Ensure we are able to scrape a page with no search results"""
url = (
"https://www.tekstowo.pl/szukaj,wykonawca,"
"agfdgja,tytul,agfdgafg.html"
)
mock = MockFetchUrl()
assert tekstowo.parse_search_results(mock(url)) is None
class TekstowoIntegrationTest(TekstowoBaseTest, LyricsAssertions): class TekstowoIntegrationTest(TekstowoBaseTest, LyricsAssertions):

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long