Update Tekstowo backend to fetch lyrics directly

- Refactored Tekstowo backend to fetch lyrics directly from song pages.
- Added `encode` method to convert artist and title to their URL format,
  where non-alphanumeric characters are replaced with underscores.
- Removed the now redundant search functionality and associated tests.
- Simplified `extract_lyrics` method to directly parse lyrics without
  any checks.
This commit is contained in:
Šarūnas Nejus 2024-10-12 02:14:18 +01:00
parent 9d2b34d457
commit d3955bac65
No known key found for this signature in database
GPG key ID: DD28F6704DBE3435
5 changed files with 24 additions and 1253 deletions

View file

@ -14,6 +14,8 @@
"""Fetches, embeds, and displays lyrics."""
from __future__ import annotations
import difflib
import errno
import itertools
@ -23,6 +25,7 @@ import re
import struct
import unicodedata
import warnings
from functools import partial
from typing import ClassVar
from urllib.parse import quote, urlencode
@ -47,7 +50,6 @@ except ImportError:
import beets
from beets import plugins, ui
from beets.autotag.hooks import string_dist
DIV_RE = re.compile(r"<(/?)div>?", re.I)
COMMENT_RE = re.compile(r"<!--.*-->", re.S)
@ -288,7 +290,7 @@ class DirectBackend(Backend):
@classmethod
def encode(cls, text: str) -> str:
"""Encode the string for inclusion in a URL."""
return quote(unidecode(text))
raise NotImplementedError
@classmethod
def build_url(cls, *args: str) -> str:
@ -312,7 +314,7 @@ class MusiXmatch(DirectBackend):
for old, new in cls.REPLACEMENTS.items():
text = re.sub(old, new, text)
return super().encode(text)
return quote(unidecode(text))
def fetch(self, artist, title, album=None, length=None):
url = self.build_url(artist, title)
@ -485,86 +487,30 @@ class Tekstowo(DirectBackend):
"""Fetch lyrics from Tekstowo.pl."""
REQUIRES_BS = True
BASE_URL = "http://www.tekstowo.pl"
URL_TEMPLATE = BASE_URL + "/wyszukaj.html?search-title={}&search-artist={}"
URL_TEMPLATE = "https://www.tekstowo.pl/piosenka,{},{}.html"
non_alpha_to_underscore = partial(re.compile(r"\W").sub, "_")
@classmethod
def encode(cls, text: str) -> str:
return cls.non_alpha_to_underscore(unidecode(text.lower()))
def fetch(self, artist, title, album=None, length=None):
url = self.build_url(title, artist)
search_results = self.fetch_url(url)
if not search_results:
return None
if html := self.fetch_url(self.build_url(artist, title)):
return self.extract_lyrics(html)
song_page_url = self.parse_search_results(search_results)
if not song_page_url:
return None
return None
song_page_html = self.fetch_url(song_page_url)
if not song_page_html:
return None
return self.extract_lyrics(song_page_html, artist, title)
def parse_search_results(self, html):
def extract_lyrics(self, html: str) -> str | None:
html = _scrape_strip_cruft(html)
html = _scrape_merge_paragraphs(html)
soup = try_parse_html(html)
if not soup:
return None
content_div = soup.find("div", class_="content")
if not content_div:
return None
if lyrics_div := soup.select_one("div.song-text > div.inner-text"):
return lyrics_div.get_text()
card_div = content_div.find("div", class_="card")
if not card_div:
return None
song_rows = card_div.find_all("div", class_="box-przeboje")
if not song_rows:
return None
song_row = song_rows[0]
if not song_row:
return None
link = song_row.find("a")
if not link:
return None
return self.BASE_URL + link.get("href")
def extract_lyrics(self, html, artist, title):
html = _scrape_strip_cruft(html)
html = _scrape_merge_paragraphs(html)
soup = try_parse_html(html)
if not soup:
return None
info_div = soup.find("div", class_="col-auto")
if not info_div:
return None
info_elements = info_div.find_all("a")
if not info_elements:
return None
html_title = info_elements[-1].get_text()
html_artist = info_elements[-2].get_text()
title_dist = string_dist(html_title, title)
artist_dist = string_dist(html_artist, artist)
thresh = self.config["dist_thresh"].get(float)
if title_dist > thresh or artist_dist > thresh:
return None
lyrics_div = soup.select("div.song-text > div.inner-text")
if not lyrics_div:
return None
return lyrics_div[0].get_text()
return None
def remove_credits(text):

View file

@ -44,6 +44,9 @@ Bug fixes:
* :doc:`plugins/discogs`: Fix the ``TypeError`` when there is no description.
* Remove single quotes from all SQL queries
:bug:`4709`
* :doc:`plugins/lyrics`: Update ``tekstowo`` backend to fetch lyrics directly
since recent updates to their website made it unsearchable.
:bug:`5456`
For packagers:

View file

@ -564,10 +564,7 @@ class TekstowoExtractLyricsTest(TekstowoBaseTest):
"""Ensure we are able to scrape a page with lyrics"""
url = "https://www.tekstowo.pl/piosenka,24kgoldn,city_of_angels_1.html"
mock = MockFetchUrl()
assert (
tekstowo.extract_lyrics(mock(url), "24kGoldn", "City of Angels")
is not None
)
assert tekstowo.extract_lyrics(mock(url))
def test_no_lyrics(self):
"""Ensure we don't crash when the scraping the html for a Tekstowo page
@ -578,61 +575,7 @@ class TekstowoExtractLyricsTest(TekstowoBaseTest):
"beethoven_piano_sonata_17_tempest_the_3rd_movement.html"
)
mock = MockFetchUrl()
assert (
tekstowo.extract_lyrics(
mock(url),
"Beethoven",
"Beethoven Piano Sonata 17" "Tempest The 3rd Movement",
)
is None
)
def test_song_no_match(self):
"""Ensure we return None when a song does not match the search query"""
# https://github.com/beetbox/beets/issues/4406
# expected return value None
url = (
"https://www.tekstowo.pl/piosenka,bailey_bigger"
",black_eyed_susan.html"
)
mock = MockFetchUrl()
assert (
tekstowo.extract_lyrics(
mock(url), "Kelly Bailey", "Black Mesa Inbound"
)
is None
)
class TekstowoParseSearchResultsTest(TekstowoBaseTest):
"""tests Tekstowo.parse_search_results()"""
def setUp(self):
"""Set up configuration"""
TekstowoBaseTest.setUp(self)
self.plugin = lyrics.LyricsPlugin()
def test_multiple_results(self):
"""Ensure we are able to scrape a page with multiple search results"""
url = (
"https://www.tekstowo.pl/szukaj,wykonawca,juice+wrld"
",tytul,lucid+dreams.html"
)
mock = MockFetchUrl()
assert (
tekstowo.parse_search_results(mock(url))
== "http://www.tekstowo.pl/piosenka,juice_wrld,"
"lucid_dreams__remix__ft__lil_uzi_vert.html"
)
def test_no_results(self):
"""Ensure we are able to scrape a page with no search results"""
url = (
"https://www.tekstowo.pl/szukaj,wykonawca,"
"agfdgja,tytul,agfdgafg.html"
)
mock = MockFetchUrl()
assert tekstowo.parse_search_results(mock(url)) is None
assert not tekstowo.extract_lyrics(mock(url))
class TekstowoIntegrationTest(TekstowoBaseTest, LyricsAssertions):

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long