mirror of
https://github.com/beetbox/beets.git
synced 2025-12-06 08:39:17 +01:00
Update Tekstowo backend to fetch lyrics directly
- Refactored Tekstowo backend to fetch lyrics directly from song pages. - Added `encode` method to convert artist and title to their URL format, where non-alphanumeric characters are replaced with underscores. - Removed the now redundant search functionality and associated tests. - Simplified `extract_lyrics` method to directly parse lyrics without any checks.
This commit is contained in:
parent
9d2b34d457
commit
d3955bac65
5 changed files with 24 additions and 1253 deletions
|
|
@ -14,6 +14,8 @@
|
|||
|
||||
"""Fetches, embeds, and displays lyrics."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import difflib
|
||||
import errno
|
||||
import itertools
|
||||
|
|
@ -23,6 +25,7 @@ import re
|
|||
import struct
|
||||
import unicodedata
|
||||
import warnings
|
||||
from functools import partial
|
||||
from typing import ClassVar
|
||||
from urllib.parse import quote, urlencode
|
||||
|
||||
|
|
@ -47,7 +50,6 @@ except ImportError:
|
|||
|
||||
import beets
|
||||
from beets import plugins, ui
|
||||
from beets.autotag.hooks import string_dist
|
||||
|
||||
DIV_RE = re.compile(r"<(/?)div>?", re.I)
|
||||
COMMENT_RE = re.compile(r"<!--.*-->", re.S)
|
||||
|
|
@ -288,7 +290,7 @@ class DirectBackend(Backend):
|
|||
@classmethod
|
||||
def encode(cls, text: str) -> str:
|
||||
"""Encode the string for inclusion in a URL."""
|
||||
return quote(unidecode(text))
|
||||
raise NotImplementedError
|
||||
|
||||
@classmethod
|
||||
def build_url(cls, *args: str) -> str:
|
||||
|
|
@ -312,7 +314,7 @@ class MusiXmatch(DirectBackend):
|
|||
for old, new in cls.REPLACEMENTS.items():
|
||||
text = re.sub(old, new, text)
|
||||
|
||||
return super().encode(text)
|
||||
return quote(unidecode(text))
|
||||
|
||||
def fetch(self, artist, title, album=None, length=None):
|
||||
url = self.build_url(artist, title)
|
||||
|
|
@ -485,86 +487,30 @@ class Tekstowo(DirectBackend):
|
|||
"""Fetch lyrics from Tekstowo.pl."""
|
||||
|
||||
REQUIRES_BS = True
|
||||
BASE_URL = "http://www.tekstowo.pl"
|
||||
URL_TEMPLATE = BASE_URL + "/wyszukaj.html?search-title={}&search-artist={}"
|
||||
URL_TEMPLATE = "https://www.tekstowo.pl/piosenka,{},{}.html"
|
||||
|
||||
non_alpha_to_underscore = partial(re.compile(r"\W").sub, "_")
|
||||
|
||||
@classmethod
|
||||
def encode(cls, text: str) -> str:
|
||||
return cls.non_alpha_to_underscore(unidecode(text.lower()))
|
||||
|
||||
def fetch(self, artist, title, album=None, length=None):
|
||||
url = self.build_url(title, artist)
|
||||
search_results = self.fetch_url(url)
|
||||
if not search_results:
|
||||
return None
|
||||
if html := self.fetch_url(self.build_url(artist, title)):
|
||||
return self.extract_lyrics(html)
|
||||
|
||||
song_page_url = self.parse_search_results(search_results)
|
||||
if not song_page_url:
|
||||
return None
|
||||
return None
|
||||
|
||||
song_page_html = self.fetch_url(song_page_url)
|
||||
if not song_page_html:
|
||||
return None
|
||||
|
||||
return self.extract_lyrics(song_page_html, artist, title)
|
||||
|
||||
def parse_search_results(self, html):
|
||||
def extract_lyrics(self, html: str) -> str | None:
|
||||
html = _scrape_strip_cruft(html)
|
||||
html = _scrape_merge_paragraphs(html)
|
||||
|
||||
soup = try_parse_html(html)
|
||||
if not soup:
|
||||
return None
|
||||
|
||||
content_div = soup.find("div", class_="content")
|
||||
if not content_div:
|
||||
return None
|
||||
if lyrics_div := soup.select_one("div.song-text > div.inner-text"):
|
||||
return lyrics_div.get_text()
|
||||
|
||||
card_div = content_div.find("div", class_="card")
|
||||
if not card_div:
|
||||
return None
|
||||
|
||||
song_rows = card_div.find_all("div", class_="box-przeboje")
|
||||
if not song_rows:
|
||||
return None
|
||||
|
||||
song_row = song_rows[0]
|
||||
if not song_row:
|
||||
return None
|
||||
|
||||
link = song_row.find("a")
|
||||
if not link:
|
||||
return None
|
||||
|
||||
return self.BASE_URL + link.get("href")
|
||||
|
||||
def extract_lyrics(self, html, artist, title):
|
||||
html = _scrape_strip_cruft(html)
|
||||
html = _scrape_merge_paragraphs(html)
|
||||
|
||||
soup = try_parse_html(html)
|
||||
if not soup:
|
||||
return None
|
||||
|
||||
info_div = soup.find("div", class_="col-auto")
|
||||
if not info_div:
|
||||
return None
|
||||
|
||||
info_elements = info_div.find_all("a")
|
||||
if not info_elements:
|
||||
return None
|
||||
|
||||
html_title = info_elements[-1].get_text()
|
||||
html_artist = info_elements[-2].get_text()
|
||||
|
||||
title_dist = string_dist(html_title, title)
|
||||
artist_dist = string_dist(html_artist, artist)
|
||||
|
||||
thresh = self.config["dist_thresh"].get(float)
|
||||
if title_dist > thresh or artist_dist > thresh:
|
||||
return None
|
||||
|
||||
lyrics_div = soup.select("div.song-text > div.inner-text")
|
||||
if not lyrics_div:
|
||||
return None
|
||||
|
||||
return lyrics_div[0].get_text()
|
||||
return None
|
||||
|
||||
|
||||
def remove_credits(text):
|
||||
|
|
|
|||
|
|
@ -44,6 +44,9 @@ Bug fixes:
|
|||
* :doc:`plugins/discogs`: Fix the ``TypeError`` when there is no description.
|
||||
* Remove single quotes from all SQL queries
|
||||
:bug:`4709`
|
||||
* :doc:`plugins/lyrics`: Update ``tekstowo`` backend to fetch lyrics directly
|
||||
since recent updates to their website made it unsearchable.
|
||||
:bug:`5456`
|
||||
|
||||
For packagers:
|
||||
|
||||
|
|
|
|||
|
|
@ -564,10 +564,7 @@ class TekstowoExtractLyricsTest(TekstowoBaseTest):
|
|||
"""Ensure we are able to scrape a page with lyrics"""
|
||||
url = "https://www.tekstowo.pl/piosenka,24kgoldn,city_of_angels_1.html"
|
||||
mock = MockFetchUrl()
|
||||
assert (
|
||||
tekstowo.extract_lyrics(mock(url), "24kGoldn", "City of Angels")
|
||||
is not None
|
||||
)
|
||||
assert tekstowo.extract_lyrics(mock(url))
|
||||
|
||||
def test_no_lyrics(self):
|
||||
"""Ensure we don't crash when the scraping the html for a Tekstowo page
|
||||
|
|
@ -578,61 +575,7 @@ class TekstowoExtractLyricsTest(TekstowoBaseTest):
|
|||
"beethoven_piano_sonata_17_tempest_the_3rd_movement.html"
|
||||
)
|
||||
mock = MockFetchUrl()
|
||||
assert (
|
||||
tekstowo.extract_lyrics(
|
||||
mock(url),
|
||||
"Beethoven",
|
||||
"Beethoven Piano Sonata 17" "Tempest The 3rd Movement",
|
||||
)
|
||||
is None
|
||||
)
|
||||
|
||||
def test_song_no_match(self):
|
||||
"""Ensure we return None when a song does not match the search query"""
|
||||
# https://github.com/beetbox/beets/issues/4406
|
||||
# expected return value None
|
||||
url = (
|
||||
"https://www.tekstowo.pl/piosenka,bailey_bigger"
|
||||
",black_eyed_susan.html"
|
||||
)
|
||||
mock = MockFetchUrl()
|
||||
assert (
|
||||
tekstowo.extract_lyrics(
|
||||
mock(url), "Kelly Bailey", "Black Mesa Inbound"
|
||||
)
|
||||
is None
|
||||
)
|
||||
|
||||
|
||||
class TekstowoParseSearchResultsTest(TekstowoBaseTest):
|
||||
"""tests Tekstowo.parse_search_results()"""
|
||||
|
||||
def setUp(self):
|
||||
"""Set up configuration"""
|
||||
TekstowoBaseTest.setUp(self)
|
||||
self.plugin = lyrics.LyricsPlugin()
|
||||
|
||||
def test_multiple_results(self):
|
||||
"""Ensure we are able to scrape a page with multiple search results"""
|
||||
url = (
|
||||
"https://www.tekstowo.pl/szukaj,wykonawca,juice+wrld"
|
||||
",tytul,lucid+dreams.html"
|
||||
)
|
||||
mock = MockFetchUrl()
|
||||
assert (
|
||||
tekstowo.parse_search_results(mock(url))
|
||||
== "http://www.tekstowo.pl/piosenka,juice_wrld,"
|
||||
"lucid_dreams__remix__ft__lil_uzi_vert.html"
|
||||
)
|
||||
|
||||
def test_no_results(self):
|
||||
"""Ensure we are able to scrape a page with no search results"""
|
||||
url = (
|
||||
"https://www.tekstowo.pl/szukaj,wykonawca,"
|
||||
"agfdgja,tytul,agfdgafg.html"
|
||||
)
|
||||
mock = MockFetchUrl()
|
||||
assert tekstowo.parse_search_results(mock(url)) is None
|
||||
assert not tekstowo.extract_lyrics(mock(url))
|
||||
|
||||
|
||||
class TekstowoIntegrationTest(TekstowoBaseTest, LyricsAssertions):
|
||||
|
|
|
|||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
Loading…
Reference in a new issue