mirror of
https://github.com/beetbox/beets.git
synced 2025-12-06 08:39:17 +01:00
Update Tekstowo backend to fetch lyrics directly
- Refactored Tekstowo backend to fetch lyrics directly from song pages. - Added `encode` method to convert artist and title to their URL format, where non-alphanumeric characters are replaced with underscores. - Removed the now redundant search functionality and associated tests. - Simplified `extract_lyrics` method to directly parse lyrics without any checks.
This commit is contained in:
parent
9d2b34d457
commit
d3955bac65
5 changed files with 24 additions and 1253 deletions
|
|
@ -14,6 +14,8 @@
|
||||||
|
|
||||||
"""Fetches, embeds, and displays lyrics."""
|
"""Fetches, embeds, and displays lyrics."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
import difflib
|
import difflib
|
||||||
import errno
|
import errno
|
||||||
import itertools
|
import itertools
|
||||||
|
|
@ -23,6 +25,7 @@ import re
|
||||||
import struct
|
import struct
|
||||||
import unicodedata
|
import unicodedata
|
||||||
import warnings
|
import warnings
|
||||||
|
from functools import partial
|
||||||
from typing import ClassVar
|
from typing import ClassVar
|
||||||
from urllib.parse import quote, urlencode
|
from urllib.parse import quote, urlencode
|
||||||
|
|
||||||
|
|
@ -47,7 +50,6 @@ except ImportError:
|
||||||
|
|
||||||
import beets
|
import beets
|
||||||
from beets import plugins, ui
|
from beets import plugins, ui
|
||||||
from beets.autotag.hooks import string_dist
|
|
||||||
|
|
||||||
DIV_RE = re.compile(r"<(/?)div>?", re.I)
|
DIV_RE = re.compile(r"<(/?)div>?", re.I)
|
||||||
COMMENT_RE = re.compile(r"<!--.*-->", re.S)
|
COMMENT_RE = re.compile(r"<!--.*-->", re.S)
|
||||||
|
|
@ -288,7 +290,7 @@ class DirectBackend(Backend):
|
||||||
@classmethod
|
@classmethod
|
||||||
def encode(cls, text: str) -> str:
|
def encode(cls, text: str) -> str:
|
||||||
"""Encode the string for inclusion in a URL."""
|
"""Encode the string for inclusion in a URL."""
|
||||||
return quote(unidecode(text))
|
raise NotImplementedError
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def build_url(cls, *args: str) -> str:
|
def build_url(cls, *args: str) -> str:
|
||||||
|
|
@ -312,7 +314,7 @@ class MusiXmatch(DirectBackend):
|
||||||
for old, new in cls.REPLACEMENTS.items():
|
for old, new in cls.REPLACEMENTS.items():
|
||||||
text = re.sub(old, new, text)
|
text = re.sub(old, new, text)
|
||||||
|
|
||||||
return super().encode(text)
|
return quote(unidecode(text))
|
||||||
|
|
||||||
def fetch(self, artist, title, album=None, length=None):
|
def fetch(self, artist, title, album=None, length=None):
|
||||||
url = self.build_url(artist, title)
|
url = self.build_url(artist, title)
|
||||||
|
|
@ -485,86 +487,30 @@ class Tekstowo(DirectBackend):
|
||||||
"""Fetch lyrics from Tekstowo.pl."""
|
"""Fetch lyrics from Tekstowo.pl."""
|
||||||
|
|
||||||
REQUIRES_BS = True
|
REQUIRES_BS = True
|
||||||
BASE_URL = "http://www.tekstowo.pl"
|
URL_TEMPLATE = "https://www.tekstowo.pl/piosenka,{},{}.html"
|
||||||
URL_TEMPLATE = BASE_URL + "/wyszukaj.html?search-title={}&search-artist={}"
|
|
||||||
|
non_alpha_to_underscore = partial(re.compile(r"\W").sub, "_")
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def encode(cls, text: str) -> str:
|
||||||
|
return cls.non_alpha_to_underscore(unidecode(text.lower()))
|
||||||
|
|
||||||
def fetch(self, artist, title, album=None, length=None):
|
def fetch(self, artist, title, album=None, length=None):
|
||||||
url = self.build_url(title, artist)
|
if html := self.fetch_url(self.build_url(artist, title)):
|
||||||
search_results = self.fetch_url(url)
|
return self.extract_lyrics(html)
|
||||||
if not search_results:
|
|
||||||
return None
|
|
||||||
|
|
||||||
song_page_url = self.parse_search_results(search_results)
|
return None
|
||||||
if not song_page_url:
|
|
||||||
return None
|
|
||||||
|
|
||||||
song_page_html = self.fetch_url(song_page_url)
|
def extract_lyrics(self, html: str) -> str | None:
|
||||||
if not song_page_html:
|
|
||||||
return None
|
|
||||||
|
|
||||||
return self.extract_lyrics(song_page_html, artist, title)
|
|
||||||
|
|
||||||
def parse_search_results(self, html):
|
|
||||||
html = _scrape_strip_cruft(html)
|
html = _scrape_strip_cruft(html)
|
||||||
html = _scrape_merge_paragraphs(html)
|
html = _scrape_merge_paragraphs(html)
|
||||||
|
|
||||||
soup = try_parse_html(html)
|
soup = try_parse_html(html)
|
||||||
if not soup:
|
|
||||||
return None
|
|
||||||
|
|
||||||
content_div = soup.find("div", class_="content")
|
if lyrics_div := soup.select_one("div.song-text > div.inner-text"):
|
||||||
if not content_div:
|
return lyrics_div.get_text()
|
||||||
return None
|
|
||||||
|
|
||||||
card_div = content_div.find("div", class_="card")
|
return None
|
||||||
if not card_div:
|
|
||||||
return None
|
|
||||||
|
|
||||||
song_rows = card_div.find_all("div", class_="box-przeboje")
|
|
||||||
if not song_rows:
|
|
||||||
return None
|
|
||||||
|
|
||||||
song_row = song_rows[0]
|
|
||||||
if not song_row:
|
|
||||||
return None
|
|
||||||
|
|
||||||
link = song_row.find("a")
|
|
||||||
if not link:
|
|
||||||
return None
|
|
||||||
|
|
||||||
return self.BASE_URL + link.get("href")
|
|
||||||
|
|
||||||
def extract_lyrics(self, html, artist, title):
|
|
||||||
html = _scrape_strip_cruft(html)
|
|
||||||
html = _scrape_merge_paragraphs(html)
|
|
||||||
|
|
||||||
soup = try_parse_html(html)
|
|
||||||
if not soup:
|
|
||||||
return None
|
|
||||||
|
|
||||||
info_div = soup.find("div", class_="col-auto")
|
|
||||||
if not info_div:
|
|
||||||
return None
|
|
||||||
|
|
||||||
info_elements = info_div.find_all("a")
|
|
||||||
if not info_elements:
|
|
||||||
return None
|
|
||||||
|
|
||||||
html_title = info_elements[-1].get_text()
|
|
||||||
html_artist = info_elements[-2].get_text()
|
|
||||||
|
|
||||||
title_dist = string_dist(html_title, title)
|
|
||||||
artist_dist = string_dist(html_artist, artist)
|
|
||||||
|
|
||||||
thresh = self.config["dist_thresh"].get(float)
|
|
||||||
if title_dist > thresh or artist_dist > thresh:
|
|
||||||
return None
|
|
||||||
|
|
||||||
lyrics_div = soup.select("div.song-text > div.inner-text")
|
|
||||||
if not lyrics_div:
|
|
||||||
return None
|
|
||||||
|
|
||||||
return lyrics_div[0].get_text()
|
|
||||||
|
|
||||||
|
|
||||||
def remove_credits(text):
|
def remove_credits(text):
|
||||||
|
|
|
||||||
|
|
@ -44,6 +44,9 @@ Bug fixes:
|
||||||
* :doc:`plugins/discogs`: Fix the ``TypeError`` when there is no description.
|
* :doc:`plugins/discogs`: Fix the ``TypeError`` when there is no description.
|
||||||
* Remove single quotes from all SQL queries
|
* Remove single quotes from all SQL queries
|
||||||
:bug:`4709`
|
:bug:`4709`
|
||||||
|
* :doc:`plugins/lyrics`: Update ``tekstowo`` backend to fetch lyrics directly
|
||||||
|
since recent updates to their website made it unsearchable.
|
||||||
|
:bug:`5456`
|
||||||
|
|
||||||
For packagers:
|
For packagers:
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -564,10 +564,7 @@ class TekstowoExtractLyricsTest(TekstowoBaseTest):
|
||||||
"""Ensure we are able to scrape a page with lyrics"""
|
"""Ensure we are able to scrape a page with lyrics"""
|
||||||
url = "https://www.tekstowo.pl/piosenka,24kgoldn,city_of_angels_1.html"
|
url = "https://www.tekstowo.pl/piosenka,24kgoldn,city_of_angels_1.html"
|
||||||
mock = MockFetchUrl()
|
mock = MockFetchUrl()
|
||||||
assert (
|
assert tekstowo.extract_lyrics(mock(url))
|
||||||
tekstowo.extract_lyrics(mock(url), "24kGoldn", "City of Angels")
|
|
||||||
is not None
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_no_lyrics(self):
|
def test_no_lyrics(self):
|
||||||
"""Ensure we don't crash when the scraping the html for a Tekstowo page
|
"""Ensure we don't crash when the scraping the html for a Tekstowo page
|
||||||
|
|
@ -578,61 +575,7 @@ class TekstowoExtractLyricsTest(TekstowoBaseTest):
|
||||||
"beethoven_piano_sonata_17_tempest_the_3rd_movement.html"
|
"beethoven_piano_sonata_17_tempest_the_3rd_movement.html"
|
||||||
)
|
)
|
||||||
mock = MockFetchUrl()
|
mock = MockFetchUrl()
|
||||||
assert (
|
assert not tekstowo.extract_lyrics(mock(url))
|
||||||
tekstowo.extract_lyrics(
|
|
||||||
mock(url),
|
|
||||||
"Beethoven",
|
|
||||||
"Beethoven Piano Sonata 17" "Tempest The 3rd Movement",
|
|
||||||
)
|
|
||||||
is None
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_song_no_match(self):
|
|
||||||
"""Ensure we return None when a song does not match the search query"""
|
|
||||||
# https://github.com/beetbox/beets/issues/4406
|
|
||||||
# expected return value None
|
|
||||||
url = (
|
|
||||||
"https://www.tekstowo.pl/piosenka,bailey_bigger"
|
|
||||||
",black_eyed_susan.html"
|
|
||||||
)
|
|
||||||
mock = MockFetchUrl()
|
|
||||||
assert (
|
|
||||||
tekstowo.extract_lyrics(
|
|
||||||
mock(url), "Kelly Bailey", "Black Mesa Inbound"
|
|
||||||
)
|
|
||||||
is None
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class TekstowoParseSearchResultsTest(TekstowoBaseTest):
|
|
||||||
"""tests Tekstowo.parse_search_results()"""
|
|
||||||
|
|
||||||
def setUp(self):
|
|
||||||
"""Set up configuration"""
|
|
||||||
TekstowoBaseTest.setUp(self)
|
|
||||||
self.plugin = lyrics.LyricsPlugin()
|
|
||||||
|
|
||||||
def test_multiple_results(self):
|
|
||||||
"""Ensure we are able to scrape a page with multiple search results"""
|
|
||||||
url = (
|
|
||||||
"https://www.tekstowo.pl/szukaj,wykonawca,juice+wrld"
|
|
||||||
",tytul,lucid+dreams.html"
|
|
||||||
)
|
|
||||||
mock = MockFetchUrl()
|
|
||||||
assert (
|
|
||||||
tekstowo.parse_search_results(mock(url))
|
|
||||||
== "http://www.tekstowo.pl/piosenka,juice_wrld,"
|
|
||||||
"lucid_dreams__remix__ft__lil_uzi_vert.html"
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_no_results(self):
|
|
||||||
"""Ensure we are able to scrape a page with no search results"""
|
|
||||||
url = (
|
|
||||||
"https://www.tekstowo.pl/szukaj,wykonawca,"
|
|
||||||
"agfdgja,tytul,agfdgafg.html"
|
|
||||||
)
|
|
||||||
mock = MockFetchUrl()
|
|
||||||
assert tekstowo.parse_search_results(mock(url)) is None
|
|
||||||
|
|
||||||
|
|
||||||
class TekstowoIntegrationTest(TekstowoBaseTest, LyricsAssertions):
|
class TekstowoIntegrationTest(TekstowoBaseTest, LyricsAssertions):
|
||||||
|
|
|
||||||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
Loading…
Reference in a new issue