From dd9f178fffd1cd130bde56fdbaae853aab4ca0c9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?= Date: Sat, 19 Oct 2024 03:30:41 +0100 Subject: [PATCH] Do not try to strip cruft from the parsed lyrics text. Having removed it I fuond that only the Genius lyrics changed: it had en extra new line. Thus I defined a function 'collapse_newlines' which now gets called for the Genius lyrics. --- beetsplug/lyrics.py | 24 +++++++++++------------- test/plugins/test_lyrics.py | 5 ++--- 2 files changed, 13 insertions(+), 16 deletions(-) diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py index 3d0e09673..2ec362356 100644 --- a/beetsplug/lyrics.py +++ b/beetsplug/lyrics.py @@ -59,9 +59,6 @@ except ImportError: JSONDict = dict[str, Any] -DIV_RE = re.compile(r"<(/?)div>?", re.I) -COMMENT_RE = re.compile(r"", re.S) -TAG_RE = re.compile(r"<[^>]*>") BREAK_RE = re.compile(r"\n?\s*]*)*>\s*\n?", re.I) USER_AGENT = f"beets/{beets.__version__}" INSTRUMENTAL_LYRICS = "[Instrumental]" @@ -552,8 +549,11 @@ class Genius(SearchBackend): check = partial(self.check_match, artist, title) for hit in json["response"]["hits"]: result = hit["result"] - if check(result["primary_artist"]["name"], result["title"]): - return self.scrape_lyrics(self.fetch_text(hit["result"]["url"])) + url = hit["result"]["url"] + if check(result["primary_artist"]["name"], result["title"]) and ( + lyrics := self.scrape_lyrics(self.fetch_text(url)) + ): + return collapse_newlines(lyrics) return None @@ -670,7 +670,10 @@ def remove_credits(text): return text -def _scrape_strip_cruft(html, plain_text_out=False): +collapse_newlines = partial(re.compile(r"\n{3,}").sub, r"\n\n") + + +def _scrape_strip_cruft(html: str) -> str: """Clean up HTML""" html = unescape(html) @@ -682,13 +685,8 @@ def _scrape_strip_cruft(html, plain_text_out=False): html = re.sub("