Do not try to strip cruft from the parsed lyrics text.

Having removed it I fuond that only the Genius lyrics changed: it had en extra new line. Thus I defined a function 'collapse_newlines' which now gets called for the Genius lyrics.
2025-12-06 08:39:17 +01:00 · 2024-10-19 03:30:41 +01:00 · 2024-10-19 03:30:41 +01:00 · dd9f178fff
commit dd9f178fff
parent 7c2fb31136
2 changed files with 13 additions and 16 deletions
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@ -59,9 +59,6 @@ except ImportError:
 JSONDict = dict[str, Any]
 DIV_RE = re.compile(r"<(/?)div>?", re.I)
 COMMENT_RE = re.compile(r"<!--.*-->", re.S)
 TAG_RE = re.compile(r"<[^>]*>")
 BREAK_RE = re.compile(r"\n?\s*<br([\s|/][^>]*)*>\s*\n?", re.I)
 USER_AGENT = f"beets/{beets.__version__}"
 INSTRUMENTAL_LYRICS = "[Instrumental]"
@ -552,8 +549,11 @@ class Genius(SearchBackend):
        check = partial(self.check_match, artist, title)
        for hit in json["response"]["hits"]:
            result = hit["result"]
-            if check(result["primary_artist"]["name"], result["title"]):
+            url = hit["result"]["url"]
-                return self.scrape_lyrics(self.fetch_text(hit["result"]["url"]))
+            if check(result["primary_artist"]["name"], result["title"]) and (
                lyrics := self.scrape_lyrics(self.fetch_text(url))
            ):
                return collapse_newlines(lyrics)
        return None
@ -670,7 +670,10 @@ def remove_credits(text):
    return text
-def _scrape_strip_cruft(html, plain_text_out=False):
+collapse_newlines = partial(re.compile(r"\n{3,}").sub, r"\n\n")
 def _scrape_strip_cruft(html: str) -> str:
    """Clean up HTML"""
    html = unescape(html)
@ -682,13 +685,8 @@ def _scrape_strip_cruft(html, plain_text_out=False):
    html = re.sub("<aside .+?</aside>", "", html)  # remove Google Ads tags
    html = re.sub(r"</?(em|strong)[^>]*>", "", html)  # remove italics / bold
    if plain_text_out:  # Strip remaining HTML tags
        html = COMMENT_RE.sub("", html)
        html = TAG_RE.sub("", html)
    html = "\n".join([x.strip() for x in html.strip().split("\n")])
-    html = re.sub(r"\n{3,}", r"\n\n", html)
+    return collapse_newlines(html)
    return html
 def _scrape_merge_paragraphs(html):
@ -1114,7 +1112,7 @@ class LyricsPlugin(RequestHandler, plugins.BeetsPlugin):
        for backend in self.backends:
            with backend.handle_request():
                if lyrics := backend.fetch(artist, title, *args):
-                    return _scrape_strip_cruft(lyrics, True)
+                    return lyrics
        return None
--- a/test/plugins/test_lyrics.py
+++ b/test/plugins/test_lyrics.py
@ -129,14 +129,13 @@ class TestLyricsUtils:
                  two  !
                  <br><br \\>
                  <blink>four</blink>""",
-                "one\ntwo !\n\nfour",
+                "<!--lyrics below-->\none\ntwo !\n\n<blink>four</blink>",
            ),
            ("foo<script>bar</script>baz", "foobaz"),
            ("foo<!--<bar>-->qux", "fooqux"),
        ],
    )
    def test_scrape_strip_cruft(self, initial_text, expected):
-        assert lyrics._scrape_strip_cruft(initial_text, True) == expected
+        assert lyrics._scrape_strip_cruft(initial_text) == expected
    def test_scrape_merge_paragraphs(self):
        text = "one</p>   <p class='myclass'>two</p><p>three"