Do not try to strip cruft from the parsed lyrics text.

Having removed it I fuond that only the Genius lyrics changed: it had en
extra new line. Thus I defined a function 'collapse_newlines' which now
gets called for the Genius lyrics.
This commit is contained in:
Šarūnas Nejus 2024-10-19 03:30:41 +01:00
parent 7c2fb31136
commit dd9f178fff
No known key found for this signature in database
GPG key ID: DD28F6704DBE3435
2 changed files with 13 additions and 16 deletions

View file

@ -59,9 +59,6 @@ except ImportError:
JSONDict = dict[str, Any] JSONDict = dict[str, Any]
DIV_RE = re.compile(r"<(/?)div>?", re.I)
COMMENT_RE = re.compile(r"<!--.*-->", re.S)
TAG_RE = re.compile(r"<[^>]*>")
BREAK_RE = re.compile(r"\n?\s*<br([\s|/][^>]*)*>\s*\n?", re.I) BREAK_RE = re.compile(r"\n?\s*<br([\s|/][^>]*)*>\s*\n?", re.I)
USER_AGENT = f"beets/{beets.__version__}" USER_AGENT = f"beets/{beets.__version__}"
INSTRUMENTAL_LYRICS = "[Instrumental]" INSTRUMENTAL_LYRICS = "[Instrumental]"
@ -552,8 +549,11 @@ class Genius(SearchBackend):
check = partial(self.check_match, artist, title) check = partial(self.check_match, artist, title)
for hit in json["response"]["hits"]: for hit in json["response"]["hits"]:
result = hit["result"] result = hit["result"]
if check(result["primary_artist"]["name"], result["title"]): url = hit["result"]["url"]
return self.scrape_lyrics(self.fetch_text(hit["result"]["url"])) if check(result["primary_artist"]["name"], result["title"]) and (
lyrics := self.scrape_lyrics(self.fetch_text(url))
):
return collapse_newlines(lyrics)
return None return None
@ -670,7 +670,10 @@ def remove_credits(text):
return text return text
def _scrape_strip_cruft(html, plain_text_out=False): collapse_newlines = partial(re.compile(r"\n{3,}").sub, r"\n\n")
def _scrape_strip_cruft(html: str) -> str:
"""Clean up HTML""" """Clean up HTML"""
html = unescape(html) html = unescape(html)
@ -682,13 +685,8 @@ def _scrape_strip_cruft(html, plain_text_out=False):
html = re.sub("<aside .+?</aside>", "", html) # remove Google Ads tags html = re.sub("<aside .+?</aside>", "", html) # remove Google Ads tags
html = re.sub(r"</?(em|strong)[^>]*>", "", html) # remove italics / bold html = re.sub(r"</?(em|strong)[^>]*>", "", html) # remove italics / bold
if plain_text_out: # Strip remaining HTML tags
html = COMMENT_RE.sub("", html)
html = TAG_RE.sub("", html)
html = "\n".join([x.strip() for x in html.strip().split("\n")]) html = "\n".join([x.strip() for x in html.strip().split("\n")])
html = re.sub(r"\n{3,}", r"\n\n", html) return collapse_newlines(html)
return html
def _scrape_merge_paragraphs(html): def _scrape_merge_paragraphs(html):
@ -1114,7 +1112,7 @@ class LyricsPlugin(RequestHandler, plugins.BeetsPlugin):
for backend in self.backends: for backend in self.backends:
with backend.handle_request(): with backend.handle_request():
if lyrics := backend.fetch(artist, title, *args): if lyrics := backend.fetch(artist, title, *args):
return _scrape_strip_cruft(lyrics, True) return lyrics
return None return None

View file

@ -129,14 +129,13 @@ class TestLyricsUtils:
two ! two !
<br><br \\> <br><br \\>
<blink>four</blink>""", <blink>four</blink>""",
"one\ntwo !\n\nfour", "<!--lyrics below-->\none\ntwo !\n\n<blink>four</blink>",
), ),
("foo<script>bar</script>baz", "foobaz"), ("foo<script>bar</script>baz", "foobaz"),
("foo<!--<bar>-->qux", "fooqux"),
], ],
) )
def test_scrape_strip_cruft(self, initial_text, expected): def test_scrape_strip_cruft(self, initial_text, expected):
assert lyrics._scrape_strip_cruft(initial_text, True) == expected assert lyrics._scrape_strip_cruft(initial_text) == expected
def test_scrape_merge_paragraphs(self): def test_scrape_merge_paragraphs(self):
text = "one</p> <p class='myclass'>two</p><p>three" text = "one</p> <p class='myclass'>two</p><p>three"