mirror of
https://github.com/beetbox/beets.git
synced 2025-12-06 08:39:17 +01:00
Do not try to strip cruft from the parsed lyrics text.
Having removed it I fuond that only the Genius lyrics changed: it had en extra new line. Thus I defined a function 'collapse_newlines' which now gets called for the Genius lyrics.
This commit is contained in:
parent
7c2fb31136
commit
dd9f178fff
2 changed files with 13 additions and 16 deletions
|
|
@ -59,9 +59,6 @@ except ImportError:
|
||||||
|
|
||||||
JSONDict = dict[str, Any]
|
JSONDict = dict[str, Any]
|
||||||
|
|
||||||
DIV_RE = re.compile(r"<(/?)div>?", re.I)
|
|
||||||
COMMENT_RE = re.compile(r"<!--.*-->", re.S)
|
|
||||||
TAG_RE = re.compile(r"<[^>]*>")
|
|
||||||
BREAK_RE = re.compile(r"\n?\s*<br([\s|/][^>]*)*>\s*\n?", re.I)
|
BREAK_RE = re.compile(r"\n?\s*<br([\s|/][^>]*)*>\s*\n?", re.I)
|
||||||
USER_AGENT = f"beets/{beets.__version__}"
|
USER_AGENT = f"beets/{beets.__version__}"
|
||||||
INSTRUMENTAL_LYRICS = "[Instrumental]"
|
INSTRUMENTAL_LYRICS = "[Instrumental]"
|
||||||
|
|
@ -552,8 +549,11 @@ class Genius(SearchBackend):
|
||||||
check = partial(self.check_match, artist, title)
|
check = partial(self.check_match, artist, title)
|
||||||
for hit in json["response"]["hits"]:
|
for hit in json["response"]["hits"]:
|
||||||
result = hit["result"]
|
result = hit["result"]
|
||||||
if check(result["primary_artist"]["name"], result["title"]):
|
url = hit["result"]["url"]
|
||||||
return self.scrape_lyrics(self.fetch_text(hit["result"]["url"]))
|
if check(result["primary_artist"]["name"], result["title"]) and (
|
||||||
|
lyrics := self.scrape_lyrics(self.fetch_text(url))
|
||||||
|
):
|
||||||
|
return collapse_newlines(lyrics)
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
@ -670,7 +670,10 @@ def remove_credits(text):
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
|
||||||
def _scrape_strip_cruft(html, plain_text_out=False):
|
collapse_newlines = partial(re.compile(r"\n{3,}").sub, r"\n\n")
|
||||||
|
|
||||||
|
|
||||||
|
def _scrape_strip_cruft(html: str) -> str:
|
||||||
"""Clean up HTML"""
|
"""Clean up HTML"""
|
||||||
html = unescape(html)
|
html = unescape(html)
|
||||||
|
|
||||||
|
|
@ -682,13 +685,8 @@ def _scrape_strip_cruft(html, plain_text_out=False):
|
||||||
html = re.sub("<aside .+?</aside>", "", html) # remove Google Ads tags
|
html = re.sub("<aside .+?</aside>", "", html) # remove Google Ads tags
|
||||||
html = re.sub(r"</?(em|strong)[^>]*>", "", html) # remove italics / bold
|
html = re.sub(r"</?(em|strong)[^>]*>", "", html) # remove italics / bold
|
||||||
|
|
||||||
if plain_text_out: # Strip remaining HTML tags
|
|
||||||
html = COMMENT_RE.sub("", html)
|
|
||||||
html = TAG_RE.sub("", html)
|
|
||||||
|
|
||||||
html = "\n".join([x.strip() for x in html.strip().split("\n")])
|
html = "\n".join([x.strip() for x in html.strip().split("\n")])
|
||||||
html = re.sub(r"\n{3,}", r"\n\n", html)
|
return collapse_newlines(html)
|
||||||
return html
|
|
||||||
|
|
||||||
|
|
||||||
def _scrape_merge_paragraphs(html):
|
def _scrape_merge_paragraphs(html):
|
||||||
|
|
@ -1114,7 +1112,7 @@ class LyricsPlugin(RequestHandler, plugins.BeetsPlugin):
|
||||||
for backend in self.backends:
|
for backend in self.backends:
|
||||||
with backend.handle_request():
|
with backend.handle_request():
|
||||||
if lyrics := backend.fetch(artist, title, *args):
|
if lyrics := backend.fetch(artist, title, *args):
|
||||||
return _scrape_strip_cruft(lyrics, True)
|
return lyrics
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -129,14 +129,13 @@ class TestLyricsUtils:
|
||||||
two !
|
two !
|
||||||
<br><br \\>
|
<br><br \\>
|
||||||
<blink>four</blink>""",
|
<blink>four</blink>""",
|
||||||
"one\ntwo !\n\nfour",
|
"<!--lyrics below-->\none\ntwo !\n\n<blink>four</blink>",
|
||||||
),
|
),
|
||||||
("foo<script>bar</script>baz", "foobaz"),
|
("foo<script>bar</script>baz", "foobaz"),
|
||||||
("foo<!--<bar>-->qux", "fooqux"),
|
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_scrape_strip_cruft(self, initial_text, expected):
|
def test_scrape_strip_cruft(self, initial_text, expected):
|
||||||
assert lyrics._scrape_strip_cruft(initial_text, True) == expected
|
assert lyrics._scrape_strip_cruft(initial_text) == expected
|
||||||
|
|
||||||
def test_scrape_merge_paragraphs(self):
|
def test_scrape_merge_paragraphs(self):
|
||||||
text = "one</p> <p class='myclass'>two</p><p>three"
|
text = "one</p> <p class='myclass'>two</p><p>three"
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue