From 70554640e579635a72b7292e541a1eb48645f712 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?=
Date: Sun, 13 Oct 2024 13:34:12 +0100
Subject: [PATCH] Create Html class for cleaning up the html text
Additionally, improve HTML pre-processing:
* Ensure a new line between blocks of lyrics text from letras.mus.br.
* Parse a missing last block of lyrics text from lacocinelle.net.
* Parse a missing last block of lyrics text from paroles.net.
* Fix encoding issues with AZLyrics by setting response encoding to
None, allowing `requests` to handle it.
---
beetsplug/lyrics.py | 105 ++++++++++++++++++++++-------------
test/plugins/lyrics_pages.py | 42 ++++++++++++++
test/plugins/test_lyrics.py | 41 +++++++-------
3 files changed, 127 insertions(+), 61 deletions(-)
diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index 0982120f2..a19d8c616 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -57,7 +57,6 @@ try:
except ImportError:
HAS_LANGDETECT = False
-BREAK_RE = re.compile(r"\n?\s* ]*)*>\s*\n?", re.I)
USER_AGENT = f"beets/{beets.__version__}"
INSTRUMENTAL_LYRICS = "[Instrumental]"
@@ -231,10 +230,16 @@ class RequestHandler:
def fetch_text(
self, url: str, params: JSONDict | None = None, **kwargs
) -> str:
- """Return text / HTML data from the given URL."""
+ """Return text / HTML data from the given URL.
+
+ Set the encoding to None to let requests handle it because some sites
+ set it incorrectly.
+ """
url = self.format_url(url, params)
self.debug("Fetching HTML from {}", url)
- return r_session.get(url, **kwargs).text
+ r = r_session.get(url, **kwargs)
+ r.encoding = None
+ return r.text
def fetch_json(self, url: str, params: JSONDict | None = None, **kwargs):
"""Return JSON data from the given URL."""
@@ -440,13 +445,60 @@ class MusiXmatch(DirectBackend):
return lyrics
+class Html:
+ collapse_space = partial(re.compile(r"(^| ) +", re.M).sub, r"\1")
+ expand_br = partial(re.compile(r"\s* ]*>\s*", re.I).sub, "\n")
+ #: two newlines between paragraphs on the same line (musica, letras.mus.br)
+ merge_blocks = partial(re.compile(r"(?)
]*>").sub, "\n\n")
+ #: a single new line between paragraphs on separate lines
+ #: (paroles.net, sweetslyrics.com, lacoccinelle.net)
+ merge_lines = partial(re.compile(r"