From 70554640e579635a72b7292e541a1eb48645f712 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?= <snejus@protonmail.com>
Date: Sun, 13 Oct 2024 13:34:12 +0100
Subject: [PATCH] Create Html class for cleaning up the html text

Additionally, improve HTML pre-processing:

* Ensure a new line between blocks of lyrics text from letras.mus.br.
* Parse a missing last block of lyrics text from lacocinelle.net.
* Parse a missing last block of lyrics text from paroles.net.
* Fix encoding issues with AZLyrics by setting response encoding to
  None, allowing `requests` to handle it.
---
 beetsplug/lyrics.py          | 105 ++++++++++++++++++++++-------------
 test/plugins/lyrics_pages.py |  42 ++++++++++++++
 test/plugins/test_lyrics.py  |  41 +++++++-------
 3 files changed, 127 insertions(+), 61 deletions(-)
diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index 0982120f2..a19d8c616 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -57,7 +57,6 @@ try:
 except ImportError:
     HAS_LANGDETECT = False
 
-BREAK_RE = re.compile(r"\n?\s*<br([\s|/][^>]*)*>\s*\n?", re.I)
 USER_AGENT = f"beets/{beets.__version__}"
 INSTRUMENTAL_LYRICS = "[Instrumental]"
 
@@ -231,10 +230,16 @@ class RequestHandler:
     def fetch_text(
         self, url: str, params: JSONDict | None = None, **kwargs
     ) -> str:
-        """Return text / HTML data from the given URL."""
+        """Return text / HTML data from the given URL.
+
+        Set the encoding to None to let requests handle it because some sites
+        set it incorrectly.
+        """
         url = self.format_url(url, params)
         self.debug("Fetching HTML from {}", url)
-        return r_session.get(url, **kwargs).text
+        r = r_session.get(url, **kwargs)
+        r.encoding = None
+        return r.text
 
     def fetch_json(self, url: str, params: JSONDict | None = None, **kwargs):
         """Return JSON data from the given URL."""
@@ -440,13 +445,60 @@ class MusiXmatch(DirectBackend):
         return lyrics
 
 
+class Html:
+    collapse_space = partial(re.compile(r"(^| ) +", re.M).sub, r"\1")
+    expand_br = partial(re.compile(r"\s*<br[^>]*>\s*", re.I).sub, "\n")
+    #: two newlines between paragraphs on the same line (musica, letras.mus.br)
+    merge_blocks = partial(re.compile(r"(?<!>)</p><p[^>]*>").sub, "\n\n")
+    #: a single new line between paragraphs on separate lines
+    #: (paroles.net, sweetslyrics.com, lacoccinelle.net)
+    merge_lines = partial(re.compile(r"</p>\s+<p[^>]*>(?!___)").sub, "\n")
+    #: remove empty divs (lacoccinelle.net)
+    remove_empty_divs = partial(re.compile(r"<div[^>]*>\s*</div>").sub, "")
+    #: remove Google Ads tags (musica.com)
+    remove_aside = partial(re.compile("<aside .+?</aside>").sub, "")
+    #: remove adslot-Content_1 div from the lyrics text (paroles.net)
+    remove_adslot = partial(
+        re.compile(r"\n</div>[^\n]+-- Content_\d+ --.*?\n<div>", re.S).sub,
+        "\n",
+    )
+    #: remove text formatting (azlyrics.com, lacocinelle.net)
+    remove_formatting = partial(
+        re.compile(r" *</?(i|em|pre|strong)[^>]*>").sub, ""
+    )
+
+    @classmethod
+    def normalize_space(cls, text: str) -> str:
+        text = unescape(text).replace("\r", "").replace("\xa0", " ")
+        return cls.collapse_space(cls.expand_br(text))
+
+    @classmethod
+    def remove_ads(cls, text: str) -> str:
+        return cls.remove_adslot(cls.remove_aside(text))
+
+    @classmethod
+    def merge_paragraphs(cls, text: str) -> str:
+        return cls.merge_blocks(cls.merge_lines(cls.remove_empty_divs(text)))
+
+
+class SoupMixin:
+    @classmethod
+    def pre_process_html(cls, html: str) -> str:
+        """Pre-process the HTML content before scraping."""
+        return Html.normalize_space(html)
+
+    @classmethod
+    def get_soup(cls, html: str) -> BeautifulSoup:
+        return BeautifulSoup(cls.pre_process_html(html), "html.parser")
+
+
 class SearchResult(NamedTuple):
     artist: str
     title: str
     url: str
 
 
-class SearchBackend(Backend):
+class SearchBackend(SoupMixin, Backend):
     REQUIRES_BS = True
 
     @cached_property
@@ -534,12 +586,12 @@ class Genius(SearchBackend):
     def scrape(cls, html: str) -> str | None:
         if m := cls.LYRICS_IN_JSON_RE.search(html):
             html_text = cls.remove_backslash(m[0]).replace(r"\n", "\n")
-            return get_soup(html_text).get_text().strip()
+            return cls.get_soup(html_text).get_text().strip()
 
         return None
 
 
-class Tekstowo(DirectBackend):
+class Tekstowo(SoupMixin, DirectBackend):
     """Fetch lyrics from Tekstowo.pl."""
 
     REQUIRES_BS = True
@@ -561,7 +613,7 @@ class Tekstowo(DirectBackend):
 
     @classmethod
     def scrape(cls, html: str) -> str | None:
-        soup = get_soup(html)
+        soup = cls.get_soup(html)
 
         if lyrics_div := soup.select_one("div.song-text > div.inner-text"):
             return lyrics_div.get_text()
@@ -569,37 +621,6 @@ class Tekstowo(DirectBackend):
         return None
 
 
-collapse_newlines = partial(re.compile(r"\n{3,}").sub, r"\n\n")
-
-
-def _scrape_strip_cruft(html: str) -> str:
-    """Clean up HTML"""
-    html = unescape(html)
-
-    html = html.replace("\r", "\n")  # Normalize EOL.
-    html = re.sub(r" +", " ", html)  # Whitespaces collapse.
-    html = BREAK_RE.sub("\n", html)  # <br> eats up surrounding '\n'.
-    html = re.sub(r"(?s)<(script).*?</\1>", "", html)  # Strip script tags.
-    html = re.sub("\u2005", " ", html)  # replace unicode with regular space
-    html = re.sub("<aside .+?</aside>", "", html)  # remove Google Ads tags
-    html = re.sub(r"</?(em|strong)[^>]*>", "", html)  # remove italics / bold
-
-    html = "\n".join([x.strip() for x in html.strip().split("\n")])
-    return collapse_newlines(html)
-
-
-def _scrape_merge_paragraphs(html):
-    html = re.sub(r"</p>\s*<p(\s*[^>]*)>", "\n", html)
-    return re.sub(r"<div .*>\s*</div>", "\n", html)
-
-
-def get_soup(html: str) -> BeautifulSoup:
-    html = _scrape_strip_cruft(html)
-    html = _scrape_merge_paragraphs(html)
-
-    return BeautifulSoup(html, "html.parser")
-
-
 class Google(SearchBackend):
     """Fetch lyrics from Google search results."""
 
@@ -635,6 +656,12 @@ class Google(SearchBackend):
     #: Split cleaned up URL title into artist and title parts.
     URL_TITLE_PARTS_RE = re.compile(r" +(?:[ :|-]+|par|by) +")
 
+    @classmethod
+    def pre_process_html(cls, html: str) -> str:
+        """Pre-process the HTML content before scraping."""
+        html = Html.remove_ads(super().pre_process_html(html))
+        return Html.remove_formatting(Html.merge_paragraphs(html))
+
     def fetch_text(self, *args, **kwargs) -> str:
         """Handle an error so that we can continue with the next URL."""
         with self.handle_request():
@@ -700,7 +727,7 @@ class Google(SearchBackend):
     @classmethod
     def scrape(cls, html: str) -> str | None:
         # Get the longest text element (if any).
-        if strings := sorted(get_soup(html).stripped_strings, key=len):
+        if strings := sorted(cls.get_soup(html).stripped_strings, key=len):
             return strings[-1]
 
         return None
diff --git a/test/plugins/lyrics_pages.py b/test/plugins/lyrics_pages.py
index 46a72076d..bca66ef1b 100644
--- a/test/plugins/lyrics_pages.py
+++ b/test/plugins/lyrics_pages.py
@@ -223,6 +223,20 @@ lyrics_pages = [
         Mademoiselle Madonna, couchée sur votre lit
         Listen to the music playing in your head.
         Vous écoutez la musique qui joue dans votre tête
+
+        Tuesday afternoon is never ending.
+        Le mardi après-midi n'en finit pas
+        Wednesday morning papers didn't come.
+        Le mercredi matin les journaux ne sont pas arrivés
+        Thursday night you stockings needed mending.
+        Jeudi soir, vos bas avaient besoin d'être réparés
+        See how they run.
+        Regardez comme ils filent
+
+        Lady Madonna, children at your feet.
+        Mademoiselle Madonna, les enfants à vos pieds
+        Wonder how you manage to make ends meet.
+        Je me demande comment vous vous débrouillez pour joindre les deux bouts
         """,
         url_title="Paroles et traduction The Beatles : Lady Madonna - paroles de chanson",  # noqa: E501
     ),
@@ -235,29 +249,35 @@ lyrics_pages = [
         Children at your feet
         Wonder how you manage
         To make ends meet
+
         Who finds the money
         When you pay the rent?
         Did you think that money
         Was Heaven sent?
+
         Friday night arrives without a suitcase
         Sunday morning creeping like a nun
         Monday's child has learned
         To tie his bootlace
         See how they run
+
         Lady Madonna
         Baby at your breast
         Wonders how you manage
         To feed the rest
         See how they run
+
         Lady Madonna
         Lying on the bed
         Listen to the music
         Playing in your head
+
         Tuesday afternoon is neverending
         Wednesday morning papers didn't come
         Thursday night your stockings
         Needed mending
         See how they run
+
         Lady Madonna
         Children at your feet
         Wonder how you manage
@@ -415,15 +435,29 @@ lyrics_pages = [
     LyricsPage.make(
         "https://www.musica.com/letras.asp?letra=59862",
         """
+        Lady Madonna, children at your feet
+        Wonder how you manage to make ends meet
+        Who finds the money when you pay the rent?
+        Did you think that money was heaven sent?
+
+        Friday night arrives without a suitcase
+        Sunday morning creeping like a nun
+        Monday's child has learned to tie his bootlace
+        See how they run
+
         Lady Madonna, baby at your breast
         Wonders how you manage to feed the rest
+
         See how they run
+
         Lady Madonna lying on the bed
         Listen to the music playing in your head
+
         Tuesday afternoon is never ending
         Wednesday morning papers didn't come
         Thursday night your stockings needed mending
         See how they run
+
         Lady Madonna, children at your feet
         Wonder how you manage to make ends meet
         """,
@@ -448,6 +482,14 @@ lyrics_pages = [
         See how they run.
         Lady Madonna, lying on the bed,
         Listen to the music playing in your head.
+
+        Tuesday afternoon is never ending.
+        Wednesday morning papers didn't come.
+        Thursday night your stockings needed mending.
+        See how they run.
+
+        Lady Madonna, children at your feet.
+        Wonder how you manage to make ends meet.
         """,
         url_title="Paroles Lady Madonna par The Beatles - Lyrics - Paroles.net",
     ),
diff --git a/test/plugins/test_lyrics.py b/test/plugins/test_lyrics.py
index 6986e4f06..9fa3931fd 100644
--- a/test/plugins/test_lyrics.py
+++ b/test/plugins/test_lyrics.py
@@ -101,28 +101,6 @@ class TestLyricsUtils:
 
         assert list(actual_titles) == [title, *expected_extra_titles]
 
-    @pytest.mark.parametrize(
-        "initial_text, expected",
-        [
-            (
-                """<!--lyrics below-->
-                  &nbsp;one
-                  <br class='myclass'>
-                  two  !
-                  <br><br \\>
-                  <blink>four</blink>""",
-                "<!--lyrics below-->\none\ntwo !\n\n<blink>four</blink>",
-            ),
-            ("foo<script>bar</script>baz", "foobaz"),
-        ],
-    )
-    def test_scrape_strip_cruft(self, initial_text, expected):
-        assert lyrics._scrape_strip_cruft(initial_text) == expected
-
-    def test_scrape_merge_paragraphs(self):
-        text = "one</p>   <p class='myclass'>two</p><p>three"
-        assert lyrics._scrape_merge_paragraphs(text) == "one\ntwo\nthree"
-
     @pytest.mark.parametrize(
         "text, expected",
         [
@@ -142,6 +120,25 @@ class TestLyricsUtils:
         assert lyrics.slug(text) == expected
 
 
+class TestHtml:
+    def test_scrape_strip_cruft(self):
+        initial = """<!--lyrics below-->
+                  &nbsp;one
+                  <br class='myclass'>
+                  two  !
+                  <br><br \\>
+                  <blink>four</blink>"""
+        expected = "<!--lyrics below-->\none\ntwo !\n\n<blink>four</blink>"
+
+        assert lyrics.Html.normalize_space(initial) == expected
+
+    def test_scrape_merge_paragraphs(self):
+        text = "one</p>   <p class='myclass'>two</p><p>three"
+        expected = "one\ntwo\n\nthree"
+
+        assert lyrics.Html.merge_paragraphs(text) == expected
+
+
 class TestSearchBackend:
     @pytest.fixture
     def backend(self, dist_thresh):