Create Html class for cleaning up the html text

Additionally, improve HTML pre-processing: * Ensure a new line between blocks of lyrics text from letras.mus.br. * Parse a missing last block of lyrics text from lacocinelle.net. * Parse a missing last block of lyrics text from paroles.net. * Fix encoding issues with AZLyrics by setting response encoding to None, allowing `requests` to handle it.
2025-12-06 08:39:17 +01:00 · 2024-10-13 13:34:12 +01:00 · 2024-10-13 13:34:12 +01:00 · 70554640e5
commit 70554640e5
parent c5c4138d66
3 changed files with 127 additions and 61 deletions
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@ -57,7 +57,6 @@ try:
 except ImportError:
    HAS_LANGDETECT = False

-BREAK_RE = re.compile(r"\n?\s*<br([\s|/][^>]*)*>\s*\n?", re.I)
 USER_AGENT = f"beets/{beets.__version__}"
 INSTRUMENTAL_LYRICS = "[Instrumental]"

@ -231,10 +230,16 @@ class RequestHandler:
    def fetch_text(
        self, url: str, params: JSONDict | None = None, **kwargs
    ) -> str:
-        """Return text / HTML data from the given URL."""
+        """Return text / HTML data from the given URL.
+
+        Set the encoding to None to let requests handle it because some sites
+        set it incorrectly.
+        """
        url = self.format_url(url, params)
        self.debug("Fetching HTML from {}", url)
-        return r_session.get(url, **kwargs).text
+        r = r_session.get(url, **kwargs)
+        r.encoding = None
+        return r.text

    def fetch_json(self, url: str, params: JSONDict | None = None, **kwargs):
        """Return JSON data from the given URL."""
@ -440,13 +445,60 @@ class MusiXmatch(DirectBackend):
        return lyrics


+class Html:
+    collapse_space = partial(re.compile(r"(^| ) +", re.M).sub, r"\1")
+    expand_br = partial(re.compile(r"\s*<br[^>]*>\s*", re.I).sub, "\n")
+    #: two newlines between paragraphs on the same line (musica, letras.mus.br)
+    merge_blocks = partial(re.compile(r"(?<!>)</p><p[^>]*>").sub, "\n\n")
+    #: a single new line between paragraphs on separate lines
+    #: (paroles.net, sweetslyrics.com, lacoccinelle.net)
+    merge_lines = partial(re.compile(r"</p>\s+<p[^>]*>(?!___)").sub, "\n")
+    #: remove empty divs (lacoccinelle.net)
+    remove_empty_divs = partial(re.compile(r"<div[^>]*>\s*</div>").sub, "")
+    #: remove Google Ads tags (musica.com)
+    remove_aside = partial(re.compile("<aside .+?</aside>").sub, "")
+    #: remove adslot-Content_1 div from the lyrics text (paroles.net)
+    remove_adslot = partial(
+        re.compile(r"\n</div>[^\n]+-- Content_\d+ --.*?\n<div>", re.S).sub,
+        "\n",
+    )
+    #: remove text formatting (azlyrics.com, lacocinelle.net)
+    remove_formatting = partial(
+        re.compile(r" *</?(i|em|pre|strong)[^>]*>").sub, ""
+    )
+
+    @classmethod
+    def normalize_space(cls, text: str) -> str:
+        text = unescape(text).replace("\r", "").replace("\xa0", " ")
+        return cls.collapse_space(cls.expand_br(text))
+
+    @classmethod
+    def remove_ads(cls, text: str) -> str:
+        return cls.remove_adslot(cls.remove_aside(text))
+
+    @classmethod
+    def merge_paragraphs(cls, text: str) -> str:
+        return cls.merge_blocks(cls.merge_lines(cls.remove_empty_divs(text)))
+
+
+class SoupMixin:
+    @classmethod
+    def pre_process_html(cls, html: str) -> str:
+        """Pre-process the HTML content before scraping."""
+        return Html.normalize_space(html)
+
+    @classmethod
+    def get_soup(cls, html: str) -> BeautifulSoup:
+        return BeautifulSoup(cls.pre_process_html(html), "html.parser")
+
+
 class SearchResult(NamedTuple):
    artist: str
    title: str
    url: str


-class SearchBackend(Backend):
+class SearchBackend(SoupMixin, Backend):
    REQUIRES_BS = True

    @cached_property
@ -534,12 +586,12 @@ class Genius(SearchBackend):
    def scrape(cls, html: str) -> str | None:
        if m := cls.LYRICS_IN_JSON_RE.search(html):
            html_text = cls.remove_backslash(m[0]).replace(r"\n", "\n")
-            return get_soup(html_text).get_text().strip()
+            return cls.get_soup(html_text).get_text().strip()

        return None


-class Tekstowo(DirectBackend):
+class Tekstowo(SoupMixin, DirectBackend):
    """Fetch lyrics from Tekstowo.pl."""

    REQUIRES_BS = True
@ -561,7 +613,7 @@ class Tekstowo(DirectBackend):

    @classmethod
    def scrape(cls, html: str) -> str | None:
-        soup = get_soup(html)
+        soup = cls.get_soup(html)

        if lyrics_div := soup.select_one("div.song-text > div.inner-text"):
            return lyrics_div.get_text()
@ -569,37 +621,6 @@ class Tekstowo(DirectBackend):
        return None


-collapse_newlines = partial(re.compile(r"\n{3,}").sub, r"\n\n")
-
-
-def _scrape_strip_cruft(html: str) -> str:
-    """Clean up HTML"""
-    html = unescape(html)
-
-    html = html.replace("\r", "\n")  # Normalize EOL.
-    html = re.sub(r" +", " ", html)  # Whitespaces collapse.
-    html = BREAK_RE.sub("\n", html)  # <br> eats up surrounding '\n'.
-    html = re.sub(r"(?s)<(script).*?</\1>", "", html)  # Strip script tags.
-    html = re.sub("\u2005", " ", html)  # replace unicode with regular space
-    html = re.sub("<aside .+?</aside>", "", html)  # remove Google Ads tags
-    html = re.sub(r"</?(em|strong)[^>]*>", "", html)  # remove italics / bold
-
-    html = "\n".join([x.strip() for x in html.strip().split("\n")])
-    return collapse_newlines(html)
-
-
-def _scrape_merge_paragraphs(html):
-    html = re.sub(r"</p>\s*<p(\s*[^>]*)>", "\n", html)
-    return re.sub(r"<div .*>\s*</div>", "\n", html)
-
-
-def get_soup(html: str) -> BeautifulSoup:
-    html = _scrape_strip_cruft(html)
-    html = _scrape_merge_paragraphs(html)
-
-    return BeautifulSoup(html, "html.parser")
-
-
 class Google(SearchBackend):
    """Fetch lyrics from Google search results."""

@ -635,6 +656,12 @@ class Google(SearchBackend):
    #: Split cleaned up URL title into artist and title parts.
    URL_TITLE_PARTS_RE = re.compile(r" +(?:[ :|-]+|par|by) +")

+    @classmethod
+    def pre_process_html(cls, html: str) -> str:
+        """Pre-process the HTML content before scraping."""
+        html = Html.remove_ads(super().pre_process_html(html))
+        return Html.remove_formatting(Html.merge_paragraphs(html))
+
    def fetch_text(self, *args, **kwargs) -> str:
        """Handle an error so that we can continue with the next URL."""
        with self.handle_request():
@ -700,7 +727,7 @@ class Google(SearchBackend):
    @classmethod
    def scrape(cls, html: str) -> str | None:
        # Get the longest text element (if any).
-        if strings := sorted(get_soup(html).stripped_strings, key=len):
+        if strings := sorted(cls.get_soup(html).stripped_strings, key=len):
            return strings[-1]

        return None
--- a/test/plugins/lyrics_pages.py
+++ b/test/plugins/lyrics_pages.py
@ -223,6 +223,20 @@ lyrics_pages = [
        Mademoiselle Madonna, couchée sur votre lit
        Listen to the music playing in your head.
        Vous écoutez la musique qui joue dans votre tête
+
+        Tuesday afternoon is never ending.
+        Le mardi après-midi n'en finit pas
+        Wednesday morning papers didn't come.
+        Le mercredi matin les journaux ne sont pas arrivés
+        Thursday night you stockings needed mending.
+        Jeudi soir, vos bas avaient besoin d'être réparés
+        See how they run.
+        Regardez comme ils filent
+
+        Lady Madonna, children at your feet.
+        Mademoiselle Madonna, les enfants à vos pieds
+        Wonder how you manage to make ends meet.
+        Je me demande comment vous vous débrouillez pour joindre les deux bouts
        """,
        url_title="Paroles et traduction The Beatles : Lady Madonna - paroles de chanson",  # noqa: E501
    ),
@ -235,29 +249,35 @@ lyrics_pages = [
        Children at your feet
        Wonder how you manage
        To make ends meet
+
        Who finds the money
        When you pay the rent?
        Did you think that money
        Was Heaven sent?
+
        Friday night arrives without a suitcase
        Sunday morning creeping like a nun
        Monday's child has learned
        To tie his bootlace
        See how they run
+
        Lady Madonna
        Baby at your breast
        Wonders how you manage
        To feed the rest
        See how they run
+
        Lady Madonna
        Lying on the bed
        Listen to the music
        Playing in your head
+
        Tuesday afternoon is neverending
        Wednesday morning papers didn't come
        Thursday night your stockings
        Needed mending
        See how they run
+
        Lady Madonna
        Children at your feet
        Wonder how you manage
@ -415,15 +435,29 @@ lyrics_pages = [
    LyricsPage.make(
        "https://www.musica.com/letras.asp?letra=59862",
        """
+        Lady Madonna, children at your feet
+        Wonder how you manage to make ends meet
+        Who finds the money when you pay the rent?
+        Did you think that money was heaven sent?
+
+        Friday night arrives without a suitcase
+        Sunday morning creeping like a nun
+        Monday's child has learned to tie his bootlace
+        See how they run
+
        Lady Madonna, baby at your breast
        Wonders how you manage to feed the rest
+
        See how they run
+
        Lady Madonna lying on the bed
        Listen to the music playing in your head
+
        Tuesday afternoon is never ending
        Wednesday morning papers didn't come
        Thursday night your stockings needed mending
        See how they run
+
        Lady Madonna, children at your feet
        Wonder how you manage to make ends meet
        """,
@ -448,6 +482,14 @@ lyrics_pages = [
        See how they run.
        Lady Madonna, lying on the bed,
        Listen to the music playing in your head.
+
+        Tuesday afternoon is never ending.
+        Wednesday morning papers didn't come.
+        Thursday night your stockings needed mending.
+        See how they run.
+
+        Lady Madonna, children at your feet.
+        Wonder how you manage to make ends meet.
        """,
        url_title="Paroles Lady Madonna par The Beatles - Lyrics - Paroles.net",
    ),
--- a/test/plugins/test_lyrics.py
+++ b/test/plugins/test_lyrics.py
@ -101,28 +101,6 @@ class TestLyricsUtils:

        assert list(actual_titles) == [title, *expected_extra_titles]

-    @pytest.mark.parametrize(
-        "initial_text, expected",
-        [
-            (
-                """<!--lyrics below-->
-                  &nbsp;one
-                  <br class='myclass'>
-                  two  !
-                  <br><br \\>
-                  <blink>four</blink>""",
-                "<!--lyrics below-->\none\ntwo !\n\n<blink>four</blink>",
-            ),
-            ("foo<script>bar</script>baz", "foobaz"),
-        ],
-    )
-    def test_scrape_strip_cruft(self, initial_text, expected):
-        assert lyrics._scrape_strip_cruft(initial_text) == expected
-
-    def test_scrape_merge_paragraphs(self):
-        text = "one</p>   <p class='myclass'>two</p><p>three"
-        assert lyrics._scrape_merge_paragraphs(text) == "one\ntwo\nthree"
-
    @pytest.mark.parametrize(
        "text, expected",
        [
@ -142,6 +120,25 @@ class TestLyricsUtils:
        assert lyrics.slug(text) == expected


+class TestHtml:
+    def test_scrape_strip_cruft(self):
+        initial = """<!--lyrics below-->
+                  &nbsp;one
+                  <br class='myclass'>
+                  two  !
+                  <br><br \\>
+                  <blink>four</blink>"""
+        expected = "<!--lyrics below-->\none\ntwo !\n\n<blink>four</blink>"
+
+        assert lyrics.Html.normalize_space(initial) == expected
+
+    def test_scrape_merge_paragraphs(self):
+        text = "one</p>   <p class='myclass'>two</p><p>three"
+        expected = "one\ntwo\n\nthree"
+
+        assert lyrics.Html.merge_paragraphs(text) == expected
+
+
 class TestSearchBackend:
    @pytest.fixture
    def backend(self, dist_thresh):