", re.S).sub,
+ "\n",
+ )
+ #: remove text formatting (azlyrics.com, lacocinelle.net)
+ remove_formatting = partial(
+ re.compile(r" *?(i|em|pre|strong)[^>]*>").sub, ""
+ )
+
+ @classmethod
+ def normalize_space(cls, text: str) -> str:
+ text = unescape(text).replace("\r", "").replace("\xa0", " ")
+ return cls.collapse_space(cls.expand_br(text))
+
+ @classmethod
+ def remove_ads(cls, text: str) -> str:
+ return cls.remove_adslot(cls.remove_aside(text))
+
+ @classmethod
+ def merge_paragraphs(cls, text: str) -> str:
+ return cls.merge_blocks(cls.merge_lines(cls.remove_empty_divs(text)))
+
+
+class SoupMixin:
+ @classmethod
+ def pre_process_html(cls, html: str) -> str:
+ """Pre-process the HTML content before scraping."""
+ return Html.normalize_space(html)
+
+ @classmethod
+ def get_soup(cls, html: str) -> BeautifulSoup:
+ return BeautifulSoup(cls.pre_process_html(html), "html.parser")
+
+
class SearchResult(NamedTuple):
artist: str
title: str
url: str
-class SearchBackend(Backend):
+class SearchBackend(SoupMixin, Backend):
REQUIRES_BS = True
@cached_property
@@ -534,12 +586,12 @@ class Genius(SearchBackend):
def scrape(cls, html: str) -> str | None:
if m := cls.LYRICS_IN_JSON_RE.search(html):
html_text = cls.remove_backslash(m[0]).replace(r"\n", "\n")
- return get_soup(html_text).get_text().strip()
+ return cls.get_soup(html_text).get_text().strip()
return None
-class Tekstowo(DirectBackend):
+class Tekstowo(SoupMixin, DirectBackend):
"""Fetch lyrics from Tekstowo.pl."""
REQUIRES_BS = True
@@ -561,7 +613,7 @@ class Tekstowo(DirectBackend):
@classmethod
def scrape(cls, html: str) -> str | None:
- soup = get_soup(html)
+ soup = cls.get_soup(html)
if lyrics_div := soup.select_one("div.song-text > div.inner-text"):
return lyrics_div.get_text()
@@ -569,37 +621,6 @@ class Tekstowo(DirectBackend):
return None
-collapse_newlines = partial(re.compile(r"\n{3,}").sub, r"\n\n")
-
-
-def _scrape_strip_cruft(html: str) -> str:
- """Clean up HTML"""
- html = unescape(html)
-
- html = html.replace("\r", "\n") # Normalize EOL.
- html = re.sub(r" +", " ", html) # Whitespaces collapse.
- html = BREAK_RE.sub("\n", html) #
eats up surrounding '\n'.
- html = re.sub(r"(?s)<(script).*?\1>", "", html) # Strip script tags.
- html = re.sub("\u2005", " ", html) # replace unicode with regular space
- html = re.sub("
", "", html) # remove Google Ads tags
- html = re.sub(r"?(em|strong)[^>]*>", "", html) # remove italics / bold
-
- html = "\n".join([x.strip() for x in html.strip().split("\n")])
- return collapse_newlines(html)
-
-
-def _scrape_merge_paragraphs(html):
- html = re.sub(r"\s*]*)>", "\n", html)
- return re.sub(r"
\s*
", "\n", html)
-
-
-def get_soup(html: str) -> BeautifulSoup:
- html = _scrape_strip_cruft(html)
- html = _scrape_merge_paragraphs(html)
-
- return BeautifulSoup(html, "html.parser")
-
-
class Google(SearchBackend):
"""Fetch lyrics from Google search results."""
@@ -635,6 +656,12 @@ class Google(SearchBackend):
#: Split cleaned up URL title into artist and title parts.
URL_TITLE_PARTS_RE = re.compile(r" +(?:[ :|-]+|par|by) +")
+ @classmethod
+ def pre_process_html(cls, html: str) -> str:
+ """Pre-process the HTML content before scraping."""
+ html = Html.remove_ads(super().pre_process_html(html))
+ return Html.remove_formatting(Html.merge_paragraphs(html))
+
def fetch_text(self, *args, **kwargs) -> str:
"""Handle an error so that we can continue with the next URL."""
with self.handle_request():
@@ -700,7 +727,7 @@ class Google(SearchBackend):
@classmethod
def scrape(cls, html: str) -> str | None:
# Get the longest text element (if any).
- if strings := sorted(get_soup(html).stripped_strings, key=len):
+ if strings := sorted(cls.get_soup(html).stripped_strings, key=len):
return strings[-1]
return None
diff --git a/test/plugins/lyrics_pages.py b/test/plugins/lyrics_pages.py
index 46a72076d..bca66ef1b 100644
--- a/test/plugins/lyrics_pages.py
+++ b/test/plugins/lyrics_pages.py
@@ -223,6 +223,20 @@ lyrics_pages = [
Mademoiselle Madonna, couchée sur votre lit
Listen to the music playing in your head.
Vous écoutez la musique qui joue dans votre tête
+
+ Tuesday afternoon is never ending.
+ Le mardi après-midi n'en finit pas
+ Wednesday morning papers didn't come.
+ Le mercredi matin les journaux ne sont pas arrivés
+ Thursday night you stockings needed mending.
+ Jeudi soir, vos bas avaient besoin d'être réparés
+ See how they run.
+ Regardez comme ils filent
+
+ Lady Madonna, children at your feet.
+ Mademoiselle Madonna, les enfants à vos pieds
+ Wonder how you manage to make ends meet.
+ Je me demande comment vous vous débrouillez pour joindre les deux bouts
""",
url_title="Paroles et traduction The Beatles : Lady Madonna - paroles de chanson", # noqa: E501
),
@@ -235,29 +249,35 @@ lyrics_pages = [
Children at your feet
Wonder how you manage
To make ends meet
+
Who finds the money
When you pay the rent?
Did you think that money
Was Heaven sent?
+
Friday night arrives without a suitcase
Sunday morning creeping like a nun
Monday's child has learned
To tie his bootlace
See how they run
+
Lady Madonna
Baby at your breast
Wonders how you manage
To feed the rest
See how they run
+
Lady Madonna
Lying on the bed
Listen to the music
Playing in your head
+
Tuesday afternoon is neverending
Wednesday morning papers didn't come
Thursday night your stockings
Needed mending
See how they run
+
Lady Madonna
Children at your feet
Wonder how you manage
@@ -415,15 +435,29 @@ lyrics_pages = [
LyricsPage.make(
"https://www.musica.com/letras.asp?letra=59862",
"""
+ Lady Madonna, children at your feet
+ Wonder how you manage to make ends meet
+ Who finds the money when you pay the rent?
+ Did you think that money was heaven sent?
+
+ Friday night arrives without a suitcase
+ Sunday morning creeping like a nun
+ Monday's child has learned to tie his bootlace
+ See how they run
+
Lady Madonna, baby at your breast
Wonders how you manage to feed the rest
+
See how they run
+
Lady Madonna lying on the bed
Listen to the music playing in your head
+
Tuesday afternoon is never ending
Wednesday morning papers didn't come
Thursday night your stockings needed mending
See how they run
+
Lady Madonna, children at your feet
Wonder how you manage to make ends meet
""",
@@ -448,6 +482,14 @@ lyrics_pages = [
See how they run.
Lady Madonna, lying on the bed,
Listen to the music playing in your head.
+
+ Tuesday afternoon is never ending.
+ Wednesday morning papers didn't come.
+ Thursday night your stockings needed mending.
+ See how they run.
+
+ Lady Madonna, children at your feet.
+ Wonder how you manage to make ends meet.
""",
url_title="Paroles Lady Madonna par The Beatles - Lyrics - Paroles.net",
),
diff --git a/test/plugins/test_lyrics.py b/test/plugins/test_lyrics.py
index 6986e4f06..9fa3931fd 100644
--- a/test/plugins/test_lyrics.py
+++ b/test/plugins/test_lyrics.py
@@ -101,28 +101,6 @@ class TestLyricsUtils:
assert list(actual_titles) == [title, *expected_extra_titles]
- @pytest.mark.parametrize(
- "initial_text, expected",
- [
- (
- """
- one
-
- two !
-
- four """,
- "\none\ntwo !\n\nfour ",
- ),
- ("foobaz", "foobaz"),
- ],
- )
- def test_scrape_strip_cruft(self, initial_text, expected):
- assert lyrics._scrape_strip_cruft(initial_text) == expected
-
- def test_scrape_merge_paragraphs(self):
- text = "one two
three"
- assert lyrics._scrape_merge_paragraphs(text) == "one\ntwo\nthree"
-
@pytest.mark.parametrize(
"text, expected",
[
@@ -142,6 +120,25 @@ class TestLyricsUtils:
assert lyrics.slug(text) == expected
+class TestHtml:
+ def test_scrape_strip_cruft(self):
+ initial = """
+ one
+
+ two !
+
+ four """
+ expected = "\none\ntwo !\n\nfour "
+
+ assert lyrics.Html.normalize_space(initial) == expected
+
+ def test_scrape_merge_paragraphs(self):
+ text = "one
two
three"
+ expected = "one\ntwo\n\nthree"
+
+ assert lyrics.Html.merge_paragraphs(text) == expected
+
+
class TestSearchBackend:
@pytest.fixture
def backend(self, dist_thresh):