Create Html class for cleaning up the html text

Additionally, improve HTML pre-processing:

* Ensure a new line between blocks of lyrics text from letras.mus.br.
* Parse a missing last block of lyrics text from lacocinelle.net.
* Parse a missing last block of lyrics text from paroles.net.
* Fix encoding issues with AZLyrics by setting response encoding to
  None, allowing `requests` to handle it.
This commit is contained in:
Šarūnas Nejus 2024-10-13 13:34:12 +01:00
parent c5c4138d66
commit 70554640e5
No known key found for this signature in database
GPG key ID: DD28F6704DBE3435
3 changed files with 127 additions and 61 deletions

View file

@ -57,7 +57,6 @@ try:
except ImportError:
HAS_LANGDETECT = False
BREAK_RE = re.compile(r"\n?\s*<br([\s|/][^>]*)*>\s*\n?", re.I)
USER_AGENT = f"beets/{beets.__version__}"
INSTRUMENTAL_LYRICS = "[Instrumental]"
@ -231,10 +230,16 @@ class RequestHandler:
def fetch_text(
self, url: str, params: JSONDict | None = None, **kwargs
) -> str:
"""Return text / HTML data from the given URL."""
"""Return text / HTML data from the given URL.
Set the encoding to None to let requests handle it because some sites
set it incorrectly.
"""
url = self.format_url(url, params)
self.debug("Fetching HTML from {}", url)
return r_session.get(url, **kwargs).text
r = r_session.get(url, **kwargs)
r.encoding = None
return r.text
def fetch_json(self, url: str, params: JSONDict | None = None, **kwargs):
"""Return JSON data from the given URL."""
@ -440,13 +445,60 @@ class MusiXmatch(DirectBackend):
return lyrics
class Html:
collapse_space = partial(re.compile(r"(^| ) +", re.M).sub, r"\1")
expand_br = partial(re.compile(r"\s*<br[^>]*>\s*", re.I).sub, "\n")
#: two newlines between paragraphs on the same line (musica, letras.mus.br)
merge_blocks = partial(re.compile(r"(?<!>)</p><p[^>]*>").sub, "\n\n")
#: a single new line between paragraphs on separate lines
#: (paroles.net, sweetslyrics.com, lacoccinelle.net)
merge_lines = partial(re.compile(r"</p>\s+<p[^>]*>(?!___)").sub, "\n")
#: remove empty divs (lacoccinelle.net)
remove_empty_divs = partial(re.compile(r"<div[^>]*>\s*</div>").sub, "")
#: remove Google Ads tags (musica.com)
remove_aside = partial(re.compile("<aside .+?</aside>").sub, "")
#: remove adslot-Content_1 div from the lyrics text (paroles.net)
remove_adslot = partial(
re.compile(r"\n</div>[^\n]+-- Content_\d+ --.*?\n<div>", re.S).sub,
"\n",
)
#: remove text formatting (azlyrics.com, lacocinelle.net)
remove_formatting = partial(
re.compile(r" *</?(i|em|pre|strong)[^>]*>").sub, ""
)
@classmethod
def normalize_space(cls, text: str) -> str:
text = unescape(text).replace("\r", "").replace("\xa0", " ")
return cls.collapse_space(cls.expand_br(text))
@classmethod
def remove_ads(cls, text: str) -> str:
return cls.remove_adslot(cls.remove_aside(text))
@classmethod
def merge_paragraphs(cls, text: str) -> str:
return cls.merge_blocks(cls.merge_lines(cls.remove_empty_divs(text)))
class SoupMixin:
@classmethod
def pre_process_html(cls, html: str) -> str:
"""Pre-process the HTML content before scraping."""
return Html.normalize_space(html)
@classmethod
def get_soup(cls, html: str) -> BeautifulSoup:
return BeautifulSoup(cls.pre_process_html(html), "html.parser")
class SearchResult(NamedTuple):
artist: str
title: str
url: str
class SearchBackend(Backend):
class SearchBackend(SoupMixin, Backend):
REQUIRES_BS = True
@cached_property
@ -534,12 +586,12 @@ class Genius(SearchBackend):
def scrape(cls, html: str) -> str | None:
if m := cls.LYRICS_IN_JSON_RE.search(html):
html_text = cls.remove_backslash(m[0]).replace(r"\n", "\n")
return get_soup(html_text).get_text().strip()
return cls.get_soup(html_text).get_text().strip()
return None
class Tekstowo(DirectBackend):
class Tekstowo(SoupMixin, DirectBackend):
"""Fetch lyrics from Tekstowo.pl."""
REQUIRES_BS = True
@ -561,7 +613,7 @@ class Tekstowo(DirectBackend):
@classmethod
def scrape(cls, html: str) -> str | None:
soup = get_soup(html)
soup = cls.get_soup(html)
if lyrics_div := soup.select_one("div.song-text > div.inner-text"):
return lyrics_div.get_text()
@ -569,37 +621,6 @@ class Tekstowo(DirectBackend):
return None
collapse_newlines = partial(re.compile(r"\n{3,}").sub, r"\n\n")
def _scrape_strip_cruft(html: str) -> str:
"""Clean up HTML"""
html = unescape(html)
html = html.replace("\r", "\n") # Normalize EOL.
html = re.sub(r" +", " ", html) # Whitespaces collapse.
html = BREAK_RE.sub("\n", html) # <br> eats up surrounding '\n'.
html = re.sub(r"(?s)<(script).*?</\1>", "", html) # Strip script tags.
html = re.sub("\u2005", " ", html) # replace unicode with regular space
html = re.sub("<aside .+?</aside>", "", html) # remove Google Ads tags
html = re.sub(r"</?(em|strong)[^>]*>", "", html) # remove italics / bold
html = "\n".join([x.strip() for x in html.strip().split("\n")])
return collapse_newlines(html)
def _scrape_merge_paragraphs(html):
html = re.sub(r"</p>\s*<p(\s*[^>]*)>", "\n", html)
return re.sub(r"<div .*>\s*</div>", "\n", html)
def get_soup(html: str) -> BeautifulSoup:
html = _scrape_strip_cruft(html)
html = _scrape_merge_paragraphs(html)
return BeautifulSoup(html, "html.parser")
class Google(SearchBackend):
"""Fetch lyrics from Google search results."""
@ -635,6 +656,12 @@ class Google(SearchBackend):
#: Split cleaned up URL title into artist and title parts.
URL_TITLE_PARTS_RE = re.compile(r" +(?:[ :|-]+|par|by) +")
@classmethod
def pre_process_html(cls, html: str) -> str:
"""Pre-process the HTML content before scraping."""
html = Html.remove_ads(super().pre_process_html(html))
return Html.remove_formatting(Html.merge_paragraphs(html))
def fetch_text(self, *args, **kwargs) -> str:
"""Handle an error so that we can continue with the next URL."""
with self.handle_request():
@ -700,7 +727,7 @@ class Google(SearchBackend):
@classmethod
def scrape(cls, html: str) -> str | None:
# Get the longest text element (if any).
if strings := sorted(get_soup(html).stripped_strings, key=len):
if strings := sorted(cls.get_soup(html).stripped_strings, key=len):
return strings[-1]
return None

View file

@ -223,6 +223,20 @@ lyrics_pages = [
Mademoiselle Madonna, couchée sur votre lit
Listen to the music playing in your head.
Vous écoutez la musique qui joue dans votre tête
Tuesday afternoon is never ending.
Le mardi après-midi n'en finit pas
Wednesday morning papers didn't come.
Le mercredi matin les journaux ne sont pas arrivés
Thursday night you stockings needed mending.
Jeudi soir, vos bas avaient besoin d'être réparés
See how they run.
Regardez comme ils filent
Lady Madonna, children at your feet.
Mademoiselle Madonna, les enfants à vos pieds
Wonder how you manage to make ends meet.
Je me demande comment vous vous débrouillez pour joindre les deux bouts
""",
url_title="Paroles et traduction The Beatles : Lady Madonna - paroles de chanson", # noqa: E501
),
@ -235,29 +249,35 @@ lyrics_pages = [
Children at your feet
Wonder how you manage
To make ends meet
Who finds the money
When you pay the rent?
Did you think that money
Was Heaven sent?
Friday night arrives without a suitcase
Sunday morning creeping like a nun
Monday's child has learned
To tie his bootlace
See how they run
Lady Madonna
Baby at your breast
Wonders how you manage
To feed the rest
See how they run
Lady Madonna
Lying on the bed
Listen to the music
Playing in your head
Tuesday afternoon is neverending
Wednesday morning papers didn't come
Thursday night your stockings
Needed mending
See how they run
Lady Madonna
Children at your feet
Wonder how you manage
@ -415,15 +435,29 @@ lyrics_pages = [
LyricsPage.make(
"https://www.musica.com/letras.asp?letra=59862",
"""
Lady Madonna, children at your feet
Wonder how you manage to make ends meet
Who finds the money when you pay the rent?
Did you think that money was heaven sent?
Friday night arrives without a suitcase
Sunday morning creeping like a nun
Monday's child has learned to tie his bootlace
See how they run
Lady Madonna, baby at your breast
Wonders how you manage to feed the rest
See how they run
Lady Madonna lying on the bed
Listen to the music playing in your head
Tuesday afternoon is never ending
Wednesday morning papers didn't come
Thursday night your stockings needed mending
See how they run
Lady Madonna, children at your feet
Wonder how you manage to make ends meet
""",
@ -448,6 +482,14 @@ lyrics_pages = [
See how they run.
Lady Madonna, lying on the bed,
Listen to the music playing in your head.
Tuesday afternoon is never ending.
Wednesday morning papers didn't come.
Thursday night your stockings needed mending.
See how they run.
Lady Madonna, children at your feet.
Wonder how you manage to make ends meet.
""",
url_title="Paroles Lady Madonna par The Beatles - Lyrics - Paroles.net",
),

View file

@ -101,28 +101,6 @@ class TestLyricsUtils:
assert list(actual_titles) == [title, *expected_extra_titles]
@pytest.mark.parametrize(
"initial_text, expected",
[
(
"""<!--lyrics below-->
&nbsp;one
<br class='myclass'>
two !
<br><br \\>
<blink>four</blink>""",
"<!--lyrics below-->\none\ntwo !\n\n<blink>four</blink>",
),
("foo<script>bar</script>baz", "foobaz"),
],
)
def test_scrape_strip_cruft(self, initial_text, expected):
assert lyrics._scrape_strip_cruft(initial_text) == expected
def test_scrape_merge_paragraphs(self):
text = "one</p> <p class='myclass'>two</p><p>three"
assert lyrics._scrape_merge_paragraphs(text) == "one\ntwo\nthree"
@pytest.mark.parametrize(
"text, expected",
[
@ -142,6 +120,25 @@ class TestLyricsUtils:
assert lyrics.slug(text) == expected
class TestHtml:
def test_scrape_strip_cruft(self):
initial = """<!--lyrics below-->
&nbsp;one
<br class='myclass'>
two !
<br><br \\>
<blink>four</blink>"""
expected = "<!--lyrics below-->\none\ntwo !\n\n<blink>four</blink>"
assert lyrics.Html.normalize_space(initial) == expected
def test_scrape_merge_paragraphs(self):
text = "one</p> <p class='myclass'>two</p><p>three"
expected = "one\ntwo\n\nthree"
assert lyrics.Html.merge_paragraphs(text) == expected
class TestSearchBackend:
@pytest.fixture
def backend(self, dist_thresh):