mirror of
https://github.com/beetbox/beets.git
synced 2025-12-07 17:16:07 +01:00
Create Html class for cleaning up the html text
Additionally, improve HTML pre-processing: * Ensure a new line between blocks of lyrics text from letras.mus.br. * Parse a missing last block of lyrics text from lacocinelle.net. * Parse a missing last block of lyrics text from paroles.net. * Fix encoding issues with AZLyrics by setting response encoding to None, allowing `requests` to handle it.
This commit is contained in:
parent
c5c4138d66
commit
70554640e5
3 changed files with 127 additions and 61 deletions
|
|
@ -57,7 +57,6 @@ try:
|
||||||
except ImportError:
|
except ImportError:
|
||||||
HAS_LANGDETECT = False
|
HAS_LANGDETECT = False
|
||||||
|
|
||||||
BREAK_RE = re.compile(r"\n?\s*<br([\s|/][^>]*)*>\s*\n?", re.I)
|
|
||||||
USER_AGENT = f"beets/{beets.__version__}"
|
USER_AGENT = f"beets/{beets.__version__}"
|
||||||
INSTRUMENTAL_LYRICS = "[Instrumental]"
|
INSTRUMENTAL_LYRICS = "[Instrumental]"
|
||||||
|
|
||||||
|
|
@ -231,10 +230,16 @@ class RequestHandler:
|
||||||
def fetch_text(
|
def fetch_text(
|
||||||
self, url: str, params: JSONDict | None = None, **kwargs
|
self, url: str, params: JSONDict | None = None, **kwargs
|
||||||
) -> str:
|
) -> str:
|
||||||
"""Return text / HTML data from the given URL."""
|
"""Return text / HTML data from the given URL.
|
||||||
|
|
||||||
|
Set the encoding to None to let requests handle it because some sites
|
||||||
|
set it incorrectly.
|
||||||
|
"""
|
||||||
url = self.format_url(url, params)
|
url = self.format_url(url, params)
|
||||||
self.debug("Fetching HTML from {}", url)
|
self.debug("Fetching HTML from {}", url)
|
||||||
return r_session.get(url, **kwargs).text
|
r = r_session.get(url, **kwargs)
|
||||||
|
r.encoding = None
|
||||||
|
return r.text
|
||||||
|
|
||||||
def fetch_json(self, url: str, params: JSONDict | None = None, **kwargs):
|
def fetch_json(self, url: str, params: JSONDict | None = None, **kwargs):
|
||||||
"""Return JSON data from the given URL."""
|
"""Return JSON data from the given URL."""
|
||||||
|
|
@ -440,13 +445,60 @@ class MusiXmatch(DirectBackend):
|
||||||
return lyrics
|
return lyrics
|
||||||
|
|
||||||
|
|
||||||
|
class Html:
|
||||||
|
collapse_space = partial(re.compile(r"(^| ) +", re.M).sub, r"\1")
|
||||||
|
expand_br = partial(re.compile(r"\s*<br[^>]*>\s*", re.I).sub, "\n")
|
||||||
|
#: two newlines between paragraphs on the same line (musica, letras.mus.br)
|
||||||
|
merge_blocks = partial(re.compile(r"(?<!>)</p><p[^>]*>").sub, "\n\n")
|
||||||
|
#: a single new line between paragraphs on separate lines
|
||||||
|
#: (paroles.net, sweetslyrics.com, lacoccinelle.net)
|
||||||
|
merge_lines = partial(re.compile(r"</p>\s+<p[^>]*>(?!___)").sub, "\n")
|
||||||
|
#: remove empty divs (lacoccinelle.net)
|
||||||
|
remove_empty_divs = partial(re.compile(r"<div[^>]*>\s*</div>").sub, "")
|
||||||
|
#: remove Google Ads tags (musica.com)
|
||||||
|
remove_aside = partial(re.compile("<aside .+?</aside>").sub, "")
|
||||||
|
#: remove adslot-Content_1 div from the lyrics text (paroles.net)
|
||||||
|
remove_adslot = partial(
|
||||||
|
re.compile(r"\n</div>[^\n]+-- Content_\d+ --.*?\n<div>", re.S).sub,
|
||||||
|
"\n",
|
||||||
|
)
|
||||||
|
#: remove text formatting (azlyrics.com, lacocinelle.net)
|
||||||
|
remove_formatting = partial(
|
||||||
|
re.compile(r" *</?(i|em|pre|strong)[^>]*>").sub, ""
|
||||||
|
)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def normalize_space(cls, text: str) -> str:
|
||||||
|
text = unescape(text).replace("\r", "").replace("\xa0", " ")
|
||||||
|
return cls.collapse_space(cls.expand_br(text))
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def remove_ads(cls, text: str) -> str:
|
||||||
|
return cls.remove_adslot(cls.remove_aside(text))
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def merge_paragraphs(cls, text: str) -> str:
|
||||||
|
return cls.merge_blocks(cls.merge_lines(cls.remove_empty_divs(text)))
|
||||||
|
|
||||||
|
|
||||||
|
class SoupMixin:
|
||||||
|
@classmethod
|
||||||
|
def pre_process_html(cls, html: str) -> str:
|
||||||
|
"""Pre-process the HTML content before scraping."""
|
||||||
|
return Html.normalize_space(html)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_soup(cls, html: str) -> BeautifulSoup:
|
||||||
|
return BeautifulSoup(cls.pre_process_html(html), "html.parser")
|
||||||
|
|
||||||
|
|
||||||
class SearchResult(NamedTuple):
|
class SearchResult(NamedTuple):
|
||||||
artist: str
|
artist: str
|
||||||
title: str
|
title: str
|
||||||
url: str
|
url: str
|
||||||
|
|
||||||
|
|
||||||
class SearchBackend(Backend):
|
class SearchBackend(SoupMixin, Backend):
|
||||||
REQUIRES_BS = True
|
REQUIRES_BS = True
|
||||||
|
|
||||||
@cached_property
|
@cached_property
|
||||||
|
|
@ -534,12 +586,12 @@ class Genius(SearchBackend):
|
||||||
def scrape(cls, html: str) -> str | None:
|
def scrape(cls, html: str) -> str | None:
|
||||||
if m := cls.LYRICS_IN_JSON_RE.search(html):
|
if m := cls.LYRICS_IN_JSON_RE.search(html):
|
||||||
html_text = cls.remove_backslash(m[0]).replace(r"\n", "\n")
|
html_text = cls.remove_backslash(m[0]).replace(r"\n", "\n")
|
||||||
return get_soup(html_text).get_text().strip()
|
return cls.get_soup(html_text).get_text().strip()
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
class Tekstowo(DirectBackend):
|
class Tekstowo(SoupMixin, DirectBackend):
|
||||||
"""Fetch lyrics from Tekstowo.pl."""
|
"""Fetch lyrics from Tekstowo.pl."""
|
||||||
|
|
||||||
REQUIRES_BS = True
|
REQUIRES_BS = True
|
||||||
|
|
@ -561,7 +613,7 @@ class Tekstowo(DirectBackend):
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def scrape(cls, html: str) -> str | None:
|
def scrape(cls, html: str) -> str | None:
|
||||||
soup = get_soup(html)
|
soup = cls.get_soup(html)
|
||||||
|
|
||||||
if lyrics_div := soup.select_one("div.song-text > div.inner-text"):
|
if lyrics_div := soup.select_one("div.song-text > div.inner-text"):
|
||||||
return lyrics_div.get_text()
|
return lyrics_div.get_text()
|
||||||
|
|
@ -569,37 +621,6 @@ class Tekstowo(DirectBackend):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
collapse_newlines = partial(re.compile(r"\n{3,}").sub, r"\n\n")
|
|
||||||
|
|
||||||
|
|
||||||
def _scrape_strip_cruft(html: str) -> str:
|
|
||||||
"""Clean up HTML"""
|
|
||||||
html = unescape(html)
|
|
||||||
|
|
||||||
html = html.replace("\r", "\n") # Normalize EOL.
|
|
||||||
html = re.sub(r" +", " ", html) # Whitespaces collapse.
|
|
||||||
html = BREAK_RE.sub("\n", html) # <br> eats up surrounding '\n'.
|
|
||||||
html = re.sub(r"(?s)<(script).*?</\1>", "", html) # Strip script tags.
|
|
||||||
html = re.sub("\u2005", " ", html) # replace unicode with regular space
|
|
||||||
html = re.sub("<aside .+?</aside>", "", html) # remove Google Ads tags
|
|
||||||
html = re.sub(r"</?(em|strong)[^>]*>", "", html) # remove italics / bold
|
|
||||||
|
|
||||||
html = "\n".join([x.strip() for x in html.strip().split("\n")])
|
|
||||||
return collapse_newlines(html)
|
|
||||||
|
|
||||||
|
|
||||||
def _scrape_merge_paragraphs(html):
|
|
||||||
html = re.sub(r"</p>\s*<p(\s*[^>]*)>", "\n", html)
|
|
||||||
return re.sub(r"<div .*>\s*</div>", "\n", html)
|
|
||||||
|
|
||||||
|
|
||||||
def get_soup(html: str) -> BeautifulSoup:
|
|
||||||
html = _scrape_strip_cruft(html)
|
|
||||||
html = _scrape_merge_paragraphs(html)
|
|
||||||
|
|
||||||
return BeautifulSoup(html, "html.parser")
|
|
||||||
|
|
||||||
|
|
||||||
class Google(SearchBackend):
|
class Google(SearchBackend):
|
||||||
"""Fetch lyrics from Google search results."""
|
"""Fetch lyrics from Google search results."""
|
||||||
|
|
||||||
|
|
@ -635,6 +656,12 @@ class Google(SearchBackend):
|
||||||
#: Split cleaned up URL title into artist and title parts.
|
#: Split cleaned up URL title into artist and title parts.
|
||||||
URL_TITLE_PARTS_RE = re.compile(r" +(?:[ :|-]+|par|by) +")
|
URL_TITLE_PARTS_RE = re.compile(r" +(?:[ :|-]+|par|by) +")
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def pre_process_html(cls, html: str) -> str:
|
||||||
|
"""Pre-process the HTML content before scraping."""
|
||||||
|
html = Html.remove_ads(super().pre_process_html(html))
|
||||||
|
return Html.remove_formatting(Html.merge_paragraphs(html))
|
||||||
|
|
||||||
def fetch_text(self, *args, **kwargs) -> str:
|
def fetch_text(self, *args, **kwargs) -> str:
|
||||||
"""Handle an error so that we can continue with the next URL."""
|
"""Handle an error so that we can continue with the next URL."""
|
||||||
with self.handle_request():
|
with self.handle_request():
|
||||||
|
|
@ -700,7 +727,7 @@ class Google(SearchBackend):
|
||||||
@classmethod
|
@classmethod
|
||||||
def scrape(cls, html: str) -> str | None:
|
def scrape(cls, html: str) -> str | None:
|
||||||
# Get the longest text element (if any).
|
# Get the longest text element (if any).
|
||||||
if strings := sorted(get_soup(html).stripped_strings, key=len):
|
if strings := sorted(cls.get_soup(html).stripped_strings, key=len):
|
||||||
return strings[-1]
|
return strings[-1]
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
|
||||||
|
|
@ -223,6 +223,20 @@ lyrics_pages = [
|
||||||
Mademoiselle Madonna, couchée sur votre lit
|
Mademoiselle Madonna, couchée sur votre lit
|
||||||
Listen to the music playing in your head.
|
Listen to the music playing in your head.
|
||||||
Vous écoutez la musique qui joue dans votre tête
|
Vous écoutez la musique qui joue dans votre tête
|
||||||
|
|
||||||
|
Tuesday afternoon is never ending.
|
||||||
|
Le mardi après-midi n'en finit pas
|
||||||
|
Wednesday morning papers didn't come.
|
||||||
|
Le mercredi matin les journaux ne sont pas arrivés
|
||||||
|
Thursday night you stockings needed mending.
|
||||||
|
Jeudi soir, vos bas avaient besoin d'être réparés
|
||||||
|
See how they run.
|
||||||
|
Regardez comme ils filent
|
||||||
|
|
||||||
|
Lady Madonna, children at your feet.
|
||||||
|
Mademoiselle Madonna, les enfants à vos pieds
|
||||||
|
Wonder how you manage to make ends meet.
|
||||||
|
Je me demande comment vous vous débrouillez pour joindre les deux bouts
|
||||||
""",
|
""",
|
||||||
url_title="Paroles et traduction The Beatles : Lady Madonna - paroles de chanson", # noqa: E501
|
url_title="Paroles et traduction The Beatles : Lady Madonna - paroles de chanson", # noqa: E501
|
||||||
),
|
),
|
||||||
|
|
@ -235,29 +249,35 @@ lyrics_pages = [
|
||||||
Children at your feet
|
Children at your feet
|
||||||
Wonder how you manage
|
Wonder how you manage
|
||||||
To make ends meet
|
To make ends meet
|
||||||
|
|
||||||
Who finds the money
|
Who finds the money
|
||||||
When you pay the rent?
|
When you pay the rent?
|
||||||
Did you think that money
|
Did you think that money
|
||||||
Was Heaven sent?
|
Was Heaven sent?
|
||||||
|
|
||||||
Friday night arrives without a suitcase
|
Friday night arrives without a suitcase
|
||||||
Sunday morning creeping like a nun
|
Sunday morning creeping like a nun
|
||||||
Monday's child has learned
|
Monday's child has learned
|
||||||
To tie his bootlace
|
To tie his bootlace
|
||||||
See how they run
|
See how they run
|
||||||
|
|
||||||
Lady Madonna
|
Lady Madonna
|
||||||
Baby at your breast
|
Baby at your breast
|
||||||
Wonders how you manage
|
Wonders how you manage
|
||||||
To feed the rest
|
To feed the rest
|
||||||
See how they run
|
See how they run
|
||||||
|
|
||||||
Lady Madonna
|
Lady Madonna
|
||||||
Lying on the bed
|
Lying on the bed
|
||||||
Listen to the music
|
Listen to the music
|
||||||
Playing in your head
|
Playing in your head
|
||||||
|
|
||||||
Tuesday afternoon is neverending
|
Tuesday afternoon is neverending
|
||||||
Wednesday morning papers didn't come
|
Wednesday morning papers didn't come
|
||||||
Thursday night your stockings
|
Thursday night your stockings
|
||||||
Needed mending
|
Needed mending
|
||||||
See how they run
|
See how they run
|
||||||
|
|
||||||
Lady Madonna
|
Lady Madonna
|
||||||
Children at your feet
|
Children at your feet
|
||||||
Wonder how you manage
|
Wonder how you manage
|
||||||
|
|
@ -415,15 +435,29 @@ lyrics_pages = [
|
||||||
LyricsPage.make(
|
LyricsPage.make(
|
||||||
"https://www.musica.com/letras.asp?letra=59862",
|
"https://www.musica.com/letras.asp?letra=59862",
|
||||||
"""
|
"""
|
||||||
|
Lady Madonna, children at your feet
|
||||||
|
Wonder how you manage to make ends meet
|
||||||
|
Who finds the money when you pay the rent?
|
||||||
|
Did you think that money was heaven sent?
|
||||||
|
|
||||||
|
Friday night arrives without a suitcase
|
||||||
|
Sunday morning creeping like a nun
|
||||||
|
Monday's child has learned to tie his bootlace
|
||||||
|
See how they run
|
||||||
|
|
||||||
Lady Madonna, baby at your breast
|
Lady Madonna, baby at your breast
|
||||||
Wonders how you manage to feed the rest
|
Wonders how you manage to feed the rest
|
||||||
|
|
||||||
See how they run
|
See how they run
|
||||||
|
|
||||||
Lady Madonna lying on the bed
|
Lady Madonna lying on the bed
|
||||||
Listen to the music playing in your head
|
Listen to the music playing in your head
|
||||||
|
|
||||||
Tuesday afternoon is never ending
|
Tuesday afternoon is never ending
|
||||||
Wednesday morning papers didn't come
|
Wednesday morning papers didn't come
|
||||||
Thursday night your stockings needed mending
|
Thursday night your stockings needed mending
|
||||||
See how they run
|
See how they run
|
||||||
|
|
||||||
Lady Madonna, children at your feet
|
Lady Madonna, children at your feet
|
||||||
Wonder how you manage to make ends meet
|
Wonder how you manage to make ends meet
|
||||||
""",
|
""",
|
||||||
|
|
@ -448,6 +482,14 @@ lyrics_pages = [
|
||||||
See how they run.
|
See how they run.
|
||||||
Lady Madonna, lying on the bed,
|
Lady Madonna, lying on the bed,
|
||||||
Listen to the music playing in your head.
|
Listen to the music playing in your head.
|
||||||
|
|
||||||
|
Tuesday afternoon is never ending.
|
||||||
|
Wednesday morning papers didn't come.
|
||||||
|
Thursday night your stockings needed mending.
|
||||||
|
See how they run.
|
||||||
|
|
||||||
|
Lady Madonna, children at your feet.
|
||||||
|
Wonder how you manage to make ends meet.
|
||||||
""",
|
""",
|
||||||
url_title="Paroles Lady Madonna par The Beatles - Lyrics - Paroles.net",
|
url_title="Paroles Lady Madonna par The Beatles - Lyrics - Paroles.net",
|
||||||
),
|
),
|
||||||
|
|
|
||||||
|
|
@ -101,28 +101,6 @@ class TestLyricsUtils:
|
||||||
|
|
||||||
assert list(actual_titles) == [title, *expected_extra_titles]
|
assert list(actual_titles) == [title, *expected_extra_titles]
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
"initial_text, expected",
|
|
||||||
[
|
|
||||||
(
|
|
||||||
"""<!--lyrics below-->
|
|
||||||
one
|
|
||||||
<br class='myclass'>
|
|
||||||
two !
|
|
||||||
<br><br \\>
|
|
||||||
<blink>four</blink>""",
|
|
||||||
"<!--lyrics below-->\none\ntwo !\n\n<blink>four</blink>",
|
|
||||||
),
|
|
||||||
("foo<script>bar</script>baz", "foobaz"),
|
|
||||||
],
|
|
||||||
)
|
|
||||||
def test_scrape_strip_cruft(self, initial_text, expected):
|
|
||||||
assert lyrics._scrape_strip_cruft(initial_text) == expected
|
|
||||||
|
|
||||||
def test_scrape_merge_paragraphs(self):
|
|
||||||
text = "one</p> <p class='myclass'>two</p><p>three"
|
|
||||||
assert lyrics._scrape_merge_paragraphs(text) == "one\ntwo\nthree"
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"text, expected",
|
"text, expected",
|
||||||
[
|
[
|
||||||
|
|
@ -142,6 +120,25 @@ class TestLyricsUtils:
|
||||||
assert lyrics.slug(text) == expected
|
assert lyrics.slug(text) == expected
|
||||||
|
|
||||||
|
|
||||||
|
class TestHtml:
|
||||||
|
def test_scrape_strip_cruft(self):
|
||||||
|
initial = """<!--lyrics below-->
|
||||||
|
one
|
||||||
|
<br class='myclass'>
|
||||||
|
two !
|
||||||
|
<br><br \\>
|
||||||
|
<blink>four</blink>"""
|
||||||
|
expected = "<!--lyrics below-->\none\ntwo !\n\n<blink>four</blink>"
|
||||||
|
|
||||||
|
assert lyrics.Html.normalize_space(initial) == expected
|
||||||
|
|
||||||
|
def test_scrape_merge_paragraphs(self):
|
||||||
|
text = "one</p> <p class='myclass'>two</p><p>three"
|
||||||
|
expected = "one\ntwo\n\nthree"
|
||||||
|
|
||||||
|
assert lyrics.Html.merge_paragraphs(text) == expected
|
||||||
|
|
||||||
|
|
||||||
class TestSearchBackend:
|
class TestSearchBackend:
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def backend(self, dist_thresh):
|
def backend(self, dist_thresh):
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue