mirror of
https://github.com/beetbox/beets.git
synced 2025-12-06 08:39:17 +01:00
Create Html class for cleaning up the html text
Additionally, improve HTML pre-processing: * Ensure a new line between blocks of lyrics text from letras.mus.br. * Parse a missing last block of lyrics text from lacocinelle.net. * Parse a missing last block of lyrics text from paroles.net. * Fix encoding issues with AZLyrics by setting response encoding to None, allowing `requests` to handle it.
This commit is contained in:
parent
c5c4138d66
commit
70554640e5
3 changed files with 127 additions and 61 deletions
|
|
@ -57,7 +57,6 @@ try:
|
|||
except ImportError:
|
||||
HAS_LANGDETECT = False
|
||||
|
||||
BREAK_RE = re.compile(r"\n?\s*<br([\s|/][^>]*)*>\s*\n?", re.I)
|
||||
USER_AGENT = f"beets/{beets.__version__}"
|
||||
INSTRUMENTAL_LYRICS = "[Instrumental]"
|
||||
|
||||
|
|
@ -231,10 +230,16 @@ class RequestHandler:
|
|||
def fetch_text(
|
||||
self, url: str, params: JSONDict | None = None, **kwargs
|
||||
) -> str:
|
||||
"""Return text / HTML data from the given URL."""
|
||||
"""Return text / HTML data from the given URL.
|
||||
|
||||
Set the encoding to None to let requests handle it because some sites
|
||||
set it incorrectly.
|
||||
"""
|
||||
url = self.format_url(url, params)
|
||||
self.debug("Fetching HTML from {}", url)
|
||||
return r_session.get(url, **kwargs).text
|
||||
r = r_session.get(url, **kwargs)
|
||||
r.encoding = None
|
||||
return r.text
|
||||
|
||||
def fetch_json(self, url: str, params: JSONDict | None = None, **kwargs):
|
||||
"""Return JSON data from the given URL."""
|
||||
|
|
@ -440,13 +445,60 @@ class MusiXmatch(DirectBackend):
|
|||
return lyrics
|
||||
|
||||
|
||||
class Html:
|
||||
collapse_space = partial(re.compile(r"(^| ) +", re.M).sub, r"\1")
|
||||
expand_br = partial(re.compile(r"\s*<br[^>]*>\s*", re.I).sub, "\n")
|
||||
#: two newlines between paragraphs on the same line (musica, letras.mus.br)
|
||||
merge_blocks = partial(re.compile(r"(?<!>)</p><p[^>]*>").sub, "\n\n")
|
||||
#: a single new line between paragraphs on separate lines
|
||||
#: (paroles.net, sweetslyrics.com, lacoccinelle.net)
|
||||
merge_lines = partial(re.compile(r"</p>\s+<p[^>]*>(?!___)").sub, "\n")
|
||||
#: remove empty divs (lacoccinelle.net)
|
||||
remove_empty_divs = partial(re.compile(r"<div[^>]*>\s*</div>").sub, "")
|
||||
#: remove Google Ads tags (musica.com)
|
||||
remove_aside = partial(re.compile("<aside .+?</aside>").sub, "")
|
||||
#: remove adslot-Content_1 div from the lyrics text (paroles.net)
|
||||
remove_adslot = partial(
|
||||
re.compile(r"\n</div>[^\n]+-- Content_\d+ --.*?\n<div>", re.S).sub,
|
||||
"\n",
|
||||
)
|
||||
#: remove text formatting (azlyrics.com, lacocinelle.net)
|
||||
remove_formatting = partial(
|
||||
re.compile(r" *</?(i|em|pre|strong)[^>]*>").sub, ""
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def normalize_space(cls, text: str) -> str:
|
||||
text = unescape(text).replace("\r", "").replace("\xa0", " ")
|
||||
return cls.collapse_space(cls.expand_br(text))
|
||||
|
||||
@classmethod
|
||||
def remove_ads(cls, text: str) -> str:
|
||||
return cls.remove_adslot(cls.remove_aside(text))
|
||||
|
||||
@classmethod
|
||||
def merge_paragraphs(cls, text: str) -> str:
|
||||
return cls.merge_blocks(cls.merge_lines(cls.remove_empty_divs(text)))
|
||||
|
||||
|
||||
class SoupMixin:
|
||||
@classmethod
|
||||
def pre_process_html(cls, html: str) -> str:
|
||||
"""Pre-process the HTML content before scraping."""
|
||||
return Html.normalize_space(html)
|
||||
|
||||
@classmethod
|
||||
def get_soup(cls, html: str) -> BeautifulSoup:
|
||||
return BeautifulSoup(cls.pre_process_html(html), "html.parser")
|
||||
|
||||
|
||||
class SearchResult(NamedTuple):
|
||||
artist: str
|
||||
title: str
|
||||
url: str
|
||||
|
||||
|
||||
class SearchBackend(Backend):
|
||||
class SearchBackend(SoupMixin, Backend):
|
||||
REQUIRES_BS = True
|
||||
|
||||
@cached_property
|
||||
|
|
@ -534,12 +586,12 @@ class Genius(SearchBackend):
|
|||
def scrape(cls, html: str) -> str | None:
|
||||
if m := cls.LYRICS_IN_JSON_RE.search(html):
|
||||
html_text = cls.remove_backslash(m[0]).replace(r"\n", "\n")
|
||||
return get_soup(html_text).get_text().strip()
|
||||
return cls.get_soup(html_text).get_text().strip()
|
||||
|
||||
return None
|
||||
|
||||
|
||||
class Tekstowo(DirectBackend):
|
||||
class Tekstowo(SoupMixin, DirectBackend):
|
||||
"""Fetch lyrics from Tekstowo.pl."""
|
||||
|
||||
REQUIRES_BS = True
|
||||
|
|
@ -561,7 +613,7 @@ class Tekstowo(DirectBackend):
|
|||
|
||||
@classmethod
|
||||
def scrape(cls, html: str) -> str | None:
|
||||
soup = get_soup(html)
|
||||
soup = cls.get_soup(html)
|
||||
|
||||
if lyrics_div := soup.select_one("div.song-text > div.inner-text"):
|
||||
return lyrics_div.get_text()
|
||||
|
|
@ -569,37 +621,6 @@ class Tekstowo(DirectBackend):
|
|||
return None
|
||||
|
||||
|
||||
collapse_newlines = partial(re.compile(r"\n{3,}").sub, r"\n\n")
|
||||
|
||||
|
||||
def _scrape_strip_cruft(html: str) -> str:
|
||||
"""Clean up HTML"""
|
||||
html = unescape(html)
|
||||
|
||||
html = html.replace("\r", "\n") # Normalize EOL.
|
||||
html = re.sub(r" +", " ", html) # Whitespaces collapse.
|
||||
html = BREAK_RE.sub("\n", html) # <br> eats up surrounding '\n'.
|
||||
html = re.sub(r"(?s)<(script).*?</\1>", "", html) # Strip script tags.
|
||||
html = re.sub("\u2005", " ", html) # replace unicode with regular space
|
||||
html = re.sub("<aside .+?</aside>", "", html) # remove Google Ads tags
|
||||
html = re.sub(r"</?(em|strong)[^>]*>", "", html) # remove italics / bold
|
||||
|
||||
html = "\n".join([x.strip() for x in html.strip().split("\n")])
|
||||
return collapse_newlines(html)
|
||||
|
||||
|
||||
def _scrape_merge_paragraphs(html):
|
||||
html = re.sub(r"</p>\s*<p(\s*[^>]*)>", "\n", html)
|
||||
return re.sub(r"<div .*>\s*</div>", "\n", html)
|
||||
|
||||
|
||||
def get_soup(html: str) -> BeautifulSoup:
|
||||
html = _scrape_strip_cruft(html)
|
||||
html = _scrape_merge_paragraphs(html)
|
||||
|
||||
return BeautifulSoup(html, "html.parser")
|
||||
|
||||
|
||||
class Google(SearchBackend):
|
||||
"""Fetch lyrics from Google search results."""
|
||||
|
||||
|
|
@ -635,6 +656,12 @@ class Google(SearchBackend):
|
|||
#: Split cleaned up URL title into artist and title parts.
|
||||
URL_TITLE_PARTS_RE = re.compile(r" +(?:[ :|-]+|par|by) +")
|
||||
|
||||
@classmethod
|
||||
def pre_process_html(cls, html: str) -> str:
|
||||
"""Pre-process the HTML content before scraping."""
|
||||
html = Html.remove_ads(super().pre_process_html(html))
|
||||
return Html.remove_formatting(Html.merge_paragraphs(html))
|
||||
|
||||
def fetch_text(self, *args, **kwargs) -> str:
|
||||
"""Handle an error so that we can continue with the next URL."""
|
||||
with self.handle_request():
|
||||
|
|
@ -700,7 +727,7 @@ class Google(SearchBackend):
|
|||
@classmethod
|
||||
def scrape(cls, html: str) -> str | None:
|
||||
# Get the longest text element (if any).
|
||||
if strings := sorted(get_soup(html).stripped_strings, key=len):
|
||||
if strings := sorted(cls.get_soup(html).stripped_strings, key=len):
|
||||
return strings[-1]
|
||||
|
||||
return None
|
||||
|
|
|
|||
|
|
@ -223,6 +223,20 @@ lyrics_pages = [
|
|||
Mademoiselle Madonna, couchée sur votre lit
|
||||
Listen to the music playing in your head.
|
||||
Vous écoutez la musique qui joue dans votre tête
|
||||
|
||||
Tuesday afternoon is never ending.
|
||||
Le mardi après-midi n'en finit pas
|
||||
Wednesday morning papers didn't come.
|
||||
Le mercredi matin les journaux ne sont pas arrivés
|
||||
Thursday night you stockings needed mending.
|
||||
Jeudi soir, vos bas avaient besoin d'être réparés
|
||||
See how they run.
|
||||
Regardez comme ils filent
|
||||
|
||||
Lady Madonna, children at your feet.
|
||||
Mademoiselle Madonna, les enfants à vos pieds
|
||||
Wonder how you manage to make ends meet.
|
||||
Je me demande comment vous vous débrouillez pour joindre les deux bouts
|
||||
""",
|
||||
url_title="Paroles et traduction The Beatles : Lady Madonna - paroles de chanson", # noqa: E501
|
||||
),
|
||||
|
|
@ -235,29 +249,35 @@ lyrics_pages = [
|
|||
Children at your feet
|
||||
Wonder how you manage
|
||||
To make ends meet
|
||||
|
||||
Who finds the money
|
||||
When you pay the rent?
|
||||
Did you think that money
|
||||
Was Heaven sent?
|
||||
|
||||
Friday night arrives without a suitcase
|
||||
Sunday morning creeping like a nun
|
||||
Monday's child has learned
|
||||
To tie his bootlace
|
||||
See how they run
|
||||
|
||||
Lady Madonna
|
||||
Baby at your breast
|
||||
Wonders how you manage
|
||||
To feed the rest
|
||||
See how they run
|
||||
|
||||
Lady Madonna
|
||||
Lying on the bed
|
||||
Listen to the music
|
||||
Playing in your head
|
||||
|
||||
Tuesday afternoon is neverending
|
||||
Wednesday morning papers didn't come
|
||||
Thursday night your stockings
|
||||
Needed mending
|
||||
See how they run
|
||||
|
||||
Lady Madonna
|
||||
Children at your feet
|
||||
Wonder how you manage
|
||||
|
|
@ -415,15 +435,29 @@ lyrics_pages = [
|
|||
LyricsPage.make(
|
||||
"https://www.musica.com/letras.asp?letra=59862",
|
||||
"""
|
||||
Lady Madonna, children at your feet
|
||||
Wonder how you manage to make ends meet
|
||||
Who finds the money when you pay the rent?
|
||||
Did you think that money was heaven sent?
|
||||
|
||||
Friday night arrives without a suitcase
|
||||
Sunday morning creeping like a nun
|
||||
Monday's child has learned to tie his bootlace
|
||||
See how they run
|
||||
|
||||
Lady Madonna, baby at your breast
|
||||
Wonders how you manage to feed the rest
|
||||
|
||||
See how they run
|
||||
|
||||
Lady Madonna lying on the bed
|
||||
Listen to the music playing in your head
|
||||
|
||||
Tuesday afternoon is never ending
|
||||
Wednesday morning papers didn't come
|
||||
Thursday night your stockings needed mending
|
||||
See how they run
|
||||
|
||||
Lady Madonna, children at your feet
|
||||
Wonder how you manage to make ends meet
|
||||
""",
|
||||
|
|
@ -448,6 +482,14 @@ lyrics_pages = [
|
|||
See how they run.
|
||||
Lady Madonna, lying on the bed,
|
||||
Listen to the music playing in your head.
|
||||
|
||||
Tuesday afternoon is never ending.
|
||||
Wednesday morning papers didn't come.
|
||||
Thursday night your stockings needed mending.
|
||||
See how they run.
|
||||
|
||||
Lady Madonna, children at your feet.
|
||||
Wonder how you manage to make ends meet.
|
||||
""",
|
||||
url_title="Paroles Lady Madonna par The Beatles - Lyrics - Paroles.net",
|
||||
),
|
||||
|
|
|
|||
|
|
@ -101,28 +101,6 @@ class TestLyricsUtils:
|
|||
|
||||
assert list(actual_titles) == [title, *expected_extra_titles]
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"initial_text, expected",
|
||||
[
|
||||
(
|
||||
"""<!--lyrics below-->
|
||||
one
|
||||
<br class='myclass'>
|
||||
two !
|
||||
<br><br \\>
|
||||
<blink>four</blink>""",
|
||||
"<!--lyrics below-->\none\ntwo !\n\n<blink>four</blink>",
|
||||
),
|
||||
("foo<script>bar</script>baz", "foobaz"),
|
||||
],
|
||||
)
|
||||
def test_scrape_strip_cruft(self, initial_text, expected):
|
||||
assert lyrics._scrape_strip_cruft(initial_text) == expected
|
||||
|
||||
def test_scrape_merge_paragraphs(self):
|
||||
text = "one</p> <p class='myclass'>two</p><p>three"
|
||||
assert lyrics._scrape_merge_paragraphs(text) == "one\ntwo\nthree"
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"text, expected",
|
||||
[
|
||||
|
|
@ -142,6 +120,25 @@ class TestLyricsUtils:
|
|||
assert lyrics.slug(text) == expected
|
||||
|
||||
|
||||
class TestHtml:
|
||||
def test_scrape_strip_cruft(self):
|
||||
initial = """<!--lyrics below-->
|
||||
one
|
||||
<br class='myclass'>
|
||||
two !
|
||||
<br><br \\>
|
||||
<blink>four</blink>"""
|
||||
expected = "<!--lyrics below-->\none\ntwo !\n\n<blink>four</blink>"
|
||||
|
||||
assert lyrics.Html.normalize_space(initial) == expected
|
||||
|
||||
def test_scrape_merge_paragraphs(self):
|
||||
text = "one</p> <p class='myclass'>two</p><p>three"
|
||||
expected = "one\ntwo\n\nthree"
|
||||
|
||||
assert lyrics.Html.merge_paragraphs(text) == expected
|
||||
|
||||
|
||||
class TestSearchBackend:
|
||||
@pytest.fixture
|
||||
def backend(self, dist_thresh):
|
||||
|
|
|
|||
Loading…
Reference in a new issue