diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py index 0982120f2..a19d8c616 100644 --- a/beetsplug/lyrics.py +++ b/beetsplug/lyrics.py @@ -57,7 +57,6 @@ try: except ImportError: HAS_LANGDETECT = False -BREAK_RE = re.compile(r"\n?\s*]*)*>\s*\n?", re.I) USER_AGENT = f"beets/{beets.__version__}" INSTRUMENTAL_LYRICS = "[Instrumental]" @@ -231,10 +230,16 @@ class RequestHandler: def fetch_text( self, url: str, params: JSONDict | None = None, **kwargs ) -> str: - """Return text / HTML data from the given URL.""" + """Return text / HTML data from the given URL. + + Set the encoding to None to let requests handle it because some sites + set it incorrectly. + """ url = self.format_url(url, params) self.debug("Fetching HTML from {}", url) - return r_session.get(url, **kwargs).text + r = r_session.get(url, **kwargs) + r.encoding = None + return r.text def fetch_json(self, url: str, params: JSONDict | None = None, **kwargs): """Return JSON data from the given URL.""" @@ -440,13 +445,60 @@ class MusiXmatch(DirectBackend): return lyrics +class Html: + collapse_space = partial(re.compile(r"(^| ) +", re.M).sub, r"\1") + expand_br = partial(re.compile(r"\s*]*>\s*", re.I).sub, "\n") + #: two newlines between paragraphs on the same line (musica, letras.mus.br) + merge_blocks = partial(re.compile(r"(?)

]*>").sub, "\n\n") + #: a single new line between paragraphs on separate lines + #: (paroles.net, sweetslyrics.com, lacoccinelle.net) + merge_lines = partial(re.compile(r"

\s+]*>(?!___)").sub, "\n") + #: remove empty divs (lacoccinelle.net) + remove_empty_divs = partial(re.compile(r"]*>\s*").sub, "") + #: remove Google Ads tags (musica.com) + remove_aside = partial(re.compile("