From a79a86d5d6cac200152b7e16f40a8b621233eaa8 Mon Sep 17 00:00:00 2001 From: Francesco Grillo Date: Tue, 23 Dec 2025 22:31:21 +0200 Subject: [PATCH 1/9] Fix lyrics Unicode corruption and escaped quotes in Genius plugin MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Problem The lyrics plugin has two bugs that corrupt fetched lyrics: 1. **Unicode corruption**: Characters like `ò`, `è`, `à` are corrupted to `√≤`, `√®`, etc. 2. **Escaped quotes**: Quotes appear as `\"` instead of `"` in lyrics ## Root Causes ### Issue 1: MacRoman encoding misdetection - **Location**: `RequestHandler.fetch_text()` line 220 - **Cause**: Setting `r.encoding = None` forces requests to use `apparent_encoding` - **Problem**: For Genius.com (and others), requests incorrectly detects MacRoman instead of UTF-8 - **Result**: UTF-8 bytes `c3 b2` (ò) decoded as MacRoman produces "√≤" (U+221A U+2264) ### Issue 2: Incomplete JSON unescape - **Location**: `Genius.scrape()` line 576 - **Cause**: The `remove_backslash` regex doesn't handle all escape patterns in JSON - **Problem**: Genius embeds lyrics in JSON with patterns like `\\"` and `\\\\"` - **Result**: After BeautifulSoup processing, escaped quotes remain in final text ## Solution ### Fix 1: Trust server encoding, fallback to UTF-8 ```python # OLD: r.encoding = None # NEW: if not r.encoding: r.encoding = 'utf-8' ``` - Respects server's declared encoding (UTF-8 for Genius) - Falls back to UTF-8 if no encoding specified (safer than apparent_encoding) - Preserves original intent of handling misconfigured servers ### Fix 2: Iteratively clean escaped quotes ```python while '\\"' in lyrics: lyrics = lyrics.replace('\\"', '"') ``` - Handles variable escape levels (`\"`, `\\\"`, `\\\\\"`) - Minimal change - keeps original `remove_backslash` regex - Applied after BeautifulSoup to avoid interfering with HTML parsing ## Testing Tested with: - Caparezza - "Argenti Vive" (Italian, many accented characters) - WestsideGunn - "Heel Cena" (escaped quotes in lyrics) Before: ``` mi si par√≤ davanti \\"I got big moves\\" ``` After: ``` mi si parò davanti "I got big moves" ``` ## Impact - Fixes lyrics for all languages with non-ASCII characters - Fixes Genius lyrics with quotes - No breaking changes - maintains backward compatibility - Minimal code changes (14 lines total) --- beetsplug/lyrics.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py index d6e14c175..4bb88cf49 100644 --- a/beetsplug/lyrics.py +++ b/beetsplug/lyrics.py @@ -200,7 +200,11 @@ class LyricsRequestHandler(RequestHandler): url = self.format_url(url, params) self.debug("Fetching HTML from {}", url) r = self.get(url, **kwargs) - r.encoding = None + """Trust server's encoding, + but default to UTF-8 if not specified + """ + if not r.encoding: + r.encoding = 'utf-8' return r.text def get_json(self, url: str, params: JSONDict | None = None, **kwargs): @@ -557,11 +561,14 @@ class Genius(SearchBackend): def scrape(cls, html: str) -> str | None: if m := cls.LYRICS_IN_JSON_RE.search(html): html_text = cls.remove_backslash(m[0]).replace(r"\n", "\n") - return cls.get_soup(html_text).get_text().strip() + lyrics = cls.get_soup(html_text).get_text().strip() + # Clean up any remaining escaped quotes (may need multiple passes) + while '\\"' in lyrics: + lyrics = lyrics.replace('\\"', '"') + return lyrics return None - class Tekstowo(SearchBackend): """Fetch lyrics from Tekstowo.pl.""" From 1d494135b578eebbab56139359c6b15da9e3ea12 Mon Sep 17 00:00:00 2001 From: Francesco Grillo Date: Tue, 23 Dec 2025 22:43:28 +0200 Subject: [PATCH 2/9] Update lyrics.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Key Improvements (addressing reviewer feedback): - ✅ Uses regex instead of while loop (more efficient, one pass) - ✅ No infinite loop risk - ✅ Handles any number of backslashes before quotes - ✅ Clear inline comments --- beetsplug/lyrics.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py index 4bb88cf49..7761a642c 100644 --- a/beetsplug/lyrics.py +++ b/beetsplug/lyrics.py @@ -563,8 +563,8 @@ class Genius(SearchBackend): html_text = cls.remove_backslash(m[0]).replace(r"\n", "\n") lyrics = cls.get_soup(html_text).get_text().strip() # Clean up any remaining escaped quotes (may need multiple passes) - while '\\"' in lyrics: - lyrics = lyrics.replace('\\"', '"') + # Use regex to handle all escape levels in one pass + lyrics = re.sub(r'\\+"', '"', lyrics) return lyrics return None From b8311915a63c728c42230e30be8aa2457a462882 Mon Sep 17 00:00:00 2001 From: Francesco Grillo Date: Tue, 23 Dec 2025 22:47:33 +0200 Subject: [PATCH 3/9] Minor linting change --- beetsplug/lyrics.py | 1 + 1 file changed, 1 insertion(+) diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py index 7761a642c..a62868d75 100644 --- a/beetsplug/lyrics.py +++ b/beetsplug/lyrics.py @@ -569,6 +569,7 @@ class Genius(SearchBackend): return None + class Tekstowo(SearchBackend): """Fetch lyrics from Tekstowo.pl.""" From 3f56d952da7772c8b8d7795c29df665fe07209d2 Mon Sep 17 00:00:00 2001 From: Francesco Grillo Date: Tue, 23 Dec 2025 22:48:27 +0200 Subject: [PATCH 4/9] Finalized linting changes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the fetch_text method (line ~207): if not r.encoding: r.encoding = "utf-8" # ← Double quotes! In the scrape method, make sure there are 2 blank lines before the next class: return None class Tekstowo(SearchBackend): # ← Two blank lines above That should pass the formatting check! The repo follows PEP 8 style (double quotes, 2 blank lines between classes). --- beetsplug/lyrics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py index a62868d75..61bd7e1da 100644 --- a/beetsplug/lyrics.py +++ b/beetsplug/lyrics.py @@ -204,7 +204,7 @@ class LyricsRequestHandler(RequestHandler): but default to UTF-8 if not specified """ if not r.encoding: - r.encoding = 'utf-8' + r.encoding = "utf-8" return r.text def get_json(self, url: str, params: JSONDict | None = None, **kwargs): From 2c300fa190f4f3072e2f6003a6b426363a13eb55 Mon Sep 17 00:00:00 2001 From: midriasi Date: Fri, 26 Dec 2025 22:01:58 +0200 Subject: [PATCH 5/9] address pr feedback --- beetsplug/lyrics.py | 21 +++++++++++---------- test/plugins/lyrics_pages.py | 24 ++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 10 deletions(-) diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py index 61bd7e1da..19dd6df99 100644 --- a/beetsplug/lyrics.py +++ b/beetsplug/lyrics.py @@ -190,21 +190,19 @@ class LyricsRequestHandler(RequestHandler): return f"{url}?{urlencode(params)}" def get_text( - self, url: str, params: JSONDict | None = None, **kwargs + self, url: str, params: JSONDict | None = None, force_utf8: bool = False, **kwargs ) -> str: """Return text / HTML data from the given URL. - Set the encoding to None to let requests handle it because some sites - set it incorrectly. + By default, trust the server's encoding and requests' apparent_encoding + detection. When force_utf8=True, default to UTF-8 if server doesn't + specify encoding (avoids MacRoman misdetection on some sites like Genius). """ url = self.format_url(url, params) self.debug("Fetching HTML from {}", url) r = self.get(url, **kwargs) - """Trust server's encoding, - but default to UTF-8 if not specified - """ - if not r.encoding: - r.encoding = "utf-8" + if force_utf8: + r.encoding = r.encoding or "utf-8" return r.text def get_json(self, url: str, params: JSONDict | None = None, **kwargs): @@ -548,6 +546,10 @@ class Genius(SearchBackend): def headers(self) -> dict[str, str]: return {"Authorization": f"Bearer {self.config['genius_api_key']}"} + def get_text(self, *args, **kwargs) -> str: + """Force UTF-8 encoding for Genius to avoid MacRoman misdetection.""" + return super().get_text(*args, force_utf8=True, **kwargs) + def search(self, artist: str, title: str) -> Iterable[SearchResult]: search_data: GeniusAPI.Search = self.get_json( self.SEARCH_URL, @@ -562,8 +564,7 @@ class Genius(SearchBackend): if m := cls.LYRICS_IN_JSON_RE.search(html): html_text = cls.remove_backslash(m[0]).replace(r"\n", "\n") lyrics = cls.get_soup(html_text).get_text().strip() - # Clean up any remaining escaped quotes (may need multiple passes) - # Use regex to handle all escape levels in one pass + # Genius embeds lyrics in JSON; escape sequences remain after parsing lyrics = re.sub(r'\\+"', '"', lyrics) return lyrics diff --git a/test/plugins/lyrics_pages.py b/test/plugins/lyrics_pages.py index 15cb812a1..f5e2cd90d 100644 --- a/test/plugins/lyrics_pages.py +++ b/test/plugins/lyrics_pages.py @@ -576,4 +576,28 @@ lyrics_pages = [ """, marks=[pytest.mark.xfail(reason="Tekstowo seems to be broken again")], ), + LyricsPage.make( + "https://genius.com/Caparezza-argenti-vive-lyrics", + """ + Ciao Dante, ti ricordi di me? Sono Filippo Argenti + Il vicino di casa che nella Commedia ponesti tra questi violenti + Sono quello che annega nel fango, pestato dai demoni intorno + Cos'è, vuoi provocarmi, sommo? Puoi solo provocarmi sonno! + """, + artist="Caparezza", + track_title="Argenti vive", + marks=[xfail_on_ci("Genius returns 403 FORBIDDEN in CI")], + ), + LyricsPage.make( + "https://genius.com/Arctic-monkeys-r-u-mine-lyrics", + """ + I go crazy 'cause here isn't where I wanna be + And satisfaction feels like a distant memory + And I can't help myself, all I + Wanna hear her say is "Are you mine?" + """, + artist="Arctic Monkeys", + track_title="R U Mine?", + marks=[xfail_on_ci("Genius returns 403 FORBIDDEN in CI")], + ), ] From 9941ffde44c81c7af2533f9e94e4c8521975f918 Mon Sep 17 00:00:00 2001 From: midriasi Date: Fri, 26 Dec 2025 22:09:14 +0200 Subject: [PATCH 6/9] fix formatting and type issues --- beetsplug/lyrics.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py index 19dd6df99..4fab98b27 100644 --- a/beetsplug/lyrics.py +++ b/beetsplug/lyrics.py @@ -190,7 +190,11 @@ class LyricsRequestHandler(RequestHandler): return f"{url}?{urlencode(params)}" def get_text( - self, url: str, params: JSONDict | None = None, force_utf8: bool = False, **kwargs + self, + url: str, + params: JSONDict | None = None, + force_utf8: bool = False, + **kwargs, ) -> str: """Return text / HTML data from the given URL. @@ -546,9 +550,9 @@ class Genius(SearchBackend): def headers(self) -> dict[str, str]: return {"Authorization": f"Bearer {self.config['genius_api_key']}"} - def get_text(self, *args, **kwargs) -> str: + def get_text(self, url: str, params: JSONDict | None = None, **kwargs) -> str: """Force UTF-8 encoding for Genius to avoid MacRoman misdetection.""" - return super().get_text(*args, force_utf8=True, **kwargs) + return super().get_text(url, params, force_utf8=True, **kwargs) def search(self, artist: str, title: str) -> Iterable[SearchResult]: search_data: GeniusAPI.Search = self.get_json( From f234686774ad4adbe2e96c63d61996df7ada32ed Mon Sep 17 00:00:00 2001 From: midriasi Date: Fri, 26 Dec 2025 22:11:16 +0200 Subject: [PATCH 7/9] fix type signature --- beetsplug/lyrics.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py index 4fab98b27..c00258415 100644 --- a/beetsplug/lyrics.py +++ b/beetsplug/lyrics.py @@ -550,9 +550,15 @@ class Genius(SearchBackend): def headers(self) -> dict[str, str]: return {"Authorization": f"Bearer {self.config['genius_api_key']}"} - def get_text(self, url: str, params: JSONDict | None = None, **kwargs) -> str: + def get_text( + self, + url: str, + params: JSONDict | None = None, + force_utf8: bool = True, + **kwargs, + ) -> str: """Force UTF-8 encoding for Genius to avoid MacRoman misdetection.""" - return super().get_text(url, params, force_utf8=True, **kwargs) + return super().get_text(url, params, force_utf8=force_utf8, **kwargs) def search(self, artist: str, title: str) -> Iterable[SearchResult]: search_data: GeniusAPI.Search = self.get_json( From 8088797a555ae60f7ca34d46602009ab548452ae Mon Sep 17 00:00:00 2001 From: midriasi Date: Wed, 31 Dec 2025 10:33:22 +0200 Subject: [PATCH 8/9] add azlyrics test case --- test/plugins/lyrics_pages.py | 144 +++++++++++++++++++++++++++++++++++ 1 file changed, 144 insertions(+) diff --git a/test/plugins/lyrics_pages.py b/test/plugins/lyrics_pages.py index f5e2cd90d..83f14e07b 100644 --- a/test/plugins/lyrics_pages.py +++ b/test/plugins/lyrics_pages.py @@ -600,4 +600,148 @@ lyrics_pages = [ track_title="R U Mine?", marks=[xfail_on_ci("Genius returns 403 FORBIDDEN in CI")], ), + LyricsPage.make( + "https://www.azlyrics.com/lyrics/derivakat/bountyhunter.html", + """ + [Romanized:] + + Living legend made of myths and made of stories + But I'm present, find the mark and I'll take the glory + Hit the dead-end, cornered like an animal, it just + Takes one second, I'm nothing if not practical + But I don't really care if you're right or you're wrong + Bùguǎn nǐ de wùhuì or you're just a disgrace + Get in the zone when I put on my song + Bìrán de xiànshí when I'm up in your face + Zhēn tài kě qíng kùn zài lóng lǐ de nǐ + But you'll find in the end that I'll leave you erased + (If they call?) I'm the one + (Got a job?) Get it done + All alone, you and me, I'll be leaving no trace + + No place where I won't go to + No limits I won't break through + No godless deed I won't do + If they got problems, then they know to + + Call the bounty hunter + Yeah, they got my number + I'm the problem solver + Leave you six feet under + You won't see another + Winter, spring or summer + Wǒ huì ràng nǐ xiāoshī + No way to recover (Hey!) + + Run it up + Night or day (Hey!) + Hunt you down + Run away + + No mercy from this mercenary + Take away your sanctuary + Shìlì jiù xiàng 20/20 + Way I see you'll end up buried + Don't deviate + Zhǎodào nǐ so I terminate + Wúlùn, wǒ huì make you pay + I'll double tap you like bang, bang + + (Dead lilac) A ghost you only heard in rumor + (Dead lilac) But only if you heard me sooner + (Dead lilac) Losing your chances and losing your time + (Dead lilac) shìjiè wèilái méi nǐ de cúnzài (Yeah) + + No place where I won't go to (I won't go to) + No limits I won't break through (I won't break through) + No godless deed I won't do (That I won't do) + If they got problems, then they know to + + Call the bounty hunter (Hey) + Yeah, they got my number + I'm the problem solver (Hey) + Leave you six feet under + You won't see another (Hey) + Winter, spring or summer + Wǒ huì ràng nǐ xiāoshī (Hey) + No way to recover (Hey!) + + Run it up (Run it up) + Night or day (Hey!) (Day) + Hunt you down (Hunt you down) + Run away + + [English/Chinese:] + + Living legend made of myths and made of stories + But I'm present, find the mark and I'll take the glory + Hit the dead-end, cornered like an animal, it just + Takes one second, I'm nothing if not practical + But I don't really care if you're right or you're wrong + 不管你的误会 or you're just a disgrace + Get in the zone when I put on my song + 必然的现实 when I'm up in your face + 真太可情 困在笼里的你 + But you'll find in the end that I'll leave you erased + (If they call?) I'm the one + (Got a job?) Get it done + All alone, you and me, I'll be leaving no trace + + No place where I won't go to + No limits I won't break through + No godless deed I won't do + If they got problems, then they know to + + Call the bounty hunter + Yeah, they got my number + I'm the problem solver + Leave you six feet under + You won't see another + Winter, spring or summer + 我会让你消失 + No way to recover (Hey!) + + Run it up + Night or day (Hey!) + Hunt you down + Run away + + No mercy from this mercenary + Take away your sanctuary + 视力就像 20/20 + Way I see you'll end up buried + Don't deviate + 找到你 so I terminate + 无论, 我会 make you pay + I'll double tap you like bang, bang + + (Dead lilac) A ghost you only heard in rumor + (Dead lilac) But only if you heard me sooner + (Dead lilac) Losing your chances and losing your time + (Dead lilac) 世界未来没你的存在 (Yeah) + + No place where I won't go to (I won't go to) + No limits I won't break through (I won't break through) + No godless deed I won't do (That I won't do) + If they got problems, then they know to + + Call the bounty hunter (Hey) + Yeah, they got my number + I'm the problem solver (Hey) + Leave you six feet under + You won't see another (Hey) + Winter, spring or summer + 我会让你消失 (Hey) + No way to recover (Hey!) + + Run it up (Run it up) + Night or day (Hey!) (Day) + Hunt you down (Hunt you down) + Run away + """, + artist="Derivakat", + track_title="Bounty Hunter", + url_title="Derivakat - Bounty Hunter Lyrics | AZLyrics.com", + marks=[xfail_on_ci("AZLyrics is blocked by Cloudflare")], + ), ] From 4cfb1e30bd6713b8d6984a0b010581e8f5ad4d3f Mon Sep 17 00:00:00 2001 From: midriasi Date: Wed, 31 Dec 2025 10:59:12 +0200 Subject: [PATCH 9/9] set encoding to None for non-genius backends --- beetsplug/lyrics.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py index c00258415..cd73f021d 100644 --- a/beetsplug/lyrics.py +++ b/beetsplug/lyrics.py @@ -198,15 +198,16 @@ class LyricsRequestHandler(RequestHandler): ) -> str: """Return text / HTML data from the given URL. - By default, trust the server's encoding and requests' apparent_encoding - detection. When force_utf8=True, default to UTF-8 if server doesn't - specify encoding (avoids MacRoman misdetection on some sites like Genius). + Set encoding to None to let requests auto-detect (works for most sites). + For Genius, force UTF-8 to avoid MacRoman misdetection. """ url = self.format_url(url, params) self.debug("Fetching HTML from {}", url) r = self.get(url, **kwargs) if force_utf8: r.encoding = r.encoding or "utf-8" + else: + r.encoding = None return r.text def get_json(self, url: str, params: JSONDict | None = None, **kwargs):