From a79a86d5d6cac200152b7e16f40a8b621233eaa8 Mon Sep 17 00:00:00 2001 From: Francesco Grillo Date: Tue, 23 Dec 2025 22:31:21 +0200 Subject: [PATCH] Fix lyrics Unicode corruption and escaped quotes in Genius plugin MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Problem The lyrics plugin has two bugs that corrupt fetched lyrics: 1. **Unicode corruption**: Characters like `ò`, `è`, `à` are corrupted to `√≤`, `√®`, etc. 2. **Escaped quotes**: Quotes appear as `\"` instead of `"` in lyrics ## Root Causes ### Issue 1: MacRoman encoding misdetection - **Location**: `RequestHandler.fetch_text()` line 220 - **Cause**: Setting `r.encoding = None` forces requests to use `apparent_encoding` - **Problem**: For Genius.com (and others), requests incorrectly detects MacRoman instead of UTF-8 - **Result**: UTF-8 bytes `c3 b2` (ò) decoded as MacRoman produces "√≤" (U+221A U+2264) ### Issue 2: Incomplete JSON unescape - **Location**: `Genius.scrape()` line 576 - **Cause**: The `remove_backslash` regex doesn't handle all escape patterns in JSON - **Problem**: Genius embeds lyrics in JSON with patterns like `\\"` and `\\\\"` - **Result**: After BeautifulSoup processing, escaped quotes remain in final text ## Solution ### Fix 1: Trust server encoding, fallback to UTF-8 ```python # OLD: r.encoding = None # NEW: if not r.encoding: r.encoding = 'utf-8' ``` - Respects server's declared encoding (UTF-8 for Genius) - Falls back to UTF-8 if no encoding specified (safer than apparent_encoding) - Preserves original intent of handling misconfigured servers ### Fix 2: Iteratively clean escaped quotes ```python while '\\"' in lyrics: lyrics = lyrics.replace('\\"', '"') ``` - Handles variable escape levels (`\"`, `\\\"`, `\\\\\"`) - Minimal change - keeps original `remove_backslash` regex - Applied after BeautifulSoup to avoid interfering with HTML parsing ## Testing Tested with: - Caparezza - "Argenti Vive" (Italian, many accented characters) - WestsideGunn - "Heel Cena" (escaped quotes in lyrics) Before: ``` mi si par√≤ davanti \\"I got big moves\\" ``` After: ``` mi si parò davanti "I got big moves" ``` ## Impact - Fixes lyrics for all languages with non-ASCII characters - Fixes Genius lyrics with quotes - No breaking changes - maintains backward compatibility - Minimal code changes (14 lines total) --- beetsplug/lyrics.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py index d6e14c175..4bb88cf49 100644 --- a/beetsplug/lyrics.py +++ b/beetsplug/lyrics.py @@ -200,7 +200,11 @@ class LyricsRequestHandler(RequestHandler): url = self.format_url(url, params) self.debug("Fetching HTML from {}", url) r = self.get(url, **kwargs) - r.encoding = None + """Trust server's encoding, + but default to UTF-8 if not specified + """ + if not r.encoding: + r.encoding = 'utf-8' return r.text def get_json(self, url: str, params: JSONDict | None = None, **kwargs): @@ -557,11 +561,14 @@ class Genius(SearchBackend): def scrape(cls, html: str) -> str | None: if m := cls.LYRICS_IN_JSON_RE.search(html): html_text = cls.remove_backslash(m[0]).replace(r"\n", "\n") - return cls.get_soup(html_text).get_text().strip() + lyrics = cls.get_soup(html_text).get_text().strip() + # Clean up any remaining escaped quotes (may need multiple passes) + while '\\"' in lyrics: + lyrics = lyrics.replace('\\"', '"') + return lyrics return None - class Tekstowo(SearchBackend): """Fetch lyrics from Tekstowo.pl."""