# This file is part of beets. # Copyright 2016, Adrian Sampson. # # Permission is hereby granted, free of charge, to any person obtaining # a copy of this software and associated documentation files (the # "Software"), to deal in the Software without restriction, including # without limitation the rights to use, copy, modify, merge, publish, # distribute, sublicense, and/or sell copies of the Software, and to # permit persons to whom the Software is furnished to do so, subject to # the following conditions: # # The above copyright notice and this permission notice shall be # included in all copies or substantial portions of the Software. """Fetches, embeds, and displays lyrics.""" from __future__ import annotations import difflib import errno import itertools import json import os.path import re import struct import unicodedata import warnings from functools import partial from typing import TYPE_CHECKING, ClassVar from urllib.parse import quote, urlencode import requests from unidecode import unidecode import beets from beets import plugins, ui if TYPE_CHECKING: from beets.importer import ImportTask from beets.library import Item try: import bs4 from bs4 import SoupStrainer HAS_BEAUTIFUL_SOUP = True except ImportError: HAS_BEAUTIFUL_SOUP = False try: import langdetect HAS_LANGDETECT = True except ImportError: HAS_LANGDETECT = False DIV_RE = re.compile(r"<(/?)div>?", re.I) COMMENT_RE = re.compile(r"", re.S) TAG_RE = re.compile(r"<[^>]*>") BREAK_RE = re.compile(r"\n?\s*]*)*>\s*\n?", re.I) USER_AGENT = f"beets/{beets.__version__}" # The content for the base index.rst generated in ReST mode. REST_INDEX_TEMPLATE = """Lyrics ====== * :ref:`Song index ` * :ref:`search` Artist index: .. toctree:: :maxdepth: 1 :glob: artists/* """ # The content for the base conf.py generated. REST_CONF_TEMPLATE = """# -*- coding: utf-8 -*- master_doc = 'index' project = 'Lyrics' copyright = 'none' author = 'Various Authors' latex_documents = [ (master_doc, 'Lyrics.tex', project, author, 'manual'), ] epub_title = project epub_author = author epub_publisher = author epub_copyright = copyright epub_exclude_files = ['search.html'] epub_tocdepth = 1 epub_tocdup = False """ # Utilities. def unichar(i): try: return chr(i) except ValueError: return struct.pack("i", i).decode("utf-32") def unescape(text): """Resolve &#xxx; HTML entities (and some others).""" if isinstance(text, bytes): text = text.decode("utf-8", "ignore") out = text.replace(" ", " ") def replchar(m): num = m.group(1) return unichar(int(num)) out = re.sub("&#(\\d+);", replchar, out) return out def extract_text_between(html, start_marker, end_marker): try: _, html = html.split(start_marker, 1) html, _ = html.split(end_marker, 1) except ValueError: return "" return html def search_pairs(item): """Yield a pairs of artists and titles to search for. The first item in the pair is the name of the artist, the second item is a list of song names. In addition to the artist and title obtained from the `item` the method tries to strip extra information like paranthesized suffixes and featured artists from the strings and add them as candidates. The artist sort name is added as a fallback candidate to help in cases where artist name includes special characters or is in a non-latin script. The method also tries to split multiple titles separated with `/`. """ def generate_alternatives(string, patterns): """Generate string alternatives by extracting first matching group for each given pattern. """ alternatives = [string] for pattern in patterns: match = re.search(pattern, string, re.IGNORECASE) if match: alternatives.append(match.group(1)) return alternatives title, artist, artist_sort = ( item.title.strip(), item.artist.strip(), item.artist_sort.strip(), ) if not title or not artist: return () patterns = [ # Remove any featuring artists from the artists name rf"(.*?) {plugins.feat_tokens()}" ] artists = generate_alternatives(artist, patterns) # Use the artist_sort as fallback only if it differs from artist to avoid # repeated remote requests with the same search terms if artist_sort and artist.lower() != artist_sort.lower(): artists.append(artist_sort) patterns = [ # Remove a parenthesized suffix from a title string. Common # examples include (live), (remix), and (acoustic). r"(.+?)\s+[(].*[)]$", # Remove any featuring artists from the title r"(.*?) {}".format(plugins.feat_tokens(for_artist=False)), # Remove part of title after colon ':' for songs with subtitles r"(.+?)\s*:.*", ] titles = generate_alternatives(title, patterns) # Check for a dual song (e.g. Pink Floyd - Speak to Me / Breathe) # and each of them. multi_titles = [] for title in titles: multi_titles.append([title]) if "/" in title: multi_titles.append([x.strip() for x in title.split("/")]) return itertools.product(artists, multi_titles) def slug(text): """Make a URL-safe, human-readable version of the given text This will do the following: 1. decode unicode characters into ASCII 2. shift everything to lowercase 3. strip whitespace 4. replace other non-word characters with dashes 5. strip extra dashes This somewhat duplicates the :func:`Google.slugify` function but slugify is not as generic as this one, which can be reused elsewhere. """ return re.sub(r"\W+", "-", unidecode(text).lower().strip()).strip("-") if HAS_BEAUTIFUL_SOUP: def try_parse_html(html, **kwargs): return bs4.BeautifulSoup(html, "html.parser", **kwargs) else: def try_parse_html(html, **kwargs): return None class Backend: REQUIRES_BS = False def __init__(self, config, log): self._log = log self.config = config def fetch_url(self, url, **kwargs): """Retrieve the content at a given URL, or return None if the source is unreachable. """ try: # Disable the InsecureRequestWarning that comes from using # `verify=false`. # https://github.com/kennethreitz/requests/issues/2214 # We're not overly worried about the NSA MITMing our lyrics scraper with warnings.catch_warnings(): warnings.simplefilter("ignore") r = requests.get( url, verify=False, headers={ "User-Agent": USER_AGENT, }, timeout=10, **kwargs, ) except requests.RequestException as exc: self._log.debug("lyrics request failed: {0}", exc) return if r.status_code == requests.codes.ok: return r.text else: self._log.debug("failed to fetch: {0} ({1})", url, r.status_code) return None def fetch( self, artist: str, title: str, album: str, length: int ) -> str | None: raise NotImplementedError class LRCLib(Backend): base_url = "https://lrclib.net/api/get" def fetch( self, artist: str, title: str, album: str, length: int ) -> str | None: params: dict[str, str | int] = { "artist_name": artist, "track_name": title, } if album: params["album_name"] = album if length: params["duration"] = length try: response = requests.get( self.base_url, params=params, timeout=10, ) data = response.json() except (requests.RequestException, json.decoder.JSONDecodeError) as exc: self._log.debug("LRCLib API request failed: {0}", exc) return None if self.config["synced"]: return data.get("syncedLyrics") or data.get("plainLyrics") return data.get("plainLyrics") class DirectBackend(Backend): """A backend for fetching lyrics directly.""" URL_TEMPLATE: ClassVar[str] #: May include formatting placeholders @classmethod def encode(cls, text: str) -> str: """Encode the string for inclusion in a URL.""" raise NotImplementedError @classmethod def build_url(cls, *args: str) -> str: return cls.URL_TEMPLATE.format(*map(cls.encode, args)) class MusiXmatch(DirectBackend): URL_TEMPLATE = "https://www.musixmatch.com/lyrics/{}/{}" REPLACEMENTS = { r"\s+": "-", "<": "Less_Than", ">": "Greater_Than", "#": "Number_", r"[\[\{]": "(", r"[\]\}]": ")", } @classmethod def encode(cls, text: str) -> str: for old, new in cls.REPLACEMENTS.items(): text = re.sub(old, new, text) return quote(unidecode(text)) def fetch(self, artist: str, title: str, *_) -> str | None: url = self.build_url(artist, title) html = self.fetch_url(url) if not html: return None if "We detected that your IP is blocked" in html: self._log.warning( "we are blocked at MusixMatch: url %s failed" % url ) return None html_parts = html.split('

", "

")) lyrics = "\n".join(lyrics_parts) lyrics = lyrics.strip(',"').replace("\\n", "\n") # another odd case: sometimes only that string remains, for # missing songs. this seems to happen after being blocked # above, when filling in the CAPTCHA. if "Instant lyrics for all your music." in lyrics: return None # sometimes there are non-existent lyrics with some content if "Lyrics | Musixmatch" in lyrics: return None return lyrics class Genius(Backend): """Fetch lyrics from Genius via genius-api. Simply adapted from bigishdata.com/2016/09/27/getting-song-lyrics-from-geniuss-api-scraping/ """ REQUIRES_BS = True base_url = "https://api.genius.com" def __init__(self, config, log): super().__init__(config, log) self.api_key = config["genius_api_key"].as_str() self.headers = { "Authorization": "Bearer %s" % self.api_key, "User-Agent": USER_AGENT, } def fetch(self, artist: str, title: str, *_) -> str | None: """Fetch lyrics from genius.com Because genius doesn't allow accessing lyrics via the api, we first query the api for a url matching our artist & title, then attempt to scrape that url for the lyrics. """ json = self._search(artist, title) if not json: self._log.debug("Genius API request returned invalid JSON") return None # find a matching artist in the json for hit in json["response"]["hits"]: hit_artist = hit["result"]["primary_artist"]["name"] if slug(hit_artist) == slug(artist): html = self.fetch_url(hit["result"]["url"]) if not html: return None return self._scrape_lyrics_from_html(html) self._log.debug( "Genius failed to find a matching artist for '{0}'", artist ) return None def _search(self, artist, title): """Searches the genius api for a given artist and title https://docs.genius.com/#search-h2 :returns: json response """ search_url = self.base_url + "/search" data = {"q": title + " " + artist.lower()} try: response = requests.get( search_url, params=data, headers=self.headers, timeout=10, ) except requests.RequestException as exc: self._log.debug("Genius API request failed: {0}", exc) return None try: return response.json() except ValueError: return None def replace_br(self, lyrics_div): for br in lyrics_div.find_all("br"): br.replace_with("\n") def _scrape_lyrics_from_html(self, html): """Scrape lyrics from a given genius.com html""" soup = try_parse_html(html) if not soup: return # Remove script tags that they put in the middle of the lyrics. [h.extract() for h in soup("script")] # Most of the time, the page contains a div with class="lyrics" where # all of the lyrics can be found already correctly formatted # Sometimes, though, it packages the lyrics into separate divs, most # likely for easier ad placement lyrics_divs = soup.find_all("div", {"data-lyrics-container": True}) if not lyrics_divs: self._log.debug("Received unusual song page html") return self._try_extracting_lyrics_from_non_data_lyrics_container( soup ) lyrics = "" for lyrics_div in lyrics_divs: self.replace_br(lyrics_div) lyrics += lyrics_div.get_text() + "\n\n" while lyrics[-1] == "\n": lyrics = lyrics[:-1] return lyrics def _try_extracting_lyrics_from_non_data_lyrics_container(self, soup): """Extract lyrics from a div without attribute data-lyrics-container This is the second most common layout on genius.com """ verse_div = soup.find("div", class_=re.compile("Lyrics__Container")) if not verse_div: if soup.find( "div", class_=re.compile("LyricsPlaceholder__Message"), string="This song is an instrumental", ): self._log.debug("Detected instrumental") return "[Instrumental]" else: self._log.debug("Couldn't scrape page using known layouts") return None lyrics_div = verse_div.parent self.replace_br(lyrics_div) ads = lyrics_div.find_all( "div", class_=re.compile("InreadAd__Container") ) for ad in ads: ad.replace_with("\n") footers = lyrics_div.find_all( "div", class_=re.compile("Lyrics__Footer") ) for footer in footers: footer.replace_with("") return lyrics_div.get_text() class Tekstowo(DirectBackend): """Fetch lyrics from Tekstowo.pl.""" REQUIRES_BS = True URL_TEMPLATE = "https://www.tekstowo.pl/piosenka,{},{}.html" non_alpha_to_underscore = partial(re.compile(r"\W").sub, "_") @classmethod def encode(cls, text: str) -> str: return cls.non_alpha_to_underscore(unidecode(text.lower())) def fetch(self, artist: str, title: str, *_) -> str | None: if html := self.fetch_url(self.build_url(artist, title)): return self.extract_lyrics(html) return None def extract_lyrics(self, html: str) -> str | None: html = _scrape_strip_cruft(html) html = _scrape_merge_paragraphs(html) soup = try_parse_html(html) if lyrics_div := soup.select_one("div.song-text > div.inner-text"): return lyrics_div.get_text() return None def remove_credits(text): """Remove first/last line of text if it contains the word 'lyrics' eg 'Lyrics by songsdatabase.com' """ textlines = text.split("\n") credits = None for i in (0, -1): if textlines and "lyrics" in textlines[i].lower(): credits = textlines.pop(i) if credits: text = "\n".join(textlines) return text def _scrape_strip_cruft(html, plain_text_out=False): """Clean up HTML""" html = unescape(html) html = html.replace("\r", "\n") # Normalize EOL. html = re.sub(r" +", " ", html) # Whitespaces collapse. html = BREAK_RE.sub("\n", html) #
eats up surrounding '\n'. html = re.sub(r"(?s)<(script).*?", "", html) # Strip script tags. html = re.sub("\u2005", " ", html) # replace unicode with regular space html = re.sub("