From 76aa97827dbab08d636d3b027da641bbaa1cbb5e Mon Sep 17 00:00:00 2001 From: wisp3rwind <17089248+wisp3rwind@users.noreply.github.com> Date: Tue, 15 Jun 2021 10:10:22 +0200 Subject: [PATCH 1/7] lyrics: rename html -> soup for consistency --- beetsplug/lyrics.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py index f0290f74a..83b43ff22 100644 --- a/beetsplug/lyrics.py +++ b/beetsplug/lyrics.py @@ -373,22 +373,22 @@ class Genius(Backend): def _scrape_lyrics_from_html(self, html): """Scrape lyrics from a given genius.com html""" - html = BeautifulSoup(html, "html.parser") + soup = BeautifulSoup(html, "html.parser") # Remove script tags that they put in the middle of the lyrics. - [h.extract() for h in html('script')] + [h.extract() for h in soup('script')] # Most of the time, the page contains a div with class="lyrics" where # all of the lyrics can be found already correctly formatted # Sometimes, though, it packages the lyrics into separate divs, most # likely for easier ad placement - lyrics_div = html.find("div", class_="lyrics") + lyrics_div = soup.find("div", class_="lyrics") if not lyrics_div: self._log.debug(u'Received unusual song page html') - verse_div = html.find("div", + verse_div = soup.find("div", class_=re.compile("Lyrics__Container")) if not verse_div: - if html.find("div", + if soup.find("div", class_=re.compile("LyricsPlaceholder__Message"), string="This song is an instrumental"): self._log.debug('Detected instrumental') @@ -433,11 +433,11 @@ class Tekstowo(Backend): html = _scrape_merge_paragraphs(html) try: - html = BeautifulSoup(html, "html.parser") + soup = BeautifulSoup(html, "html.parser") except HTMLParseError: return None - song_rows = html.find("div", class_="content"). \ + song_rows = soup.find("div", class_="content"). \ find("div", class_="card"). \ find_all("div", class_="box-przeboje") @@ -457,11 +457,11 @@ class Tekstowo(Backend): html = _scrape_merge_paragraphs(html) try: - html = BeautifulSoup(html, "html.parser") + soup = BeautifulSoup(html, "html.parser") except HTMLParseError: return None - return html.find("div", class_="song-text").get_text() + return soup.find("div", class_="song-text").get_text() def remove_credits(text): From f8a4c661faa0e6b5201a0fd88c74cb7f574f52b2 Mon Sep 17 00:00:00 2001 From: wisp3rwind <17089248+wisp3rwind@users.noreply.github.com> Date: Tue, 15 Jun 2021 10:18:46 +0200 Subject: [PATCH 2/7] lyrics: return None explicitly --- beetsplug/lyrics.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py index 83b43ff22..9607a5cb7 100644 --- a/beetsplug/lyrics.py +++ b/beetsplug/lyrics.py @@ -257,6 +257,7 @@ class Backend(object): return r.text else: self._log.debug(u'failed to fetch: {0} ({1})', url, r.status_code) + return None def fetch(self, artist, title): raise NotImplementedError() @@ -286,11 +287,11 @@ class MusiXmatch(Backend): html = self.fetch_url(url) if not html: - return + return None if "We detected that your IP is blocked" in html: self._log.warning(u'we are blocked at MusixMatch: url %s failed' % url) - return + return None html_parts = html.split('

Date: Tue, 15 Jun 2021 10:24:03 +0200 Subject: [PATCH 3/7] lyrics: always check for fetch_url() returning None --- beetsplug/lyrics.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py index 9607a5cb7..a6dc0bd6a 100644 --- a/beetsplug/lyrics.py +++ b/beetsplug/lyrics.py @@ -344,8 +344,10 @@ class Genius(Backend): hit_artist = hit["result"]["primary_artist"]["name"] if slug(hit_artist) == slug(artist): - return self._scrape_lyrics_from_html( - self.fetch_url(hit["result"]["url"])) + html = self.fetch_url(hit["result"]["url"]) + if not html: + return None + return self._scrape_lyrics_from_html(html) self._log.debug(u'Genius failed to find a matching artist for \'{0}\'', artist) @@ -419,12 +421,16 @@ class Tekstowo(Backend): def fetch(self, artist, title): url = self.build_url(title, artist) search_results = self.fetch_url(url) + if not search_results: + return None song_page_url = self.parse_search_results(search_results) if not song_page_url: return None song_page_html = self.fetch_url(song_page_url) + if not song_page_html: + return None return self.extract_lyrics(song_page_html) def parse_search_results(self, html): @@ -512,9 +518,6 @@ def scrape_lyrics_from_html(html): if not HAS_BEAUTIFUL_SOUP: return None - if not html: - return None - def is_text_notcode(text): length = len(text) return (length > 20 and @@ -647,6 +650,8 @@ class Google(Backend): title, artist): continue html = self.fetch_url(url_link) + if not html: + continue lyrics = scrape_lyrics_from_html(html) if not lyrics: continue From 867d383544ba97391f81fbcc37481905dc3a7253 Mon Sep 17 00:00:00 2001 From: wisp3rwind <17089248+wisp3rwind@users.noreply.github.com> Date: Tue, 15 Jun 2021 10:30:05 +0200 Subject: [PATCH 4/7] lyrics: wrap BeautifulSoup() constructor to centralize error handling also ensure that the return value is always checked for None --- beetsplug/lyrics.py | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py index a6dc0bd6a..127a94a07 100644 --- a/beetsplug/lyrics.py +++ b/beetsplug/lyrics.py @@ -33,7 +33,8 @@ import six from six.moves import urllib try: - from bs4 import SoupStrainer, BeautifulSoup + import bs4 + from bs4 import SoupStrainer HAS_BEAUTIFUL_SOUP = True except ImportError: HAS_BEAUTIFUL_SOUP = False @@ -219,6 +220,17 @@ def slug(text): return re.sub(r'\W+', '-', unidecode(text).lower().strip()).strip('-') +if HAS_BEAUTIFUL_SOUP: + def try_parse_html(html, **kwargs): + try: + return bs4.BeautifulSoup(html, 'html.parser', **kwargs) + except HTMLParseError: + return None +else: + def try_parse_html(html, **kwargs): + return None + + class Backend(object): def __init__(self, config, log): self._log = log @@ -377,7 +389,9 @@ class Genius(Backend): def _scrape_lyrics_from_html(self, html): """Scrape lyrics from a given genius.com html""" - soup = BeautifulSoup(html, "html.parser") + soup = try_parse_html(html) + if not soup: + return # Remove script tags that they put in the middle of the lyrics. [h.extract() for h in soup('script')] @@ -440,9 +454,8 @@ class Tekstowo(Backend): html = _scrape_strip_cruft(html) html = _scrape_merge_paragraphs(html) - try: - soup = BeautifulSoup(html, "html.parser") - except HTMLParseError: + soup = try_parse_html(html) + if not soup: return None song_rows = soup.find("div", class_="content"). \ @@ -464,9 +477,8 @@ class Tekstowo(Backend): html = _scrape_strip_cruft(html) html = _scrape_merge_paragraphs(html) - try: - soup = BeautifulSoup(html, "html.parser") - except HTMLParseError: + soup = try_parse_html(html) + if not soup: return None return soup.find("div", class_="song-text").get_text() @@ -527,10 +539,8 @@ def scrape_lyrics_from_html(html): html = _scrape_merge_paragraphs(html) # extract all long text blocks that are not code - try: - soup = BeautifulSoup(html, "html.parser", - parse_only=SoupStrainer(text=is_text_notcode)) - except HTMLParseError: + soup = try_parse_html(html, parse_only=SoupStrainer(text=is_text_notcode)) + if not soup: return None # Get the longest text element (if any). From 038cebfa9bed3c8463e84a83ced0a4ba408794c8 Mon Sep 17 00:00:00 2001 From: wisp3rwind <17089248+wisp3rwind@users.noreply.github.com> Date: Tue, 15 Jun 2021 10:34:59 +0200 Subject: [PATCH 5/7] lyrics: remove duplicate check for beautifulsoup The plugin already disables these sources entirely when beautifulsoup is not available --- beetsplug/lyrics.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py index 127a94a07..3a746bdd2 100644 --- a/beetsplug/lyrics.py +++ b/beetsplug/lyrics.py @@ -448,9 +448,6 @@ class Tekstowo(Backend): return self.extract_lyrics(song_page_html) def parse_search_results(self, html): - if not HAS_BEAUTIFUL_SOUP: - return None - html = _scrape_strip_cruft(html) html = _scrape_merge_paragraphs(html) @@ -527,9 +524,6 @@ def scrape_lyrics_from_html(html): """Scrape lyrics from a URL. If no lyrics can be found, return None instead. """ - if not HAS_BEAUTIFUL_SOUP: - return None - def is_text_notcode(text): length = len(text) return (length > 20 and From 316b79f72fd5c4f39e4d14b417c7bb5fd8864e4a Mon Sep 17 00:00:00 2001 From: wisp3rwind <17089248+wisp3rwind@users.noreply.github.com> Date: Tue, 15 Jun 2021 10:35:58 +0200 Subject: [PATCH 6/7] lyrics: slightly refactor beautifulsoup checks instead of having a global list of sources that require the package, indicate the dependency using an attribute of the source class --- beetsplug/lyrics.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py index 3a746bdd2..e899c0e86 100644 --- a/beetsplug/lyrics.py +++ b/beetsplug/lyrics.py @@ -232,6 +232,8 @@ else: class Backend(object): + REQUIRES_BS = False + def __init__(self, config, log): self._log = log @@ -329,6 +331,8 @@ class Genius(Backend): bigishdata.com/2016/09/27/getting-song-lyrics-from-geniuss-api-scraping/ """ + REQUIRES_BS = True + base_url = "https://api.genius.com" def __init__(self, config, log): @@ -428,6 +432,7 @@ class Genius(Backend): class Tekstowo(Backend): # Fetch lyrics from Tekstowo.pl. + REQUIRES_BS = True BASE_URL = 'http://www.tekstowo.pl' URL_PATTERN = BASE_URL + '/wyszukaj.html?search-title=%s&search-artist=%s' @@ -548,6 +553,8 @@ def scrape_lyrics_from_html(html): class Google(Backend): """Fetch lyrics from Google search results.""" + REQUIRES_BS = True + def __init__(self, config, log): super(Google, self).__init__(config, log) self.api_key = config['google_API_key'].as_str() @@ -670,7 +677,6 @@ class Google(Backend): class LyricsPlugin(plugins.BeetsPlugin): SOURCES = ['google', 'musixmatch', 'genius', 'tekstowo'] - BS_SOURCES = ['google', 'genius', 'tekstowo'] SOURCE_BACKENDS = { 'google': Google, 'musixmatch': MusiXmatch, @@ -740,15 +746,17 @@ class LyricsPlugin(plugins.BeetsPlugin): for source in sources] def sanitize_bs_sources(self, sources): - for source in self.BS_SOURCES: - if source in sources: + enabled_sources = [] + for source in sources: + if source.REQUIRES_BS: self._log.debug(u'To use the %s lyrics source, you must ' u'install the beautifulsoup4 module. See ' u'the documentation for further details.' % source) - sources.remove(source) + else: + enabled_sources.append(source) - return sources + return enabled_sources def get_bing_access_token(self): params = { From dfd834cf8f5fcd72266e87c1fdcf5d638dfb8fe4 Mon Sep 17 00:00:00 2001 From: wisp3rwind <17089248+wisp3rwind@users.noreply.github.com> Date: Tue, 15 Jun 2021 11:01:56 +0200 Subject: [PATCH 7/7] lyrics: update changelog --- docs/changelog.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/changelog.rst b/docs/changelog.rst index 69e2f01a7..5ca9a8f9c 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -380,7 +380,9 @@ Fixes: * Templates that use ``%ifdef`` now produce the expected behavior when used in conjunction with non-string fields from the :doc:`/plugins/types`. :bug:`3852` - +* :doc:`/plugins/lyrics`: Fix crashes when a website could not be retrieved, + affecting at least the Genius source + :bug:`3970` For plugin developers: