Merge pull request #3978 from wisp3rwind/handle_lyrics_errors

Always handle errors in the lyrics plugin
This commit is contained in:
Benedikt 2021-06-18 17:17:32 +02:00 committed by GitHub
commit 027474b86a
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 62 additions and 39 deletions

View file

@ -33,7 +33,8 @@ import six
from six.moves import urllib
try:
from bs4 import SoupStrainer, BeautifulSoup
import bs4
from bs4 import SoupStrainer
HAS_BEAUTIFUL_SOUP = True
except ImportError:
HAS_BEAUTIFUL_SOUP = False
@ -219,7 +220,20 @@ def slug(text):
return re.sub(r'\W+', '-', unidecode(text).lower().strip()).strip('-')
if HAS_BEAUTIFUL_SOUP:
def try_parse_html(html, **kwargs):
try:
return bs4.BeautifulSoup(html, 'html.parser', **kwargs)
except HTMLParseError:
return None
else:
def try_parse_html(html, **kwargs):
return None
class Backend(object):
REQUIRES_BS = False
def __init__(self, config, log):
self._log = log
@ -257,6 +271,7 @@ class Backend(object):
return r.text
else:
self._log.debug(u'failed to fetch: {0} ({1})', url, r.status_code)
return None
def fetch(self, artist, title):
raise NotImplementedError()
@ -286,11 +301,11 @@ class MusiXmatch(Backend):
html = self.fetch_url(url)
if not html:
return
return None
if "We detected that your IP is blocked" in html:
self._log.warning(u'we are blocked at MusixMatch: url %s failed'
% url)
return
return None
html_parts = html.split('<p class="mxm-lyrics__content')
# Sometimes lyrics come in 2 or more parts
lyrics_parts = []
@ -302,10 +317,10 @@ class MusiXmatch(Backend):
# missing songs. this seems to happen after being blocked
# above, when filling in the CAPTCHA.
if "Instant lyrics for all your music." in lyrics:
return
return None
# sometimes there are non-existent lyrics with some content
if 'Lyrics | Musixmatch' in lyrics:
return
return None
return lyrics
@ -316,6 +331,8 @@ class Genius(Backend):
bigishdata.com/2016/09/27/getting-song-lyrics-from-geniuss-api-scraping/
"""
REQUIRES_BS = True
base_url = "https://api.genius.com"
def __init__(self, config, log):
@ -343,11 +360,14 @@ class Genius(Backend):
hit_artist = hit["result"]["primary_artist"]["name"]
if slug(hit_artist) == slug(artist):
return self._scrape_lyrics_from_html(
self.fetch_url(hit["result"]["url"]))
html = self.fetch_url(hit["result"]["url"])
if not html:
return None
return self._scrape_lyrics_from_html(html)
self._log.debug(u'Genius failed to find a matching artist for \'{0}\'',
artist)
return None
def _search(self, artist, title):
"""Searches the genius api for a given artist and title
@ -373,22 +393,24 @@ class Genius(Backend):
def _scrape_lyrics_from_html(self, html):
"""Scrape lyrics from a given genius.com html"""
html = BeautifulSoup(html, "html.parser")
soup = try_parse_html(html)
if not soup:
return
# Remove script tags that they put in the middle of the lyrics.
[h.extract() for h in html('script')]
[h.extract() for h in soup('script')]
# Most of the time, the page contains a div with class="lyrics" where
# all of the lyrics can be found already correctly formatted
# Sometimes, though, it packages the lyrics into separate divs, most
# likely for easier ad placement
lyrics_div = html.find("div", class_="lyrics")
lyrics_div = soup.find("div", class_="lyrics")
if not lyrics_div:
self._log.debug(u'Received unusual song page html')
verse_div = html.find("div",
verse_div = soup.find("div",
class_=re.compile("Lyrics__Container"))
if not verse_div:
if html.find("div",
if soup.find("div",
class_=re.compile("LyricsPlaceholder__Message"),
string="This song is an instrumental"):
self._log.debug('Detected instrumental')
@ -410,6 +432,7 @@ class Genius(Backend):
class Tekstowo(Backend):
# Fetch lyrics from Tekstowo.pl.
REQUIRES_BS = True
BASE_URL = 'http://www.tekstowo.pl'
URL_PATTERN = BASE_URL + '/wyszukaj.html?search-title=%s&search-artist=%s'
@ -417,27 +440,27 @@ class Tekstowo(Backend):
def fetch(self, artist, title):
url = self.build_url(title, artist)
search_results = self.fetch_url(url)
if not search_results:
return None
song_page_url = self.parse_search_results(search_results)
if not song_page_url:
return None
song_page_html = self.fetch_url(song_page_url)
if not song_page_html:
return None
return self.extract_lyrics(song_page_html)
def parse_search_results(self, html):
if not HAS_BEAUTIFUL_SOUP:
return None
html = _scrape_strip_cruft(html)
html = _scrape_merge_paragraphs(html)
try:
html = BeautifulSoup(html, "html.parser")
except HTMLParseError:
soup = try_parse_html(html)
if not soup:
return None
song_rows = html.find("div", class_="content"). \
song_rows = soup.find("div", class_="content"). \
find("div", class_="card"). \
find_all("div", class_="box-przeboje")
@ -456,12 +479,11 @@ class Tekstowo(Backend):
html = _scrape_strip_cruft(html)
html = _scrape_merge_paragraphs(html)
try:
html = BeautifulSoup(html, "html.parser")
except HTMLParseError:
soup = try_parse_html(html)
if not soup:
return None
return html.find("div", class_="song-text").get_text()
return soup.find("div", class_="song-text").get_text()
def remove_credits(text):
@ -507,12 +529,6 @@ def scrape_lyrics_from_html(html):
"""Scrape lyrics from a URL. If no lyrics can be found, return None
instead.
"""
if not HAS_BEAUTIFUL_SOUP:
return None
if not html:
return None
def is_text_notcode(text):
length = len(text)
return (length > 20 and
@ -522,10 +538,8 @@ def scrape_lyrics_from_html(html):
html = _scrape_merge_paragraphs(html)
# extract all long text blocks that are not code
try:
soup = BeautifulSoup(html, "html.parser",
parse_only=SoupStrainer(text=is_text_notcode))
except HTMLParseError:
soup = try_parse_html(html, parse_only=SoupStrainer(text=is_text_notcode))
if not soup:
return None
# Get the longest text element (if any).
@ -539,6 +553,8 @@ def scrape_lyrics_from_html(html):
class Google(Backend):
"""Fetch lyrics from Google search results."""
REQUIRES_BS = True
def __init__(self, config, log):
super(Google, self).__init__(config, log)
self.api_key = config['google_API_key'].as_str()
@ -645,6 +661,8 @@ class Google(Backend):
title, artist):
continue
html = self.fetch_url(url_link)
if not html:
continue
lyrics = scrape_lyrics_from_html(html)
if not lyrics:
continue
@ -654,10 +672,11 @@ class Google(Backend):
item['displayLink'])
return lyrics
return None
class LyricsPlugin(plugins.BeetsPlugin):
SOURCES = ['google', 'musixmatch', 'genius', 'tekstowo']
BS_SOURCES = ['google', 'genius', 'tekstowo']
SOURCE_BACKENDS = {
'google': Google,
'musixmatch': MusiXmatch,
@ -727,15 +746,17 @@ class LyricsPlugin(plugins.BeetsPlugin):
for source in sources]
def sanitize_bs_sources(self, sources):
for source in self.BS_SOURCES:
if source in sources:
enabled_sources = []
for source in sources:
if source.REQUIRES_BS:
self._log.debug(u'To use the %s lyrics source, you must '
u'install the beautifulsoup4 module. See '
u'the documentation for further details.'
% source)
sources.remove(source)
else:
enabled_sources.append(source)
return sources
return enabled_sources
def get_bing_access_token(self):
params = {

View file

@ -380,7 +380,9 @@ Fixes:
* Templates that use ``%ifdef`` now produce the expected behavior when used in
conjunction with non-string fields from the :doc:`/plugins/types`.
:bug:`3852`
* :doc:`/plugins/lyrics`: Fix crashes when a website could not be retrieved,
affecting at least the Genius source
:bug:`3970`
For plugin developers: