mirror of
https://github.com/beetbox/beets.git
synced 2026-01-04 15:03:22 +01:00
Merge pull request #3978 from wisp3rwind/handle_lyrics_errors
Always handle errors in the lyrics plugin
This commit is contained in:
commit
027474b86a
2 changed files with 62 additions and 39 deletions
|
|
@ -33,7 +33,8 @@ import six
|
|||
from six.moves import urllib
|
||||
|
||||
try:
|
||||
from bs4 import SoupStrainer, BeautifulSoup
|
||||
import bs4
|
||||
from bs4 import SoupStrainer
|
||||
HAS_BEAUTIFUL_SOUP = True
|
||||
except ImportError:
|
||||
HAS_BEAUTIFUL_SOUP = False
|
||||
|
|
@ -219,7 +220,20 @@ def slug(text):
|
|||
return re.sub(r'\W+', '-', unidecode(text).lower().strip()).strip('-')
|
||||
|
||||
|
||||
if HAS_BEAUTIFUL_SOUP:
|
||||
def try_parse_html(html, **kwargs):
|
||||
try:
|
||||
return bs4.BeautifulSoup(html, 'html.parser', **kwargs)
|
||||
except HTMLParseError:
|
||||
return None
|
||||
else:
|
||||
def try_parse_html(html, **kwargs):
|
||||
return None
|
||||
|
||||
|
||||
class Backend(object):
|
||||
REQUIRES_BS = False
|
||||
|
||||
def __init__(self, config, log):
|
||||
self._log = log
|
||||
|
||||
|
|
@ -257,6 +271,7 @@ class Backend(object):
|
|||
return r.text
|
||||
else:
|
||||
self._log.debug(u'failed to fetch: {0} ({1})', url, r.status_code)
|
||||
return None
|
||||
|
||||
def fetch(self, artist, title):
|
||||
raise NotImplementedError()
|
||||
|
|
@ -286,11 +301,11 @@ class MusiXmatch(Backend):
|
|||
|
||||
html = self.fetch_url(url)
|
||||
if not html:
|
||||
return
|
||||
return None
|
||||
if "We detected that your IP is blocked" in html:
|
||||
self._log.warning(u'we are blocked at MusixMatch: url %s failed'
|
||||
% url)
|
||||
return
|
||||
return None
|
||||
html_parts = html.split('<p class="mxm-lyrics__content')
|
||||
# Sometimes lyrics come in 2 or more parts
|
||||
lyrics_parts = []
|
||||
|
|
@ -302,10 +317,10 @@ class MusiXmatch(Backend):
|
|||
# missing songs. this seems to happen after being blocked
|
||||
# above, when filling in the CAPTCHA.
|
||||
if "Instant lyrics for all your music." in lyrics:
|
||||
return
|
||||
return None
|
||||
# sometimes there are non-existent lyrics with some content
|
||||
if 'Lyrics | Musixmatch' in lyrics:
|
||||
return
|
||||
return None
|
||||
return lyrics
|
||||
|
||||
|
||||
|
|
@ -316,6 +331,8 @@ class Genius(Backend):
|
|||
bigishdata.com/2016/09/27/getting-song-lyrics-from-geniuss-api-scraping/
|
||||
"""
|
||||
|
||||
REQUIRES_BS = True
|
||||
|
||||
base_url = "https://api.genius.com"
|
||||
|
||||
def __init__(self, config, log):
|
||||
|
|
@ -343,11 +360,14 @@ class Genius(Backend):
|
|||
hit_artist = hit["result"]["primary_artist"]["name"]
|
||||
|
||||
if slug(hit_artist) == slug(artist):
|
||||
return self._scrape_lyrics_from_html(
|
||||
self.fetch_url(hit["result"]["url"]))
|
||||
html = self.fetch_url(hit["result"]["url"])
|
||||
if not html:
|
||||
return None
|
||||
return self._scrape_lyrics_from_html(html)
|
||||
|
||||
self._log.debug(u'Genius failed to find a matching artist for \'{0}\'',
|
||||
artist)
|
||||
return None
|
||||
|
||||
def _search(self, artist, title):
|
||||
"""Searches the genius api for a given artist and title
|
||||
|
|
@ -373,22 +393,24 @@ class Genius(Backend):
|
|||
def _scrape_lyrics_from_html(self, html):
|
||||
"""Scrape lyrics from a given genius.com html"""
|
||||
|
||||
html = BeautifulSoup(html, "html.parser")
|
||||
soup = try_parse_html(html)
|
||||
if not soup:
|
||||
return
|
||||
|
||||
# Remove script tags that they put in the middle of the lyrics.
|
||||
[h.extract() for h in html('script')]
|
||||
[h.extract() for h in soup('script')]
|
||||
|
||||
# Most of the time, the page contains a div with class="lyrics" where
|
||||
# all of the lyrics can be found already correctly formatted
|
||||
# Sometimes, though, it packages the lyrics into separate divs, most
|
||||
# likely for easier ad placement
|
||||
lyrics_div = html.find("div", class_="lyrics")
|
||||
lyrics_div = soup.find("div", class_="lyrics")
|
||||
if not lyrics_div:
|
||||
self._log.debug(u'Received unusual song page html')
|
||||
verse_div = html.find("div",
|
||||
verse_div = soup.find("div",
|
||||
class_=re.compile("Lyrics__Container"))
|
||||
if not verse_div:
|
||||
if html.find("div",
|
||||
if soup.find("div",
|
||||
class_=re.compile("LyricsPlaceholder__Message"),
|
||||
string="This song is an instrumental"):
|
||||
self._log.debug('Detected instrumental')
|
||||
|
|
@ -410,6 +432,7 @@ class Genius(Backend):
|
|||
|
||||
class Tekstowo(Backend):
|
||||
# Fetch lyrics from Tekstowo.pl.
|
||||
REQUIRES_BS = True
|
||||
|
||||
BASE_URL = 'http://www.tekstowo.pl'
|
||||
URL_PATTERN = BASE_URL + '/wyszukaj.html?search-title=%s&search-artist=%s'
|
||||
|
|
@ -417,27 +440,27 @@ class Tekstowo(Backend):
|
|||
def fetch(self, artist, title):
|
||||
url = self.build_url(title, artist)
|
||||
search_results = self.fetch_url(url)
|
||||
if not search_results:
|
||||
return None
|
||||
song_page_url = self.parse_search_results(search_results)
|
||||
|
||||
if not song_page_url:
|
||||
return None
|
||||
|
||||
song_page_html = self.fetch_url(song_page_url)
|
||||
if not song_page_html:
|
||||
return None
|
||||
return self.extract_lyrics(song_page_html)
|
||||
|
||||
def parse_search_results(self, html):
|
||||
if not HAS_BEAUTIFUL_SOUP:
|
||||
return None
|
||||
|
||||
html = _scrape_strip_cruft(html)
|
||||
html = _scrape_merge_paragraphs(html)
|
||||
|
||||
try:
|
||||
html = BeautifulSoup(html, "html.parser")
|
||||
except HTMLParseError:
|
||||
soup = try_parse_html(html)
|
||||
if not soup:
|
||||
return None
|
||||
|
||||
song_rows = html.find("div", class_="content"). \
|
||||
song_rows = soup.find("div", class_="content"). \
|
||||
find("div", class_="card"). \
|
||||
find_all("div", class_="box-przeboje")
|
||||
|
||||
|
|
@ -456,12 +479,11 @@ class Tekstowo(Backend):
|
|||
html = _scrape_strip_cruft(html)
|
||||
html = _scrape_merge_paragraphs(html)
|
||||
|
||||
try:
|
||||
html = BeautifulSoup(html, "html.parser")
|
||||
except HTMLParseError:
|
||||
soup = try_parse_html(html)
|
||||
if not soup:
|
||||
return None
|
||||
|
||||
return html.find("div", class_="song-text").get_text()
|
||||
return soup.find("div", class_="song-text").get_text()
|
||||
|
||||
|
||||
def remove_credits(text):
|
||||
|
|
@ -507,12 +529,6 @@ def scrape_lyrics_from_html(html):
|
|||
"""Scrape lyrics from a URL. If no lyrics can be found, return None
|
||||
instead.
|
||||
"""
|
||||
if not HAS_BEAUTIFUL_SOUP:
|
||||
return None
|
||||
|
||||
if not html:
|
||||
return None
|
||||
|
||||
def is_text_notcode(text):
|
||||
length = len(text)
|
||||
return (length > 20 and
|
||||
|
|
@ -522,10 +538,8 @@ def scrape_lyrics_from_html(html):
|
|||
html = _scrape_merge_paragraphs(html)
|
||||
|
||||
# extract all long text blocks that are not code
|
||||
try:
|
||||
soup = BeautifulSoup(html, "html.parser",
|
||||
parse_only=SoupStrainer(text=is_text_notcode))
|
||||
except HTMLParseError:
|
||||
soup = try_parse_html(html, parse_only=SoupStrainer(text=is_text_notcode))
|
||||
if not soup:
|
||||
return None
|
||||
|
||||
# Get the longest text element (if any).
|
||||
|
|
@ -539,6 +553,8 @@ def scrape_lyrics_from_html(html):
|
|||
class Google(Backend):
|
||||
"""Fetch lyrics from Google search results."""
|
||||
|
||||
REQUIRES_BS = True
|
||||
|
||||
def __init__(self, config, log):
|
||||
super(Google, self).__init__(config, log)
|
||||
self.api_key = config['google_API_key'].as_str()
|
||||
|
|
@ -645,6 +661,8 @@ class Google(Backend):
|
|||
title, artist):
|
||||
continue
|
||||
html = self.fetch_url(url_link)
|
||||
if not html:
|
||||
continue
|
||||
lyrics = scrape_lyrics_from_html(html)
|
||||
if not lyrics:
|
||||
continue
|
||||
|
|
@ -654,10 +672,11 @@ class Google(Backend):
|
|||
item['displayLink'])
|
||||
return lyrics
|
||||
|
||||
return None
|
||||
|
||||
|
||||
class LyricsPlugin(plugins.BeetsPlugin):
|
||||
SOURCES = ['google', 'musixmatch', 'genius', 'tekstowo']
|
||||
BS_SOURCES = ['google', 'genius', 'tekstowo']
|
||||
SOURCE_BACKENDS = {
|
||||
'google': Google,
|
||||
'musixmatch': MusiXmatch,
|
||||
|
|
@ -727,15 +746,17 @@ class LyricsPlugin(plugins.BeetsPlugin):
|
|||
for source in sources]
|
||||
|
||||
def sanitize_bs_sources(self, sources):
|
||||
for source in self.BS_SOURCES:
|
||||
if source in sources:
|
||||
enabled_sources = []
|
||||
for source in sources:
|
||||
if source.REQUIRES_BS:
|
||||
self._log.debug(u'To use the %s lyrics source, you must '
|
||||
u'install the beautifulsoup4 module. See '
|
||||
u'the documentation for further details.'
|
||||
% source)
|
||||
sources.remove(source)
|
||||
else:
|
||||
enabled_sources.append(source)
|
||||
|
||||
return sources
|
||||
return enabled_sources
|
||||
|
||||
def get_bing_access_token(self):
|
||||
params = {
|
||||
|
|
|
|||
|
|
@ -380,7 +380,9 @@ Fixes:
|
|||
* Templates that use ``%ifdef`` now produce the expected behavior when used in
|
||||
conjunction with non-string fields from the :doc:`/plugins/types`.
|
||||
:bug:`3852`
|
||||
|
||||
* :doc:`/plugins/lyrics`: Fix crashes when a website could not be retrieved,
|
||||
affecting at least the Genius source
|
||||
:bug:`3970`
|
||||
|
||||
For plugin developers:
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue