lyrics: wrap BeautifulSoup() constructor to centralize error handling

also ensure that the return value is always checked for None
2026-01-30 20:13:37 +01:00 · 2021-06-15 10:30:05 +02:00 · 2021-06-15 10:30:05 +02:00 · 867d383544
commit 867d383544
parent b34442f5d1
1 changed files with 22 additions and 12 deletions
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@ -33,7 +33,8 @@ import six
 from six.moves import urllib

 try:
-    from bs4 import SoupStrainer, BeautifulSoup
+    import bs4
+    from bs4 import SoupStrainer
    HAS_BEAUTIFUL_SOUP = True
 except ImportError:
    HAS_BEAUTIFUL_SOUP = False
@ -219,6 +220,17 @@ def slug(text):
    return re.sub(r'\W+', '-', unidecode(text).lower().strip()).strip('-')


+if HAS_BEAUTIFUL_SOUP:
+    def try_parse_html(html, **kwargs):
+        try:
+            return bs4.BeautifulSoup(html, 'html.parser', **kwargs)
+        except HTMLParseError:
+            return None
+else:
+    def try_parse_html(html, **kwargs):
+        return None
+
+
 class Backend(object):
    def __init__(self, config, log):
        self._log = log
@ -377,7 +389,9 @@ class Genius(Backend):
    def _scrape_lyrics_from_html(self, html):
        """Scrape lyrics from a given genius.com html"""

-        soup = BeautifulSoup(html, "html.parser")
+        soup = try_parse_html(html)
+        if not soup:
+            return

        # Remove script tags that they put in the middle of the lyrics.
        [h.extract() for h in soup('script')]
@ -440,9 +454,8 @@ class Tekstowo(Backend):
        html = _scrape_strip_cruft(html)
        html = _scrape_merge_paragraphs(html)

-        try:
-            soup = BeautifulSoup(html, "html.parser")
-        except HTMLParseError:
+        soup = try_parse_html(html)
+        if not soup:
            return None

        song_rows = soup.find("div", class_="content"). \
@ -464,9 +477,8 @@ class Tekstowo(Backend):
        html = _scrape_strip_cruft(html)
        html = _scrape_merge_paragraphs(html)

-        try:
-            soup = BeautifulSoup(html, "html.parser")
-        except HTMLParseError:
+        soup = try_parse_html(html)
+        if not soup:
            return None

        return soup.find("div", class_="song-text").get_text()
@ -527,10 +539,8 @@ def scrape_lyrics_from_html(html):
    html = _scrape_merge_paragraphs(html)

    # extract all long text blocks that are not code
-    try:
-        soup = BeautifulSoup(html, "html.parser",
-                             parse_only=SoupStrainer(text=is_text_notcode))
-    except HTMLParseError:
+    soup = try_parse_html(html, parse_only=SoupStrainer(text=is_text_notcode))
+    if not soup:
        return None

    # Get the longest text element (if any).