lyrics : remove empty divs before scraping

it may result in \n being inserted that we will strip in _scrape_strip_cruft
2026-02-12 18:31:48 +01:00 · 2014-12-30 23:37:23 +01:00 · 2014-12-30 23:37:23 +01:00 · d4d5c085fa
commit d4d5c085fa
parent fd94094c1b
2 changed files with 11 additions and 12 deletions
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@ -90,7 +90,7 @@ def extract_text_between(html, start_marker, end_marker):
        html, _ = html.split(end_marker, 1)
    except ValueError:
        return u''
-    return _scrape_strip_cruft(html, True)
+    return html


 def extract_text_in(html, starttag):
@ -124,8 +124,7 @@ def extract_text_in(html, starttag):
    else:
        print('no closing tag found!')
        return
-    lyrics = ''.join(parts)
-    return _scrape_strip_cruft(lyrics, True)
+    return u''.join(parts)


 def search_pairs(item):
@ -221,7 +220,7 @@ def fetch_lyricswiki(artist, title):
    if not html:
        return

-    lyrics = extract_text_in(html, "<div class='lyricbox'>")
+    lyrics = extract_text_in(html, u"<div class='lyricbox'>")
    if lyrics and 'Unfortunately, we are not licensed' not in lyrics:
        return lyrics

@ -360,13 +359,14 @@ def _scrape_strip_cruft(html, plain_text_out=False):
        html = COMMENT_RE.sub('', html)
        html = TAG_RE.sub('', html)

-    # Strip lines
    html = '\n'.join([x.strip() for x in html.strip().split('\n')])
+    html = re.sub(r'\n{3,}', r'\n\n', html)
    return html


 def _scrape_merge_paragraphs(html):
-    return re.sub(r'</p>\s*<p(\s*[^>]*)>', '\n', html)
+    html = re.sub(r'</p>\s*<p(\s*[^>]*)>', '\n', html)
+    return re.sub(r'<div .*>\s*</div>', '\n', html)


 def scrape_lyrics_from_html(html):
@ -541,4 +541,4 @@ class LyricsPlugin(plugins.BeetsPlugin):
            if lyrics:
                log.debug(u'got lyrics from backend: {0}'
                          .format(backend.__name__))
-                return lyrics.strip()
+                return _scrape_strip_cruft(lyrics, True)
--- a/test/test_lyrics.py
+++ b/test/test_lyrics.py
@ -319,12 +319,11 @@ class LyricsGooglePluginTest(unittest.TestCase):
        for (fun, s) in zip([lyrics.fetch_lyricswiki,
                             lyrics.fetch_lyricscom,
                             lyrics.fetch_musixmatch], DEFAULT_SOURCES):
-            if os.path.isfile(url_to_filename(
-                              s['url'] + s['path'])):
+            url = s['url'] + s['path']
+            if os.path.isfile(url_to_filename(url)):
                res = fun(s['artist'], s['title'])
-                self.assertTrue(lyrics.is_lyrics(res))
-                self.assertTrue(is_lyrics_content_ok(
-                                s['title'], res))
+                self.assertTrue(lyrics.is_lyrics(res), url)
+                self.assertTrue(is_lyrics_content_ok(s['title'], res), url)

    def test_is_page_candidate_exact_match(self):
        """Test matching html page title with song infos -- when song infos are