From a6a83be434ba3b88a3ad1406152174597cc8d5f6 Mon Sep 17 00:00:00 2001
From: Fabrice Laporte
]*)*>\s*\n?', re.I)
- html = regex.sub('\n', html) # When present,
eat up surrounding '\n'
-
- if plain_text_out: # Strip remaining HTML tags
- html = TAG_RE.sub('', html)
+ html = regex.sub('\n', html) # When present,
eat up surrounding '\n'
+
+ if plain_text_out: # Strip remaining HTML tags
+ html = TAG_RE.sub('', html)
html = COMMENT_RE.sub('', html)
-
+
# Strip lines
html = '\n'.join([x.strip() for x in html.strip().split('\n')])
return html
+
def _scrape_merge_paragraphs(html):
regex = re.compile(r'
]*)>') html = regex.sub('\n', html) return html + def scrape_lyrics_from_html(html): """Scrape lyrics from a URL. If no lyrics can be found, return None instead. """ from bs4 import SoupStrainer, BeautifulSoup - def may_be_lyrics(string): - length = len(string) - return (length > 20 and - string.count(' ') > length/25 - and (string.find('=')==-1 or string.find(';')==1)) - if not html: return None - + + def is_text_notcode(string): + length = len(string) + return (length > 20 and + string.count(' ') > length / 25 + and (string.find('=') == -1 or string.find(';') == 1)) + html = _scrape_strip_cruft(html) html = _scrape_merge_paragraphs(html) - soup = BeautifulSoup(html, "html.parser", - parse_only=SoupStrainer(text=may_be_lyrics)) + + # extract all long text blocks that are not code + soup = BeautifulSoup(html, "html.parser", + parse_only=SoupStrainer(text=is_text_notcode)) soup = sorted(soup.stripped_strings, key=len)[-1] return soup + def fetch_google(artist, title): """Fetch lyrics from Google search results. """ diff --git a/test/test_lyrics.py b/test/test_lyrics.py index 00bea692a..7bc657bf1 100644 --- a/test/test_lyrics.py +++ b/test/test_lyrics.py @@ -20,7 +20,6 @@ import _common from _common import unittest from beetsplug import lyrics from beets.library import Item -from beets import config from beets.util import confit @@ -150,9 +149,6 @@ class LyricsPluginTest(unittest.TestCase): "one\ntwo\nthree") - - - LYRICS_TEXTS = confit.load_yaml(os.path.join(_common.RSRC, 'lyricstext.yaml')) definfo = dict(artist=u'The Beatles', title=u'Lady Madonna') # default query @@ -262,7 +258,7 @@ class LyricsGooglePluginTest(unittest.TestCase): __import__('bs4') except ImportError: self.skipTest('Beautiful Soup 4 not available') - + lyrics.LyricsPlugin() lyrics.fetch_url = MockFetchUrl()