diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py index 0977de363..08f90ddbf 100644 --- a/beetsplug/lyrics.py +++ b/beetsplug/lyrics.py @@ -113,6 +113,7 @@ def extract_text(html, starttag): lyrics = ''.join(parts) return _scrape_strip_cruft(lyrics, True) + def search_pairs(item): """Yield a pairs of artists and titles to search for. @@ -297,7 +298,8 @@ def is_lyrics(text, artist=None): badTriggersOcc = [] nbLines = text.count('\n') if nbLines <= 1: - log.debug(u"Ignoring too short lyrics '{0}'".format(text.decode('utf8'))) + log.debug(u"Ignoring too short lyrics '{0}'".format( + text.decode('utf8'))) return 0 elif nbLines < 5: badTriggersOcc.append('too_short') @@ -319,53 +321,59 @@ def is_lyrics(text, artist=None): return len(badTriggersOcc) < 2 + def _scrape_strip_cruft(html, plain_text_out=False): """Clean up HTML """ html = unescape(html) - # Normalize EOL - html = html.replace('\r','\n') + # Normalize EOL + html = html.replace('\r', '\n') html = re.sub(r' +', ' ', html) # Whitespaces collapse. regex = re.compile(r'\n?\s*]*)*>\s*\n?', re.I) - html = regex.sub('\n', html) # When present,
eat up surrounding '\n' - - if plain_text_out: # Strip remaining HTML tags - html = TAG_RE.sub('', html) + html = regex.sub('\n', html) # When present,
eat up surrounding '\n' + + if plain_text_out: # Strip remaining HTML tags + html = TAG_RE.sub('', html) html = COMMENT_RE.sub('', html) - + # Strip lines html = '\n'.join([x.strip() for x in html.strip().split('\n')]) return html + def _scrape_merge_paragraphs(html): regex = re.compile(r'

\s*]*)>') html = regex.sub('\n', html) return html + def scrape_lyrics_from_html(html): """Scrape lyrics from a URL. If no lyrics can be found, return None instead. """ from bs4 import SoupStrainer, BeautifulSoup - def may_be_lyrics(string): - length = len(string) - return (length > 20 and - string.count(' ') > length/25 - and (string.find('=')==-1 or string.find(';')==1)) - if not html: return None - + + def is_text_notcode(string): + length = len(string) + return (length > 20 and + string.count(' ') > length / 25 + and (string.find('=') == -1 or string.find(';') == 1)) + html = _scrape_strip_cruft(html) html = _scrape_merge_paragraphs(html) - soup = BeautifulSoup(html, "html.parser", - parse_only=SoupStrainer(text=may_be_lyrics)) + + # extract all long text blocks that are not code + soup = BeautifulSoup(html, "html.parser", + parse_only=SoupStrainer(text=is_text_notcode)) soup = sorted(soup.stripped_strings, key=len)[-1] return soup + def fetch_google(artist, title): """Fetch lyrics from Google search results. """ diff --git a/test/test_lyrics.py b/test/test_lyrics.py index 00bea692a..7bc657bf1 100644 --- a/test/test_lyrics.py +++ b/test/test_lyrics.py @@ -20,7 +20,6 @@ import _common from _common import unittest from beetsplug import lyrics from beets.library import Item -from beets import config from beets.util import confit @@ -150,9 +149,6 @@ class LyricsPluginTest(unittest.TestCase): "one\ntwo\nthree") - - - LYRICS_TEXTS = confit.load_yaml(os.path.join(_common.RSRC, 'lyricstext.yaml')) definfo = dict(artist=u'The Beatles', title=u'Lady Madonna') # default query @@ -262,7 +258,7 @@ class LyricsGooglePluginTest(unittest.TestCase): __import__('bs4') except ImportError: self.skipTest('Beautiful Soup 4 not available') - + lyrics.LyricsPlugin() lyrics.fetch_url = MockFetchUrl()