diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index 0977de363..08f90ddbf 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -113,6 +113,7 @@ def extract_text(html, starttag):
lyrics = ''.join(parts)
return _scrape_strip_cruft(lyrics, True)
+
def search_pairs(item):
"""Yield a pairs of artists and titles to search for.
@@ -297,7 +298,8 @@ def is_lyrics(text, artist=None):
badTriggersOcc = []
nbLines = text.count('\n')
if nbLines <= 1:
- log.debug(u"Ignoring too short lyrics '{0}'".format(text.decode('utf8')))
+ log.debug(u"Ignoring too short lyrics '{0}'".format(
+ text.decode('utf8')))
return 0
elif nbLines < 5:
badTriggersOcc.append('too_short')
@@ -319,53 +321,59 @@ def is_lyrics(text, artist=None):
return len(badTriggersOcc) < 2
+
def _scrape_strip_cruft(html, plain_text_out=False):
"""Clean up HTML
"""
html = unescape(html)
- # Normalize EOL
- html = html.replace('\r','\n')
+ # Normalize EOL
+ html = html.replace('\r', '\n')
html = re.sub(r' +', ' ', html) # Whitespaces collapse.
regex = re.compile(r'\n?\s*
]*)*>\s*\n?', re.I)
- html = regex.sub('\n', html) # When present,
eat up surrounding '\n'
-
- if plain_text_out: # Strip remaining HTML tags
- html = TAG_RE.sub('', html)
+ html = regex.sub('\n', html) # When present,
eat up surrounding '\n'
+
+ if plain_text_out: # Strip remaining HTML tags
+ html = TAG_RE.sub('', html)
html = COMMENT_RE.sub('', html)
-
+
# Strip lines
html = '\n'.join([x.strip() for x in html.strip().split('\n')])
return html
+
def _scrape_merge_paragraphs(html):
regex = re.compile(r'
]*)>') html = regex.sub('\n', html) return html + def scrape_lyrics_from_html(html): """Scrape lyrics from a URL. If no lyrics can be found, return None instead. """ from bs4 import SoupStrainer, BeautifulSoup - def may_be_lyrics(string): - length = len(string) - return (length > 20 and - string.count(' ') > length/25 - and (string.find('=')==-1 or string.find(';')==1)) - if not html: return None - + + def is_text_notcode(string): + length = len(string) + return (length > 20 and + string.count(' ') > length / 25 + and (string.find('=') == -1 or string.find(';') == 1)) + html = _scrape_strip_cruft(html) html = _scrape_merge_paragraphs(html) - soup = BeautifulSoup(html, "html.parser", - parse_only=SoupStrainer(text=may_be_lyrics)) + + # extract all long text blocks that are not code + soup = BeautifulSoup(html, "html.parser", + parse_only=SoupStrainer(text=is_text_notcode)) soup = sorted(soup.stripped_strings, key=len)[-1] return soup + def fetch_google(artist, title): """Fetch lyrics from Google search results. """ diff --git a/test/test_lyrics.py b/test/test_lyrics.py index 00bea692a..7bc657bf1 100644 --- a/test/test_lyrics.py +++ b/test/test_lyrics.py @@ -20,7 +20,6 @@ import _common from _common import unittest from beetsplug import lyrics from beets.library import Item -from beets import config from beets.util import confit @@ -150,9 +149,6 @@ class LyricsPluginTest(unittest.TestCase): "one\ntwo\nthree") - - - LYRICS_TEXTS = confit.load_yaml(os.path.join(_common.RSRC, 'lyricstext.yaml')) definfo = dict(artist=u'The Beatles', title=u'Lady Madonna') # default query @@ -262,7 +258,7 @@ class LyricsGooglePluginTest(unittest.TestCase): __import__('bs4') except ImportError: self.skipTest('Beautiful Soup 4 not available') - + lyrics.LyricsPlugin() lyrics.fetch_url = MockFetchUrl()