From 8ef7837d2289140f0800f8a12c2e63949eec1a63 Mon Sep 17 00:00:00 2001 From: Fabrice Laporte Date: Wed, 24 Sep 2014 16:20:55 +0200 Subject: [PATCH] merge strip_cruft() and _scrape_normalize_eol() into _scrape_strip_cruft --- beetsplug/lyrics.py | 50 ++++++++++++++++-------------------------- test/lyrics_sources.py | 2 +- test/test_lyrics.py | 22 ++++++++++++------- 3 files changed, 34 insertions(+), 40 deletions(-) diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py index 1e4fd704d..cafe94a77 100644 --- a/beetsplug/lyrics.py +++ b/beetsplug/lyrics.py @@ -113,27 +113,7 @@ def extract_text(html, starttag): print('no closing tag found!') return lyrics = ''.join(parts) - return strip_cruft(lyrics) - - -def strip_cruft(lyrics, wscollapse=True): - """Clean up HTML from an extracted lyrics string. For example,
- tags are replaced with newlines. - """ - lyrics = COMMENT_RE.sub('', lyrics) - lyrics = unescape(lyrics) - if wscollapse: - lyrics = re.sub(r'\s+', ' ', lyrics) # Whitespace collapse. - - lyrics = re.sub(r'<(script).*?(?s)', '', lyrics) # Strip script tags. - lyrics = BREAK_RE.sub('\n', lyrics) #
newlines. - lyrics = re.sub(r'\n +', '\n', lyrics) - lyrics = re.sub(r' +\n', '\n', lyrics) - lyrics = TAG_RE.sub('', lyrics) # Strip remaining HTML tags. - lyrics = lyrics.replace('\r', '\n') - lyrics = lyrics.strip() - return lyrics - + return _scrape_strip_cruft(lyrics, True) def search_pairs(item): """Yield a pairs of artists and titles to search for. @@ -341,13 +321,23 @@ def is_lyrics(text, artist=None): return len(badTriggersOcc) < 2 -def _scrape_normalize_eol(html): - """Return html text where the only authorized eol marker is \n +def _scrape_strip_cruft(html, plain_text_out=False): + """Clean up HTML """ - html.replace('\r','\n') - # Replace
without introducing superfluous newline in the output - BREAK_RE = re.compile(r'\n?\s*]*)*>\s*\n?', re.I) - html = BREAK_RE.sub('\n', html) + html = unescape(html) + + # Normalize EOL + html = html.replace('\r','\n') + html = re.sub(r' +', ' ', html) # Whitespaces collapse. + regex = re.compile(r'\n?\s*]*)*>\s*\n?', re.I) + html = regex.sub('\n', html) # When present,
eat up surrounding '\n' + + if plain_text_out: # Strip remaining HTML tags + html = TAG_RE.sub('', html) + html = COMMENT_RE.sub('', html) + + # Strip lines + html = '\n'.join([x.strip() for x in html.strip().split('\n')]) return html def _scrape_merge_paragraphs(html): @@ -417,8 +407,8 @@ def scrape_lyrics_from_html(html): """ if not html: return None - - html = _scrape_normalize_eol(html) + + html = _scrape_strip_cruft(html) html = _scrape_merge_paragraphs(html) soup = BeautifulSoup(html) @@ -458,8 +448,6 @@ def fetch_google(artist, title): if not lyrics: continue - lyrics = strip_cruft(lyrics, False) - if is_lyrics(lyrics, artist): log.debug(u'got lyrics from {0}'.format(item['displayLink'])) return lyrics diff --git a/test/lyrics_sources.py b/test/lyrics_sources.py index e3e2c8b75..96658998d 100644 --- a/test/lyrics_sources.py +++ b/test/lyrics_sources.py @@ -154,7 +154,7 @@ class LyricsSourcesPluginTest(unittest.TestCase): def test_sources_ok(self): for s in self.sourcesOk: url = s['url'] + s['path'] - log.info('Trying to scrape lyrics from {0}'.format(url)) + log.info('Scraping lyrics from {0}'.format(url)) res = lyrics.scrape_lyrics_from_html(lyrics.fetch_url(url)) self.assertTrue(lyrics.is_lyrics(res), url) self.assertTrue(is_lyrics_content_ok(s['title'], res), url) diff --git a/test/test_lyrics.py b/test/test_lyrics.py index d240a3370..ce53d7465 100644 --- a/test/test_lyrics.py +++ b/test/test_lyrics.py @@ -119,14 +119,6 @@ class LyricsPluginTest(unittest.TestCase): if the beat aint crackin""" self.assertEqual(lyrics.remove_credits(text), text) - def test_strip_cruft(self): - text = """ -