From 8ef7837d2289140f0800f8a12c2e63949eec1a63 Mon Sep 17 00:00:00 2001
From: Fabrice Laporte <kraymer@gmail.com>
Date: Wed, 24 Sep 2014 16:20:55 +0200
Subject: [PATCH] merge strip_cruft() and _scrape_normalize_eol() into
 _scrape_strip_cruft

---
 beetsplug/lyrics.py    | 50 ++++++++++++++++--------------------------
 test/lyrics_sources.py |  2 +-
 test/test_lyrics.py    | 22 ++++++++++++-------
 3 files changed, 34 insertions(+), 40 deletions(-)
diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index 1e4fd704d..cafe94a77 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -113,27 +113,7 @@ def extract_text(html, starttag):
         print('no closing tag found!')
         return
     lyrics = ''.join(parts)
-    return strip_cruft(lyrics)
-
-
-def strip_cruft(lyrics, wscollapse=True):
-    """Clean up HTML from an extracted lyrics string. For example, <BR>
-    tags are replaced with newlines.
-    """
-    lyrics = COMMENT_RE.sub('', lyrics)
-    lyrics = unescape(lyrics)
-    if wscollapse:
-        lyrics = re.sub(r'\s+', ' ', lyrics)  # Whitespace collapse.
-
-    lyrics = re.sub(r'<(script).*?</\1>(?s)', '', lyrics)  # Strip script tags.
-    lyrics = BREAK_RE.sub('\n', lyrics)  # <BR> newlines.
-    lyrics = re.sub(r'\n +', '\n', lyrics)
-    lyrics = re.sub(r' +\n', '\n', lyrics)
-    lyrics = TAG_RE.sub('', lyrics)  # Strip remaining HTML tags.
-    lyrics = lyrics.replace('\r', '\n')
-    lyrics = lyrics.strip()
-    return lyrics
-
+    return _scrape_strip_cruft(lyrics, True)
 
 def search_pairs(item):
     """Yield a pairs of artists and titles to search for.
@@ -341,13 +321,23 @@ def is_lyrics(text, artist=None):
 
     return len(badTriggersOcc) < 2
 
-def _scrape_normalize_eol(html):
-    """Return html text where the only authorized eol marker is \n
+def _scrape_strip_cruft(html, plain_text_out=False):
+    """Clean up HTML
     """
-    html.replace('\r','\n')
-    # Replace <br> without introducing superfluous newline in the output
-    BREAK_RE = re.compile(r'\n?\s*<br([\s|/][^>]*)*>\s*\n?', re.I)
-    html = BREAK_RE.sub('\n', html)
+    html = unescape(html)
+
+    # Normalize EOL 
+    html = html.replace('\r','\n')
+    html = re.sub(r' +', ' ', html)  # Whitespaces collapse.
+    regex = re.compile(r'\n?\s*<br([\s|/][^>]*)*>\s*\n?', re.I)
+    html = regex.sub('\n', html) # When present, <br> eat up surrounding '\n' 
+   
+    if plain_text_out: # Strip remaining HTML tags
+        html = TAG_RE.sub('', html)  
+        html = COMMENT_RE.sub('', html)
+   
+    # Strip lines
+    html = '\n'.join([x.strip() for x in html.strip().split('\n')])
     return html
 
 def _scrape_merge_paragraphs(html):
@@ -417,8 +407,8 @@ def scrape_lyrics_from_html(html):
     """
     if not html:
         return None
-
-    html = _scrape_normalize_eol(html)
+       
+    html = _scrape_strip_cruft(html)
     html = _scrape_merge_paragraphs(html)
 
     soup = BeautifulSoup(html)
@@ -458,8 +448,6 @@ def fetch_google(artist, title):
             if not lyrics:
                 continue
 
-            lyrics = strip_cruft(lyrics, False)
-
             if is_lyrics(lyrics, artist):
                 log.debug(u'got lyrics from {0}'.format(item['displayLink']))
                 return lyrics
diff --git a/test/lyrics_sources.py b/test/lyrics_sources.py
index e3e2c8b75..96658998d 100644
--- a/test/lyrics_sources.py
+++ b/test/lyrics_sources.py
@@ -154,7 +154,7 @@ class LyricsSourcesPluginTest(unittest.TestCase):
     def test_sources_ok(self):
         for s in self.sourcesOk:
             url = s['url'] + s['path']
-            log.info('Trying to scrape lyrics from {0}'.format(url))
+            log.info('Scraping lyrics from {0}'.format(url))
             res = lyrics.scrape_lyrics_from_html(lyrics.fetch_url(url))
             self.assertTrue(lyrics.is_lyrics(res), url)
             self.assertTrue(is_lyrics_content_ok(s['title'], res), url)
diff --git a/test/test_lyrics.py b/test/test_lyrics.py
index d240a3370..ce53d7465 100644
--- a/test/test_lyrics.py
+++ b/test/test_lyrics.py
@@ -119,14 +119,6 @@ class LyricsPluginTest(unittest.TestCase):
                   if the beat aint crackin"""
         self.assertEqual(lyrics.remove_credits(text), text)
 
-    def test_strip_cruft(self):
-        text = """<!--lyrics below-->
-                  <script type="javascript">
-                  &nbsp;  One<BR>\r\n
-                  <blink>Two</blink>
-               """
-        self.assertEqual(lyrics.strip_cruft(text), u"One\nTwo")
-
     def test_is_lyrics(self):
         texts = ['LyricsMania.com - Copyright (c) 2013 - All Rights Reserved']
         texts += ["""All material found on this site is property\n
@@ -138,6 +130,20 @@ class LyricsPluginTest(unittest.TestCase):
         text = u"http://site.com/çafe-au_lait(boisson)"
         self.assertEqual(lyrics.slugify(text), 'http://site.com/cafe_au_lait')
 
+    def test_scrape_strip_cruft(self):
+        text = u"""<!--lyrics below-->
+                  &nbsp;one
+                  <br class='myclass'>
+                  two  !
+                  <br><br \>
+                  <blink>four</blink>"""
+        self.assertEqual(lyrics._scrape_strip_cruft(text, True),
+                         "one\ntwo !\n\nfour")
+
+    def test_scrape_merge_paragraphs(self):
+        text = u"one</p>   <p class='myclass'>two</p><p>three"
+        self.assertEqual(lyrics._scrape_merge_paragraphs(text),
+                         "one\ntwo\nthree")
 
 def suite():
     return unittest.TestLoader().loadTestsFromName(__name__)