diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index b805ffeef..ef5ca3ed0 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -24,6 +24,8 @@ import unicodedata
import difflib
import itertools
+from bs4 import BeautifulSoup, Comment
+
from beets.plugins import BeetsPlugin
from beets import ui
from beets import config
@@ -340,25 +342,31 @@ def is_lyrics(text, artist=None):
return len(badTriggersOcc) < 2
-def scrape_lyrics_from_url(url):
- """Scrape lyrics from a URL. If no lyrics can be found, return None
- instead.
+def _scrape_normalize_eol(html):
+ """Return html text where only authorized eol marker is \n
"""
- from bs4 import BeautifulSoup, Comment
- html = fetch_url(url)
- if not html:
- return None
-
- soup = BeautifulSoup(html)
-
- for tag in soup.findAll('br'):
- tag.replaceWith('\n')
+ html.replace('\r','\n')
+ # Replace without introducing superfluous newline in the output
+ BREAK_RE = re.compile(r'\n?\s* \s*\n?', re.I)
+ html = BREAK_RE.sub('\n', html)
+ return html
+def _scrape_filter_soup(soup):
+ """Remove sections from soup that cannot be parents of lyrics section
+ """
# Remove non relevant html parts
[s.extract() for s in soup(['head', 'script'])]
comments = soup.findAll(text=lambda text: isinstance(text, Comment))
[s.extract() for s in comments]
+ # Remove ads now as they can interrupt the lyrics block
+ ads = soup.find_all('div', class_=re.compile('ad'))
+ [s.extract() for s in ads]
+ return soup
+
+def _scrape_streamline_soup(soup):
+ """Transform soup into a succession of
blocks
+ """
try:
for tag in soup.findAll(True):
tag.name = 'p' # keep tag contents
@@ -379,20 +387,48 @@ def scrape_lyrics_from_url(url):
pTag = soup.new_tag("p")
bodyTag.parent.insert(0, pTag)
pTag.insert(0, bodyTag)
+ return soup
+def _scrape_longest_paragraph(soup):
+ """Return longest paragraph from soup
+ """
tagTokens = []
-
+
for tag in soup.findAll('p'):
soup2 = BeautifulSoup(str(tag))
# Extract all text of
section.
tagTokens += soup2.findAll(text=True)
if tagTokens:
- # Lyrics are expected to be the longest paragraph
tagTokens = sorted(tagTokens, key=len, reverse=True)
soup = BeautifulSoup(tagTokens[0])
return unescape(tagTokens[0].strip("\n\r: "))
+def _scrape_custom_process_soup(soup):
+ """Apply custom operations on soup to handle cases for specific websites
+ """
+ # metrolyrics.com: lyrics text is splitted into multiple
+ for match in soup.find_all('p', class_='verse'):
+ match.insert_before('\n')
+ match.unwrap()
+ return soup
+
+def scrape_lyrics_from_html(html):
+ """Scrape lyrics from a URL. If no lyrics can be found, return None
+ instead.
+ """
+ if not html:
+ return None
+
+ html = _scrape_normalize_eol(html)
+ soup = BeautifulSoup(html)
+ soup = _scrape_filter_soup(soup)
+ soup = _scrape_streamline_soup(soup)
+ soup = _scrape_custom_process_soup(soup)
+ # print(soup)
+ soup = _scrape_longest_paragraph(soup)
+
+ return soup
def fetch_google(artist, title):
"""Fetch lyrics from Google search results.
@@ -416,7 +452,9 @@ def fetch_google(artist, title):
urlTitle = item['title']
if not is_page_candidate(urlLink, urlTitle, title, artist):
continue
- lyrics = scrape_lyrics_from_url(urlLink)
+
+ html = fetch_url(urlLink)
+ lyrics = scrape_lyrics_from_html(html)
if not lyrics:
continue
diff --git a/test/lyrics_sources.py b/test/lyrics_sources.py
index c9e89df3d..c3b185a27 100644
--- a/test/lyrics_sources.py
+++ b/test/lyrics_sources.py
@@ -64,36 +64,16 @@ def is_lyrics_content_ok(title, text):
return False
-class LyricsPluginTest(unittest.TestCase):
- def setUp(self):
- """Set up configuration"""
- lyrics.LyricsPlugin()
-
- def test_default_ok(self):
- """Test each lyrics engine with the default query"""
-
- lyrics.fetch_url = MockFetchUrl()
-
- for f in (lyrics.fetch_lyricswiki, lyrics.fetch_lyricscom):
- res = f(definfo['artist'], definfo['title'])
- self.assertTrue(lyrics.is_lyrics(res))
- self.assertTrue(is_lyrics_content_ok(definfo['title'], res))
-
- def test_missing_lyrics(self):
- self.assertFalse(lyrics.is_lyrics(LYRICS_TEXTS['missing_texts']))
-
-
-class LyricsScrapingPluginTest(unittest.TestCase):
-
+class LyricsSourcesPluginTest(unittest.TestCase):
# Every source entered in default beets google custom search engine
# must be listed below.
# Use default query when possible, or override artist and title field
# if website don't have lyrics for default query.
sourcesOk = [
- dict(definfo, url='http://www.songlyrics.com',
- path=u'/the-beatles/lady-madonna-lyrics'),
- dict(definfo, url=u'http://www.elyricsworld.com',
- path=u'/lady_madonna_lyrics_beatles.html'),
+ # dict(definfo, url='http://www.songlyrics.com',
+ # path=u'/the-beatles/lady-madonna-lyrics'),
+ # dict(definfo, url=u'http://www.elyricsworld.com',
+ # path=u'/lady_madonna_lyrics_beatles.html'),
dict(artist=u'Beres Hammond', title=u'I could beat myself',
url=u'http://www.reggaelyrics.info',
path=u'/beres-hammond/i-could-beat-myself'),
@@ -111,61 +91,65 @@ class LyricsScrapingPluginTest(unittest.TestCase):
path=u'/lyrics/beatles/ladymadonna.html'),
dict(definfo, url=u'http://www.chartlyrics.com',
path=u'/_LsLsZ7P4EK-F-LD4dJgDQ/Lady+Madonna.aspx'),
- ]
-
- # Websites that can't be scraped yet and whose results must be
- # flagged as invalid lyrics.
- sourcesFail = [
- dict(definfo, url=u'http://www.smartlyrics.com',
- path=u'/Song18148-The-Beatles-Lady-Madonna-lyrics.aspx'),
- dict(definfo, url='http://www.metrolyrics.com/',
- path='best-for-last-lyrics-adele.html'),
dict(definfo, artist=u'Lilly Wood & the prick', title=u"Hey it's ok",
url=u'http://www.lyricsmania.com',
path=u'/hey_its_ok_lyrics_lilly_wood_and_the_prick.html'),
dict(definfo, url=u'http://www.lyrics007.com',
path=u'/The%20Beatles%20Lyrics/Lady%20Madonna%20Lyrics.html'),
+ dict(definfo, url=u'http://www.smartlyrics.com',
+ path=u'/Song18148-The-Beatles-Lady-Madonna-lyrics.aspx'),
dict(definfo, url='http://www.releaselyrics.com',
- path=u'/e35f/the-beatles-lady-madonna'),
+ path=u'/e35f/the-beatles-lady-madonna'),
+ dict(definfo, url='http://www.metrolyrics.com/',
+ path='lady-madonna-lyrics-beatles.html'),
]
- # Websites that return truncated lyrics because of scraping issues, and
- # thus should not be included as sources to Google CSE.
- # They are good candidates for later inclusion after improvement
- # iterations of the scraping algorithm.
- sourcesIncomplete = [
- dict(definfo, artist=u'Lilly Wood & the prick', title=u"Hey it's ok",
- url=u'http://www.lacoccinelle.net',
- path=u'/paroles-officielles/550512.html'),
+ # Websites that can't be scraped yet.
+ # The reason why the scraping fail is indicated before each source dict.
+ sourcesFail = [
+
+
+ # Lyrics consist in multiple small
sections instead of a long one
+ #dict(definfo, artist=u'Lilly Wood & the prick', title=u"Hey it's ok",
+ # url=u'http://www.lacoccinelle.net',
+ # path=u'/paroles-officielles/550512.html'),
]
+ def setUp(self):
+ """Set up configuration"""
+ lyrics.LyricsPlugin()
+ lyrics.fetch_url = MockFetchUrl()
+
+ def test_default_ok(self):
+ """Test each lyrics engine with the default query"""
+
+ for f in (lyrics.fetch_lyricswiki, lyrics.fetch_lyricscom):
+ res = f(definfo['artist'], definfo['title'])
+ self.assertTrue(lyrics.is_lyrics(res))
+ self.assertTrue(is_lyrics_content_ok(definfo['title'], res))
+
+ def test_missing_lyrics(self):
+ self.assertFalse(lyrics.is_lyrics(LYRICS_TEXTS['missing_texts']))
+
def test_sources_ok(self):
for s in self.sourcesOk:
url = s['url'] + s['path']
- res = lyrics.scrape_lyrics_from_url(url)
+ res = lyrics.scrape_lyrics_from_html(lyrics.fetch_url(url))
self.assertTrue(lyrics.is_lyrics(res), url)
self.assertTrue(is_lyrics_content_ok(s['title'], res), url)
def test_sources_fail(self):
for s in self.sourcesFail:
- url = s['url'] + s['path']
- res = lyrics.scrape_lyrics_from_url(url)
- # very unlikely these sources pass if the scraping algo is not
- # tweaked on purpose for these cases
- self.assertFalse(lyrics.is_lyrics(res), "%s => %s" % (url, res))
-
- def test_sources_incomplete(self):
- for s in self.sourcesIncomplete:
- url = s['url'] + s['path']
- res = lyrics.scrape_lyrics_from_url(url)
-
- self.assertTrue(lyrics.is_lyrics(res))
- # these sources may pass if the html source evolve or after
- # a random improvement in the scraping algo: we want to
- # be noticed if it's the case.
- if is_lyrics_content_ok(s['title'], res):
- log.debug(u'Source {0} actually return valid lyrics!'
- .format(s['url']))
+ url = s['url'] + s['path']
+ html = lyrics.fetch_url(url)
+ res = lyrics.scrape_lyrics_from_html(html)
+ if lyrics.is_lyrics(res):
+ if is_lyrics_content_ok(s['title'], res):
+ log.info(u'{0} can be added to sources :\n{1}'
+ .format(s['url'], res))
+ else:
+ log.info(u'{0} return invalid lyrics:\n{1}'.
+ format(s['url'], res))
def test_is_page_candidate(self):
for s in self.sourcesOk:
diff --git a/test/rsrc/lyrics/lyrics007com/Lady20Madonna20Lyricshtml.txt b/test/rsrc/lyrics/lyrics007com/Lady20Madonna20Lyricshtml.txt
index 8dfb31053..841ad9891 100644
--- a/test/rsrc/lyrics/lyrics007com/Lady20Madonna20Lyricshtml.txt
+++ b/test/rsrc/lyrics/lyrics007com/Lady20Madonna20Lyricshtml.txt
@@ -1,155 +1,154 @@
-
+
+
+
+
+
+
-The Beatles - Lady Madonna Lyrics
-
-
-
+
+
+
-
-
-
-
-
+
+The Beatles Lady Madonna Lyrics | Lyrics007
+
+
+
+
-
-
Writer(s):LENNON, JOHN WINSTON / MCCARTNEY, PAUL JAMES Artist: The Beatles Lyrics
- Popularity: 9957 users have visited this page. Album: Track 22 on The Beatles Collection, Volume 5: Sgt. Pepper's Lonely Hearts Club Band
-
Rate:
-Lady Madonna gets avg. rating
-
-7.3
- out of 10
- based on 3 ratings. Rate the song now!!!
+Artist: The Beatles Lyrics
+ Popularity : 2898 users have visited this page.
+ Album: Track 22 on The Beatles Collection, Volume 5: Sgt. Pepper's Lonely Hearts Club Band
+ Recorded: 3 and 6 February 1968, EMI Studios, London
+ Writer(s): Lennon–McCartney
+ Genre: Rock and roll
+ Producer(s): George Martin
+ Length: 2:16
+ Certification: Platinum (RIAA) Format: 7" single
+ Label: Parlophone (UK), Capitol (US)
+ Released: 15 March 1968
+
Lady Madonna, children at your feet
+
+Wonder how you manage to make ends meet
+
+Who find the money when you pay the rent
+
+Did you think that money was heaven sent
+
-
-
-Lady Madonna, children at your feet
-Wonder how you manage to make ends meet
-Who find the money when you pay the rent
-Did you think that money was heaven sent
-
-Friday night arrives without a suitcase
-Sunday morning creeping like a nun
-Monday's child has learned to tie his bootlegs
-See how they run
-
-Lady Madonna, baby at your breast
-Wonders how you manage to feed the rest
-Pa pa pa pa,
-See how they run
-
-Lady Madonna lying on the bed
-Listen to the music playing in your head
-
-Tuesday afternoon is never ending
-Wednesday morning papers didn't come
-Thursday night you stocking needed mending
-See how they run
-
-Lady Madonna, children at your feet
-Wonder how you manage to make ends meet
-
-
-
-
- If you believe the lyrics are not correct you can Submit Corrections to us Add a comment and share what The Beatles Lady Madonna Lyrics means to you with your friends:
-
-Featured lyrics:
-
+
+
+
+See how they run
+
+
+
+Lady Madonna lying on the bed
+
+Listen to the music playing in your head
+
+
+
+Tuesday afternoon is never ending
+
+Wednesday morning papers didn't come
+
+Thursday night your stocking needed mending
+
+See how they run
+
+
+
+Lady Madonna, children at your feet
+
+Wonder how you manage to make ends meet
+