diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py index b805ffeef..ef5ca3ed0 100644 --- a/beetsplug/lyrics.py +++ b/beetsplug/lyrics.py @@ -24,6 +24,8 @@ import unicodedata import difflib import itertools +from bs4 import BeautifulSoup, Comment + from beets.plugins import BeetsPlugin from beets import ui from beets import config @@ -340,25 +342,31 @@ def is_lyrics(text, artist=None): return len(badTriggersOcc) < 2 -def scrape_lyrics_from_url(url): - """Scrape lyrics from a URL. If no lyrics can be found, return None - instead. +def _scrape_normalize_eol(html): + """Return html text where only authorized eol marker is \n """ - from bs4 import BeautifulSoup, Comment - html = fetch_url(url) - if not html: - return None - - soup = BeautifulSoup(html) - - for tag in soup.findAll('br'): - tag.replaceWith('\n') + html.replace('\r','\n') + # Replace
without introducing superfluous newline in the output + BREAK_RE = re.compile(r'\n?\s*\s*\n?', re.I) + html = BREAK_RE.sub('\n', html) + return html +def _scrape_filter_soup(soup): + """Remove sections from soup that cannot be parents of lyrics section + """ # Remove non relevant html parts [s.extract() for s in soup(['head', 'script'])] comments = soup.findAll(text=lambda text: isinstance(text, Comment)) [s.extract() for s in comments] + # Remove ads now as they can interrupt the lyrics block + ads = soup.find_all('div', class_=re.compile('ad')) + [s.extract() for s in ads] + return soup + +def _scrape_streamline_soup(soup): + """Transform soup into a succession of

blocks + """ try: for tag in soup.findAll(True): tag.name = 'p' # keep tag contents @@ -379,20 +387,48 @@ def scrape_lyrics_from_url(url): pTag = soup.new_tag("p") bodyTag.parent.insert(0, pTag) pTag.insert(0, bodyTag) + return soup +def _scrape_longest_paragraph(soup): + """Return longest paragraph from soup + """ tagTokens = [] - + for tag in soup.findAll('p'): soup2 = BeautifulSoup(str(tag)) # Extract all text of

section. tagTokens += soup2.findAll(text=True) if tagTokens: - # Lyrics are expected to be the longest paragraph tagTokens = sorted(tagTokens, key=len, reverse=True) soup = BeautifulSoup(tagTokens[0]) return unescape(tagTokens[0].strip("\n\r: ")) +def _scrape_custom_process_soup(soup): + """Apply custom operations on soup to handle cases for specific websites + """ + # metrolyrics.com: lyrics text is splitted into multiple

+ for match in soup.find_all('p', class_='verse'): + match.insert_before('\n') + match.unwrap() + return soup + +def scrape_lyrics_from_html(html): + """Scrape lyrics from a URL. If no lyrics can be found, return None + instead. + """ + if not html: + return None + + html = _scrape_normalize_eol(html) + soup = BeautifulSoup(html) + soup = _scrape_filter_soup(soup) + soup = _scrape_streamline_soup(soup) + soup = _scrape_custom_process_soup(soup) + # print(soup) + soup = _scrape_longest_paragraph(soup) + + return soup def fetch_google(artist, title): """Fetch lyrics from Google search results. @@ -416,7 +452,9 @@ def fetch_google(artist, title): urlTitle = item['title'] if not is_page_candidate(urlLink, urlTitle, title, artist): continue - lyrics = scrape_lyrics_from_url(urlLink) + + html = fetch_url(urlLink) + lyrics = scrape_lyrics_from_html(html) if not lyrics: continue diff --git a/test/lyrics_sources.py b/test/lyrics_sources.py index c9e89df3d..c3b185a27 100644 --- a/test/lyrics_sources.py +++ b/test/lyrics_sources.py @@ -64,36 +64,16 @@ def is_lyrics_content_ok(title, text): return False -class LyricsPluginTest(unittest.TestCase): - def setUp(self): - """Set up configuration""" - lyrics.LyricsPlugin() - - def test_default_ok(self): - """Test each lyrics engine with the default query""" - - lyrics.fetch_url = MockFetchUrl() - - for f in (lyrics.fetch_lyricswiki, lyrics.fetch_lyricscom): - res = f(definfo['artist'], definfo['title']) - self.assertTrue(lyrics.is_lyrics(res)) - self.assertTrue(is_lyrics_content_ok(definfo['title'], res)) - - def test_missing_lyrics(self): - self.assertFalse(lyrics.is_lyrics(LYRICS_TEXTS['missing_texts'])) - - -class LyricsScrapingPluginTest(unittest.TestCase): - +class LyricsSourcesPluginTest(unittest.TestCase): # Every source entered in default beets google custom search engine # must be listed below. # Use default query when possible, or override artist and title field # if website don't have lyrics for default query. sourcesOk = [ - dict(definfo, url='http://www.songlyrics.com', - path=u'/the-beatles/lady-madonna-lyrics'), - dict(definfo, url=u'http://www.elyricsworld.com', - path=u'/lady_madonna_lyrics_beatles.html'), + # dict(definfo, url='http://www.songlyrics.com', + # path=u'/the-beatles/lady-madonna-lyrics'), + # dict(definfo, url=u'http://www.elyricsworld.com', + # path=u'/lady_madonna_lyrics_beatles.html'), dict(artist=u'Beres Hammond', title=u'I could beat myself', url=u'http://www.reggaelyrics.info', path=u'/beres-hammond/i-could-beat-myself'), @@ -111,61 +91,65 @@ class LyricsScrapingPluginTest(unittest.TestCase): path=u'/lyrics/beatles/ladymadonna.html'), dict(definfo, url=u'http://www.chartlyrics.com', path=u'/_LsLsZ7P4EK-F-LD4dJgDQ/Lady+Madonna.aspx'), - ] - - # Websites that can't be scraped yet and whose results must be - # flagged as invalid lyrics. - sourcesFail = [ - dict(definfo, url=u'http://www.smartlyrics.com', - path=u'/Song18148-The-Beatles-Lady-Madonna-lyrics.aspx'), - dict(definfo, url='http://www.metrolyrics.com/', - path='best-for-last-lyrics-adele.html'), dict(definfo, artist=u'Lilly Wood & the prick', title=u"Hey it's ok", url=u'http://www.lyricsmania.com', path=u'/hey_its_ok_lyrics_lilly_wood_and_the_prick.html'), dict(definfo, url=u'http://www.lyrics007.com', path=u'/The%20Beatles%20Lyrics/Lady%20Madonna%20Lyrics.html'), + dict(definfo, url=u'http://www.smartlyrics.com', + path=u'/Song18148-The-Beatles-Lady-Madonna-lyrics.aspx'), dict(definfo, url='http://www.releaselyrics.com', - path=u'/e35f/the-beatles-lady-madonna'), + path=u'/e35f/the-beatles-lady-madonna'), + dict(definfo, url='http://www.metrolyrics.com/', + path='lady-madonna-lyrics-beatles.html'), ] - # Websites that return truncated lyrics because of scraping issues, and - # thus should not be included as sources to Google CSE. - # They are good candidates for later inclusion after improvement - # iterations of the scraping algorithm. - sourcesIncomplete = [ - dict(definfo, artist=u'Lilly Wood & the prick', title=u"Hey it's ok", - url=u'http://www.lacoccinelle.net', - path=u'/paroles-officielles/550512.html'), + # Websites that can't be scraped yet. + # The reason why the scraping fail is indicated before each source dict. + sourcesFail = [ + + + # Lyrics consist in multiple small

sections instead of a long one + #dict(definfo, artist=u'Lilly Wood & the prick', title=u"Hey it's ok", + # url=u'http://www.lacoccinelle.net', + # path=u'/paroles-officielles/550512.html'), ] + def setUp(self): + """Set up configuration""" + lyrics.LyricsPlugin() + lyrics.fetch_url = MockFetchUrl() + + def test_default_ok(self): + """Test each lyrics engine with the default query""" + + for f in (lyrics.fetch_lyricswiki, lyrics.fetch_lyricscom): + res = f(definfo['artist'], definfo['title']) + self.assertTrue(lyrics.is_lyrics(res)) + self.assertTrue(is_lyrics_content_ok(definfo['title'], res)) + + def test_missing_lyrics(self): + self.assertFalse(lyrics.is_lyrics(LYRICS_TEXTS['missing_texts'])) + def test_sources_ok(self): for s in self.sourcesOk: url = s['url'] + s['path'] - res = lyrics.scrape_lyrics_from_url(url) + res = lyrics.scrape_lyrics_from_html(lyrics.fetch_url(url)) self.assertTrue(lyrics.is_lyrics(res), url) self.assertTrue(is_lyrics_content_ok(s['title'], res), url) def test_sources_fail(self): for s in self.sourcesFail: - url = s['url'] + s['path'] - res = lyrics.scrape_lyrics_from_url(url) - # very unlikely these sources pass if the scraping algo is not - # tweaked on purpose for these cases - self.assertFalse(lyrics.is_lyrics(res), "%s => %s" % (url, res)) - - def test_sources_incomplete(self): - for s in self.sourcesIncomplete: - url = s['url'] + s['path'] - res = lyrics.scrape_lyrics_from_url(url) - - self.assertTrue(lyrics.is_lyrics(res)) - # these sources may pass if the html source evolve or after - # a random improvement in the scraping algo: we want to - # be noticed if it's the case. - if is_lyrics_content_ok(s['title'], res): - log.debug(u'Source {0} actually return valid lyrics!' - .format(s['url'])) + url = s['url'] + s['path'] + html = lyrics.fetch_url(url) + res = lyrics.scrape_lyrics_from_html(html) + if lyrics.is_lyrics(res): + if is_lyrics_content_ok(s['title'], res): + log.info(u'{0} can be added to sources :\n{1}' + .format(s['url'], res)) + else: + log.info(u'{0} return invalid lyrics:\n{1}'. + format(s['url'], res)) def test_is_page_candidate(self): for s in self.sourcesOk: diff --git a/test/rsrc/lyrics/lyrics007com/Lady20Madonna20Lyricshtml.txt b/test/rsrc/lyrics/lyrics007com/Lady20Madonna20Lyricshtml.txt index 8dfb31053..841ad9891 100644 --- a/test/rsrc/lyrics/lyrics007com/Lady20Madonna20Lyricshtml.txt +++ b/test/rsrc/lyrics/lyrics007com/Lady20Madonna20Lyricshtml.txt @@ -1,155 +1,154 @@ - + + + + + + -The Beatles - Lady Madonna Lyrics - - - + + + - - - - - + +The Beatles Lady Madonna Lyrics | Lyrics007 + + + + - -

- -
- -
-

The Beatles - Lady Madonna Lyrics

-
-
-
Writer(s):LENNON, JOHN WINSTON / MCCARTNEY, PAUL JAMES
Artist: The Beatles Lyrics -
Popularity: 9957 users have visited this page.
Album: Track 22 on The Beatles Collection, Volume 5: Sgt. Pepper's Lonely Hearts Club Band
-
Rate: -Lady Madonna gets avg. rating - -7.3 - out of 10 - based on 3 ratings. Rate the song now!!!
-


+
+
+ + + +
+Home >> The Beatles Lyrics >> The Beatles - Lady Madonna Lyrics
+

The Beatles Lady Madonna Lyrics

+Artist: The Beatles Lyrics +
Popularity : 2898 users have visited this page. +
Album: Track 22 on The Beatles Collection, Volume 5: Sgt. Pepper's Lonely Hearts Club Band +
Recorded: 3 and 6 February 1968, EMI Studios, London +
Writer(s): Lennon–McCartney +
Genre: Rock and roll +
Producer(s): George Martin +
Length: 2:16 +
Certification: Platinum (RIAA)
Format: 7" single +
Label: Parlophone (UK), Capitol (US) +
Released: 15 March 1968 +


Lady Madonna, children at your feet +
+Wonder how you manage to make ends meet +
+Who find the money when you pay the rent +
+Did you think that money was heaven sent +
- - -Lady Madonna, children at your feet
-Wonder how you manage to make ends meet
-Who find the money when you pay the rent
-Did you think that money was heaven sent
-
-Friday night arrives without a suitcase
-Sunday morning creeping like a nun
-Monday's child has learned to tie his bootlegs
-See how they run
-
-Lady Madonna, baby at your breast
-Wonders how you manage to feed the rest
-Pa pa pa pa,
-See how they run
-
-Lady Madonna lying on the bed
-Listen to the music playing in your head
-
-Tuesday afternoon is never ending
-Wednesday morning papers didn't come
-Thursday night you stocking needed mending
-See how they run
-
-Lady Madonna, children at your feet
-Wonder how you manage to make ends meet
- - - -
-
- -

If you believe the lyrics are not correct you can Submit Corrections to us
Add a comment and share what The Beatles Lady Madonna Lyrics means to you with your friends:
- -Featured lyrics: -Lyrics © Sony/ATV Music Publishing LLC -
-
- +
+ +
+See how they run +
+ +
+Lady Madonna lying on the bed +
+Listen to the music playing in your head +
+ +
+Tuesday afternoon is never ending +
+Wednesday morning papers didn't come +
+Thursday night your stocking needed mending +
+See how they run +
+ +
+Lady Madonna, children at your feet +
+Wonder how you manage to make ends meet +

Thanks to Quidam for the correction +
The Beatles's Lady Madonna album cover
- -

+
+
+ + + comments powered by Disqus +
The hottest lyrics from The Beatles + +
+ +
-
-
Lyrics007 gets licensed to display lyrics and pay the lyrics writers through LyricFind. The most of song titles are calibrated according to wikipedia
- -
+
 ©COPYRIGHT 2014, LYRICS007.COM, ALL RIGHTS RESERVED.
+ +
-