diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py index 00954e2c3..47e299f82 100644 --- a/beetsplug/lyrics.py +++ b/beetsplug/lyrics.py @@ -23,6 +23,7 @@ import json import unicodedata import difflib import itertools +from HTMLParser import HTMLParseError from beets.plugins import BeetsPlugin from beets import ui @@ -271,8 +272,9 @@ def is_page_candidate(urlLink, urlTitle, title, artist): tokens = [by + '_' + artist for by in BY_TRANS] + \ [artist, sitename, sitename.replace('www.', '')] + LYRICS_TRANS songTitle = re.sub(u'(%s)' % u'|'.join(tokens), u'', urlTitle) + songTitle = songTitle.strip('_|') + typoRatio = .9 - typoRatio = .8 return difflib.SequenceMatcher(None, songTitle, title).ratio() >= typoRatio @@ -364,8 +366,12 @@ def scrape_lyrics_from_html(html): html = _scrape_merge_paragraphs(html) # extract all long text blocks that are not code - soup = BeautifulSoup(html, "html.parser", - parse_only=SoupStrainer(text=is_text_notcode)) + try: + soup = BeautifulSoup(html, "html.parser", + parse_only=SoupStrainer(text=is_text_notcode)) + except HTMLParseError: + return None + soup = sorted(soup.stripped_strings, key=len)[-1] return soup diff --git a/setup.py b/setup.py index 180374d0c..df035341d 100755 --- a/setup.py +++ b/setup.py @@ -85,12 +85,13 @@ setup( + (['ordereddict'] if sys.version_info < (2, 7, 0) else []), tests_require=[ - 'responses', - 'pyechonest', - 'mock', + 'beautifulsoup4', 'flask', - 'rarfile', + 'mock', + 'pyechonest', 'pylast', + 'rarfile', + 'responses', ], # Plugin (optional) dependencies: diff --git a/test/rsrc/lyrics/onelyricsnet/benellenharpercityofdreamslyrics.txt b/test/rsrc/lyrics/onelyricsnet/benellenharpercityofdreamslyrics.txt new file mode 100644 index 000000000..373e1f29d --- /dev/null +++ b/test/rsrc/lyrics/onelyricsnet/benellenharpercityofdreamslyrics.txt @@ -0,0 +1,341 @@ + + + + + + + Ben & Ellen Harper City Of Dreams Lyrics - Onelyrics.net + + + +
+
+ +
+ +
+ + +
+
+ +
+ + + + +
+
+

+ + Onelyricsnet-NewSongLyrics +

+ + +
+ + + +
+ + + + +
+
+ +
+ + +

+ Ben & Ellen Harper City Of Dreams Lyrics

+
+
+ + Day breaks over the city of my childhood +
Daybreak over the city I called home +
Where the sage met the sea and the groves were sweet and green +
It's a city that lives only in my dreams +
+
The groves where we played when we were children +
The groves where we fooled around as teens +
Those green groves are paved from la to santa fe +
That city lives only in my dreams +
+
Landmarks lost to parking lots in the city I called home +
Looking back I see what used to be +
Now freeways crawl though the suburban sprawl +
As far as the eye can see +
And the city lives only in my dreams +
+
Twilight shades the valley of my memory +
When citrus groves still perfumed the sky +
But I guess those orange blossoms weren't so special after all +
Now it's a city of days gone by +
+
Landmarks lost to parking lots in the city I called home +
Looking back I see what used to be +
Now freeways crawl through the suburban sprawl +
As far as the eye can see +
And the city lives only in my dreams + + + +
+
+ +
+ May 12, 2014 +
+
+ 126 hits +
+ +
+ +
+ +
+ + + + +
+ +
+ + + +
+ +
+ + + +
+
+
+
+
+
+ +
+
+
+ +
+
+ + +
+ + diff --git a/test/rsrc/lyricstext.yaml b/test/rsrc/lyricstext.yaml index 66435fcc1..ea005dd86 100644 --- a/test/rsrc/lyricstext.yaml +++ b/test/rsrc/lyricstext.yaml @@ -215,6 +215,34 @@ Hey_it_s_ok: | Hey It's OK, I'ts Ok Cause I've found what i wanted +City_of_dreams: | + Day breaks over the city of my childhood + Daybreak over the city I called home + Where the sage met the sea and the groves were sweet and green + It's a city that lives only in my dreams + + The groves where we played when we were children + The groves where we fooled around as teens + Those green groves are paved from la to santa fe + That city lives only in my dreams + + Landmarks lost to parking lots in the city I called home + Looking back I see what used to be + Now freeways crawl though the suburban sprawl + As far as the eye can see + And the city lives only in my dreams + + Twilight shades the valley of my memory + When citrus groves still perfumed the sky + But I guess those orange blossoms weren't so special after all + Now it's a city of days gone by + + Landmarks lost to parking lots in the city I called home + Looking back I see what used to be + Now freeways crawl through the suburban sprawl + As far as the eye can see + And the city lives only in my dreams + missing_texts: | Lyricsmania staff is working hard for you to add $TITLE lyrics as soon as they'll be released by $ARTIST, check back soon! diff --git a/test/test_lyrics.py b/test/test_lyrics.py index c911e1dcb..6b2929565 100644 --- a/test/test_lyrics.py +++ b/test/test_lyrics.py @@ -17,6 +17,7 @@ import os import _common +import sys from _common import unittest from beetsplug import lyrics from beets.library import Item @@ -163,8 +164,7 @@ class MockFetchUrl(object): url = url.replace('http://', '').replace('www.', '') fn = "".join(x for x in url if (x.isalnum() or x == '/')) fn = fn.split('/') - fn = os.path.join('rsrc', 'lyrics', fn[0], fn[-1]) + '.txt' - + fn = os.path.join(_common.RSRC, 'lyrics', fn[0], fn[-1]) + '.txt' with open(fn, 'r') as f: content = f.read() return content @@ -186,7 +186,7 @@ def is_lyrics_content_ok(title, text): class LyricsGooglePluginTest(unittest.TestCase): # Every source entered in default beets google custom search engine # must be listed below. - # Use default query when possible, or override artist and title field + # Use default query when possible, or override artist and title fields # if website don't have lyrics for default query. sourcesOk = [ dict(definfo, @@ -227,6 +227,10 @@ class LyricsGooglePluginTest(unittest.TestCase): dict(definfo, url='http://www.metrolyrics.com/', path='lady-madonna-lyrics-beatles.html'), + dict(definfo, + url=u'http://www.onelyrics.net/', + artist=u'Ben & Ellen Harper', title=u'City of dreams', + path='ben-ellen-harper-city-of-dreams-lyrics'), dict(definfo, url=u'http://www.paroles.net/', artist=u'Lilly Wood & the prick', title=u"Hey it's ok", @@ -258,7 +262,8 @@ class LyricsGooglePluginTest(unittest.TestCase): __import__('bs4') except ImportError: self.skipTest('Beautiful Soup 4 not available') - + if sys.version_info[:3] < (2, 7, 3): + self.skipTest("Python’s built-in HTML parser is not good enough") lyrics.LyricsPlugin() lyrics.fetch_url = MockFetchUrl() @@ -280,7 +285,7 @@ class LyricsGooglePluginTest(unittest.TestCase): self.assertTrue(lyrics.is_lyrics(res), url) self.assertTrue(is_lyrics_content_ok(s['title'], res), url) - def test_is_page_candidate(self): + def test_is_page_candidate_exact_match(self): from bs4 import SoupStrainer, BeautifulSoup for s in self.sourcesOk: @@ -292,6 +297,23 @@ class LyricsGooglePluginTest(unittest.TestCase): s['title'], s['artist']), True, url) + def test_is_page_candidate_fuzzy_match(self): + url = u'http://www.example.com/lazy_madonna_beatles' + urlTitle = u'example.com | lazy madonna lyrics by the beatles' + title = u'Lady Madonna' + artist = u'The Beatles' + # very small diffs (typo) are ok + self.assertEqual(lyrics.is_page_candidate(url, urlTitle, title, + artist), True, url) + # reject different title + urlTitle = u'example.com | busy madonna lyrics by the beatles' + self.assertEqual(lyrics.is_page_candidate(url, urlTitle, title, + artist), False, url) + # (title, artist) != (artist, title) + urlTitle = u'example.com | the beatles lyrics by Lazy Madonna' + self.assertEqual(lyrics.is_page_candidate(url, urlTitle, title, + artist), False, url) + def suite(): return unittest.TestLoader().loadTestsFromName(__name__) diff --git a/tox.ini b/tox.ini index 186dfb36f..bc1d3dfd9 100644 --- a/tox.ini +++ b/tox.ini @@ -8,13 +8,14 @@ envlist = py26, py27, pypy, docs, flake8 [testenv] deps = - nose - mock - pylast + beautifulsoup4 flask - responses + mock + nose pyechonest + pylast rarfile + responses commands = nosetests {posargs}