Merge pull request #992 from KraYmer/lyrics-coverage

Improve lyrics coverage
This commit is contained in:
Fabrice L. 2014-10-09 22:11:28 +02:00
commit e6bf8c21d8
6 changed files with 415 additions and 16 deletions

View file

@ -23,6 +23,7 @@ import json
import unicodedata
import difflib
import itertools
from HTMLParser import HTMLParseError
from beets.plugins import BeetsPlugin
from beets import ui
@ -271,8 +272,9 @@ def is_page_candidate(urlLink, urlTitle, title, artist):
tokens = [by + '_' + artist for by in BY_TRANS] + \
[artist, sitename, sitename.replace('www.', '')] + LYRICS_TRANS
songTitle = re.sub(u'(%s)' % u'|'.join(tokens), u'', urlTitle)
songTitle = songTitle.strip('_|')
typoRatio = .9
typoRatio = .8
return difflib.SequenceMatcher(None, songTitle, title).ratio() >= typoRatio
@ -364,8 +366,12 @@ def scrape_lyrics_from_html(html):
html = _scrape_merge_paragraphs(html)
# extract all long text blocks that are not code
soup = BeautifulSoup(html, "html.parser",
parse_only=SoupStrainer(text=is_text_notcode))
try:
soup = BeautifulSoup(html, "html.parser",
parse_only=SoupStrainer(text=is_text_notcode))
except HTMLParseError:
return None
soup = sorted(soup.stripped_strings, key=len)[-1]
return soup

View file

@ -85,12 +85,13 @@ setup(
+ (['ordereddict'] if sys.version_info < (2, 7, 0) else []),
tests_require=[
'responses',
'pyechonest',
'mock',
'beautifulsoup4',
'flask',
'rarfile',
'mock',
'pyechonest',
'pylast',
'rarfile',
'responses',
],
# Plugin (optional) dependencies:

View file

@ -0,0 +1,341 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head id="ctl00_Head1"><meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1" /><title>
Ben & Ellen Harper City Of Dreams Lyrics - Onelyrics.net
</title><meta name="keywords" content="Ben &amp;amp; Ellen Harper City Of Dreams, Ben &amp;amp; Ellen Harper City Of Dreams Lyrics, Ben &amp;amp; Ellen Harper City Of Dreams Song Lyrics, Ben &amp;amp; Ellen Harper City Of Dreams Song Text, Ben &amp;amp; Ellen Harper City Of Dreams Şarkı Sözü" /><meta name="description" content="Ben &amp; Ellen Harper City Of Dreams Lyrics. Day breaks over the city of my childhood Daybreak over the city I called home Where the sage met the..." /><meta name="robots" content="index,follow" /><meta name="robots" content="NOODP" /><link rel="icon" href="/favicon.ico" type="image/x-icon" /><meta http-equiv="Content-Type" content="text/html; charset=utf-8" /><link href="/App_Themes/default/css/Interface.css" rel="stylesheet" type="text/css" /><link rel="search" type="application/opensearchdescription+xml" title="Onelyrics.net - Search Lyrics" href="/xml/search.xml" /><link rel="alternate" type="application/rss+xml" title="Onelyrics.net - New Song Lyrics" href="/xml/new_lyrics.xml" />
<link href="http://www.onelyrics.net/ben-ellen-harper-city-of-dreams-lyrics" rel="canonical" /></head>
<body>
<form name="aspnetForm" method="post" action="/ben-ellen-harper-city-of-dreams-lyrics" id="aspnetForm">
<div>
<input type="hidden" name="__VIEWSTATE" id="__VIEWSTATE" value="/wEPDwUKMTExNDA4MzEzOGRkBfqlfuuQKa7LybXM/GprsFkj1mSg9qg8V6w++Om/EM0=" />
</div>
<div>
<input type="hidden" name="__VIEWSTATEGENERATOR" id="__VIEWSTATEGENERATOR" value="67C2BABB" />
</div>
<div class="center">
<div id="header">
<div class="logo">
<a href="/" title="lyrics">ONE<strong>LYRICS</strong>
</a>
</div>
<div class="header_alt">
<ul class="ortamenu">
<li class='ilk'><a href="/"
title="lyrics">
<img src="/App_Themes/default/img/home.gif" alt="lyrics" />
</a></li>
<li><a href="/new-song-lyrics"
title="new song lyrics">New Song Lyrics</a></li>
<li class="son"><a href="/populer-song-lyrics"
title="populer song lyrics">Populer Song Lyrics</a></li>
</ul>
<div class="arama">
<script>
(function () {
var cx = '004449022147236721955:rwdr_-ykwwg';
var gcse = document.createElement('script');
gcse.type = 'text/javascript';
gcse.async = true;
gcse.src = (document.location.protocol == 'https:' ? 'https:' : 'http:') +
'//www.google.com/cse/cse.js?cx=' + cx;
var s = document.getElementsByTagName('script')[0];
s.parentNode.insertBefore(gcse, s);
})();
</script>
<gcse:search></gcse:search>
</div>
<div class="clr"></div>
</div>
<div class="alfabe">
<a href="/a-song-lyrics.html" title="A" class="ilk">A</a>
<a href="/b-song-lyrics.html" title="B">B</a>
<a href="/c-song-lyrics.html" title="C">C</a>
<a href="/d-song-lyrics.html" title="D">D</a>
<a href="/e-song-lyrics.html" title="E">E</a>
<a href="/f-song-lyrics.html" title="F">F</a>
<a href="/g-song-lyrics.html" title="G">G</a>
<a href="/h-song-lyrics.html" title="H">H</a>
<a href="/i-song-lyrics.html" title="I">I</a>
<a href="/j-song-lyrics.html" title="J">J</a>
<a href="/k-song-lyrics.html" title="K">K</a>
<a href="/l-song-lyrics.html" title="L">L</a>
<a href="/m-song-lyrics.html" title="M">M</a>
<a href="/n-song-lyrics.html" title="N">N</a>
<a href="/o-song-lyrics.html" title="O">O</a>
<a href="/p-song-lyrics.html" title="P">P</a>
<a href="/q-song-lyrics.html" title="Q">Q</a>
<a href="/r-song-lyrics.html" title="R">R</a>
<a href="/s-song-lyrics.html" title="S">S</a>
<a href="/t-song-lyrics.html" title="T">T</a>
<a href="/u-song-lyrics.html" title="U">U</a>
<a href="/v-song-lyrics.html" title="V">V</a>
<a href="/w-song-lyrics.html" title="W">W</a>
<a href="/x-song-lyrics.html" title="X">X</a>
<a href="/y-song-lyrics.html" title="Y">Y</a>
<a href="/z-song-lyrics.html" title="Z">Z</a>
</div>
</div>
<div class="txtcenter">
<script async src="//pagead2.googlesyndication.com/pagead/js/adsbygoogle.js"></script>
<!-- 728x90 Resim -->
<ins class="adsbygoogle"
style="display: inline-block; width: 728px; height: 90px"
data-ad-client="ca-pub-3379066124362506"
data-ad-slot="6181228355"></ins>
<script>
(adsbygoogle = window.adsbygoogle || []).push({});
</script>
</div>
<div id="main">
<p>
<a href="http://feeds.feedburner.com/Onelyricsnet-NewSongLyrics" target="_blank">
<img src="http://feeds.feedburner.com/~fc/Onelyricsnet-NewSongLyrics?bg=ea5f24&amp;fg=ffffff&amp;anim=0" height="26" width="88" style="border: 0" alt="Onelyricsnet-NewSongLyrics" /></a>
</p>
<div id="sag">
<div class="reklam">
<script async src="//pagead2.googlesyndication.com/pagead/js/adsbygoogle.js"></script>
<!-- 336x280ResimMetin -->
<ins class="adsbygoogle"
style="display: inline-block; width: 336px; height: 280px"
data-ad-client="ca-pub-3379066124362506"
data-ad-slot="3608404357"></ins>
<script>
(adsbygoogle = window.adsbygoogle || []).push({});
</script>
</div>
<div class="anabaslik">
Latest Updates Lyrics
</div>
<ul class="populer_lyrics">
<li>
<a href="jason-aldean-dont-change-gone-lyrics" title="Jason Aldean Don't Change Gone Lyrics">
Jason Aldean Don't Change Gone Lyrics
</a>
</li>
<li>
<a href="rapsody-godzilla-lyrics" title="Rapsody Godzilla Lyrics">
Rapsody Godzilla Lyrics
</a>
</li>
<li>
<a href="jason-aldean-i-took-it-with-me-lyrics" title="Jason Aldean I Took It With Me Lyrics">
Jason Aldean I Took It With Me Lyrics
</a>
</li>
<li>
<a href="david-archuleta-glorious-lyrics" title="David Archuleta Glorious Lyrics">
David Archuleta Glorious Lyrics
</a>
</li>
<li>
<a href="jason-aldean-if-my-truck-could-talk-lyrics" title="Jason Aldean If My Truck Could Talk Lyrics">
Jason Aldean If My Truck Could Talk Lyrics
</a>
</li>
<li>
<a href="tydi-perfect-crush-lyrics" title="TyDi Perfect Crush Lyrics">
TyDi Perfect Crush Lyrics
</a>
</li>
<li>
<a href="jason-aldean-laid-back-lyrics" title="Jason Aldean Laid Back Lyrics">
Jason Aldean Laid Back Lyrics
</a>
</li>
<li>
<a href="jason-aldean-miss-that-girl-lyrics" title="Jason Aldean Miss That Girl Lyrics">
Jason Aldean Miss That Girl Lyrics
</a>
</li>
<li>
<a href="childish-gambino-go-dj-lyrics" title="Childish Gambino Go DJ Lyrics">
Childish Gambino Go DJ Lyrics
</a>
</li>
<li>
<a href="vybz-kartel-well-make-it-lyrics" title="Vybz Kartel We'll Make It Lyrics">
Vybz Kartel We'll Make It Lyrics
</a>
</li>
<li class="hepsi">
<a href="/new-song-lyrics" title="all wiev updates song lyrics">all wiev updates song lyrics>
</a>
</li>
</ul>
</div>
<div id="sol">
<div class="solreklam">
<script async src="//pagead2.googlesyndication.com/pagead/js/adsbygoogle.js"></script>
<!-- 300x600ResimMetin -->
<ins class="adsbygoogle"
style="display: inline-block; width: 300px; height: 600px"
data-ad-client="ca-pub-3379066124362506"
data-ad-slot="6547325555"></ins>
<script>
(adsbygoogle = window.adsbygoogle || []).push({});
</script>
</div>
<div class="sag_sarki">
<div class="baslik">
<div class="breadcrumb" itemprop="breadcrumb" itemscope itemtype="http://schema.org/WebPage">
<a href="/" title="Lyrics">Lyrics</a> > <a href="ben-ellen-harper-city-of-dreams-lyrics" title="Ben & Ellen Harper City Of Dreams Lyrics"
class="breadcrumb_aktif">
Ben & Ellen Harper City Of Dreams Lyrics</a>
</div>
<h1 title="Ben & Ellen Harper City Of Dreams Lyrics">
Ben & Ellen Harper City Of Dreams Lyrics</h1>
</div>
<div class="icerik">
Day breaks over the city of my childhood
<br>Daybreak over the city I called home
<br>Where the sage met the sea and the groves were sweet and green
<br>It's a city that lives only in my dreams
<br>
<br>The groves where we played when we were children
<br>The groves where we fooled around as teens
<br>Those green groves are paved from la to santa fe
<br>That city lives only in my dreams
<br>
<br>Landmarks lost to parking lots in the city I called home
<br>Looking back I see what used to be
<br>Now freeways crawl though the suburban sprawl
<br>As far as the eye can see
<br>And the city lives only in my dreams
<br>
<br>Twilight shades the valley of my memory
<br>When citrus groves still perfumed the sky
<br>But I guess those orange blossoms weren't so special after all
<br>Now it's a city of days gone by
<br>
<br>Landmarks lost to parking lots in the city I called home
<br>Looking back I see what used to be
<br>Now freeways crawl through the suburban sprawl
<br>As far as the eye can see
<br>And the city lives only in my dreams
<div class='tags'><b>Tags</b><br/><a href='/lyrics.aspx?q=Ben+%26amp%3b+Ellen+Harper+City+Of+Dreams' title='Ben &amp; Ellen Harper City Of Dreams'>Ben &amp; Ellen Harper City Of Dreams</a> <a href='/lyrics.aspx?q=Ben+%26amp%3b+Ellen+Harper+City+Of+Dreams+Lyrics' title='Ben &amp; Ellen Harper City Of Dreams Lyrics'>Ben &amp; Ellen Harper City Of Dreams Lyrics</a> <a href='/lyrics.aspx?q=Ben+%26amp%3b+Ellen+Harper+City+Of+Dreams+Song+Lyrics' title='Ben &amp; Ellen Harper City Of Dreams Song Lyrics'>Ben &amp; Ellen Harper City Of Dreams Song Lyrics</a> <a href='/lyrics.aspx?q=Ben+%26amp%3b+Ellen+Harper+City+Of+Dreams+Song+Text' title='Ben &amp; Ellen Harper City Of Dreams Song Text'>Ben &amp; Ellen Harper City Of Dreams Song Text</a> <a href='/lyrics.aspx?q=Ben+%26amp%3b+Ellen+Harper+City+Of+Dreams+%c5%9eark%c4%b1+S%c3%b6z%c3%bc' title='Ben &amp; Ellen Harper City Of Dreams Şarkı Sözü'>Ben &amp; Ellen Harper City Of Dreams Şarkı Sözü</a> </div>
<div class="clr"></div>
<div class="sarki_alt">
<div class="icerik_tarih">
May 12, 2014
</div>
<div class="icerik_hit">
126 hits
</div>
<div class="clr"></div>
<div class="sosyalaglar">
<div class="sosyalaglar_li">
<!-- Google +1 Butonu -->
<script type="text/javascript" src="https://apis.google.com/js/plusone.js" async="true"> { lang: 'en' }</script>
<g:plusone size="tall"></g:plusone>
<!-- Google +1 Butonu End-->
</div>
<div class="sosyalaglar_li">
<!--Twitter-->
<a href="http://twitter.com/share" class="twitter-share-button" data-count="vertical">Tweet</a><script type="text/javascript" src="http://platform.twitter.com/widgets.js"></script>
<!--Twitter End-->
</div>
<div class="sosyalaglar_li">
<!--Facebook-->
<iframe src="http://www.facebook.com/plugins/like.php?href=http://www.onelyrics.net/ben-ellen-harper-city-of-dreams-lyrics&amp;layout=box_count&amp;show_faces=true&amp;width=450&amp;action=like&amp;colorscheme=light&amp;height=65"
scrolling="no" frameborder="0" style="border: none; overflow: hidden; width: 62px; height: 65px;"
allowtransparency="true"></iframe>
<!--Facebook End-->
</div>
</div>
<div class="clr"></div>
</div>
</div>
</div>
</div>
<div class="clr">
</div>
</div>
</div>
<div id="footer">
<div class="center footer">
<div class="copyright">
Copyright © 2014 Onelyrics.net / All rights reserved.
</div>
<div class="clr"></div>
<ul class="altmenu2">
<li>
<a href="/" title="lyrics"><strong>lyrics</strong></a>
</li>
<li>
<a href="/new-song-lyrics" title="song lyrics"><em>song lyrics</em></a>
</li>
<li>
<a href="/" title="şarkı sözleri">şarkı sözleri</a>
</li>
<li class="son">
<a href="https://plus.google.com/+OnelyricsNet-SongLyrics?rel=author" title="Google" target="_blank">Google</a>
</li>
</ul>
<div class="clr">
</div>
</div>
</div>
<script type="text/javascript">
(function (i, s, o, g, r, a, m) {
i['GoogleAnalyticsObject'] = r; i[r] = i[r] || function () {
(i[r].q = i[r].q || []).push(arguments)
}, i[r].l = 1 * new Date(); a = s.createElement(o),
m = s.getElementsByTagName(o)[0]; a.async = 1; a.src = g; m.parentNode.insertBefore(a, m)
})(window, document, 'script', '//www.google-analytics.com/analytics.js', 'ga');
ga('create', 'UA-29965928-11', 'onelyrics.net');
ga('send', 'pageview');
</script>
</form>
</body>
</html>

View file

@ -215,6 +215,34 @@ Hey_it_s_ok: |
Hey It's OK, I'ts Ok
Cause I've found what i wanted
City_of_dreams: |
Day breaks over the city of my childhood
Daybreak over the city I called home
Where the sage met the sea and the groves were sweet and green
It's a city that lives only in my dreams
The groves where we played when we were children
The groves where we fooled around as teens
Those green groves are paved from la to santa fe
That city lives only in my dreams
Landmarks lost to parking lots in the city I called home
Looking back I see what used to be
Now freeways crawl though the suburban sprawl
As far as the eye can see
And the city lives only in my dreams
Twilight shades the valley of my memory
When citrus groves still perfumed the sky
But I guess those orange blossoms weren't so special after all
Now it's a city of days gone by
Landmarks lost to parking lots in the city I called home
Looking back I see what used to be
Now freeways crawl through the suburban sprawl
As far as the eye can see
And the city lives only in my dreams
missing_texts: |
Lyricsmania staff is working hard for you to add $TITLE lyrics as soon
as they'll be released by $ARTIST, check back soon!

View file

@ -17,6 +17,7 @@
import os
import _common
import sys
from _common import unittest
from beetsplug import lyrics
from beets.library import Item
@ -163,8 +164,7 @@ class MockFetchUrl(object):
url = url.replace('http://', '').replace('www.', '')
fn = "".join(x for x in url if (x.isalnum() or x == '/'))
fn = fn.split('/')
fn = os.path.join('rsrc', 'lyrics', fn[0], fn[-1]) + '.txt'
fn = os.path.join(_common.RSRC, 'lyrics', fn[0], fn[-1]) + '.txt'
with open(fn, 'r') as f:
content = f.read()
return content
@ -186,7 +186,7 @@ def is_lyrics_content_ok(title, text):
class LyricsGooglePluginTest(unittest.TestCase):
# Every source entered in default beets google custom search engine
# must be listed below.
# Use default query when possible, or override artist and title field
# Use default query when possible, or override artist and title fields
# if website don't have lyrics for default query.
sourcesOk = [
dict(definfo,
@ -227,6 +227,10 @@ class LyricsGooglePluginTest(unittest.TestCase):
dict(definfo,
url='http://www.metrolyrics.com/',
path='lady-madonna-lyrics-beatles.html'),
dict(definfo,
url=u'http://www.onelyrics.net/',
artist=u'Ben & Ellen Harper', title=u'City of dreams',
path='ben-ellen-harper-city-of-dreams-lyrics'),
dict(definfo,
url=u'http://www.paroles.net/',
artist=u'Lilly Wood & the prick', title=u"Hey it's ok",
@ -258,7 +262,8 @@ class LyricsGooglePluginTest(unittest.TestCase):
__import__('bs4')
except ImportError:
self.skipTest('Beautiful Soup 4 not available')
if sys.version_info[:3] < (2, 7, 3):
self.skipTest("Pythons built-in HTML parser is not good enough")
lyrics.LyricsPlugin()
lyrics.fetch_url = MockFetchUrl()
@ -280,7 +285,7 @@ class LyricsGooglePluginTest(unittest.TestCase):
self.assertTrue(lyrics.is_lyrics(res), url)
self.assertTrue(is_lyrics_content_ok(s['title'], res), url)
def test_is_page_candidate(self):
def test_is_page_candidate_exact_match(self):
from bs4 import SoupStrainer, BeautifulSoup
for s in self.sourcesOk:
@ -292,6 +297,23 @@ class LyricsGooglePluginTest(unittest.TestCase):
s['title'], s['artist']),
True, url)
def test_is_page_candidate_fuzzy_match(self):
url = u'http://www.example.com/lazy_madonna_beatles'
urlTitle = u'example.com | lazy madonna lyrics by the beatles'
title = u'Lady Madonna'
artist = u'The Beatles'
# very small diffs (typo) are ok
self.assertEqual(lyrics.is_page_candidate(url, urlTitle, title,
artist), True, url)
# reject different title
urlTitle = u'example.com | busy madonna lyrics by the beatles'
self.assertEqual(lyrics.is_page_candidate(url, urlTitle, title,
artist), False, url)
# (title, artist) != (artist, title)
urlTitle = u'example.com | the beatles lyrics by Lazy Madonna'
self.assertEqual(lyrics.is_page_candidate(url, urlTitle, title,
artist), False, url)
def suite():
return unittest.TestLoader().loadTestsFromName(__name__)

View file

@ -8,13 +8,14 @@ envlist = py26, py27, pypy, docs, flake8
[testenv]
deps =
nose
mock
pylast
beautifulsoup4
flask
responses
mock
nose
pyechonest
pylast
rarfile
responses
commands =
nosetests {posargs}