no html entities in _scrape_streamline_soup output

This commit is contained in:
Fabrice Laporte 2014-09-24 00:25:50 +02:00
parent da6bcda4af
commit 333591fd78
2 changed files with 29 additions and 29 deletions

View file

@ -347,20 +347,19 @@ def _scrape_normalize_eol(html):
html.replace('\r','\n')
# Replace <br> without introducing superfluous newline in the output
BREAK_RE = re.compile(r'\n?\s*<br([\s|/][^>]*)*>\s*\n?', re.I)
html = BREAK_RE.sub('\n', html)
html = BREAK_RE.sub('\n', html)
return html
def _scrape_merge_paragraphs(html):
regex = re.compile(r'</p>\s*<p(\s*[^>]*)>')
html = regex.sub('\n', html)
html = regex.sub('\n', html)
return html
def _scrape_filter_soup(soup):
"""Remove sections from soup that cannot be parents of lyrics section
"""
# Remove non relevant html parts
[s.extract() for s in soup(['head', 'script'])]
[s.extract() for s in soup(['head', 'script', 'iframe', 'a'])]
comments = soup.findAll(text=lambda text: isinstance(text, Comment))
[s.extract() for s in comments]
@ -385,9 +384,8 @@ def _scrape_streamline_soup(soup):
.format(e, exc_info=True))
# Make better soup from current soup! The previous unclosed <p> sections
# are now closed. Use str() rather than prettify() as it's more
# conservative concerning EOL
soup = BeautifulSoup(str(soup))
# are now closed.
soup = BeautifulSoup(soup.prettify(formatter=None))
# Insert the whole body in a <p> in case lyrics are nested in no markup but
# <body>
@ -419,12 +417,15 @@ def scrape_lyrics_from_html(html):
"""
if not html:
return None
html = _scrape_normalize_eol(html)
html = _scrape_merge_paragraphs(html)
soup = BeautifulSoup(html)
soup = _scrape_filter_soup(soup)
soup = _scrape_streamline_soup(soup)
soup = _scrape_longest_paragraph(soup)
return soup

View file

@ -46,7 +46,7 @@ class MockFetchUrl(object):
fn = "".join(x for x in url if (x.isalnum() or x == '/'))
fn = fn.split('/')
fn = os.path.join('rsrc', 'lyrics', fn[0], fn[-1]) + '.txt'
with open(fn, 'r') as f:
content = f.read()
return content
@ -71,42 +71,42 @@ class LyricsSourcesPluginTest(unittest.TestCase):
# Use default query when possible, or override artist and title field
# if website don't have lyrics for default query.
sourcesOk = [
dict(definfo,
dict(definfo,
url=u'http://www.absolutelyrics.com',
path=u'/lyrics/view/the_beatles/lady_madonna'),
dict(definfo,
dict(definfo,
url=u'http://www.azlyrics.com',
path=u'/lyrics/beatles/ladymadonna.html'),
dict(definfo,
dict(definfo,
url=u'http://www.chartlyrics.com',
path=u'/_LsLsZ7P4EK-F-LD4dJgDQ/Lady+Madonna.aspx'),
dict(definfo,
dict(definfo,
url=u'http://www.elyricsworld.com',
path=u'/lady_madonna_lyrics_beatles.html'),
dict(definfo,
dict(definfo,
url=u'http://www.lacoccinelle.net',
artist=u'Jacques Brel', title=u"Amsterdam",
path=u'/paroles-officielles/275679.html'),
dict(definfo,
dict(definfo,
url=u'http://www.lyrics007.com',
path=u'/The%20Beatles%20Lyrics/Lady%20Madonna%20Lyrics.html'),
dict(definfo,
dict(definfo,
url='http://www.lyrics.com/',
path=u'lady-madonna-lyrics-the-beatles.html'),
dict(definfo,
dict(definfo,
url='http://www.lyricsmania.com/',
path='lady_madonna_lyrics_the_beatles.html'),
dict(definfo,
url=u'http://www.lyrics.net',
dict(definfo,
url=u'http://www.lyrics.net',
path=u'/lyric/17547916'),
dict(definfo,
dict(definfo,
url=u'http://www.lyricsontop.com',
artist=u'Amy Winehouse', title=u"Jazz'n'blues",
path=u'/amy-winehouse-songs/jazz-n-blues-lyrics.html'),
dict(definfo,
dict(definfo,
url=u'http://lyrics.wikia.com/',
path=u'The_Beatles:Lady_Madonna'),
dict(definfo,
dict(definfo,
url='http://www.metrolyrics.com/',
path='lady-madonna-lyrics-beatles.html'),
dict(definfo,
@ -117,25 +117,24 @@ class LyricsSourcesPluginTest(unittest.TestCase):
url=u'http://www.reggaelyrics.info',
artist=u'Beres Hammond', title=u'I could beat myself',
path=u'/beres-hammond/i-could-beat-myself'),
dict(definfo,
dict(definfo,
url='http://www.releaselyrics.com',
path=u'/e35f/the-beatles-lady-madonna'),
dict(definfo,
dict(definfo,
url=u'http://www.smartlyrics.com',
path=u'/Song18148-The-Beatles-Lady-Madonna-lyrics.aspx'),
dict(definfo,
dict(definfo,
url='http://www.songlyrics.com',
path=u'/the-beatles/lady-madonna-lyrics'),
dict(definfo,
url=u'http://www.stlyrics.com',
dict(definfo,
url=u'http://www.stlyrics.com',
path=u'/songs/r/richiehavens48961/ladymadonna2069109.html'),
dict(definfo,
dict(definfo,
url=u'http://www.sweetslyrics.com',
path=u'/761696.The%20Beatles%20-%20Lady%20Madonna.html'),
]
def setUp(self):
"""Set up configuration"""
lyrics.LyricsPlugin()