lyrics : remove empty divs before scraping

it may result in \n being inserted that we will strip in
_scrape_strip_cruft
This commit is contained in:
Fabrice Laporte 2014-12-30 23:37:23 +01:00
parent fd94094c1b
commit d4d5c085fa
2 changed files with 11 additions and 12 deletions

View file

@ -90,7 +90,7 @@ def extract_text_between(html, start_marker, end_marker):
html, _ = html.split(end_marker, 1)
except ValueError:
return u''
return _scrape_strip_cruft(html, True)
return html
def extract_text_in(html, starttag):
@ -124,8 +124,7 @@ def extract_text_in(html, starttag):
else:
print('no closing tag found!')
return
lyrics = ''.join(parts)
return _scrape_strip_cruft(lyrics, True)
return u''.join(parts)
def search_pairs(item):
@ -221,7 +220,7 @@ def fetch_lyricswiki(artist, title):
if not html:
return
lyrics = extract_text_in(html, "<div class='lyricbox'>")
lyrics = extract_text_in(html, u"<div class='lyricbox'>")
if lyrics and 'Unfortunately, we are not licensed' not in lyrics:
return lyrics
@ -360,13 +359,14 @@ def _scrape_strip_cruft(html, plain_text_out=False):
html = COMMENT_RE.sub('', html)
html = TAG_RE.sub('', html)
# Strip lines
html = '\n'.join([x.strip() for x in html.strip().split('\n')])
html = re.sub(r'\n{3,}', r'\n\n', html)
return html
def _scrape_merge_paragraphs(html):
return re.sub(r'</p>\s*<p(\s*[^>]*)>', '\n', html)
html = re.sub(r'</p>\s*<p(\s*[^>]*)>', '\n', html)
return re.sub(r'<div .*>\s*</div>', '\n', html)
def scrape_lyrics_from_html(html):
@ -541,4 +541,4 @@ class LyricsPlugin(plugins.BeetsPlugin):
if lyrics:
log.debug(u'got lyrics from backend: {0}'
.format(backend.__name__))
return lyrics.strip()
return _scrape_strip_cruft(lyrics, True)

View file

@ -319,12 +319,11 @@ class LyricsGooglePluginTest(unittest.TestCase):
for (fun, s) in zip([lyrics.fetch_lyricswiki,
lyrics.fetch_lyricscom,
lyrics.fetch_musixmatch], DEFAULT_SOURCES):
if os.path.isfile(url_to_filename(
s['url'] + s['path'])):
url = s['url'] + s['path']
if os.path.isfile(url_to_filename(url)):
res = fun(s['artist'], s['title'])
self.assertTrue(lyrics.is_lyrics(res))
self.assertTrue(is_lyrics_content_ok(
s['title'], res))
self.assertTrue(lyrics.is_lyrics(res), url)
self.assertTrue(is_lyrics_content_ok(s['title'], res), url)
def test_is_page_candidate_exact_match(self):
"""Test matching html page title with song infos -- when song infos are