mirror of
https://github.com/beetbox/beets.git
synced 2025-12-29 03:52:51 +01:00
lyrics : remove empty divs before scraping
it may result in \n being inserted that we will strip in _scrape_strip_cruft
This commit is contained in:
parent
fd94094c1b
commit
d4d5c085fa
2 changed files with 11 additions and 12 deletions
|
|
@ -90,7 +90,7 @@ def extract_text_between(html, start_marker, end_marker):
|
|||
html, _ = html.split(end_marker, 1)
|
||||
except ValueError:
|
||||
return u''
|
||||
return _scrape_strip_cruft(html, True)
|
||||
return html
|
||||
|
||||
|
||||
def extract_text_in(html, starttag):
|
||||
|
|
@ -124,8 +124,7 @@ def extract_text_in(html, starttag):
|
|||
else:
|
||||
print('no closing tag found!')
|
||||
return
|
||||
lyrics = ''.join(parts)
|
||||
return _scrape_strip_cruft(lyrics, True)
|
||||
return u''.join(parts)
|
||||
|
||||
|
||||
def search_pairs(item):
|
||||
|
|
@ -221,7 +220,7 @@ def fetch_lyricswiki(artist, title):
|
|||
if not html:
|
||||
return
|
||||
|
||||
lyrics = extract_text_in(html, "<div class='lyricbox'>")
|
||||
lyrics = extract_text_in(html, u"<div class='lyricbox'>")
|
||||
if lyrics and 'Unfortunately, we are not licensed' not in lyrics:
|
||||
return lyrics
|
||||
|
||||
|
|
@ -360,13 +359,14 @@ def _scrape_strip_cruft(html, plain_text_out=False):
|
|||
html = COMMENT_RE.sub('', html)
|
||||
html = TAG_RE.sub('', html)
|
||||
|
||||
# Strip lines
|
||||
html = '\n'.join([x.strip() for x in html.strip().split('\n')])
|
||||
html = re.sub(r'\n{3,}', r'\n\n', html)
|
||||
return html
|
||||
|
||||
|
||||
def _scrape_merge_paragraphs(html):
|
||||
return re.sub(r'</p>\s*<p(\s*[^>]*)>', '\n', html)
|
||||
html = re.sub(r'</p>\s*<p(\s*[^>]*)>', '\n', html)
|
||||
return re.sub(r'<div .*>\s*</div>', '\n', html)
|
||||
|
||||
|
||||
def scrape_lyrics_from_html(html):
|
||||
|
|
@ -541,4 +541,4 @@ class LyricsPlugin(plugins.BeetsPlugin):
|
|||
if lyrics:
|
||||
log.debug(u'got lyrics from backend: {0}'
|
||||
.format(backend.__name__))
|
||||
return lyrics.strip()
|
||||
return _scrape_strip_cruft(lyrics, True)
|
||||
|
|
|
|||
|
|
@ -319,12 +319,11 @@ class LyricsGooglePluginTest(unittest.TestCase):
|
|||
for (fun, s) in zip([lyrics.fetch_lyricswiki,
|
||||
lyrics.fetch_lyricscom,
|
||||
lyrics.fetch_musixmatch], DEFAULT_SOURCES):
|
||||
if os.path.isfile(url_to_filename(
|
||||
s['url'] + s['path'])):
|
||||
url = s['url'] + s['path']
|
||||
if os.path.isfile(url_to_filename(url)):
|
||||
res = fun(s['artist'], s['title'])
|
||||
self.assertTrue(lyrics.is_lyrics(res))
|
||||
self.assertTrue(is_lyrics_content_ok(
|
||||
s['title'], res))
|
||||
self.assertTrue(lyrics.is_lyrics(res), url)
|
||||
self.assertTrue(is_lyrics_content_ok(s['title'], res), url)
|
||||
|
||||
def test_is_page_candidate_exact_match(self):
|
||||
"""Test matching html page title with song infos -- when song infos are
|
||||
|
|
|
|||
Loading…
Reference in a new issue