mirror of
https://github.com/beetbox/beets.git
synced 2026-01-30 03:54:21 +01:00
no html entities in _scrape_streamline_soup output
This commit is contained in:
parent
da6bcda4af
commit
333591fd78
2 changed files with 29 additions and 29 deletions
|
|
@ -347,20 +347,19 @@ def _scrape_normalize_eol(html):
|
|||
html.replace('\r','\n')
|
||||
# Replace <br> without introducing superfluous newline in the output
|
||||
BREAK_RE = re.compile(r'\n?\s*<br([\s|/][^>]*)*>\s*\n?', re.I)
|
||||
html = BREAK_RE.sub('\n', html)
|
||||
html = BREAK_RE.sub('\n', html)
|
||||
return html
|
||||
|
||||
def _scrape_merge_paragraphs(html):
|
||||
regex = re.compile(r'</p>\s*<p(\s*[^>]*)>')
|
||||
html = regex.sub('\n', html)
|
||||
|
||||
html = regex.sub('\n', html)
|
||||
return html
|
||||
|
||||
def _scrape_filter_soup(soup):
|
||||
"""Remove sections from soup that cannot be parents of lyrics section
|
||||
"""
|
||||
# Remove non relevant html parts
|
||||
[s.extract() for s in soup(['head', 'script'])]
|
||||
[s.extract() for s in soup(['head', 'script', 'iframe', 'a'])]
|
||||
comments = soup.findAll(text=lambda text: isinstance(text, Comment))
|
||||
[s.extract() for s in comments]
|
||||
|
||||
|
|
@ -385,9 +384,8 @@ def _scrape_streamline_soup(soup):
|
|||
.format(e, exc_info=True))
|
||||
|
||||
# Make better soup from current soup! The previous unclosed <p> sections
|
||||
# are now closed. Use str() rather than prettify() as it's more
|
||||
# conservative concerning EOL
|
||||
soup = BeautifulSoup(str(soup))
|
||||
# are now closed.
|
||||
soup = BeautifulSoup(soup.prettify(formatter=None))
|
||||
|
||||
# Insert the whole body in a <p> in case lyrics are nested in no markup but
|
||||
# <body>
|
||||
|
|
@ -419,12 +417,15 @@ def scrape_lyrics_from_html(html):
|
|||
"""
|
||||
if not html:
|
||||
return None
|
||||
|
||||
|
||||
html = _scrape_normalize_eol(html)
|
||||
html = _scrape_merge_paragraphs(html)
|
||||
|
||||
soup = BeautifulSoup(html)
|
||||
|
||||
soup = _scrape_filter_soup(soup)
|
||||
soup = _scrape_streamline_soup(soup)
|
||||
|
||||
soup = _scrape_longest_paragraph(soup)
|
||||
|
||||
return soup
|
||||
|
|
|
|||
|
|
@ -46,7 +46,7 @@ class MockFetchUrl(object):
|
|||
fn = "".join(x for x in url if (x.isalnum() or x == '/'))
|
||||
fn = fn.split('/')
|
||||
fn = os.path.join('rsrc', 'lyrics', fn[0], fn[-1]) + '.txt'
|
||||
|
||||
|
||||
with open(fn, 'r') as f:
|
||||
content = f.read()
|
||||
return content
|
||||
|
|
@ -71,42 +71,42 @@ class LyricsSourcesPluginTest(unittest.TestCase):
|
|||
# Use default query when possible, or override artist and title field
|
||||
# if website don't have lyrics for default query.
|
||||
sourcesOk = [
|
||||
dict(definfo,
|
||||
dict(definfo,
|
||||
url=u'http://www.absolutelyrics.com',
|
||||
path=u'/lyrics/view/the_beatles/lady_madonna'),
|
||||
dict(definfo,
|
||||
dict(definfo,
|
||||
url=u'http://www.azlyrics.com',
|
||||
path=u'/lyrics/beatles/ladymadonna.html'),
|
||||
dict(definfo,
|
||||
dict(definfo,
|
||||
url=u'http://www.chartlyrics.com',
|
||||
path=u'/_LsLsZ7P4EK-F-LD4dJgDQ/Lady+Madonna.aspx'),
|
||||
dict(definfo,
|
||||
dict(definfo,
|
||||
url=u'http://www.elyricsworld.com',
|
||||
path=u'/lady_madonna_lyrics_beatles.html'),
|
||||
dict(definfo,
|
||||
dict(definfo,
|
||||
url=u'http://www.lacoccinelle.net',
|
||||
artist=u'Jacques Brel', title=u"Amsterdam",
|
||||
path=u'/paroles-officielles/275679.html'),
|
||||
dict(definfo,
|
||||
dict(definfo,
|
||||
url=u'http://www.lyrics007.com',
|
||||
path=u'/The%20Beatles%20Lyrics/Lady%20Madonna%20Lyrics.html'),
|
||||
dict(definfo,
|
||||
dict(definfo,
|
||||
url='http://www.lyrics.com/',
|
||||
path=u'lady-madonna-lyrics-the-beatles.html'),
|
||||
dict(definfo,
|
||||
dict(definfo,
|
||||
url='http://www.lyricsmania.com/',
|
||||
path='lady_madonna_lyrics_the_beatles.html'),
|
||||
dict(definfo,
|
||||
url=u'http://www.lyrics.net',
|
||||
dict(definfo,
|
||||
url=u'http://www.lyrics.net',
|
||||
path=u'/lyric/17547916'),
|
||||
dict(definfo,
|
||||
dict(definfo,
|
||||
url=u'http://www.lyricsontop.com',
|
||||
artist=u'Amy Winehouse', title=u"Jazz'n'blues",
|
||||
path=u'/amy-winehouse-songs/jazz-n-blues-lyrics.html'),
|
||||
dict(definfo,
|
||||
dict(definfo,
|
||||
url=u'http://lyrics.wikia.com/',
|
||||
path=u'The_Beatles:Lady_Madonna'),
|
||||
dict(definfo,
|
||||
dict(definfo,
|
||||
url='http://www.metrolyrics.com/',
|
||||
path='lady-madonna-lyrics-beatles.html'),
|
||||
dict(definfo,
|
||||
|
|
@ -117,25 +117,24 @@ class LyricsSourcesPluginTest(unittest.TestCase):
|
|||
url=u'http://www.reggaelyrics.info',
|
||||
artist=u'Beres Hammond', title=u'I could beat myself',
|
||||
path=u'/beres-hammond/i-could-beat-myself'),
|
||||
dict(definfo,
|
||||
dict(definfo,
|
||||
url='http://www.releaselyrics.com',
|
||||
path=u'/e35f/the-beatles-lady-madonna'),
|
||||
dict(definfo,
|
||||
dict(definfo,
|
||||
url=u'http://www.smartlyrics.com',
|
||||
path=u'/Song18148-The-Beatles-Lady-Madonna-lyrics.aspx'),
|
||||
dict(definfo,
|
||||
dict(definfo,
|
||||
url='http://www.songlyrics.com',
|
||||
path=u'/the-beatles/lady-madonna-lyrics'),
|
||||
dict(definfo,
|
||||
url=u'http://www.stlyrics.com',
|
||||
dict(definfo,
|
||||
url=u'http://www.stlyrics.com',
|
||||
path=u'/songs/r/richiehavens48961/ladymadonna2069109.html'),
|
||||
dict(definfo,
|
||||
dict(definfo,
|
||||
url=u'http://www.sweetslyrics.com',
|
||||
path=u'/761696.The%20Beatles%20-%20Lady%20Madonna.html'),
|
||||
]
|
||||
|
||||
|
||||
|
||||
def setUp(self):
|
||||
"""Set up configuration"""
|
||||
lyrics.LyricsPlugin()
|
||||
|
|
|
|||
Loading…
Reference in a new issue