mirror of
https://github.com/beetbox/beets.git
synced 2026-01-28 19:16:10 +01:00
commit
ea89cf32eb
3 changed files with 39 additions and 47 deletions
|
|
@ -33,10 +33,10 @@ from beets import config
|
|||
|
||||
log = logging.getLogger('beets')
|
||||
|
||||
DIV_RE = re.compile(r'<(/?)div>?')
|
||||
DIV_RE = re.compile(r'<(/?)div>?', re.I)
|
||||
COMMENT_RE = re.compile(r'<!--.*-->', re.S)
|
||||
TAG_RE = re.compile(r'<[^>]*>')
|
||||
BREAK_RE = re.compile(r'<br\s*/?>')
|
||||
BREAK_RE = re.compile(r'<br\s*/?>', re.I)
|
||||
URL_CHARACTERS = {
|
||||
u'\u2018': u"'",
|
||||
u'\u2019': u"'",
|
||||
|
|
@ -122,6 +122,7 @@ def strip_cruft(lyrics, wscollapse=True):
|
|||
lyrics = unescape(lyrics)
|
||||
if wscollapse:
|
||||
lyrics = re.sub(r'\s+', ' ', lyrics) # Whitespace collapse.
|
||||
|
||||
lyrics = re.sub(r'<(script).*?</\1>(?s)', '', lyrics) # Strip script tags.
|
||||
lyrics = BREAK_RE.sub('\n', lyrics) # <BR> newlines.
|
||||
lyrics = re.sub(r'\n +', '\n', lyrics)
|
||||
|
|
@ -294,36 +295,6 @@ def is_page_candidate(urlLink, urlTitle, title, artist):
|
|||
return difflib.SequenceMatcher(None, songTitle, title).ratio() >= typoRatio
|
||||
|
||||
|
||||
def insert_line_feeds(text):
|
||||
"""Insert newlines before upper-case characters.
|
||||
"""
|
||||
tokensStr = re.split("([a-z][A-Z])", text)
|
||||
for idx in range(1, len(tokensStr), 2):
|
||||
ltoken = list(tokensStr[idx])
|
||||
tokensStr[idx] = ltoken[0] + '\n' + ltoken[1]
|
||||
return ''.join(tokensStr)
|
||||
|
||||
|
||||
def sanitize_lyrics(text):
|
||||
"""Clean text, returning raw lyrics as output or None if it happens
|
||||
that input text is actually not lyrics content. Clean (x)html tags
|
||||
in text, correct layout and syntax...
|
||||
"""
|
||||
text = strip_cruft(text, False)
|
||||
|
||||
# Restore \n in input text
|
||||
if '\n' not in text:
|
||||
text = insert_line_feeds(text)
|
||||
|
||||
while text.count('\n\n') > text.count('\n') // 4:
|
||||
# Remove first occurrence of \n for each sequence of \n
|
||||
text = re.sub(r'\n(\n+)', '\g<1>', text)
|
||||
|
||||
text = re.sub(r'\n\n+', '\n\n', text) # keep at most two \n in a row
|
||||
|
||||
return text
|
||||
|
||||
|
||||
def remove_credits(text):
|
||||
"""Remove first/last line of text if it contains the word 'lyrics'
|
||||
eg 'Lyrics by songsdatabase.com'
|
||||
|
|
@ -343,7 +314,6 @@ def is_lyrics(text, artist=None):
|
|||
"""
|
||||
if not text:
|
||||
return
|
||||
|
||||
badTriggersOcc = []
|
||||
nbLines = text.count('\n')
|
||||
if nbLines <= 1:
|
||||
|
|
@ -356,7 +326,7 @@ def is_lyrics(text, artist=None):
|
|||
# down
|
||||
text = remove_credits(text)
|
||||
|
||||
badTriggers = ['lyrics', 'copyright', 'property']
|
||||
badTriggers = ['lyrics', 'copyright', 'property', 'links']
|
||||
if artist:
|
||||
badTriggersOcc += [artist]
|
||||
|
||||
|
|
@ -450,7 +420,7 @@ def fetch_google(artist, title):
|
|||
if not lyrics:
|
||||
continue
|
||||
|
||||
lyrics = sanitize_lyrics(lyrics)
|
||||
lyrics = strip_cruft(lyrics, False)
|
||||
|
||||
if is_lyrics(lyrics, artist):
|
||||
log.debug(u'got lyrics from {0}'.format(item['displayLink']))
|
||||
|
|
|
|||
|
|
@ -90,16 +90,13 @@ class LyricsScrapingPluginTest(unittest.TestCase):
|
|||
# Use default query when possible, or override artist and title field
|
||||
# if website don't have lyrics for default query.
|
||||
sourcesOk = [
|
||||
dict(definfo, url=u'http://www.smartlyrics.com',
|
||||
path=u'/Song18148-The-Beatles-Lady-Madonna-lyrics.aspx'),
|
||||
dict(definfo, url='http://www.songlyrics.com',
|
||||
path=u'/the-beatles/lady-madonna-lyrics'),
|
||||
dict(definfo, url=u'http://www.elyricsworld.com',
|
||||
path=u'/lady_madonna_lyrics_beatles.html'),
|
||||
dict(artist=u'Beres Hammond', title=u'I could beat myself',
|
||||
url=u'http://www.reggaelyrics.info',
|
||||
path=u'/beres-hammond/i-could-beat-myself'),
|
||||
dict(definfo, artist=u'Lilly Wood & the prick', title=u"Hey it's ok",
|
||||
url=u'http://www.lyricsmania.com',
|
||||
path=u'/hey_its_ok_lyrics_lilly_wood_and_the_prick.html'),
|
||||
dict(definfo, artist=u'Lilly Wood & the prick', title=u"Hey it's ok",
|
||||
url=u'http://www.paroles.net/',
|
||||
path=u'lilly-wood-the-prick/paroles-hey-it-s-ok'),
|
||||
|
|
@ -108,25 +105,28 @@ class LyricsScrapingPluginTest(unittest.TestCase):
|
|||
path=u'/amy-winehouse-songs/jazz-n-blues-lyrics.html'),
|
||||
dict(definfo, url=u'http://www.sweetslyrics.com',
|
||||
path=u'/761696.The%20Beatles%20-%20Lady%20Madonna.html'),
|
||||
dict(definfo, url=u'http://www.lyrics007.com',
|
||||
path=u'/The%20Beatles%20Lyrics/Lady%20Madonna%20Lyrics.html'),
|
||||
dict(definfo, url=u'http://www.absolutelyrics.com',
|
||||
path=u'/lyrics/view/the_beatles/lady_madonna'),
|
||||
dict(definfo, url=u'http://www.azlyrics.com/',
|
||||
path=u'/lyrics/beatles/ladymadonna.html'),
|
||||
dict(definfo, url=u'http://www.chartlyrics.com',
|
||||
path=u'/_LsLsZ7P4EK-F-LD4dJgDQ/Lady+Madonna.aspx'),
|
||||
dict(definfo, url='http://www.releaselyrics.com',
|
||||
path=u'/e35f/the-beatles-lady-madonna'),
|
||||
]
|
||||
|
||||
# Websites that can't be scraped yet and whose results must be
|
||||
# flagged as invalid lyrics.
|
||||
sourcesFail = [
|
||||
dict(definfo, url='http://www.songlyrics.com',
|
||||
path=u'/the-beatles/lady-madonna-lyrics'),
|
||||
dict(definfo, url=u'http://www.smartlyrics.com',
|
||||
path=u'/Song18148-The-Beatles-Lady-Madonna-lyrics.aspx'),
|
||||
dict(definfo, url='http://www.metrolyrics.com/',
|
||||
path='best-for-last-lyrics-adele.html')
|
||||
path='best-for-last-lyrics-adele.html'),
|
||||
dict(definfo, artist=u'Lilly Wood & the prick', title=u"Hey it's ok",
|
||||
url=u'http://www.lyricsmania.com',
|
||||
path=u'/hey_its_ok_lyrics_lilly_wood_and_the_prick.html'),
|
||||
dict(definfo, url=u'http://www.lyrics007.com',
|
||||
path=u'/The%20Beatles%20Lyrics/Lady%20Madonna%20Lyrics.html'),
|
||||
dict(definfo, url='http://www.releaselyrics.com',
|
||||
path=u'/e35f/the-beatles-lady-madonna'),
|
||||
]
|
||||
|
||||
# Websites that return truncated lyrics because of scraping issues, and
|
||||
|
|
|
|||
|
|
@ -1,3 +1,4 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# This file is part of beets.
|
||||
# Copyright 2014, Fabrice Laporte.
|
||||
#
|
||||
|
|
@ -111,11 +112,32 @@ class LyricsPluginTest(unittest.TestCase):
|
|||
lyrics.remove_credits("""Lyrics brought by example.com"""),
|
||||
""
|
||||
)
|
||||
|
||||
# don't remove 2nd verse for the only reason it contains 'lyrics' word
|
||||
text = """Look at all the shit that i done bought her
|
||||
See lyrics ain't nothin
|
||||
if the beat aint crackin"""
|
||||
self.assertEqual(lyrics.remove_credits(text), text)
|
||||
|
||||
def test_strip_cruft(self):
|
||||
text = """<!--lyrics below-->
|
||||
<script type="javascript">
|
||||
One<BR>\r\n
|
||||
<blink>Two</blink>
|
||||
"""
|
||||
self.assertEqual(lyrics.strip_cruft(text), u"One\nTwo")
|
||||
|
||||
def test_is_lyrics(self):
|
||||
texts = ['LyricsMania.com - Copyright (c) 2013 - All Rights Reserved']
|
||||
texts += ["""All material found on this site is property\n
|
||||
of mywickedsongtext brand"""]
|
||||
for t in texts:
|
||||
self.assertFalse(lyrics.is_lyrics(t))
|
||||
|
||||
def test_slugify(self):
|
||||
text = u"http://site.com/çafe-au_lait(boisson)"
|
||||
self.assertEqual(lyrics.slugify(text), 'http://site.com/cafe_au_lait')
|
||||
|
||||
|
||||
def suite():
|
||||
return unittest.TestLoader().loadTestsFromName(__name__)
|
||||
|
|
|
|||
Loading…
Reference in a new issue