mirror of
https://github.com/beetbox/beets.git
synced 2026-02-28 10:15:23 +01:00
Merge pull request #325 from KraYmer/master
lyrics: google backend should turn up more results
This commit is contained in:
commit
02c305e20c
1 changed files with 39 additions and 56 deletions
|
|
@ -121,6 +121,7 @@ def strip_cruft(lyrics, wscollapse=True):
|
|||
lyrics = re.sub(r'\n +', '\n', lyrics)
|
||||
lyrics = re.sub(r' +\n', '\n', lyrics)
|
||||
lyrics = TAG_RE.sub('', lyrics) # Strip remaining HTML tags.
|
||||
lyrics = lyrics.replace('\r','\n')
|
||||
lyrics = lyrics.strip()
|
||||
return lyrics
|
||||
|
||||
|
|
@ -204,13 +205,17 @@ def slugify(text):
|
|||
log.exception("Failing to normalize '%s'" % (text))
|
||||
return urllib.quote(text)
|
||||
|
||||
|
||||
BY_TRANS = ['by', 'par']
|
||||
LYRICS_TRANS = ['lyrics', 'paroles']
|
||||
|
||||
def is_page_candidate(urlLink, urlTitle, title, artist):
|
||||
"""Return True if the URL title makes it a good candidate to be a
|
||||
page that contains lyrics of title by artist.
|
||||
"""
|
||||
title = slugify(title.lower())
|
||||
artist = slugify(artist.lower())
|
||||
urlLink = slugify(urlLink.lower())
|
||||
sitename = re.search(u"//([^/]+)/.*", slugify(urlLink.lower())).group(1)
|
||||
urlTitle = slugify(urlTitle.lower())
|
||||
|
||||
# Check if URL title contains song title (exact match)
|
||||
|
|
@ -218,8 +223,10 @@ def is_page_candidate(urlLink, urlTitle, title, artist):
|
|||
return True
|
||||
# or try extracting song title from URL title and check if
|
||||
# they are close enough
|
||||
songTitle = urlTitle.replace('lyrics', '') \
|
||||
.replace(artist, '').strip('%20')
|
||||
tokens = [by+'%20'+artist for by in BY_TRANS] + \
|
||||
[artist, sitename, sitename.replace('www.','')] + LYRICS_TRANS
|
||||
songTitle = re.sub(u'(%s)' % u'|'.join(tokens) ,u'', urlTitle).strip('%20')
|
||||
|
||||
if songTitle:
|
||||
log.debug("Match ratio of '%s' with title: %s" %
|
||||
(songTitle,
|
||||
|
|
@ -237,17 +244,6 @@ def insert_line_feeds(text):
|
|||
tokensStr[idx] = ltoken[0] + '\n' + ltoken[1]
|
||||
return ''.join(tokensStr)
|
||||
|
||||
def decimate_line_feeds(text):
|
||||
"""Decimate newline characters. By default use only one newline as
|
||||
an end-of-line marker. Keep at most two newlines in a row (e.g., to
|
||||
separate verses).
|
||||
"""
|
||||
# Remove first occurrence of \n for each sequence of \n
|
||||
text = re.sub(r'\n(\n+)', '\g<1>', text)
|
||||
# Keep at most two \n in a row
|
||||
text = re.sub(r'\n\n+', '\n\n', text)
|
||||
return text.strip('\n')
|
||||
|
||||
def sanitize_lyrics(text):
|
||||
"""Clean text, returning raw lyrics as output or None if it happens
|
||||
that input text is actually not lyrics content. Clean (x)html tags
|
||||
|
|
@ -255,32 +251,26 @@ def sanitize_lyrics(text):
|
|||
"""
|
||||
text = strip_cruft(text, False)
|
||||
|
||||
# Suppress advertisements.
|
||||
# Match lines with an opening bracket but no ending one, ie lines that
|
||||
# contained html link that has been wiped out when scraping.
|
||||
LINK1_RE = re.compile(r'(\(|\[).*[^\)\]]$')
|
||||
# Match lines containing url between brackets
|
||||
LINK2_RE = re.compile(r'(\(|\[).*[http|www].*(\]|\))')
|
||||
text = LINK1_RE.sub('', text)
|
||||
text = LINK2_RE.sub('', text)
|
||||
|
||||
# Restore \n in input text
|
||||
if '\n' not in text:
|
||||
text = insert_line_feeds(text)
|
||||
|
||||
# Suppress advertisements.
|
||||
textLines = text.splitlines(True)
|
||||
# Match lines with an opening bracket but no ending one, ie lines that
|
||||
# contained html link that has been wiped out when scraping.
|
||||
reAdHtml = re.compile(r'(\(|\[).*[^\)\]]$')
|
||||
# Match lines containing url between brackets
|
||||
reAdTxt = re.compile(r'(\(|\[).*[http|www].*(\]|\))')
|
||||
for line in textLines:
|
||||
if re.match(reAdHtml, line) or re.match(reAdTxt, line):
|
||||
textLines.remove(line)
|
||||
while text.count('\n\n') > text.count('\n')/4:
|
||||
# Remove first occurrence of \n for each sequence of \n
|
||||
text = re.sub(r'\n(\n+)', '\g<1>', text)
|
||||
|
||||
# \n might have been duplicated during the scraping.
|
||||
# decimate \n while number of \n represent more than half the number of
|
||||
# lines
|
||||
while len([x for x in textLines if x == '\n']) >= len(textLines) / 2 - 1:
|
||||
if len(textLines) <= 3:
|
||||
break
|
||||
text = ''.join(textLines)
|
||||
text = decimate_line_feeds(text)
|
||||
textLines = [line.strip(' ') for line in text.splitlines(True)]
|
||||
text = re.sub(r'\n\n+', '\n\n', text) # keep at most two \n in a row
|
||||
|
||||
return ''.join(textLines)
|
||||
return text
|
||||
|
||||
def is_lyrics(text, artist):
|
||||
"""Determine whether the text seems to be valid lyrics.
|
||||
|
|
@ -305,32 +295,25 @@ def scrape_lyrics_from_url(url):
|
|||
"""Scrape lyrics from a URL. If no lyrics can be found, return None
|
||||
instead.
|
||||
"""
|
||||
from bs4 import BeautifulSoup, Tag
|
||||
from bs4 import BeautifulSoup, Tag, Comment
|
||||
html = fetch_url(url)
|
||||
soup = BeautifulSoup(html)
|
||||
|
||||
# Simplify the code by replacing some markers by the <p> marker
|
||||
try:
|
||||
for tag in soup.findAll(['center', 'blockquote']):
|
||||
pTag = Tag(soup, "p")
|
||||
pTag.contents = tag.contents
|
||||
tag.replaceWith(pTag)
|
||||
|
||||
for tag in soup.findAll(['script', 'a', 'font']):
|
||||
tag.replaceWith('<p>')
|
||||
|
||||
except Exception, e:
|
||||
log.debug('Error %s when replacing containing marker by p marker' % e,
|
||||
exc_info=True)
|
||||
|
||||
for tag in soup.findAll('br'):
|
||||
tag.replaceWith('\n')
|
||||
|
||||
# Keep only tags that can possibly be parent tags and eol
|
||||
for tag in soup.findAll(True):
|
||||
containers = ['div', 'p', 'html', 'body', 'table', 'tr', 'td', 'br']
|
||||
if tag.name not in containers:
|
||||
tag.extract()
|
||||
# Remove non relevant html parts
|
||||
[s.extract() for s in soup(['head', 'script'])]
|
||||
comments = soup.findAll(text=lambda text:isinstance(text, Comment))
|
||||
[s.extract() for s in comments]
|
||||
|
||||
try:
|
||||
for tag in soup.findAll(True):
|
||||
tag.name = 'p' # keep tag contents
|
||||
|
||||
except Exception, e:
|
||||
log.debug('Error %s when replacing containing marker by p marker' % e,
|
||||
exc_info=True)
|
||||
|
||||
# Make better soup from current soup! The previous unclosed <p> sections
|
||||
# are now closed. Use str() rather than prettify() as it's more
|
||||
|
|
@ -346,6 +329,7 @@ def scrape_lyrics_from_url(url):
|
|||
pTag.insert(0, bodyTag)
|
||||
|
||||
tagTokens = []
|
||||
|
||||
for tag in soup.findAll('p'):
|
||||
soup2 = BeautifulSoup(str(tag))
|
||||
# Extract all text of <p> section.
|
||||
|
|
@ -355,8 +339,6 @@ def scrape_lyrics_from_url(url):
|
|||
# Lyrics are expected to be the longest paragraph
|
||||
tagTokens = sorted(tagTokens, key=len, reverse=True)
|
||||
soup = BeautifulSoup(tagTokens[0])
|
||||
if soup.findAll(['div', 'a']):
|
||||
return None
|
||||
return unescape(tagTokens[0].strip("\n\r: "))
|
||||
|
||||
def fetch_google(artist, title):
|
||||
|
|
@ -459,6 +441,7 @@ class LyricsPlugin(BeetsPlugin):
|
|||
(item.artist, item.title))
|
||||
|
||||
item.lyrics = lyrics
|
||||
|
||||
if write:
|
||||
item.write()
|
||||
lib.store(item)
|
||||
|
|
|
|||
Loading…
Reference in a new issue