diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py index ef5ca3ed0..9f70e110d 100644 --- a/beetsplug/lyrics.py +++ b/beetsplug/lyrics.py @@ -143,7 +143,7 @@ def search_pairs(item): In addition to the artist and title obtained from the `item` the method tries to strip extra information like paranthesized suffixes - and featured artists from the strings and add them as caniddates. + and featured artists from the strings and add them as candidates. The method also tries to split multiple titles separated with `/`. """ @@ -319,7 +319,7 @@ def is_lyrics(text, artist=None): badTriggersOcc = [] nbLines = text.count('\n') if nbLines <= 1: - log.debug(u"Ignoring too short lyrics '{0}'".format(text)) + log.debug(u"Ignoring too short lyrics '{0}'".format(text.decode('utf8'))) return 0 elif nbLines < 5: badTriggersOcc.append('too_short') @@ -341,16 +341,21 @@ def is_lyrics(text, artist=None): return len(badTriggersOcc) < 2 - def _scrape_normalize_eol(html): - """Return html text where only authorized eol marker is \n + """Return html text where the only authorized eol marker is \n """ html.replace('\r','\n') # Replace
without introducing superfluous newline in the output - BREAK_RE = re.compile(r'\n?\s*\s*\n?', re.I) + BREAK_RE = re.compile(r'\n?\s*]*)*>\s*\n?', re.I) html = BREAK_RE.sub('\n', html) return html +def _scrape_merge_paragraphs(html): + regex = re.compile(r'

\s*]*)>') + html = regex.sub('\n', html) + + return html + def _scrape_filter_soup(soup): """Remove sections from soup that cannot be parents of lyrics section """ @@ -360,13 +365,17 @@ def _scrape_filter_soup(soup): [s.extract() for s in comments] # Remove ads now as they can interrupt the lyrics block - ads = soup.find_all('div', class_=re.compile('ad')) + ads = soup.find_all('div', class_=re.compile('^ad')) [s.extract() for s in ads] return soup def _scrape_streamline_soup(soup): """Transform soup into a succession of

blocks """ + for tag in ['em','i','b','strong']: + for match in soup.find_all(tag): + match.unwrap() + try: for tag in soup.findAll(True): tag.name = 'p' # keep tag contents @@ -380,8 +389,8 @@ def _scrape_streamline_soup(soup): # conservative concerning EOL soup = BeautifulSoup(str(soup)) - # In case lyrics are nested in no markup but - # Insert the whole body in a

+ # Insert the whole body in a

in case lyrics are nested in no markup but + # bodyTag = soup.find('body') if bodyTag: pTag = soup.new_tag("p") @@ -393,7 +402,7 @@ def _scrape_longest_paragraph(soup): """Return longest paragraph from soup """ tagTokens = [] - + for tag in soup.findAll('p'): soup2 = BeautifulSoup(str(tag)) # Extract all text of

section. @@ -404,28 +413,18 @@ def _scrape_longest_paragraph(soup): soup = BeautifulSoup(tagTokens[0]) return unescape(tagTokens[0].strip("\n\r: ")) -def _scrape_custom_process_soup(soup): - """Apply custom operations on soup to handle cases for specific websites - """ - # metrolyrics.com: lyrics text is splitted into multiple

- for match in soup.find_all('p', class_='verse'): - match.insert_before('\n') - match.unwrap() - return soup - def scrape_lyrics_from_html(html): """Scrape lyrics from a URL. If no lyrics can be found, return None instead. """ if not html: return None - + html = _scrape_normalize_eol(html) + html = _scrape_merge_paragraphs(html) soup = BeautifulSoup(html) soup = _scrape_filter_soup(soup) soup = _scrape_streamline_soup(soup) - soup = _scrape_custom_process_soup(soup) - # print(soup) soup = _scrape_longest_paragraph(soup) return soup