mirror of
https://github.com/beetbox/beets.git
synced 2026-01-07 16:34:45 +01:00
Address failing google sources tests
Two google sources failed to return the expected output. I looked into each case why parsing failed: - lyrics on musica.com contain <aside> Google Ads - each lyrics line on lacoccinelle.net is wrapped within alternating <em> and <strong> tags Thus remove these tags as part of the HTML cleanup logic.
This commit is contained in:
parent
e99d457c9d
commit
3b73a26002
1 changed files with 2 additions and 0 deletions
|
|
@ -536,6 +536,8 @@ def _scrape_strip_cruft(html, plain_text_out=False):
|
|||
html = BREAK_RE.sub("\n", html) # <br> eats up surrounding '\n'.
|
||||
html = re.sub(r"(?s)<(script).*?</\1>", "", html) # Strip script tags.
|
||||
html = re.sub("\u2005", " ", html) # replace unicode with regular space
|
||||
html = re.sub("<aside .+?</aside>", "", html) # remove Google Ads tags
|
||||
html = re.sub(r"</?(em|strong)[^>]*>", "", html) # remove italics / bold
|
||||
|
||||
if plain_text_out: # Strip remaining HTML tags
|
||||
html = COMMENT_RE.sub("", html)
|
||||
|
|
|
|||
Loading…
Reference in a new issue