Leave a single chef in the kitchen

This commit is contained in:
Šarūnas Nejus 2024-09-11 07:51:43 +01:00
parent cb29605bfd
commit 7c2fb31136
No known key found for this signature in database
GPG key ID: DD28F6704DBE3435
2 changed files with 23 additions and 62 deletions

View file

@ -44,8 +44,7 @@ if TYPE_CHECKING:
from beets.library import Item
try:
import bs4
from bs4 import SoupStrainer
from bs4 import BeautifulSoup
HAS_BEAUTIFUL_SOUP = True
except ImportError:
@ -246,17 +245,6 @@ def slug(text):
return re.sub(r"\W+", "-", unidecode(text).lower().strip()).strip("-")
if HAS_BEAUTIFUL_SOUP:
def try_parse_html(html, **kwargs):
return bs4.BeautifulSoup(html, "html.parser", **kwargs)
else:
def try_parse_html(html, **kwargs):
return None
class RequestHandler:
_log: beets.logging.Logger
@ -565,9 +553,7 @@ class Genius(SearchBackend):
for hit in json["response"]["hits"]:
result = hit["result"]
if check(result["primary_artist"]["name"], result["title"]):
return self._scrape_lyrics_from_html(
self.fetch_text(result["url"])
)
return self.scrape_lyrics(self.fetch_text(hit["result"]["url"]))
return None
@ -584,17 +570,9 @@ class Genius(SearchBackend):
headers=self.headers,
)
def replace_br(self, lyrics_div):
for br in lyrics_div.find_all("br"):
br.replace_with("\n")
def _scrape_lyrics_from_html(self, html: str) -> str | None:
def scrape_lyrics(self, html: str) -> str | None:
"""Scrape lyrics from a given genius.com html"""
soup = try_parse_html(html)
# Remove script tags that they put in the middle of the lyrics.
[h.extract() for h in soup("script")]
soup = get_soup(html)
# Most of the time, the page contains a div with class="lyrics" where
# all of the lyrics can be found already correctly formatted
@ -609,7 +587,6 @@ class Genius(SearchBackend):
)
lyrics = ""
for lyrics_div in lyrics_divs:
self.replace_br(lyrics_div)
lyrics += lyrics_div.get_text() + "\n\n"
while lyrics[-1] == "\n":
lyrics = lyrics[:-1]
@ -633,7 +610,6 @@ class Genius(SearchBackend):
return None
lyrics_div = verse_div.parent
self.replace_br(lyrics_div)
ads = lyrics_div.find_all(
"div", class_=re.compile("InreadAd__Container")
@ -665,17 +641,14 @@ class Tekstowo(DirectBackend):
# We are expecting to receive a 404 since we are guessing the URL.
# Thus suppress the error so that it does not end up in the logs.
with suppress(NotFoundError):
return self.extract_lyrics(
return self.scrape_lyrics(
self.fetch_text(self.build_url(artist, title))
)
return None
def extract_lyrics(self, html: str) -> str | None:
html = _scrape_strip_cruft(html)
html = _scrape_merge_paragraphs(html)
soup = try_parse_html(html)
def scrape_lyrics(self, html: str) -> str | None:
soup = get_soup(html)
if lyrics_div := soup.select_one("div.song-text > div.inner-text"):
return lyrics_div.get_text()
@ -723,33 +696,11 @@ def _scrape_merge_paragraphs(html):
return re.sub(r"<div .*>\s*</div>", "\n", html)
def scrape_lyrics_from_html(html: str) -> str | None:
"""Scrape lyrics from a URL. If no lyrics can be found, return None
instead.
"""
def is_text_notcode(text):
if not text:
return False
length = len(text)
return (
length > 20
and text.count(" ") > length / 25
and (text.find("{") == -1 or text.find(";") == -1)
)
def get_soup(html: str) -> BeautifulSoup:
html = _scrape_strip_cruft(html)
html = _scrape_merge_paragraphs(html)
# extract all long text blocks that are not code
soup = try_parse_html(html, parse_only=SoupStrainer(string=is_text_notcode))
# Get the longest text element (if any).
strings = sorted(soup.stripped_strings, key=len, reverse=True)
if strings:
return strings[0]
else:
return None
return BeautifulSoup(html, "html.parser")
class Google(SearchBackend):
@ -757,6 +708,16 @@ class Google(SearchBackend):
SEARCH_URL = "https://www.googleapis.com/customsearch/v1"
@staticmethod
def scrape_lyrics(html: str) -> str | None:
soup = get_soup(html)
# Get the longest text element (if any).
strings = sorted(soup.stripped_strings, key=len, reverse=True)
if strings:
return strings[0]
return None
def is_lyrics(self, text, artist=None):
"""Determine whether the text seems to be valid lyrics."""
if not text:
@ -843,7 +804,7 @@ class Google(SearchBackend):
if not check_candidate(url_link, item.get("title", "")):
continue
with self.handle_request():
lyrics = scrape_lyrics_from_html(self.fetch_text(url_link))
lyrics = self.scrape_lyrics(self.fetch_text(url_link))
if not lyrics:
continue

View file

@ -328,7 +328,7 @@ class TestGoogleLyrics(LyricsBackendTest):
def test_mocked_source_ok(self, backend, lyrics_html):
"""Test that lyrics of the mocked page are correctly scraped"""
result = lyrics.scrape_lyrics_from_html(lyrics_html).lower()
result = backend.scrape_lyrics(lyrics_html).lower()
assert result
assert backend.is_lyrics(result)
@ -390,7 +390,7 @@ class TestGeniusLyrics(LyricsBackendTest):
],
) # fmt: skip
def test_scrape(self, backend, lyrics_html, expected_line_count):
result = backend._scrape_lyrics_from_html(lyrics_html) or ""
result = backend.scrape_lyrics(lyrics_html) or ""
assert len(result.splitlines()) == expected_line_count
@ -411,7 +411,7 @@ class TestTekstowoLyrics(LyricsBackendTest):
],
)
def test_scrape(self, backend, lyrics_html, expecting_lyrics):
assert bool(backend.extract_lyrics(lyrics_html)) == expecting_lyrics
assert bool(backend.scrape_lyrics(lyrics_html)) == expecting_lyrics
LYRICS_DURATION = 950