Fix incorrect soup usage in various recipes

Also make SoupStrainer available in calibre.ebooks.BeautifulSoup
2026-05-05 23:33:47 +02:00 · 2019-03-25 10:17:27 +05:30 · 2019-03-25 10:17:27 +05:30 · ba59ac679d
commit ba59ac679d
parent de9d97d688
6 changed files with 10 additions and 12 deletions
--- a/recipes/calcalist.recipe
+++ b/recipes/calcalist.recipe
@ -1,5 +1,5 @@
 from calibre.web.feeds.news import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import re
+import re


 class AdvancedUserRecipe1283848012(BasicNewsRecipe):
--- a/recipes/globes_co_il.recipe
+++ b/recipes/globes_co_il.recipe
@ -1,5 +1,5 @@
 from calibre.web.feeds.news import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import re
+import re


 class AdvancedUserRecipe1283848012(BasicNewsRecipe):
--- a/recipes/metro_news_nl.recipe
+++ b/recipes/metro_news_nl.recipe
@ -3,7 +3,6 @@
 from calibre.web.feeds.news import BasicNewsRecipe
 import re
 from calibre.utils.magick import Image
-from calibre.ebooks.BeautifulSoup import BeautifulSoup

 ''' Version 1.2, updated cover image to match the changed website.
 added info date on title
@ -163,7 +162,7 @@ def safeRemovePart(self, killingSoup, soupIsArray):
        return killingSoup


-class MerryProcess(BeautifulSoup):
+class MerryProcess(object):
    myKiller = MerryExtract()
    myPrepare = MerryPreProcess()

--- a/recipes/roger_ebert.recipe
+++ b/recipes/roger_ebert.recipe
@ -1,7 +1,6 @@
 import re
 import urllib2
 from calibre.web.feeds.news import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import BeautifulSoup, SoupStrainer


 class Ebert(BasicNewsRecipe):
@ -78,8 +77,8 @@ def parse_index(self):
                    description = match.group(2)

                self.log(thislink)
-
-                for link in BeautifulSoup(thislink, parseOnlyThese=SoupStrainer('a')):
+                soup = self.index_to_soup(thislink)
+                for link in soup.findAll('a', href=True):
                    thisurl = self.PREFIX + link['href']
                    thislinktext = self.tag_to_string(link)

@ -91,7 +90,7 @@ def parse_index(self):
                    if thistitle == '':
                        thistitle = 'Ebert Journal Post'

-                    """
+                    r"""
                    pattern2 = r'AID=\/(.*?)\/'
                    reg2 = re.compile(pattern2, re.IGNORECASE|re.DOTALL)
                    match2 = reg2.search(thisurl)
--- a/recipes/roger_ebert_blog.recipe
+++ b/recipes/roger_ebert_blog.recipe
@ -2,7 +2,6 @@
 import urllib2
 import time
 from calibre.web.feeds.news import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import BeautifulSoup, SoupStrainer
 from calibre import strftime

 '''
@ -94,8 +93,9 @@ def parse_index(self):
                    description = match.group(2)

                self.log(thislink)
+                soup = self.index_to_soup(thislink)

-                for link in BeautifulSoup(thislink, parseOnlyThese=SoupStrainer('a')):
+                for link in soup.findAll('a', href=True):
                    thisurl = self.PREFIX + link['href']
                    thislinktext = self.tag_to_string(link)

--- a/src/calibre/ebooks/BeautifulSoup.py
+++ b/src/calibre/ebooks/BeautifulSoup.py
@ -6,8 +6,8 @@

 import bs4
 from bs4 import (  # noqa
-    CData, Comment, Declaration, NavigableString, ProcessingInstruction, Tag,
-    __version__
+    CData, Comment, Declaration, NavigableString, ProcessingInstruction,
+    SoupStrainer, Tag, __version__
 )

 from polyglot.builtins import unicode_type