mirror of
https://github.com/JimmXinu/FanFicFare.git
synced 2026-05-02 03:48:40 +02:00
Fix for Get URLs from Page when poor HTML--double soup like base_adapter.
This commit is contained in:
parent
96da2eab89
commit
f8132eb14b
1 changed files with 3 additions and 1 deletions
|
|
@ -86,7 +86,9 @@ def get_urls_from_html(data,url=None,configuration=None,normalize=False,restrict
|
|||
if not configuration:
|
||||
configuration = Configuration(["test1.com"],"EPUB",lightweight=True)
|
||||
|
||||
soup = BeautifulSoup(data,"html5lib")
|
||||
## soup and re-soup because BS4/html5lib is more forgiving of
|
||||
## incorrectly nested tags that way.
|
||||
soup = BeautifulSoup(unicode(BeautifulSoup(data,"html5lib")),"html5lib")
|
||||
if restrictsearch:
|
||||
soup = soup.find(*restrictsearch)
|
||||
#logger.debug("restrict search:%s"%soup)
|
||||
|
|
|
|||
Loading…
Reference in a new issue