Fix for Get URLs from Page when poor HTML--double soup like base_adapter.

2026-05-02 03:48:40 +02:00 · 2016-10-31 12:42:41 -05:00 · 2016-10-31 12:42:41 -05:00 · f8132eb14b
commit f8132eb14b
parent 96da2eab89
1 changed files with 3 additions and 1 deletions
--- a/fanficfare/geturls.py
+++ b/fanficfare/geturls.py
@ -86,7 +86,9 @@ def get_urls_from_html(data,url=None,configuration=None,normalize=False,restrict
    if not configuration:
        configuration = Configuration(["test1.com"],"EPUB",lightweight=True)

-    soup = BeautifulSoup(data,"html5lib")
+    ## soup and re-soup because BS4/html5lib is more forgiving of
+    ## incorrectly nested tags that way.
+    soup = BeautifulSoup(unicode(BeautifulSoup(data,"html5lib")),"html5lib")
    if restrictsearch:
        soup = soup.find(*restrictsearch)
        #logger.debug("restrict search:%s"%soup)