Better handling of &<> entities with stripHTML() and chapter titles. #1019

2026-01-05 23:56:08 +01:00 · 2023-11-26 11:32:41 -06:00 · 2023-11-26 11:32:41 -06:00 · 84d6106a30
commit 84d6106a30
parent 6761cae9c1
3 changed files with 7 additions and 7 deletions
--- a/fanficfare/adapters/base_adapter.py
+++ b/fanficfare/adapters/base_adapter.py
@ -171,12 +171,6 @@ class BaseSiteAdapter(Requestable):
            meta = defaultdict(unicode,othermeta) # copy othermeta
            if title:
                title = stripHTML(title,remove_all_entities=False)
-                # Put the basic 3 html entities back in.
-                # bs4 is 'helpfully' removing them.
-                ## Now with more checking because bs4 is apparently
-                ## not *always* removing them now.
-                if '&' in title and '&amp;' not in title:
-                    title = title.replace('&','&amp;').replace('<','&lt;').replace('>','&gt;')
            else:
                ## A default value for when there's no chapter
                ## title. Cropped up once with adapter_novelonlinefullcom
--- a/fanficfare/adapters/base_xenforoforum_adapter.py
+++ b/fanficfare/adapters/base_xenforoforum_adapter.py
@ -406,7 +406,7 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
                if after:
                    # logger.debug("AFTER "*10)
                    after=False
-                url,name = atag['href'],stripHTML(atag)
+                url,name = atag['href'],stripHTML(atag,remove_all_entities=False)
                date = self.get_threadmark_date(tm_item)
                words,kwords = self.get_threadmark_words(tm_item)
                if 'http' not in url:
--- a/fanficfare/htmlcleanup.py
+++ b/fanficfare/htmlcleanup.py
@ -69,6 +69,12 @@ def stripHTML(soup, remove_all_entities=True):
    else:
        # bs4 already converts all the entities to UTF8 chars.
        retval = soup.get_text(strip=True)
+        if not remove_all_entities:
+            # put basic 3 entities back
+            if '&' in retval and '&amp;' not in retval:
+                # check in case called more than once.
+                retval = retval.replace('&','&amp;')
+            retval = retval.replace('<','&lt;').replace('>','&gt;')
    # some change in the python3 branch started making &nbsp; '\xc2\xa0'
    # instead of ' '
    return ensure_text(retval).replace(u'\xc2\xa0',' ').strip()