From 84d6106a30d48633b908dbd68d8796edda35eca9 Mon Sep 17 00:00:00 2001
From: Jim Miller <retiefjimm@gmail.com>
Date: Sun, 26 Nov 2023 11:32:41 -0600
Subject: [PATCH] Better handling of &<> entities with stripHTML() and chapter
 titles. #1019

---
 fanficfare/adapters/base_adapter.py              | 6 ------
 fanficfare/adapters/base_xenforoforum_adapter.py | 2 +-
 fanficfare/htmlcleanup.py                        | 6 ++++++
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/fanficfare/adapters/base_adapter.py b/fanficfare/adapters/base_adapter.py
index 88a56d0b..793f7607 100644
--- a/fanficfare/adapters/base_adapter.py
+++ b/fanficfare/adapters/base_adapter.py
@@ -171,12 +171,6 @@ class BaseSiteAdapter(Requestable):
             meta = defaultdict(unicode,othermeta) # copy othermeta
             if title:
                 title = stripHTML(title,remove_all_entities=False)
-                # Put the basic 3 html entities back in.
-                # bs4 is 'helpfully' removing them.
-                ## Now with more checking because bs4 is apparently
-                ## not *always* removing them now.
-                if '&' in title and '&amp;' not in title:
-                    title = title.replace('&','&amp;').replace('<','&lt;').replace('>','&gt;')
             else:
                 ## A default value for when there's no chapter
                 ## title. Cropped up once with adapter_novelonlinefullcom
diff --git a/fanficfare/adapters/base_xenforoforum_adapter.py b/fanficfare/adapters/base_xenforoforum_adapter.py
index bec569bc..80494c27 100644
--- a/fanficfare/adapters/base_xenforoforum_adapter.py
+++ b/fanficfare/adapters/base_xenforoforum_adapter.py
@@ -406,7 +406,7 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
                 if after:
                     # logger.debug("AFTER "*10)
                     after=False
-                url,name = atag['href'],stripHTML(atag)
+                url,name = atag['href'],stripHTML(atag,remove_all_entities=False)
                 date = self.get_threadmark_date(tm_item)
                 words,kwords = self.get_threadmark_words(tm_item)
                 if 'http' not in url:
diff --git a/fanficfare/htmlcleanup.py b/fanficfare/htmlcleanup.py
index 4b2caef2..5e19ba81 100644
--- a/fanficfare/htmlcleanup.py
+++ b/fanficfare/htmlcleanup.py
@@ -69,6 +69,12 @@ def stripHTML(soup, remove_all_entities=True):
     else:
         # bs4 already converts all the entities to UTF8 chars.
         retval = soup.get_text(strip=True)
+        if not remove_all_entities:
+            # put basic 3 entities back
+            if '&' in retval and '&amp;' not in retval:
+                # check in case called more than once.
+                retval = retval.replace('&','&amp;')
+            retval = retval.replace('<','&lt;').replace('>','&gt;')
     # some change in the python3 branch started making &nbsp; '\xc2\xa0'
     # instead of ' '
     return ensure_text(retval).replace(u'\xc2\xa0',' ').strip()