From 84d6106a30d48633b908dbd68d8796edda35eca9 Mon Sep 17 00:00:00 2001 From: Jim Miller Date: Sun, 26 Nov 2023 11:32:41 -0600 Subject: [PATCH] Better handling of &<> entities with stripHTML() and chapter titles. #1019 --- fanficfare/adapters/base_adapter.py | 6 ------ fanficfare/adapters/base_xenforoforum_adapter.py | 2 +- fanficfare/htmlcleanup.py | 6 ++++++ 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/fanficfare/adapters/base_adapter.py b/fanficfare/adapters/base_adapter.py index 88a56d0b..793f7607 100644 --- a/fanficfare/adapters/base_adapter.py +++ b/fanficfare/adapters/base_adapter.py @@ -171,12 +171,6 @@ class BaseSiteAdapter(Requestable): meta = defaultdict(unicode,othermeta) # copy othermeta if title: title = stripHTML(title,remove_all_entities=False) - # Put the basic 3 html entities back in. - # bs4 is 'helpfully' removing them. - ## Now with more checking because bs4 is apparently - ## not *always* removing them now. - if '&' in title and '&' not in title: - title = title.replace('&','&').replace('<','<').replace('>','>') else: ## A default value for when there's no chapter ## title. Cropped up once with adapter_novelonlinefullcom diff --git a/fanficfare/adapters/base_xenforoforum_adapter.py b/fanficfare/adapters/base_xenforoforum_adapter.py index bec569bc..80494c27 100644 --- a/fanficfare/adapters/base_xenforoforum_adapter.py +++ b/fanficfare/adapters/base_xenforoforum_adapter.py @@ -406,7 +406,7 @@ class BaseXenForoForumAdapter(BaseSiteAdapter): if after: # logger.debug("AFTER "*10) after=False - url,name = atag['href'],stripHTML(atag) + url,name = atag['href'],stripHTML(atag,remove_all_entities=False) date = self.get_threadmark_date(tm_item) words,kwords = self.get_threadmark_words(tm_item) if 'http' not in url: diff --git a/fanficfare/htmlcleanup.py b/fanficfare/htmlcleanup.py index 4b2caef2..5e19ba81 100644 --- a/fanficfare/htmlcleanup.py +++ b/fanficfare/htmlcleanup.py @@ -69,6 +69,12 @@ def stripHTML(soup, remove_all_entities=True): else: # bs4 already converts all the entities to UTF8 chars. retval = soup.get_text(strip=True) + if not remove_all_entities: + # put basic 3 entities back + if '&' in retval and '&' not in retval: + # check in case called more than once. + retval = retval.replace('&','&') + retval = retval.replace('<','<').replace('>','>') # some change in the python3 branch started making   '\xc2\xa0' # instead of ' ' return ensure_text(retval).replace(u'\xc2\xa0',' ').strip()