Better handling of &<> entities with stripHTML() and chapter titles. #1019

This commit is contained in:
Jim Miller 2023-11-26 11:32:41 -06:00
parent 6761cae9c1
commit 84d6106a30
3 changed files with 7 additions and 7 deletions

View file

@ -171,12 +171,6 @@ class BaseSiteAdapter(Requestable):
meta = defaultdict(unicode,othermeta) # copy othermeta
if title:
title = stripHTML(title,remove_all_entities=False)
# Put the basic 3 html entities back in.
# bs4 is 'helpfully' removing them.
## Now with more checking because bs4 is apparently
## not *always* removing them now.
if '&' in title and '&amp;' not in title:
title = title.replace('&','&amp;').replace('<','&lt;').replace('>','&gt;')
else:
## A default value for when there's no chapter
## title. Cropped up once with adapter_novelonlinefullcom

View file

@ -406,7 +406,7 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
if after:
# logger.debug("AFTER "*10)
after=False
url,name = atag['href'],stripHTML(atag)
url,name = atag['href'],stripHTML(atag,remove_all_entities=False)
date = self.get_threadmark_date(tm_item)
words,kwords = self.get_threadmark_words(tm_item)
if 'http' not in url:

View file

@ -69,6 +69,12 @@ def stripHTML(soup, remove_all_entities=True):
else:
# bs4 already converts all the entities to UTF8 chars.
retval = soup.get_text(strip=True)
if not remove_all_entities:
# put basic 3 entities back
if '&' in retval and '&amp;' not in retval:
# check in case called more than once.
retval = retval.replace('&','&amp;')
retval = retval.replace('<','&lt;').replace('>','&gt;')
# some change in the python3 branch started making &nbsp; '\xc2\xa0'
# instead of ' '
return ensure_text(retval).replace(u'\xc2\xa0',' ').strip()