mirror of
https://github.com/JimmXinu/FanFicFare.git
synced 2026-01-05 23:56:08 +01:00
Better handling of &<> entities with stripHTML() and chapter titles. #1019
This commit is contained in:
parent
6761cae9c1
commit
84d6106a30
3 changed files with 7 additions and 7 deletions
|
|
@ -171,12 +171,6 @@ class BaseSiteAdapter(Requestable):
|
|||
meta = defaultdict(unicode,othermeta) # copy othermeta
|
||||
if title:
|
||||
title = stripHTML(title,remove_all_entities=False)
|
||||
# Put the basic 3 html entities back in.
|
||||
# bs4 is 'helpfully' removing them.
|
||||
## Now with more checking because bs4 is apparently
|
||||
## not *always* removing them now.
|
||||
if '&' in title and '&' not in title:
|
||||
title = title.replace('&','&').replace('<','<').replace('>','>')
|
||||
else:
|
||||
## A default value for when there's no chapter
|
||||
## title. Cropped up once with adapter_novelonlinefullcom
|
||||
|
|
|
|||
|
|
@ -406,7 +406,7 @@ class BaseXenForoForumAdapter(BaseSiteAdapter):
|
|||
if after:
|
||||
# logger.debug("AFTER "*10)
|
||||
after=False
|
||||
url,name = atag['href'],stripHTML(atag)
|
||||
url,name = atag['href'],stripHTML(atag,remove_all_entities=False)
|
||||
date = self.get_threadmark_date(tm_item)
|
||||
words,kwords = self.get_threadmark_words(tm_item)
|
||||
if 'http' not in url:
|
||||
|
|
|
|||
|
|
@ -69,6 +69,12 @@ def stripHTML(soup, remove_all_entities=True):
|
|||
else:
|
||||
# bs4 already converts all the entities to UTF8 chars.
|
||||
retval = soup.get_text(strip=True)
|
||||
if not remove_all_entities:
|
||||
# put basic 3 entities back
|
||||
if '&' in retval and '&' not in retval:
|
||||
# check in case called more than once.
|
||||
retval = retval.replace('&','&')
|
||||
retval = retval.replace('<','<').replace('>','>')
|
||||
# some change in the python3 branch started making '\xc2\xa0'
|
||||
# instead of ' '
|
||||
return ensure_text(retval).replace(u'\xc2\xa0',' ').strip()
|
||||
|
|
|
|||
Loading…
Reference in a new issue