mirror of
https://github.com/JimmXinu/FanFicFare.git
synced 2026-05-05 11:00:47 +02:00
Fix for &<> entities in chapter titles.
This commit is contained in:
parent
c386df4e48
commit
2779e15961
3 changed files with 31 additions and 26 deletions
|
|
@ -39,7 +39,7 @@ logger = logging.getLogger(__name__)
|
|||
|
||||
from ..story import Story
|
||||
from ..configurable import Configurable
|
||||
from ..htmlcleanup import removeEntities, removeAllEntities, stripHTML
|
||||
from ..htmlcleanup import stripHTML
|
||||
from ..exceptions import InvalidStoryURL
|
||||
|
||||
# quick convenience class
|
||||
|
|
@ -156,7 +156,7 @@ class BaseSiteAdapter(Configurable):
|
|||
self.ignore_chapter_url_list = [ self.normalize_chapterurl(u) for u in self.getConfig('ignore_chapter_url_list').splitlines() ]
|
||||
if self.normalize_chapterurl(url) not in self.ignore_chapter_url_list:
|
||||
meta = defaultdict(unicode,othermeta) # copy othermeta
|
||||
meta.update({'title':stripHTML(title),'url':url}) # after other to make sure they are set
|
||||
meta.update({'title':stripHTML(title,remove_all_entities=False),'url':url}) # after other to make sure they are set
|
||||
self.chapterUrls.append(meta)
|
||||
self.story.setMetadata('numChapters', self.num_chapters())
|
||||
return True
|
||||
|
|
|
|||
|
|
@ -62,9 +62,10 @@ def _replaceNotEntities(data):
|
|||
p = re.compile(r'&([a-zA-Z][-.a-zA-Z0-9]*);')
|
||||
return p.sub(r'&\1', data)
|
||||
|
||||
def stripHTML(soup):
|
||||
def stripHTML(soup, remove_all_entities=True):
|
||||
if isinstance(soup,basestring):
|
||||
retval = removeAllEntities(re.sub(r'<[^>]+>','',"%s" % soup)).strip()
|
||||
retval = removeEntities(re.sub(r'<[^>]+>','',"%s" % soup),
|
||||
remove_all_entities=remove_all_entities).strip()
|
||||
else:
|
||||
# bs4 already converts all the entities to UTF8 chars.
|
||||
retval = soup.get_text(strip=True)
|
||||
|
|
@ -77,48 +78,48 @@ def conditionalRemoveEntities(value):
|
|||
return removeEntities(value).strip()
|
||||
else:
|
||||
return value
|
||||
|
||||
def removeAllEntities(text):
|
||||
# Remove < < and &
|
||||
return removeEntities(text).replace('<', '<').replace('>', '>').replace('&', '&')
|
||||
|
||||
def removeEntities(text, space_only=False):
|
||||
def removeAllEntities(text):
|
||||
# Remove < < and & also
|
||||
return removeEntities(text, remove_all_entities=True)
|
||||
|
||||
def removeEntities(text, space_only=False, remove_all_entities=False):
|
||||
# keeps &, < and > when remove_all_entities=False
|
||||
if text is None:
|
||||
return u""
|
||||
|
||||
|
||||
if not isinstance(text,basestring):
|
||||
return unicode(text)
|
||||
|
||||
text = unicode(text)
|
||||
|
||||
try:
|
||||
t = text
|
||||
except (UnicodeEncodeError,UnicodeDecodeError) as e:
|
||||
try:
|
||||
t = text.encode ('ascii', 'xmlcharrefreplace')
|
||||
t = text.encode ('ascii', 'xmlcharrefreplace')
|
||||
except (UnicodeEncodeError,UnicodeDecodeError) as e:
|
||||
t = text
|
||||
text = t
|
||||
text = t
|
||||
# replace numeric versions of [&<>] with named versions,
|
||||
# then replace named versions with actual characters,
|
||||
text = re.sub(r'�*38;','&',text)
|
||||
text = re.sub(r'�*60;','<',text)
|
||||
text = re.sub(r'�*62;','>',text)
|
||||
|
||||
|
||||
# replace remaining � entities with unicode value, such as ' -> '
|
||||
text = _replaceNumberEntities(text)
|
||||
|
||||
# replace several named entities with character, such as — -> -
|
||||
# see constants.py for the list.
|
||||
# reverse sort will put entities with ; before the same one without, when valid.
|
||||
for e in reversed(sorted(entities.keys())):
|
||||
v = entities[e]
|
||||
if space_only and re.match(r"^[^\s]$", v, re.UNICODE | re.S):
|
||||
# if not space
|
||||
continue
|
||||
try:
|
||||
text = text.replace(e, v)
|
||||
except UnicodeDecodeError as ex:
|
||||
# for the pound symbol in constants.py
|
||||
text = text.replace(e, v.decode('utf-8'))
|
||||
# try:
|
||||
text = text.replace(e, v)
|
||||
# except UnicodeDecodeError as ex:
|
||||
# # for the pound symbol
|
||||
# text = text.replace(e, v.decode('utf-8'))
|
||||
|
||||
# SGMLParser, and in turn, BeautifulStoneSoup doesn't parse
|
||||
# entities terribly well and inserts (;) after something that
|
||||
|
|
@ -128,9 +129,14 @@ def removeEntities(text, space_only=False):
|
|||
# this point, there should be *no* real entities left, so find
|
||||
# these not-entities and removing them here should be safe.
|
||||
text = _replaceNotEntities(text)
|
||||
|
||||
# < < and & are the only html entities allowed in xhtml, put those back.
|
||||
return text.replace('&', '&').replace('&lt', '<').replace('&gt', '>')
|
||||
|
||||
if remove_all_entities:
|
||||
text = text.replace('<', '<').replace('>', '>').replace('&', '&')
|
||||
else:
|
||||
# < > and & are the only html entities allowed in xhtml, put those back.
|
||||
# They come out as < because _replaceNotEntities removes the ';'.
|
||||
text = text.replace('&', '&').replace('&lt', '<').replace('&gt', '>')
|
||||
return text
|
||||
|
||||
## Currently used(optionally) by adapter_lightnovelgatecom and
|
||||
## adapter_wwwnovelallcom only. I hesitate to put the option in
|
||||
|
|
|
|||
|
|
@ -1021,7 +1021,6 @@ class Story(Configurable):
|
|||
def addChapter(self, chap, newchap=False):
|
||||
# logger.debug("addChapter(%s,%s)"%(chap,newchap))
|
||||
chapter = defaultdict(unicode,chap) # default unknown to empty string
|
||||
chapter['title'] = removeEntities(chapter['title'])
|
||||
chapter['html'] = removeEntities(chapter['html'])
|
||||
if self.getConfig('strip_chapter_numbers') and \
|
||||
self.getConfig('chapter_title_strip_pattern'):
|
||||
|
|
@ -1039,7 +1038,7 @@ class Story(Configurable):
|
|||
self.chapters.append(chapter)
|
||||
|
||||
def getChapters(self,fortoc=False):
|
||||
"Chapters will be dicts"
|
||||
"Chapters will be defaultdicts(unicode)"
|
||||
retval = []
|
||||
|
||||
## only add numbers if more than one chapter. Ditto (new) marks.
|
||||
|
|
|
|||
Loading…
Reference in a new issue