diff --git a/fanficfare/adapters/base_adapter.py b/fanficfare/adapters/base_adapter.py index c6bae9c4..5f9102ba 100644 --- a/fanficfare/adapters/base_adapter.py +++ b/fanficfare/adapters/base_adapter.py @@ -698,29 +698,49 @@ class BaseSiteAdapter(Requestable): # logger.info("Parsing for normalize_text_links failed...") try: - for t in soup.findAll(recursive=True): - for attr in self.get_attr_keys(t): - if attr not in acceptable_attributes: - del t[attr] ## strip all tag attributes except acceptable_attributes + # python doesn't have a do-while loop. + found_empty=True + do_resoup=False + while found_empty==True: + found_empty=False + if do_resoup: + # re-soup when empty tags removed before looking + # for more because multiple 'whitespace' strings + # show up differently and doing stripHTML() also + # catches
etc. + soup = BeautifulSoup(unicode(soup),'html5lib') + for t in soup.findAll(recursive=True): + for attr in self.get_attr_keys(t): + if attr not in acceptable_attributes: + del t[attr] ## strip all tag attributes except acceptable_attributes - # these are not acceptable strict XHTML. But we do already have - # CSS classes of the same names defined - if t and hasattr(t,'name') and t.name is not None: - if t.name in self.getConfigList('replace_tags_with_spans',['u']): - t['class']=t.name - t.name='span' - if t.name in ('center'): - t['class']=t.name - t.name='div' - # removes paired, but empty non paragraph tags. - if t.name not in self.getConfigList('keep_empty_tags',['p','td','th']) and t.string != None and len(t.string.strip()) == 0 : - t.decompose() + if t and hasattr(t,'name') and t.name is not None: + # remove script tags cross the board. + # epub readers (Moon+, FBReader & Aldiko at least) + # don't like