mirror of
https://github.com/JimmXinu/FanFicFare.git
synced 2025-12-23 09:14:18 +01:00
Fix efpfanfic.net when author includes story URLs in desc, heuristics for poor HTML.
This commit is contained in:
parent
875c894f91
commit
91dba79bff
1 changed files with 21 additions and 6 deletions
|
|
@ -171,10 +171,11 @@ class EFPFanFicNet(BaseSiteAdapter):
|
|||
|
||||
storya = None
|
||||
authsoup = None
|
||||
storyblock = None
|
||||
authurl = self.story.getMetadata('authorUrl')
|
||||
|
||||
## author can have more than one page of stories.
|
||||
while storya == None:
|
||||
while storyblock == None:
|
||||
|
||||
# no storya, but do have authsoup--we're looping on author pages.
|
||||
if authsoup != None:
|
||||
|
|
@ -186,10 +187,14 @@ class EFPFanFicNet(BaseSiteAdapter):
|
|||
logger.debug("fetching author page: (%s)"%authurl)
|
||||
authsoup = bs.BeautifulSoup(self._fetchUrl(authurl))
|
||||
#print("authsoup:%s"%authsoup)
|
||||
|
||||
storya = authsoup.find('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+r'&i=1$'))
|
||||
|
||||
storyblock = storya.parent.parent.parent
|
||||
|
||||
storyas = authsoup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+r'&i=1$'))
|
||||
for storya in storyas:
|
||||
#print("======storya:%s"%storya)
|
||||
storyblock = storya.findParent('div',{'class':'storybloc'})
|
||||
#print("======storyblock:%s"%storyblock)
|
||||
if storyblock != None:
|
||||
continue
|
||||
|
||||
self.setDescription(url,storyblock.find('div', {'class':'introbloc'}))
|
||||
|
||||
|
|
@ -293,4 +298,14 @@ class EFPFanFicNet(BaseSiteAdapter):
|
|||
if None == div:
|
||||
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
|
||||
return self.utf8FromSoup(url,div)
|
||||
# remove any header and 'o:p' tags.
|
||||
for tag in div.findAll("head") + div.findAll("o:p"):
|
||||
tag.extract()
|
||||
|
||||
# change any html and body tags to div.
|
||||
for tag in div.findAll("html") + div.findAll("body"):
|
||||
tag.name='div'
|
||||
|
||||
# remove extra bogus doctype.
|
||||
#<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
|
||||
return re.sub(r"<!DOCTYPE[^>]+>","",self.utf8FromSoup(url,div))
|
||||
|
|
|
|||
Loading…
Reference in a new issue