Allow for stories without series in adapter_silmarillionwritersguildorg, clean up whitespace.

This commit is contained in:
Jim Miller 2020-05-27 09:54:39 -05:00
parent cbc7c4b64b
commit 23f93bde24

View file

@ -88,71 +88,71 @@ class SilmarillionWritersGuildOrgAdapter(BaseSiteAdapter):
# Now go hunting for all the meta data and the chapter list.
## Title and author
# find story header
a = soup.find('h6')
titleLinks = a.find_all('a')
authorLink= titleLinks[1]
self.story.setMetadata('authorId',authorLink['href'].split('=')[1])
self.story.setMetadata('authorUrl','https://'+self.host+'/archive/home/'+authorLink['href'])
self.story.setMetadata('author',authorLink.string)
self.story.setMetadata('title',a.find('strong').find('a').get_text())
# Site does some weird stuff with pagination on series view and will only display 25 stories per page of results
# Therefor to get accurate index for series, we fetch all sub-pages of series and parse for valid story urls and add to a list,
# Then find first instance of current story url and use the number of loop itteration for index
# This is pretty slow but ehh it works
# This is pretty slow but ehh it works
try:
# Find Series name from series URL.
a = soup.find('a', href=re.compile(r"viewseries.php\?seriesid=\d+"))
seriesName = a.string
seriesUrl = 'https://'+self.host+'/archive/home/'+a['href']
self.story.setMetadata('seriesUrl',seriesUrl)
#logger.debug("Series Url: "+seriesUrl)
# Get Series page and convert to soup
seriesPageSoup = self.make_soup(self._fetchUrl(seriesUrl))
# Find Series page sub-pages
seriesPageUrlList = []
for i in seriesPageSoup.findAll('a', href=re.compile("viewseries.php\?seriesid=\d+&offset=\d+$")):
# Don't include url from next button, is another http request and parse + could cause more bugs!
if i.string != '[Next]':
seriesPageUrlList.append(i)
#get urls from all subpages and append to list
seriesStoryList = []
for seriesPagePageUrl in seriesPageUrlList:
seriesPagePageSoup = self.make_soup(self._fetchUrl('https://'+self.host+'/archive/home/'+seriesPagePageUrl['href']))
seriesPagePageStoryList = seriesPagePageSoup.findAll('a', href=re.compile(r'^viewstory.php\?sid=\d+$'))
for seriesPagePageStoryUrl in seriesPagePageStoryList:
seriesStoryList.append(seriesPagePageStoryUrl)
if a:
seriesName = a.string
seriesUrl = 'https://'+self.host+'/archive/home/'+a['href']
self.story.setMetadata('seriesUrl',seriesUrl)
#logger.debug("Series Url: "+seriesUrl)
# Get Series page and convert to soup
seriesPageSoup = self.make_soup(self._fetchUrl(seriesUrl))
# Find Series page sub-pages
seriesPageUrlList = []
for i in seriesPageSoup.findAll('a', href=re.compile("viewseries.php\?seriesid=\d+&offset=\d+$")):
# Don't include url from next button, is another http request and parse + could cause more bugs!
if i.string != '[Next]':
seriesPageUrlList.append(i)
#get urls from all subpages and append to list
seriesStoryList = []
for seriesPagePageUrl in seriesPageUrlList:
seriesPagePageSoup = self.make_soup(self._fetchUrl('https://'+self.host+'/archive/home/'+seriesPagePageUrl['href']))
seriesPagePageStoryList = seriesPagePageSoup.findAll('a', href=re.compile(r'^viewstory.php\?sid=\d+$'))
for seriesPagePageStoryUrl in seriesPagePageStoryList:
seriesStoryList.append(seriesPagePageStoryUrl)
# Find series index for story
i=1
for seriesStoriesUrl in seriesStoryList:
if seriesStoriesUrl['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):
self.setSeries(seriesName, i)
#logger.debug("Series Name: "+ seriesName)
#logger.debug("Series Index: "+i)
break
i+=1
# Find series index for story
i=1
for seriesStoriesUrl in seriesStoryList:
if seriesStoriesUrl['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):
self.setSeries(seriesName, i)
#logger.debug("Series Name: "+ seriesName)
#logger.debug("Series Index: "+i)
break
i+=1
except Exception as e:
raise e
# I find it hard to care if the series parsing fails
#pass
logger.warn("series parsing failed(%s)"%e)
pass
# Find the chapters by regexing urls
chapters=soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"&chapter=\d+$"))
if len(chapters)==1:
self.add_chapter(self.story.getMetadata('title'),'https://'+self.host+'/archive/home/'+chapters[0]['href'])
else:
@ -161,9 +161,9 @@ class SilmarillionWritersGuildOrgAdapter(BaseSiteAdapter):
self.add_chapter(chapter,'https://'+self.host+'/archive/home/'+chapter['href'])
# find the details section for the work, will hopefully make parsing metadata a bit easier
workDetails = soup.find('div', {'id' : 'general'}).find('div', {'id' : 'general'})
# some metadata can be retrieved through regexes so will do that to try and avoid a janky mess.
#get characters
@ -171,30 +171,30 @@ class SilmarillionWritersGuildOrgAdapter(BaseSiteAdapter):
charList = workDetails.findAll('a', href=re.compile(r'browse.php\?type=characters'+"&charid=\d+$"))
for char in charList:
self.story.addToList('characters',char.string)
except Exception as e:
logger.warn("character parsing failed(%s)"%e)
#get warnings
try:
warnList = workDetails.findAll('a', href=re.compile(r'browse.php\?type=class&type_id=2'+"&classid=\d+$"))
for warn in warnList:
self.story.addToList('warnings', warn.string)
except Exception as e:
logger.warn("warning parsing failed(%s)"%e)
#get genres
try:
genresList = workDetails.findAll('a', href=re.compile(r'browse.php\?type=class&type_id=1'+"&classid=\d+$"))
for genre in genresList:
self.story.addToList('genre', genre.string)
except Exception as e:
logger.warn("genre parsing failed(%s)"%e)
logger.warn("genre parsing failed(%s)"%e)
# no convenient way to extract remaining metadata so bodge it by finding relevant identifier string and using next element as the data source
#get summary by finding identifier, then itterating until next identifier is found and using data between the two as the summary
try:
summaryStart = workDetails.find('strong',text='Summary: ')
@ -208,67 +208,67 @@ class SilmarillionWritersGuildOrgAdapter(BaseSiteAdapter):
except Exception as e:
logger.warn("summary parsing failed(%s) -- This can be caused by bad HTML in story description."%e)
#get rating
try:
rating = workDetails.find('strong',text='Rated:').next_sibling.string
self.story.setMetadata('rating', rating)
except Exception as e:
logger.warn("rating parsing failed(%s) -- This can be caused by bad HTML in story description."%e)
#get completion status and correct for consistency with other adapters
try:
if (workDetails.find('strong',text='Completed:').next_sibling.string).lower() == "yes":
status="Completed"
else:
status="In-Progress"
self.story.setMetadata('status', status)
except Exception as e:
logger.warn("status parsing failed(%s) -- This can be caused by bad HTML in story description."%e)
#get wordcount
try:
wordCount = workDetails.find('strong',text='Word count:').next_sibling.string
self.story.setMetadata('numWords', wordCount)
except Exception as e:
logger.warn("wordcount parsing failed(%s) -- This can be caused by bad HTML in story description."%e)
#get published date, this works for some reason yet doesn't without the spaces in it
try:
datePublished = workDetails.find('strong',text=' Published: ').next_sibling.string
self.story.setMetadata('datePublished', makeDate(datePublished, self.dateformat))
except Exception as e:
logger.warn("datePublished parsing failed(%s) -- This can be caused by bad HTML in story description."%e)
#get updated date
try:
dateUpdated = workDetails.find('strong',text='Updated:').next_sibling.string
self.story.setMetadata('dateUpdated', makeDate(dateUpdated, self.dateformat))
except Exception as e:
logger.warn("dateUpdated parsing failed(%s) -- This can be caused by bad HTML in story description."%e)
# grab the text for an individual chapter.
def getChapterText(self, url):
logger.debug('Getting chapter text from: %s' % url)
data = self._fetchUrl(url)
soup = self.make_soup(data)
# No convenient way to get story without the rest of the page, so get whole page and strip unneeded sections
contentParent = soup.find('div', {'id' : 'maincontent'}).find('div', {'id' : 'general'})
contentParent.find('p').decompose() # remove page header
contentParent.find('p').decompose() # remove page header
contentParent.find_all('div',id='general')[2].decompose() #remove page footer
contentParent.find_all('div',id='general')[0].decompose() #remove chapter select etc.
contentParent.name='div'
#error on failure
if None == contentParent:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)