From 23f93bde24541446d0dfd90f207a0cebe6a6bb01 Mon Sep 17 00:00:00 2001 From: Jim Miller Date: Wed, 27 May 2020 09:54:39 -0500 Subject: [PATCH] Allow for stories without series in adapter_silmarillionwritersguildorg, clean up whitespace. --- .../adapter_silmarillionwritersguildorg.py | 142 +++++++++--------- 1 file changed, 71 insertions(+), 71 deletions(-) diff --git a/fanficfare/adapters/adapter_silmarillionwritersguildorg.py b/fanficfare/adapters/adapter_silmarillionwritersguildorg.py index 91101743..990757a8 100644 --- a/fanficfare/adapters/adapter_silmarillionwritersguildorg.py +++ b/fanficfare/adapters/adapter_silmarillionwritersguildorg.py @@ -88,71 +88,71 @@ class SilmarillionWritersGuildOrgAdapter(BaseSiteAdapter): # Now go hunting for all the meta data and the chapter list. ## Title and author - + # find story header a = soup.find('h6') titleLinks = a.find_all('a') authorLink= titleLinks[1] - + self.story.setMetadata('authorId',authorLink['href'].split('=')[1]) self.story.setMetadata('authorUrl','https://'+self.host+'/archive/home/'+authorLink['href']) self.story.setMetadata('author',authorLink.string) self.story.setMetadata('title',a.find('strong').find('a').get_text()) - + # Site does some weird stuff with pagination on series view and will only display 25 stories per page of results # Therefor to get accurate index for series, we fetch all sub-pages of series and parse for valid story urls and add to a list, # Then find first instance of current story url and use the number of loop itteration for index - - # This is pretty slow but ehh it works - + + # This is pretty slow but ehh it works + try: # Find Series name from series URL. a = soup.find('a', href=re.compile(r"viewseries.php\?seriesid=\d+")) - seriesName = a.string - seriesUrl = 'https://'+self.host+'/archive/home/'+a['href'] - - self.story.setMetadata('seriesUrl',seriesUrl) - - #logger.debug("Series Url: "+seriesUrl) - - # Get Series page and convert to soup - seriesPageSoup = self.make_soup(self._fetchUrl(seriesUrl)) - # Find Series page sub-pages - seriesPageUrlList = [] - for i in seriesPageSoup.findAll('a', href=re.compile("viewseries.php\?seriesid=\d+&offset=\d+$")): - # Don't include url from next button, is another http request and parse + could cause more bugs! - if i.string != '[Next]': - seriesPageUrlList.append(i) - - #get urls from all subpages and append to list - seriesStoryList = [] - for seriesPagePageUrl in seriesPageUrlList: - seriesPagePageSoup = self.make_soup(self._fetchUrl('https://'+self.host+'/archive/home/'+seriesPagePageUrl['href'])) - seriesPagePageStoryList = seriesPagePageSoup.findAll('a', href=re.compile(r'^viewstory.php\?sid=\d+$')) - - for seriesPagePageStoryUrl in seriesPagePageStoryList: - seriesStoryList.append(seriesPagePageStoryUrl) + if a: + seriesName = a.string + seriesUrl = 'https://'+self.host+'/archive/home/'+a['href'] + + self.story.setMetadata('seriesUrl',seriesUrl) + + #logger.debug("Series Url: "+seriesUrl) + + # Get Series page and convert to soup + seriesPageSoup = self.make_soup(self._fetchUrl(seriesUrl)) + # Find Series page sub-pages + seriesPageUrlList = [] + for i in seriesPageSoup.findAll('a', href=re.compile("viewseries.php\?seriesid=\d+&offset=\d+$")): + # Don't include url from next button, is another http request and parse + could cause more bugs! + if i.string != '[Next]': + seriesPageUrlList.append(i) + + #get urls from all subpages and append to list + seriesStoryList = [] + for seriesPagePageUrl in seriesPageUrlList: + seriesPagePageSoup = self.make_soup(self._fetchUrl('https://'+self.host+'/archive/home/'+seriesPagePageUrl['href'])) + seriesPagePageStoryList = seriesPagePageSoup.findAll('a', href=re.compile(r'^viewstory.php\?sid=\d+$')) + + for seriesPagePageStoryUrl in seriesPagePageStoryList: + seriesStoryList.append(seriesPagePageStoryUrl) + + # Find series index for story + i=1 + for seriesStoriesUrl in seriesStoryList: + if seriesStoriesUrl['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')): + self.setSeries(seriesName, i) + #logger.debug("Series Name: "+ seriesName) + #logger.debug("Series Index: "+i) + break + i+=1 - # Find series index for story - i=1 - for seriesStoriesUrl in seriesStoryList: - if seriesStoriesUrl['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')): - self.setSeries(seriesName, i) - #logger.debug("Series Name: "+ seriesName) - #logger.debug("Series Index: "+i) - break - i+=1 - except Exception as e: - raise e - # I find it hard to care if the series parsing fails - #pass + logger.warn("series parsing failed(%s)"%e) + pass # Find the chapters by regexing urls chapters=soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"&chapter=\d+$")) - + if len(chapters)==1: self.add_chapter(self.story.getMetadata('title'),'https://'+self.host+'/archive/home/'+chapters[0]['href']) else: @@ -161,9 +161,9 @@ class SilmarillionWritersGuildOrgAdapter(BaseSiteAdapter): self.add_chapter(chapter,'https://'+self.host+'/archive/home/'+chapter['href']) # find the details section for the work, will hopefully make parsing metadata a bit easier - + workDetails = soup.find('div', {'id' : 'general'}).find('div', {'id' : 'general'}) - + # some metadata can be retrieved through regexes so will do that to try and avoid a janky mess. #get characters @@ -171,30 +171,30 @@ class SilmarillionWritersGuildOrgAdapter(BaseSiteAdapter): charList = workDetails.findAll('a', href=re.compile(r'browse.php\?type=characters'+"&charid=\d+$")) for char in charList: self.story.addToList('characters',char.string) - + except Exception as e: logger.warn("character parsing failed(%s)"%e) - + #get warnings try: warnList = workDetails.findAll('a', href=re.compile(r'browse.php\?type=class&type_id=2'+"&classid=\d+$")) for warn in warnList: self.story.addToList('warnings', warn.string) - + except Exception as e: logger.warn("warning parsing failed(%s)"%e) - + #get genres try: genresList = workDetails.findAll('a', href=re.compile(r'browse.php\?type=class&type_id=1'+"&classid=\d+$")) for genre in genresList: self.story.addToList('genre', genre.string) - + except Exception as e: - logger.warn("genre parsing failed(%s)"%e) - + logger.warn("genre parsing failed(%s)"%e) + # no convenient way to extract remaining metadata so bodge it by finding relevant identifier string and using next element as the data source - + #get summary by finding identifier, then itterating until next identifier is found and using data between the two as the summary try: summaryStart = workDetails.find('strong',text='Summary: ') @@ -208,67 +208,67 @@ class SilmarillionWritersGuildOrgAdapter(BaseSiteAdapter): except Exception as e: logger.warn("summary parsing failed(%s) -- This can be caused by bad HTML in story description."%e) - + #get rating try: rating = workDetails.find('strong',text='Rated:').next_sibling.string self.story.setMetadata('rating', rating) except Exception as e: logger.warn("rating parsing failed(%s) -- This can be caused by bad HTML in story description."%e) - + #get completion status and correct for consistency with other adapters try: if (workDetails.find('strong',text='Completed:').next_sibling.string).lower() == "yes": status="Completed" - + else: status="In-Progress" - + self.story.setMetadata('status', status) except Exception as e: logger.warn("status parsing failed(%s) -- This can be caused by bad HTML in story description."%e) - + #get wordcount try: wordCount = workDetails.find('strong',text='Word count:').next_sibling.string self.story.setMetadata('numWords', wordCount) except Exception as e: logger.warn("wordcount parsing failed(%s) -- This can be caused by bad HTML in story description."%e) - + #get published date, this works for some reason yet doesn't without the spaces in it try: datePublished = workDetails.find('strong',text=' Published: ').next_sibling.string self.story.setMetadata('datePublished', makeDate(datePublished, self.dateformat)) - + except Exception as e: logger.warn("datePublished parsing failed(%s) -- This can be caused by bad HTML in story description."%e) - + #get updated date try: dateUpdated = workDetails.find('strong',text='Updated:').next_sibling.string self.story.setMetadata('dateUpdated', makeDate(dateUpdated, self.dateformat)) - + except Exception as e: logger.warn("dateUpdated parsing failed(%s) -- This can be caused by bad HTML in story description."%e) # grab the text for an individual chapter. def getChapterText(self, url): - + logger.debug('Getting chapter text from: %s' % url) - + data = self._fetchUrl(url) soup = self.make_soup(data) - + # No convenient way to get story without the rest of the page, so get whole page and strip unneeded sections - + contentParent = soup.find('div', {'id' : 'maincontent'}).find('div', {'id' : 'general'}) - - contentParent.find('p').decompose() # remove page header + + contentParent.find('p').decompose() # remove page header contentParent.find_all('div',id='general')[2].decompose() #remove page footer contentParent.find_all('div',id='general')[0].decompose() #remove chapter select etc. - + contentParent.name='div' - + #error on failure if None == contentParent: raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)