Allow for stories without series in adapter_silmarillionwritersguildorg, clean up whitespace.

2025-12-22 08:44:13 +01:00 · 2020-05-27 09:54:39 -05:00 · 2020-05-27 09:54:39 -05:00 · 23f93bde24
commit 23f93bde24
parent cbc7c4b64b
1 changed files with 71 additions and 71 deletions
--- a/fanficfare/adapters/adapter_silmarillionwritersguildorg.py
+++ b/fanficfare/adapters/adapter_silmarillionwritersguildorg.py
@ -88,71 +88,71 @@ class SilmarillionWritersGuildOrgAdapter(BaseSiteAdapter):
        # Now go hunting for all the meta data and the chapter list.

        ## Title and author
-        
+
        # find story header
        a = soup.find('h6')

        titleLinks = a.find_all('a')
        authorLink= titleLinks[1]
-        
+
        self.story.setMetadata('authorId',authorLink['href'].split('=')[1])
        self.story.setMetadata('authorUrl','https://'+self.host+'/archive/home/'+authorLink['href'])
        self.story.setMetadata('author',authorLink.string)

        self.story.setMetadata('title',a.find('strong').find('a').get_text())
-        
+
        # Site does some weird stuff with pagination on series view and will only display 25 stories per page of results
        # Therefor to get accurate index for series, we fetch all sub-pages of series and parse for valid story urls and add to a list,
        # Then find first instance of current story url and use the number of loop itteration for index
-        
-        # This is pretty slow but ehh it works 
-        
+
+        # This is pretty slow but ehh it works
+
        try:
            # Find Series name from series URL.
            a = soup.find('a', href=re.compile(r"viewseries.php\?seriesid=\d+"))
-            seriesName = a.string
-            seriesUrl = 'https://'+self.host+'/archive/home/'+a['href']
-            
-            self.story.setMetadata('seriesUrl',seriesUrl)
-            
-            #logger.debug("Series Url: "+seriesUrl)
-            
-            # Get Series page and convert to soup
-            seriesPageSoup = self.make_soup(self._fetchUrl(seriesUrl))
-            # Find Series page sub-pages
-            seriesPageUrlList = []
-            for i in seriesPageSoup.findAll('a', href=re.compile("viewseries.php\?seriesid=\d+&offset=\d+$")):
-                    # Don't include url from next button, is another http request and parse + could cause more bugs!
-                    if i.string != '[Next]':
-                        seriesPageUrlList.append(i)
-            
-            #get urls from all subpages and append to list
-            seriesStoryList = []
-            for seriesPagePageUrl in seriesPageUrlList:
-                seriesPagePageSoup = self.make_soup(self._fetchUrl('https://'+self.host+'/archive/home/'+seriesPagePageUrl['href']))
-                seriesPagePageStoryList = seriesPagePageSoup.findAll('a', href=re.compile(r'^viewstory.php\?sid=\d+$'))
-                
-                for seriesPagePageStoryUrl in seriesPagePageStoryList:
-                    seriesStoryList.append(seriesPagePageStoryUrl)
+            if a:
+                seriesName = a.string
+                seriesUrl = 'https://'+self.host+'/archive/home/'+a['href']
+
+                self.story.setMetadata('seriesUrl',seriesUrl)
+
+                #logger.debug("Series Url: "+seriesUrl)
+
+                # Get Series page and convert to soup
+                seriesPageSoup = self.make_soup(self._fetchUrl(seriesUrl))
+                # Find Series page sub-pages
+                seriesPageUrlList = []
+                for i in seriesPageSoup.findAll('a', href=re.compile("viewseries.php\?seriesid=\d+&offset=\d+$")):
+                        # Don't include url from next button, is another http request and parse + could cause more bugs!
+                        if i.string != '[Next]':
+                            seriesPageUrlList.append(i)
+
+                #get urls from all subpages and append to list
+                seriesStoryList = []
+                for seriesPagePageUrl in seriesPageUrlList:
+                    seriesPagePageSoup = self.make_soup(self._fetchUrl('https://'+self.host+'/archive/home/'+seriesPagePageUrl['href']))
+                    seriesPagePageStoryList = seriesPagePageSoup.findAll('a', href=re.compile(r'^viewstory.php\?sid=\d+$'))
+
+                    for seriesPagePageStoryUrl in seriesPagePageStoryList:
+                        seriesStoryList.append(seriesPagePageStoryUrl)
+
+                # Find series index for story
+                i=1
+                for seriesStoriesUrl in seriesStoryList:
+                    if seriesStoriesUrl['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):
+                        self.setSeries(seriesName, i)
+                        #logger.debug("Series Name: "+ seriesName)
+                        #logger.debug("Series Index: "+i)
+                        break
+                    i+=1

-            # Find series index for story
-            i=1
-            for seriesStoriesUrl in seriesStoryList:
-                if seriesStoriesUrl['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):
-                    self.setSeries(seriesName, i)
-                    #logger.debug("Series Name: "+ seriesName)
-                    #logger.debug("Series Index: "+i)
-                    break
-                i+=1
-            
        except Exception as e:
-            raise e
-            # I find it hard to care if the series parsing fails
-            #pass
+            logger.warn("series parsing failed(%s)"%e)
+            pass

        # Find the chapters by regexing urls
        chapters=soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+"&chapter=\d+$"))
-        
+
        if len(chapters)==1:
            self.add_chapter(self.story.getMetadata('title'),'https://'+self.host+'/archive/home/'+chapters[0]['href'])
        else:
@ -161,9 +161,9 @@ class SilmarillionWritersGuildOrgAdapter(BaseSiteAdapter):
                self.add_chapter(chapter,'https://'+self.host+'/archive/home/'+chapter['href'])

 	# find the details section for the work, will hopefully make parsing metadata a bit easier
-	
+
        workDetails = soup.find('div', {'id' : 'general'}).find('div', {'id' : 'general'})
-        
+
        # some metadata can be retrieved through regexes so will do that to try and avoid a janky mess.

        #get characters
@ -171,30 +171,30 @@ class SilmarillionWritersGuildOrgAdapter(BaseSiteAdapter):
            charList = workDetails.findAll('a', href=re.compile(r'browse.php\?type=characters'+"&charid=\d+$"))
            for char in charList:
                self.story.addToList('characters',char.string)
-                
+
        except Exception as e:
            logger.warn("character parsing failed(%s)"%e)
-            
+
        #get warnings
        try:
            warnList = workDetails.findAll('a', href=re.compile(r'browse.php\?type=class&type_id=2'+"&classid=\d+$"))
            for warn in warnList:
                self.story.addToList('warnings', warn.string)
-                
+
        except Exception as e:
            logger.warn("warning parsing failed(%s)"%e)
-            
+
        #get genres
        try:
            genresList = workDetails.findAll('a', href=re.compile(r'browse.php\?type=class&type_id=1'+"&classid=\d+$"))
            for genre in genresList:
                self.story.addToList('genre', genre.string)
-                
+
        except Exception as e:
-            logger.warn("genre parsing failed(%s)"%e)    
-        
+            logger.warn("genre parsing failed(%s)"%e)
+
        # no convenient way to extract remaining metadata so bodge it by finding relevant identifier string and using next element as the data source
-        
+
        #get summary by finding identifier, then itterating until next identifier is found and using data between the two as the summary
        try:
            summaryStart = workDetails.find('strong',text='Summary: ')
@ -208,67 +208,67 @@ class SilmarillionWritersGuildOrgAdapter(BaseSiteAdapter):
        except Exception as e:
            logger.warn("summary parsing failed(%s) -- This can be caused by bad HTML in story description."%e)

-        
+
        #get rating
        try:
            rating = workDetails.find('strong',text='Rated:').next_sibling.string
            self.story.setMetadata('rating', rating)
        except Exception as e:
            logger.warn("rating parsing failed(%s) -- This can be caused by bad HTML in story description."%e)
-        
+
        #get completion status and correct for consistency with other adapters
        try:
            if (workDetails.find('strong',text='Completed:').next_sibling.string).lower() == "yes":
                status="Completed"
-                
+
            else:
                status="In-Progress"
-                
+
            self.story.setMetadata('status', status)
        except Exception as e:
            logger.warn("status parsing failed(%s) -- This can be caused by bad HTML in story description."%e)
-            
+
        #get wordcount
        try:
            wordCount = workDetails.find('strong',text='Word count:').next_sibling.string
            self.story.setMetadata('numWords', wordCount)
        except Exception as e:
            logger.warn("wordcount parsing failed(%s) -- This can be caused by bad HTML in story description."%e)
-        
+
        #get published date, this works for some reason yet doesn't without the spaces in it
        try:
            datePublished = workDetails.find('strong',text=' Published: ').next_sibling.string
            self.story.setMetadata('datePublished', makeDate(datePublished, self.dateformat))
-            
+
        except Exception as e:
            logger.warn("datePublished parsing failed(%s) -- This can be caused by bad HTML in story description."%e)
-        
+
        #get updated date
        try:
            dateUpdated = workDetails.find('strong',text='Updated:').next_sibling.string
            self.story.setMetadata('dateUpdated', makeDate(dateUpdated, self.dateformat))
-            
+
        except Exception as e:
            logger.warn("dateUpdated parsing failed(%s) -- This can be caused by bad HTML in story description."%e)

    # grab the text for an individual chapter.
    def getChapterText(self, url):
-        
+
        logger.debug('Getting chapter text from: %s' % url)
-        
+
        data = self._fetchUrl(url)
        soup = self.make_soup(data)
-        
+
        # No convenient way to get story without the rest of the page, so get whole page and strip unneeded sections
-        
+
        contentParent = soup.find('div', {'id' : 'maincontent'}).find('div', {'id' : 'general'})
-        
-        contentParent.find('p').decompose() # remove page header        
+
+        contentParent.find('p').decompose() # remove page header
        contentParent.find_all('div',id='general')[2].decompose() #remove page footer
        contentParent.find_all('div',id='general')[0].decompose() #remove chapter select etc.
-        
+
        contentParent.name='div'
-        
+
        #error on failure
        if None == contentParent:
            raise exceptions.FailedToDownload("Error downloading Chapter: %s!  Missing required element!" % url)