From a14c97d335fc660f86c8f421ee02ff32c5147cfe Mon Sep 17 00:00:00 2001
From: Jim Miller <retiefjimm@gmail.com>
Date: Wed, 27 May 2020 10:37:55 -0500
Subject: [PATCH] Tweak series parsing to save fetches in
 adapter_silmarillionwritersguildorg

---
 .../adapter_silmarillionwritersguildorg.py    | 54 ++++++++++++-------
 1 file changed, 34 insertions(+), 20 deletions(-)
diff --git a/fanficfare/adapters/adapter_silmarillionwritersguildorg.py b/fanficfare/adapters/adapter_silmarillionwritersguildorg.py
index 990757a8..eb778fec 100644
--- a/fanficfare/adapters/adapter_silmarillionwritersguildorg.py
+++ b/fanficfare/adapters/adapter_silmarillionwritersguildorg.py
@@ -68,6 +68,13 @@ class SilmarillionWritersGuildOrgAdapter(BaseSiteAdapter):
     def getSiteURLPattern(self):
         return r"https?://"+re.escape(self.getSiteDomain()+"/archive/home/viewstory.php?sid=")+r"\d+$"
 
+    def use_pagecache(self):
+        '''
+        adapters that will work with the page cache need to implement
+        this and change it to True.
+        '''
+        return True
+
     ## Getting the chapter list and the meta data
     def extractChapterUrlsAndMetadata(self):
 
@@ -119,33 +126,40 @@ class SilmarillionWritersGuildOrgAdapter(BaseSiteAdapter):
                 #logger.debug("Series Url: "+seriesUrl)
 
                 # Get Series page and convert to soup
-                seriesPageSoup = self.make_soup(self._fetchUrl(seriesUrl))
+                seriesPageSoup = self.make_soup(self._fetchUrl(seriesUrl+"&offset=0"))
+                ## &offset=0 is the same as the first page, by adding
+                ## that, the page cache will save us from fetching it
+                ## twice in the loop below.
+
                 # Find Series page sub-pages
                 seriesPageUrlList = []
+                seriesStoryList = []
                 for i in seriesPageSoup.findAll('a', href=re.compile("viewseries.php\?seriesid=\d+&offset=\d+$")):
-                        # Don't include url from next button, is another http request and parse + could cause more bugs!
-                        if i.string != '[Next]':
-                            seriesPageUrlList.append(i)
+                    # Don't include url from next button, is another http request and parse + could cause more bugs!
+                    if i.string != '[Next]':
+                        seriesPageUrlList.append(i)
 
                 #get urls from all subpages and append to list
-                seriesStoryList = []
+                i=1
                 for seriesPagePageUrl in seriesPageUrlList:
                     seriesPagePageSoup = self.make_soup(self._fetchUrl('https://'+self.host+'/archive/home/'+seriesPagePageUrl['href']))
-                    seriesPagePageStoryList = seriesPagePageSoup.findAll('a', href=re.compile(r'^viewstory.php\?sid=\d+$'))
-
-                    for seriesPagePageStoryUrl in seriesPagePageStoryList:
-                        seriesStoryList.append(seriesPagePageStoryUrl)
-
-                # Find series index for story
-                i=1
-                for seriesStoriesUrl in seriesStoryList:
-                    if seriesStoriesUrl['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):
-                        self.setSeries(seriesName, i)
-                        #logger.debug("Series Name: "+ seriesName)
-                        #logger.debug("Series Index: "+i)
-                        break
-                    i+=1
+                    storyHeaders = seriesPagePageSoup.findAll('h5')
+                    ## can't just search for story URLs, some story
+                    ## descs also contain story URLs.  Looks like only
+                    ## story titles are <h5>.
+                    for storyHeader in storyHeaders:
+                        seriesPagePageStoryUrl = storyHeader.find('a',href=re.compile(r'^viewstory.php\?sid=\d+$'))
+                        if seriesPagePageStoryUrl['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')):
+                            #logger.debug("Series Name: "+ seriesName)
+                            #logger.debug("Series Index: "+i)
+                            self.setSeries(seriesName, i)
+                            raise StopIteration("Break out of series parsing loops")
+                        i+=1
 
+        except StopIteration:
+            # break out of both loops, don't need to fetch further
+            # pages after story found.
+            pass
         except Exception as e:
             logger.warn("series parsing failed(%s)"%e)
             pass
@@ -157,7 +171,7 @@ class SilmarillionWritersGuildOrgAdapter(BaseSiteAdapter):
             self.add_chapter(self.story.getMetadata('title'),'https://'+self.host+'/archive/home/'+chapters[0]['href'])
         else:
             for chapter in chapters:
-                logger.debug("Added Chapter: "+chapter.string)
+                # logger.debug("Added Chapter: "+chapter.string)
                 self.add_chapter(chapter,'https://'+self.host+'/archive/home/'+chapter['href'])
 
 	# find the details section for the work, will hopefully make parsing metadata a bit easier