Adding eFiction series parsing attempt to base_adapter.

2026-05-08 21:11:59 +02:00 · 2020-09-27 19:50:32 -05:00 · 2020-09-27 19:50:32 -05:00 · ad59e2cf45
commit ad59e2cf45
parent d9101f315a
4 changed files with 47 additions and 6 deletions
--- a/fanficfare/adapters/adapter_archiveofourownorg.py
+++ b/fanficfare/adapters/adapter_archiveofourownorg.py
@ -592,7 +592,7 @@ class ArchiveOfOurOwnOrgAdapter(BaseSiteAdapter):
            self.performLogin(url,data)
            # get the list page with logged in session.

-    def get_series_from_page(self,url,data):
+    def get_series_from_page(self,url,data,normalize=False):
        '''
        This method is to make it easier for adapters to detect a
        series URL, pick out the series metadata and list of storyUrls
--- a/fanficfare/adapters/adapter_tthfanficorg.py
+++ b/fanficfare/adapters/adapter_tthfanficorg.py
@ -356,7 +356,7 @@ class TwistingTheHellmouthSiteAdapter(BaseSiteAdapter):
        if self.getConfig("is_adult"):
            self.setSiteMaxRating(url)

-    def get_series_from_page(self,url,data):
+    def get_series_from_page(self,url,data,normalize=False):
        '''
        This method is to make it easier for adapters to detect a
        series URL, pick out the series metadata and list of storyUrls
--- a/fanficfare/adapters/base_adapter.py
+++ b/fanficfare/adapters/base_adapter.py
@ -411,8 +411,8 @@ class BaseSiteAdapter(Configurable):
        self.before_get_urls_from_page(url,normalize)

        # this way it uses User-Agent or other special settings.
-        data = self._fetchUrl(url,usecache=False)
-        series = self.get_series_from_page(url,data)
+        data = self._fetchUrl(url,usecache=True)
+        series = self.get_series_from_page(url,data,normalize)
        if series:
            # just to make it easier for adapters.
            if isinstance(series.get('desc',None),(BeautifulSoup,Tag)):
@ -424,16 +424,56 @@ class BaseSiteAdapter(Configurable):
                                                 configuration=self.configuration,
                                                 normalize=normalize)}

-    def get_series_from_page(self,url,data):
+    def get_series_from_page(self,url,data,normalize=False):
+        from ..geturls import get_urls_from_html
        '''
        This method is to make it easier for adapters to detect a
        series URL, pick out the series metadata and list of storyUrls
        to return without needing to override get_urls_from_page
        entirely.
        '''
+        # return {}
+        retval = {}
        ## return dict with at least {'urllist':['storyUrl','storyUrl',...]}
        ## 'name' and 'desc' are also used if given.
-        return {}
+
+        ## for eFiction sites:
+        ## http://www.dracoandginny.com/viewseries.php?seriesid=45
+        logger.debug("base get_series_from_page:%s"%url)
+        try:
+            if re.match(r".*viewseries\.php\?s(erie)?sid=\d+.*",url): # seriesid or ssid
+                logger.debug("Attempting eFiction get_series_from_page")
+                soup = self.make_soup(data)
+                retval = {}
+                nametag = soup.select_one('div#pagetitle')
+                if nametag:
+                    nametag.find('a').decompose()
+                    retval['name'] = stripHTML(nametag)
+                    if retval['name'].endswith(' by'):
+                        # remove trailing ' by'
+                        retval['name'] = retval['name'][:-3]
+                summaryspan = soup.select_one("div#titleblock span.label")
+                if stripHTML(summaryspan) == "Summary:":
+                    desc = ""
+                    c = summaryspan.nextSibling
+                    while not isinstance(c,Tag):
+                        desc += unicode(c)
+                        c = c.nextSibling
+                    if desc:
+                        retval['desc']=desc
+
+                # trying to get story urls for series from different
+                # eFictions is a nightmare that the pre-existing
+                # get_urls_from_html() handles well enough.  I don't
+                # think eFiction allows HTML in story desc anyway...
+                retval['urllist']=get_urls_from_html(soup,
+                                                    url,
+                                                    configuration=self.configuration,
+                                                    normalize=normalize)
+        except Exception as e:
+            logger.debug("get_series_from_page for eFiction failed:%s"%e)
+            retval = {}
+        return retval

    # Just for series, in case we choose to change how it's stored or represented later.
    def setSeries(self,name,num):
--- a/fanficfare/geturls.py
+++ b/fanficfare/geturls.py
@ -81,6 +81,7 @@ def get_urls_from_html(data,url=None,configuration=None,normalize=False,email=Fa
                    urls[adapter.story.getMetadata('storyUrl')] = [href]
                else:
                    urls[adapter.story.getMetadata('storyUrl')].append(href)
+                #logger.debug("adapter storyUrl:%s"%adapter.story.getMetadata('storyUrl'))
            except Exception as e:
                #logger.debug e
                pass