From ad59e2cf4579ec8b141360584b715b666844b571 Mon Sep 17 00:00:00 2001 From: Jim Miller Date: Sun, 27 Sep 2020 19:50:32 -0500 Subject: [PATCH] Adding eFiction series parsing attempt to base_adapter. --- .../adapters/adapter_archiveofourownorg.py | 2 +- fanficfare/adapters/adapter_tthfanficorg.py | 2 +- fanficfare/adapters/base_adapter.py | 48 +++++++++++++++++-- fanficfare/geturls.py | 1 + 4 files changed, 47 insertions(+), 6 deletions(-) diff --git a/fanficfare/adapters/adapter_archiveofourownorg.py b/fanficfare/adapters/adapter_archiveofourownorg.py index be645e24..46e59c87 100644 --- a/fanficfare/adapters/adapter_archiveofourownorg.py +++ b/fanficfare/adapters/adapter_archiveofourownorg.py @@ -592,7 +592,7 @@ class ArchiveOfOurOwnOrgAdapter(BaseSiteAdapter): self.performLogin(url,data) # get the list page with logged in session. - def get_series_from_page(self,url,data): + def get_series_from_page(self,url,data,normalize=False): ''' This method is to make it easier for adapters to detect a series URL, pick out the series metadata and list of storyUrls diff --git a/fanficfare/adapters/adapter_tthfanficorg.py b/fanficfare/adapters/adapter_tthfanficorg.py index 266e4589..c96a26fe 100644 --- a/fanficfare/adapters/adapter_tthfanficorg.py +++ b/fanficfare/adapters/adapter_tthfanficorg.py @@ -356,7 +356,7 @@ class TwistingTheHellmouthSiteAdapter(BaseSiteAdapter): if self.getConfig("is_adult"): self.setSiteMaxRating(url) - def get_series_from_page(self,url,data): + def get_series_from_page(self,url,data,normalize=False): ''' This method is to make it easier for adapters to detect a series URL, pick out the series metadata and list of storyUrls diff --git a/fanficfare/adapters/base_adapter.py b/fanficfare/adapters/base_adapter.py index ab8bf018..140c979a 100644 --- a/fanficfare/adapters/base_adapter.py +++ b/fanficfare/adapters/base_adapter.py @@ -411,8 +411,8 @@ class BaseSiteAdapter(Configurable): self.before_get_urls_from_page(url,normalize) # this way it uses User-Agent or other special settings. - data = self._fetchUrl(url,usecache=False) - series = self.get_series_from_page(url,data) + data = self._fetchUrl(url,usecache=True) + series = self.get_series_from_page(url,data,normalize) if series: # just to make it easier for adapters. if isinstance(series.get('desc',None),(BeautifulSoup,Tag)): @@ -424,16 +424,56 @@ class BaseSiteAdapter(Configurable): configuration=self.configuration, normalize=normalize)} - def get_series_from_page(self,url,data): + def get_series_from_page(self,url,data,normalize=False): + from ..geturls import get_urls_from_html ''' This method is to make it easier for adapters to detect a series URL, pick out the series metadata and list of storyUrls to return without needing to override get_urls_from_page entirely. ''' + # return {} + retval = {} ## return dict with at least {'urllist':['storyUrl','storyUrl',...]} ## 'name' and 'desc' are also used if given. - return {} + + ## for eFiction sites: + ## http://www.dracoandginny.com/viewseries.php?seriesid=45 + logger.debug("base get_series_from_page:%s"%url) + try: + if re.match(r".*viewseries\.php\?s(erie)?sid=\d+.*",url): # seriesid or ssid + logger.debug("Attempting eFiction get_series_from_page") + soup = self.make_soup(data) + retval = {} + nametag = soup.select_one('div#pagetitle') + if nametag: + nametag.find('a').decompose() + retval['name'] = stripHTML(nametag) + if retval['name'].endswith(' by'): + # remove trailing ' by' + retval['name'] = retval['name'][:-3] + summaryspan = soup.select_one("div#titleblock span.label") + if stripHTML(summaryspan) == "Summary:": + desc = "" + c = summaryspan.nextSibling + while not isinstance(c,Tag): + desc += unicode(c) + c = c.nextSibling + if desc: + retval['desc']=desc + + # trying to get story urls for series from different + # eFictions is a nightmare that the pre-existing + # get_urls_from_html() handles well enough. I don't + # think eFiction allows HTML in story desc anyway... + retval['urllist']=get_urls_from_html(soup, + url, + configuration=self.configuration, + normalize=normalize) + except Exception as e: + logger.debug("get_series_from_page for eFiction failed:%s"%e) + retval = {} + return retval # Just for series, in case we choose to change how it's stored or represented later. def setSeries(self,name,num): diff --git a/fanficfare/geturls.py b/fanficfare/geturls.py index 006cb81f..e9d32130 100644 --- a/fanficfare/geturls.py +++ b/fanficfare/geturls.py @@ -81,6 +81,7 @@ def get_urls_from_html(data,url=None,configuration=None,normalize=False,email=Fa urls[adapter.story.getMetadata('storyUrl')] = [href] else: urls[adapter.story.getMetadata('storyUrl')].append(href) + #logger.debug("adapter storyUrl:%s"%adapter.story.getMetadata('storyUrl')) except Exception as e: #logger.debug e pass