diff --git a/calibre-plugin/plugin-defaults.ini b/calibre-plugin/plugin-defaults.ini index fb2fe98b..33070893 100644 --- a/calibre-plugin/plugin-defaults.ini +++ b/calibre-plugin/plugin-defaults.ini @@ -3525,9 +3525,6 @@ add_to_titlepage_entries:,views, averageWords, fandoms ## parameter to your personal.ini and list the ones you don't want. #exclude_notes:authornotes,newsboxes,spoilers,footnotes -[www.silmarillionwritersguild.org] -use_basic_cache:true - [www.siye.co.uk] use_basic_cache:true ## Site dedicated to these categories/characters/ships diff --git a/fanficfare/adapters/__init__.py b/fanficfare/adapters/__init__.py index a126124c..14d91521 100644 --- a/fanficfare/adapters/__init__.py +++ b/fanficfare/adapters/__init__.py @@ -130,7 +130,6 @@ from . import adapter_swiorgru from . import adapter_fanficsme from . import adapter_fanfictalkcom from . import adapter_scifistoriescom -from . import adapter_silmarillionwritersguildorg from . import adapter_chireadscom from . import adapter_scribblehubcom from . import adapter_fictionlive diff --git a/fanficfare/adapters/adapter_silmarillionwritersguildorg.py b/fanficfare/adapters/adapter_silmarillionwritersguildorg.py deleted file mode 100644 index b00e5bbe..00000000 --- a/fanficfare/adapters/adapter_silmarillionwritersguildorg.py +++ /dev/null @@ -1,272 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright 2020 FanFicFare team -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from __future__ import absolute_import -import logging -logger = logging.getLogger(__name__) -import re -from bs4.element import Tag -from .. import exceptions as exceptions - -# py2 vs py3 transition -from ..six import text_type as unicode - -from .base_adapter import BaseSiteAdapter, makeDate - -def getClass(): - return SilmarillionWritersGuildOrgAdapter - -# Class name has to be unique. Our convention is camel case the -# sitename with Adapter at the end. www is skipped. -class SilmarillionWritersGuildOrgAdapter(BaseSiteAdapter): - - def __init__(self, config, url): - BaseSiteAdapter.__init__(self, config, url) - - self.username = "NoneGiven" # if left empty, site doesn't return any message at all. - self.password = "" - self.is_adult=False - - # get storyId from url--url validation guarantees query is only sid=1234 - self.story.setMetadata('storyId',self.parsedUrl.query.split('=',)[1]) - - # normalized story URL. - self._setURL('http://' + self.getSiteDomain() + '/archive/home/viewstory.php?sid='+self.story.getMetadata('storyId')) - - # Each adapter needs to have a unique site abbreviation. - self.story.setMetadata('siteabbrev','swg') - - # The date format will vary from site to site. - # http://docs.python.org/library/datetime.html#strftime-strptime-behavior - self.dateformat = "%B %d, %Y" - - @staticmethod # must be @staticmethod, don't remove it. - def getSiteDomain(): - # The site domain. Does have www here, if it uses it. - return 'www.silmarillionwritersguild.org' - - @classmethod - def getSiteExampleURLs(cls): - return "https://"+cls.getSiteDomain()+"/archive/home/viewstory.php?sid=123" - - def getSiteURLPattern(self): - return r"https?://"+re.escape(self.getSiteDomain()+"/archive/home/viewstory.php?sid=")+r"\d+$" - - ## Getting the chapter list and the meta data - def extractChapterUrlsAndMetadata(self): - - url = self.url - logger.debug("URL: "+url) - - data = self.get_request(url) - - soup = self.make_soup(data) - - - ## Title and author - - # find story header - a = soup.find('h6') - - titleLinks = a.find_all('a') - authorLink= titleLinks[1] - - self.story.setMetadata('authorId',authorLink['href'].split('=')[1]) - self.story.setMetadata('authorUrl','https://'+self.host+'/archive/home/'+authorLink['href']) - self.story.setMetadata('author',authorLink.string) - - self.story.setMetadata('title',a.find('strong').find('a').get_text()) - - # Site does some weird stuff with pagination on series view and will only display 25 stories per page of results - # Therefor to get accurate index for series, we fetch all sub-pages of series and parse for valid story urls and add to a list, - # Then find first instance of current story url and use the number of loop itteration for index - - # This is pretty slow but ehh it works - - try: - # Find Series name from series URL. - a = soup.find('a', href=re.compile(r"viewseries.php\?seriesid=\d+")) - if a: - seriesName = a.string - seriesUrl = 'https://'+self.host+'/archive/home/'+a['href'] - - self.story.setMetadata('seriesUrl',seriesUrl) - - #logger.debug("Series Url: "+seriesUrl) - - # Get Series page and convert to soup - seriesPageSoup = self.make_soup(self.get_request(seriesUrl+"&offset=0")) - ## &offset=0 is the same as the first page, by adding - ## that, the page cache will save us from fetching it - ## twice in the loop below. - - # Find Series page sub-pages - seriesPageUrlList = [] - seriesStoryList = [] - for i in seriesPageSoup.findAll('a', href=re.compile(r"viewseries.php\?seriesid=\d+&offset=\d+$")): - # Don't include url from next button, is another http request and parse + could cause more bugs! - if i.string != '[Next]': - seriesPageUrlList.append(i) - - #get urls from all subpages and append to list - i=1 - for seriesPagePageUrl in seriesPageUrlList: - seriesPagePageSoup = self.make_soup(self.get_request('https://'+self.host+'/archive/home/'+seriesPagePageUrl['href'])) - storyHeaders = seriesPagePageSoup.findAll('h5') - ## can't just search for story URLs, some story - ## descs also contain story URLs. Looks like only - ## story titles are
. - for storyHeader in storyHeaders: - seriesPagePageStoryUrl = storyHeader.find('a',href=re.compile(r'^viewstory.php\?sid=\d+$')) - if seriesPagePageStoryUrl['href'] == ('viewstory.php?sid='+self.story.getMetadata('storyId')): - #logger.debug("Series Name: "+ seriesName) - #logger.debug("Series Index: "+i) - self.setSeries(seriesName, i) - raise StopIteration("Break out of series parsing loops") - i+=1 - - except StopIteration: - # break out of both loops, don't need to fetch further - # pages after story found. - pass - except Exception as e: - logger.warning("series parsing failed(%s)"%e) - - # Find the chapters by regexing urls - chapters=soup.findAll('a', href=re.compile(r'viewstory.php\?sid='+self.story.getMetadata('storyId')+r"&chapter=\d+$")) - - if len(chapters)==1: - self.add_chapter(self.story.getMetadata('title'),'https://'+self.host+'/archive/home/'+chapters[0]['href']) - else: - for chapter in chapters: - # logger.debug("Added Chapter: "+chapter.string) - self.add_chapter(chapter,'https://'+self.host+'/archive/home/'+chapter['href']) - - # find the details section for the work, will hopefully make parsing metadata a bit easier - - workDetails = soup.find('div', {'id' : 'general'}).find('div', {'id' : 'general'}) - - # some metadata can be retrieved through regexes so will do that to try and avoid a janky mess. - - #get characters - try: - charList = workDetails.findAll('a', href=re.compile(r'browse.php\?type=characters'+r"&charid=\d+$")) - for char in charList: - self.story.addToList('characters',char.string) - - except Exception as e: - logger.warning("character parsing failed(%s)"%e) - - #get warnings - try: - warnList = workDetails.findAll('a', href=re.compile(r'browse.php\?type=class&type_id=2'+r"&classid=\d+$")) - for warn in warnList: - self.story.addToList('warnings', warn.string) - - except Exception as e: - logger.warning("warning parsing failed(%s)"%e) - - #get genres - try: - genresList = workDetails.findAll('a', href=re.compile(r'browse.php\?type=class&type_id=1'+r"&classid=\d+$")) - for genre in genresList: - self.story.addToList('genre', genre.string) - - except Exception as e: - logger.warning("genre parsing failed(%s)"%e) - - # no convenient way to extract remaining metadata so bodge it by finding relevant identifier string and using next element as the data source - - #get summary by finding identifier, then itterating until next identifier is found and using data between the two as the summary - try: - summaryStart = workDetails.find('strong',text='Summary: ') - currentElement = summaryStart.parent.next_sibling - summaryValue = "" - while not isinstance(currentElement,Tag) or currentElement.name != 'strong': - summaryValue += unicode(currentElement) - currentElement = currentElement.next_sibling - #logger.debug(summaryValue) - self.setDescription(url,summaryValue) - except Exception as e: - logger.warning("summary parsing failed(%s) -- This can be caused by bad HTML in story description."%e) - - - #get rating - try: - rating = workDetails.find('strong',text='Rated:').next_sibling.string - self.story.setMetadata('rating', rating) - except Exception as e: - logger.warning("rating parsing failed(%s) -- This can be caused by bad HTML in story description."%e) - - #get completion status and correct for consistency with other adapters - try: - if (workDetails.find('strong',text='Completed:').next_sibling.string).lower() == "yes": - status="Completed" - - else: - status="In-Progress" - - self.story.setMetadata('status', status) - except Exception as e: - logger.warning("status parsing failed(%s) -- This can be caused by bad HTML in story description."%e) - - #get wordcount - try: - wordCount = workDetails.find('strong',text='Word count:').next_sibling.string - self.story.setMetadata('numWords', wordCount) - except Exception as e: - logger.warning("wordcount parsing failed(%s) -- This can be caused by bad HTML in story description."%e) - - #get published date, this works for some reason yet doesn't without the spaces in it - try: - datePublished = workDetails.find('strong',text=' Published: ').next_sibling.string - self.story.setMetadata('datePublished', makeDate(datePublished, self.dateformat)) - - except Exception as e: - logger.warning("datePublished parsing failed(%s) -- This can be caused by bad HTML in story description."%e) - - #get updated date - try: - dateUpdated = workDetails.find('strong',text='Updated:').next_sibling.string - self.story.setMetadata('dateUpdated', makeDate(dateUpdated, self.dateformat)) - - except Exception as e: - logger.warning("dateUpdated parsing failed(%s) -- This can be caused by bad HTML in story description."%e) - - # grab the text for an individual chapter. - def getChapterText(self, url): - - logger.debug('Getting chapter text from: %s' % url) - - data = self.get_request(url) - soup = self.make_soup(data) - - # No convenient way to get story without the rest of the page, so get whole page and strip unneeded sections - - contentParent = soup.find('div', {'id' : 'maincontent'}).find('div', {'id' : 'general'}) - - contentParent.find('p').decompose() # remove page header - contentParent.find_all('div',id='general')[2].decompose() #remove page footer - contentParent.find_all('div',id='general')[0].decompose() #remove chapter select etc. - - contentParent.name='div' - - #error on failure - if None == contentParent: - raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url) - - return self.utf8FromSoup(url,contentParent) diff --git a/fanficfare/defaults.ini b/fanficfare/defaults.ini index 14728c5a..ff1dc2f2 100644 --- a/fanficfare/defaults.ini +++ b/fanficfare/defaults.ini @@ -3495,9 +3495,6 @@ add_to_titlepage_entries:,views, averageWords, fandoms ## parameter to your personal.ini and list the ones you don't want. #exclude_notes:authornotes,newsboxes,spoilers,footnotes -[www.silmarillionwritersguild.org] -use_basic_cache:true - [www.siye.co.uk] use_basic_cache:true ## Site dedicated to these categories/characters/ships