From 60ff1986fa98b5185b74cbd5ffe5bbc44cb134e5 Mon Sep 17 00:00:00 2001 From: user39189 Date: Tue, 15 Dec 2015 23:28:51 -0500 Subject: [PATCH] Add mcstories.com site adapter --- calibre-plugin/plugin-defaults.ini | 4 + fanficfare/adapters/__init__.py | 1 + fanficfare/adapters/adapter_mcstoriescom.py | 189 ++++++++++++++++++++ fanficfare/defaults.ini | 4 + 4 files changed, 198 insertions(+) create mode 100644 fanficfare/adapters/adapter_mcstoriescom.py diff --git a/calibre-plugin/plugin-defaults.ini b/calibre-plugin/plugin-defaults.ini index d4606e9e..c4c19c8e 100644 --- a/calibre-plugin/plugin-defaults.ini +++ b/calibre-plugin/plugin-defaults.ini @@ -1302,6 +1302,10 @@ extracategories:Lord of the Rings ## Site dedicated to these categories/characters/ships extracategories:Harry Potter +[mcstories.com] +## Clear FanFiction from defaults, site is original fiction. +extratags:Erotica + [merlinfic.dtwins.co.uk] ## Some sites require login (or login for some rated stories) The ## program can prompt you, or you can save it in config. In diff --git a/fanficfare/adapters/__init__.py b/fanficfare/adapters/__init__.py index bbf81a64..981e4444 100644 --- a/fanficfare/adapters/__init__.py +++ b/fanficfare/adapters/__init__.py @@ -139,6 +139,7 @@ import adapter_forumquestionablequestingcom import adapter_ninelivesarchivecom import adapter_masseffect2in import adapter_quotevcom +import adapter_mcstoriescom ## This bit of complexity allows adapters to be added by just adding ## importing. It eliminates the long if/else clauses we used to need diff --git a/fanficfare/adapters/adapter_mcstoriescom.py b/fanficfare/adapters/adapter_mcstoriescom.py new file mode 100644 index 00000000..ba8b60d8 --- /dev/null +++ b/fanficfare/adapters/adapter_mcstoriescom.py @@ -0,0 +1,189 @@ +# -*- coding: utf-8 -*- + +# Copyright 2013 Fanficdownloader team, 2015 FanFicFare team +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import time +import logging +logger = logging.getLogger(__name__) +import re +import urllib2 +import urlparse +import time +import os + +from bs4.element import Comment +from ..htmlcleanup import stripHTML +from .. import exceptions as exceptions + +from base_adapter import BaseSiteAdapter, makeDate + +class MCStoriesComSiteAdapter(BaseSiteAdapter): + + def __init__(self, config, url): + BaseSiteAdapter.__init__(self, config, url) + + self.decode = ["utf8", + "Windows-1252"] # 1252 is a superset of iso-8859-1. + # Most sites that claim to be + # iso-8859-1 (and some that claim to be + # utf8) are really windows-1252. + + self.story.setMetadata('siteabbrev','mcstories') + + # Normalize story URL to the chapter index page (.../index.html) + url = re.sub("/([a-zA-Z0-9_-]+.html)?$", "/index.html", url) + + # Extract story ID from base URL, http://mcstories.com/STORY_ID/index.html + storyId = self.parsedUrl.path.split('/',)[1] + self.story.setMetadata('storyId', storyId) + + ## set url + self._setURL(url) + + # The date format will vary from site to site. + # http://docs.python.org/library/datetime.html#strftime-strptime-behavior + self.dateformat = '%d %B %Y' + + @staticmethod + def getSiteDomain(): + return 'mcstories.com' + + @classmethod + def getAcceptDomains(cls): + return ['mcstories.com', + 'www.mcstories.com'] + + @classmethod + def getSiteExampleURLs(cls): + return "http://mcstories.com/StoryTitle http://mcstories.com/StoryTitle/index.html http://mcstories.com/StoryTitle/StoryTitle1.html" + + def getSiteURLPattern(self): + return r"https?://(www\.)?mcstories\.com/([a-zA-Z0-9_-]+)/" + + def extractChapterUrlsAndMetadata(self): + """ + Chapters are located at /StoryName/StoryName.html (for single-chapter + stories), or /StoryName/StoryName#.html for multiple chapters (# is a + non-padded incrementing number, like StoryName1, StoryName2.html, ..., + StoryName10.html) + + The story metadata page is at /StoryName/index.html , including a list + of chapters. + """ + + if not (self.is_adult or self.getConfig("is_adult")): + raise exceptions.AdultCheckRequired(self.url) + + try: + data1 = self._fetchUrl(self.url) + soup1 = self.make_soup(data1) + #strip comments from soup + [comment.extract() for comment in soup1.findAll(text=lambda text:isinstance(text, Comment))] + except urllib2.HTTPError, e: + if e.code == 404: + raise exceptions.StoryDoesNotExist(self.url) + else: + raise e + + # Extract metadata + title = soup1.find('h3', class_='title') + self.story.setMetadata('title', title.text) + + # Author + author = soup1.find('h3', class_='byline').a + authorurl = urlparse.urljoin(self.url, author['href']) + self.story.setMetadata('author', author.text) + self.story.setMetadata('authorUrl', authorurl) + authorid = os.path.splitext(os.path.basename(authorurl))[0] + self.story.setMetadata('authorId', authorid) + + # Description + synopsis = soup1.find('section', class_='synopsis') + description = "\n\n".join([p.text for p in synopsis.findAll('p')]) + self.story.setMetadata('description', description) + + # Tags + codesDiv = soup1.find('div', class_="storyCodes") + for a in codesDiv.findAll('a'): + self.story.addToList('category', a.text) + + # Publish and update dates + publishdate = None + updatedate = None + datelines = soup1.findAll('h3', class_='dateline') + for dateline in datelines: + if dateline.text.startswith('Added '): + publishdate = makeDate(dateline.text, "Added " + self.dateformat) + elif dateline.text.startswith('Updated '): + updatedate = makeDate(dateline.text, "Updated " + self.dateformat) + + if publishdate is not None: self.story.setMetadata('datePublished', publishdate) + if updatedate is not None: self.story.setMetadata('dateUpdated', updatedate) + + # Get chapter URLs + self.chapterUrls = [] + chapterTable = soup1.find('table', class_='index') + + if chapterTable is not None: + # Multi-chapter story + chapterRows = chapterTable.findAll('tr') + + for row in chapterRows: + chapterCell = row.td + if chapterCell is not None: + link = chapterCell.a + chapterTitle = link.text + chapterUrl = urlparse.urljoin(self.url, link['href']) + self.chapterUrls.append((chapterTitle, chapterUrl)) + else: + # Single chapter + chapterDiv = soup1.find('div', class_='chapter') + chapterTitle = chapterDiv.a.text + chapterUrl = urlparse.urljoin(self.url, chapterDiv.a['href']) + self.chapterUrls = [(chapterTitle, chapterUrl)] + + self.story.setMetadata('numChapters', len(self.chapterUrls)) + + logger.debug("Story: <%s>", self.story) + + return + + def getChapterText(self, url): + """ + Clean up a mcstories chapter page. + + All content is in article#mcstories, with chapter headers in h3 + """ + logger.debug('Getting chapter text from <%s>' % url) + data1 = self._fetchUrl(url) + soup1 = self.make_soup(data1) + + #strip comments from soup + [comment.extract() for comment in soup1.findAll(text=lambda text:isinstance(text, Comment))] + + # get story text + story1 = soup1.find('article', id='mcstories') + + # Remove duplicate name and author headers + [h3.extract() for h3 in story1.findAll('h3')] + + storytext = self.utf8FromSoup(url, story1) + + return storytext + + +def getClass(): + return MCStoriesComSiteAdapter diff --git a/fanficfare/defaults.ini b/fanficfare/defaults.ini index 166462e4..1a1cd154 100644 --- a/fanficfare/defaults.ini +++ b/fanficfare/defaults.ini @@ -1290,6 +1290,10 @@ extracategories:Lord of the Rings ## Site dedicated to these categories/characters/ships extracategories:Harry Potter +[mcstories.com] +## Clear FanFiction from defaults, site is original fiction. +extratags:Erotica + [merlinfic.dtwins.co.uk] ## Some sites require login (or login for some rated stories) The ## program can prompt you, or you can save it in config. In