Add mcstories.com site adapter

2025-12-25 18:24:49 +01:00 · 2015-12-15 23:28:51 -05:00 · 2015-12-15 23:28:51 -05:00 · 60ff1986fa
commit 60ff1986fa
parent b314c915d4
4 changed files with 198 additions and 0 deletions
--- a/calibre-plugin/plugin-defaults.ini
+++ b/calibre-plugin/plugin-defaults.ini
@ -1302,6 +1302,10 @@ extracategories:Lord of the Rings
 ## Site dedicated to these categories/characters/ships
 extracategories:Harry Potter

+[mcstories.com]
+## Clear FanFiction from defaults, site is original fiction.
+extratags:Erotica
+
 [merlinfic.dtwins.co.uk]
 ## Some sites require login (or login for some rated stories) The
 ## program can prompt you, or you can save it in config.  In
--- a/fanficfare/adapters/init.py
+++ b/fanficfare/adapters/init.py
@ -139,6 +139,7 @@ import adapter_forumquestionablequestingcom
 import adapter_ninelivesarchivecom
 import adapter_masseffect2in
 import adapter_quotevcom
+import adapter_mcstoriescom

 ## This bit of complexity allows adapters to be added by just adding
 ## importing.  It eliminates the long if/else clauses we used to need
--- a/fanficfare/adapters/adapter_mcstoriescom.py
+++ b/fanficfare/adapters/adapter_mcstoriescom.py
@ -0,0 +1,189 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2013 Fanficdownloader team, 2015 FanFicFare team
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import time
+import logging
+logger = logging.getLogger(__name__)
+import re
+import urllib2
+import urlparse
+import time
+import os
+
+from bs4.element import Comment
+from ..htmlcleanup import stripHTML
+from .. import exceptions as exceptions
+
+from base_adapter import BaseSiteAdapter, makeDate
+
+class MCStoriesComSiteAdapter(BaseSiteAdapter):
+
+    def __init__(self, config, url):
+        BaseSiteAdapter.__init__(self, config, url)
+
+        self.decode = ["utf8",
+                       "Windows-1252"] # 1252 is a superset of iso-8859-1.
+                               # Most sites that claim to be
+                               # iso-8859-1 (and some that claim to be
+                               # utf8) are really windows-1252.
+
+        self.story.setMetadata('siteabbrev','mcstories')
+
+        # Normalize story URL to the chapter index page (.../index.html)
+        url = re.sub("/([a-zA-Z0-9_-]+.html)?$", "/index.html", url)
+
+        # Extract story ID from base URL, http://mcstories.com/STORY_ID/index.html
+        storyId = self.parsedUrl.path.split('/',)[1]
+        self.story.setMetadata('storyId', storyId)
+
+        ## set url
+        self._setURL(url)
+
+        # The date format will vary from site to site.
+        # http://docs.python.org/library/datetime.html#strftime-strptime-behavior
+        self.dateformat = '%d %B %Y'
+
+    @staticmethod
+    def getSiteDomain():
+        return 'mcstories.com'
+
+    @classmethod
+    def getAcceptDomains(cls):
+        return ['mcstories.com',
+                'www.mcstories.com']
+
+    @classmethod
+    def getSiteExampleURLs(cls):
+        return "http://mcstories.com/StoryTitle http://mcstories.com/StoryTitle/index.html http://mcstories.com/StoryTitle/StoryTitle1.html"
+
+    def getSiteURLPattern(self):
+        return r"https?://(www\.)?mcstories\.com/([a-zA-Z0-9_-]+)/"
+
+    def extractChapterUrlsAndMetadata(self):
+        """
+        Chapters are located at /StoryName/StoryName.html (for single-chapter
+        stories), or /StoryName/StoryName#.html for multiple chapters (# is a
+        non-padded incrementing number, like StoryName1, StoryName2.html, ...,
+        StoryName10.html)
+
+        The story metadata page is at /StoryName/index.html , including a list
+        of chapters.
+        """
+
+        if not (self.is_adult or self.getConfig("is_adult")):
+            raise exceptions.AdultCheckRequired(self.url)
+
+        try:
+            data1 = self._fetchUrl(self.url)
+            soup1 = self.make_soup(data1)
+            #strip comments from soup
+            [comment.extract() for comment in soup1.findAll(text=lambda text:isinstance(text, Comment))]
+        except urllib2.HTTPError, e:
+            if e.code == 404:
+                raise exceptions.StoryDoesNotExist(self.url)
+            else:
+                raise e
+
+        # Extract metadata
+        title = soup1.find('h3', class_='title')
+        self.story.setMetadata('title', title.text)
+
+        # Author
+        author = soup1.find('h3', class_='byline').a
+        authorurl = urlparse.urljoin(self.url, author['href'])
+        self.story.setMetadata('author', author.text)
+        self.story.setMetadata('authorUrl', authorurl)
+        authorid = os.path.splitext(os.path.basename(authorurl))[0]
+        self.story.setMetadata('authorId', authorid)
+
+        # Description
+        synopsis = soup1.find('section', class_='synopsis')
+        description = "\n\n".join([p.text for p in synopsis.findAll('p')])
+        self.story.setMetadata('description', description)
+
+        # Tags
+        codesDiv = soup1.find('div', class_="storyCodes")
+        for a in codesDiv.findAll('a'):
+            self.story.addToList('category', a.text)
+
+        # Publish and update dates
+        publishdate = None
+        updatedate = None
+        datelines = soup1.findAll('h3', class_='dateline')
+        for dateline in datelines:
+            if dateline.text.startswith('Added '):
+                publishdate = makeDate(dateline.text, "Added " + self.dateformat)
+            elif dateline.text.startswith('Updated '):
+                updatedate = makeDate(dateline.text, "Updated " + self.dateformat)
+
+        if publishdate is not None: self.story.setMetadata('datePublished', publishdate)
+        if updatedate is not None: self.story.setMetadata('dateUpdated', updatedate)
+
+        # Get chapter URLs
+        self.chapterUrls = []
+        chapterTable = soup1.find('table', class_='index')
+
+        if chapterTable is not None:
+            # Multi-chapter story
+            chapterRows = chapterTable.findAll('tr')
+
+            for row in chapterRows:
+                chapterCell = row.td
+                if chapterCell is not None:
+                    link = chapterCell.a
+                    chapterTitle = link.text
+                    chapterUrl = urlparse.urljoin(self.url, link['href'])
+                    self.chapterUrls.append((chapterTitle, chapterUrl))
+        else:
+            # Single chapter
+            chapterDiv = soup1.find('div', class_='chapter')
+            chapterTitle = chapterDiv.a.text
+            chapterUrl = urlparse.urljoin(self.url, chapterDiv.a['href'])
+            self.chapterUrls = [(chapterTitle, chapterUrl)]
+
+        self.story.setMetadata('numChapters', len(self.chapterUrls))
+
+        logger.debug("Story: <%s>", self.story)
+
+        return
+
+    def getChapterText(self, url):
+        """
+        Clean up a mcstories chapter page.
+
+        All content is in article#mcstories, with chapter headers in h3
+        """
+        logger.debug('Getting chapter text from <%s>' % url)
+        data1 = self._fetchUrl(url)
+        soup1 = self.make_soup(data1)
+
+        #strip comments from soup
+        [comment.extract() for comment in soup1.findAll(text=lambda text:isinstance(text, Comment))]
+
+        # get story text
+        story1 = soup1.find('article', id='mcstories')
+
+        # Remove duplicate name and author headers
+        [h3.extract() for h3 in story1.findAll('h3')]
+
+        storytext = self.utf8FromSoup(url, story1)
+
+        return storytext
+
+
+def getClass():
+    return MCStoriesComSiteAdapter
--- a/fanficfare/defaults.ini
+++ b/fanficfare/defaults.ini
@ -1290,6 +1290,10 @@ extracategories:Lord of the Rings
 ## Site dedicated to these categories/characters/ships
 extracategories:Harry Potter

+[mcstories.com]
+## Clear FanFiction from defaults, site is original fiction.
+extratags:Erotica
+
 [merlinfic.dtwins.co.uk]
 ## Some sites require login (or login for some rated stories) The
 ## program can prompt you, or you can save it in config.  In