Add rudimentary support for `www.masseffect2.in'.

Status: usable, but needs various enhancements and refactoring. Implemented: * Downloading of whole stories given a chapter URL. * Automatic chapter numbering extraction and title generation. * Author identification. * Word and chapter (not chapter parts) counting. * Genre, character, and rating detection (in basic cases).
2026-05-07 12:02:25 +02:00 · 2015-07-21 19:59:02 +03:00 · 2015-07-21 19:59:02 +03:00 · 707f7a347b
commit 707f7a347b
parent cae4a74578
3 changed files with 570 additions and 0 deletions
--- a/fanficfare/adapters/init.py
+++ b/fanficfare/adapters/init.py
@ -135,6 +135,7 @@ import adapter_fanfictionjunkiesde
 import adapter_devianthearts
 import adapter_tgstorytimecom
 import adapter_itcouldhappennet
+import adapter_masseffect2in

 ## This bit of complexity allows adapters to be added by just adding
 ## importing.  It eliminates the long if/else clauses we used to need
--- a/fanficfare/adapters/adapter_masseffect2in.py
+++ b/fanficfare/adapters/adapter_masseffect2in.py
@ -0,0 +1,559 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2011 Fanficdownloader team,
+#           2015 FanFicFare team,
+#           2015 Dmitry Kozliuk
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import datetime
+import logging
+import re
+import urllib2
+import codecs
+
+from .. import BeautifulSoup as bs
+from ..htmlcleanup import stripHTML
+from .. import exceptions as exceptions
+from base_adapter import BaseSiteAdapter, makeDate
+
+
+_logger = logging.getLogger(__name__)
+
+
+def getClass():
+    """Returns adapter class defined in this module."""
+    return MassEffect2InAdapter
+
+
+class ParsingError(Exception):
+    """Indicates an error while parsing web page content."""
+    def __init__(self, message):
+        Exception.__init__(self)
+        self.message = message
+
+
+class MassEffect2InAdapter(BaseSiteAdapter):
+    """Provides support for masseffect2.in site as story source.
+    Can be used as a template for sites build upon Ucoz.com engine.
+    Specializations:
+        1) Russian content (date format, genre names, etc.);
+        2) original `R.A.T.I.N.G.' rating scale, used by masseffect2.in
+           and some affiliated sites."""
+
+    WORD_PATTERN = re.compile(u'\w+', re.UNICODE)
+
+    DOCUMENT_ID_PATTERN = re.compile(u'\d+-\d+-\d+-\d+')
+
+    # Various `et cetera' and `et al' forms in Russian texts.
+    # Intended to be used with whole strings!
+    ETC_PATTERN = re.compile(
+        u'''[и&]\s(?:
+              (?:т\.?\s?[пд]\.?)|
+              (?:др(?:угие|\.)?)|
+              (?:пр(?:очие|\.)?)|
+              # Note: identically looking letters `K' and `o'
+              # below are from Latin and Cyrillic alphabets.
+              (?:ко(?:мпания)?|[KК][oо°])
+            )$
+        ''',
+        re.IGNORECASE + re.UNICODE + re.VERBOSE)
+
+    CHAPTER_NUMBER_PATTERN = re.compile(
+        u'''[\.:\s]*
+            (?:глава)?  # `Chapter' in Russian.
+            \s
+            (?P<chapterIndex>\d+)
+            (?:
+              (?:
+                # For `X.Y' and `X-Y' numbering styles:
+                [\-\.]|
+                # For `Chapter X (part Y)' and similar numbering styles:
+                [\.,]?\s
+                (?P<brace>\()?
+                (?:часть)?      # `Part' in Russian.
+                \s
+              )
+              (?P<partIndex>\d+)
+              (?(brace)\))
+            )?
+            [\.:\s]*
+         ''',
+        re.IGNORECASE + re.UNICODE + re.VERBOSE)
+
+    PROLOGUE_EPILOGUE_PATTERN = re.compile(
+        u'''[\.:\s]*         # Optional separators.
+            (пролог|эпилог)  # `Prologue' or `epilogue' in Russian.
+            [\.:\s]*         # Optional separators.
+         ''',
+        re.IGNORECASE + re.UNICODE + re.VERBOSE)
+
+    def __init__(self, config, url):
+        BaseSiteAdapter.__init__(self, config, url)
+
+        self.decode = ["utf8"]
+        self.dateformat = "%d.%m.%Y"
+
+        self.story.setMetadata('siteabbrev', 'me2in')
+
+        self.story.setMetadata('storyId', self._extractDocumentId(self.url))
+
+        self._setURL(self._makeUrl(self.story.getMetadata('storyId')))
+
+        self._transient_metadata = {}
+
+        # Memory cache of document HTML parsing results.  Increases performance
+        # drastically, because all downloaded pages are parsed at least twice.
+        # FIXME: Can be simplified when BS is updated to 4.4 with cloning.
+        self._parsing_cache = {}
+
+    @classmethod
+    def _makeUrl(cls, chapterId):
+        """Makes a chapter URL given a chapter ID."""
+        return 'http://%s/publ/%s' % (cls.getSiteDomain(), chapterId)
+
+    # Must be @staticmethod, not @classmethod!
+    @staticmethod
+    def getSiteDomain():
+        return 'www.masseffect2.in'
+
+    @classmethod
+    def getSiteExampleURLs(cls):
+        return u' '.join([cls._makeUrl('19-1-0-1234'),
+                          cls._makeUrl('24-1-0-4321')])
+
+    def getSiteURLPattern(self):
+        return re.escape(self._makeUrl('')) + self.DOCUMENT_ID_PATTERN.pattern
+
+    def use_pagecache(self):
+        """Allows use of downloaded page cache.  It is essential for this
+        adapter, because the site does not offers chapter URL list, and many
+        pages have to be fetched and parsed repeatedly."""
+        return True
+
+    def extractChapterUrlsAndMetadata(self):
+        """Extracts chapter URLs and story metadata.  Actually downloads all
+        chapters, which is not exactly right, but necessary due to technical
+        limitations of the site."""
+
+        def followLinks(document, selector):
+            """Downloads chapters one by one by locating and following links
+            specified by a selector.  Returns chapters' URLs in order they
+            were found."""
+            block = document\
+                .find('td', {'class': 'eDetails1'})\
+                .find('div', selector)
+            if not block:
+                return
+            link = block.find('a')
+            if not link:
+                return
+            chapterId = self._extractDocumentId(link['href'])
+            url = self._makeUrl(chapterId)
+            try:
+                chapter = self._loadDocument(url)
+            except urllib2.HTTPError, error:
+                if error.code == 404:
+                    raise exceptions.FailedToDownload(
+                        u'Error downloading chapter: %s!' % url)
+                raise
+            yield url
+            for url in followLinks(chapter, selector):
+                yield url
+
+        def followPreviousLinks(document):
+            """Downloads chapters following `Previous chapter' links.
+            Returns a list of chapters' URLs."""
+            urls = list(followLinks(document, {'class': 'fl tal'}))
+            return list(reversed(urls))
+
+        def followNextLinks(document):
+            """Downloads chapters following `Next chapter' links.
+            Returns a list of chapters' URLs."""
+            return list(followLinks(document, {'class': 'tar fr'}))
+
+        try:
+            document = self._loadDocument(self.url)
+        except urllib2.HTTPError, error:
+            if error.code == 404:
+                raise exceptions.StoryDoesNotExist(self.url)
+            raise
+        # There is no convenient mechanism to obtain URLs of all chapters
+        # other than navigating to previous and next chapters using links
+        # located on each chapter page.
+        chapters = \
+            followPreviousLinks(document) + \
+            [self.url] + \
+            followNextLinks(document)
+
+        # Transient metadata is updated when parsing each chapter,
+        # then converted and saved to story metadata.
+        self._transient_metadata = {
+            # We only have one date for each chapter and assume the oldest one
+            # to be publication date and the most recent one to be update date.
+            'datePublished': datetime.datetime.max,
+            'dateUpdated': datetime.datetime.min,
+
+            'numWords': 0,
+
+            # We aim at counting chapters, not chapter parts.
+            'numChapters': 0
+        }
+
+        for url in chapters:
+            chapter = self._loadDocument(url)
+            _logger.debug(u"Parsing chapter `%s'", url)
+            self._parseChapterMetadata(url, chapter)
+
+        # Attributes are handled separately due to format conversions.
+        self.story.setMetadata(
+            'datePublished', self._transient_metadata['datePublished'])
+        self.story.setMetadata(
+            'dateUpdated', self._transient_metadata['dateUpdated'])
+        self.story.setMetadata(
+            'numWords', str(self._transient_metadata['numWords']))
+        self.story.setMetadata(
+            'numChapters', self._transient_metadata['numChapters'])
+
+    def getChapterText(self, url):
+        """Grabs the text for an individual chapter."""
+        element = self._getChapterTextElement(url)
+        return self.utf8FromSoup(url, element)
+
+    def _parseChapterMetadata(self, url, document):
+        try:
+            self._parseTitle(url, document)
+            infoBar = document.find('td', {'class': 'eDetails2'})
+            if not infoBar:
+                raise ParsingError(u'No informational bar found.')
+            if not self.story.getMetadata('authorId'):
+                self._parseAuthor(infoBar)
+            self._parseDates(infoBar)
+            self._parseTextForWordCount(url)
+            self._parseAttributes(document)
+        except ParsingError, error:
+            raise exceptions.FailedToDownload(
+                u"Error parsing `%s'.  %s" % (url, error.message))
+
+    def _parseAttributes(self, document):
+        try:
+            elements = document \
+                .find('div', {'class': 'comm-div'}) \
+                .findNextSibling('div', {'class': 'cb'}) \
+                .nextGenerator()
+            attributesText = u''
+            for element in elements:
+                if not element:
+                    _logger.warning(u'Attribute block not terminated!')
+                    break
+                if isinstance(element, bs.Tag):
+                    # Although deprecated, `has_key()' is required here.
+                    if element.name == 'div' and \
+                            element.has_key('class') and \
+                            element['class'] == 'cb':
+                        break
+                    elif element.name == 'img':
+                        self._parseRatingFromImage(element)
+                else:
+                    attributesText += stripHTML(element)
+        except AttributeError or TypeError:
+            raise ParsingError(u'Failed to locate and collect attributes.')
+
+        for record in re.split(u';|\.', attributesText):
+            parts = record.split(u':', 1)
+            if len(parts) < 2:
+                continue
+            key = parts[0].strip().lower()
+            value = parts[1].strip().strip(u'.')
+            self._parseAttribute(key, value)
+
+    def _parseRatingFromImage(self, element):
+        """Given an image element, tries to parse story rating from it."""
+        # FIXME: This should probably be made adjustable via settings.
+        ratings = {
+            'E': u'Exempt (18+)',
+            'R': u'Restricted (16+)',
+            'A': u'Иная история',
+            'T': u'To every',
+            'I': u'Art house',
+            'Nn': u'Новый мир',
+            'G': u'О, господи!',
+        }
+        ratings['IN'] = ratings['A']
+
+        # Although deprecated, `has_key()' is required here.
+        if not element.has_key('src'):
+            return
+        source = element['src']
+        if 'REITiNG' not in source:
+            return
+        match = re.search(u'/(?P<rating>[ERATINnG]+)\.png$', source)
+        if not match:
+            return
+        symbol = match.group('rating')
+        if symbol == 'IN':
+            symbol = 'A'
+        if symbol in ratings:
+            rating = ratings[symbol]
+            self.story.setMetadata('rating', rating)
+            if symbol in ('R', 'E'):
+                self.is_adult = True
+
+    def _parseAttribute(self, key, value):
+        """Parses a single known attribute value for chapter metadata."""
+
+        def refineCharacter(name):
+            """Refines character name from stop-words and distortions."""
+            strippedName = name.strip()
+            nameOnly = re.sub(self.ETC_PATTERN, u'', strippedName)
+            # TODO: extract canonical name (even ME-specific?).
+            canonicalName = nameOnly
+            return canonicalName
+
+        if key == u'жанр':
+            definitions = value.split(u',')
+            if len(definitions) > 4:
+                _logger.warning(u'Possibly incorrect genre detection!')
+            for definition in definitions:
+                genres = definition.split(u'/')
+                self.story.extendList('genre', genres)
+        elif key == u'статус':
+            status = 'In-Progress' if value == u'в процессе' else 'Completed'
+            self.story.setMetadata('status', status)
+        elif key == u'персонажи':
+            characters = [refineCharacter(name) for name in value.split(u',')]
+            self.story.extendList('characters', characters)
+        else:
+            _logger.debug(u"Unrecognized attribute `%s'.", key)
+
+    def _parseTextForWordCount(self, url):
+        element = self._getChapterTextElement(url)
+        text = stripHTML(element)
+        count = len(re.findall(self.WORD_PATTERN, text))
+        self._transient_metadata['numWords'] += count
+        pass
+
+    def _parseDates(self, infoBar):
+        try:
+            dateText = infoBar \
+                .find('i', {'class': 'icon-eye'}) \
+                .findPreviousSibling(text=True) \
+                .strip(u'| \n')
+        except AttributeError:
+            raise ParsingError(u'Failed to locate date.')
+        date = makeDate(dateText, self.dateformat)
+        if date > self._transient_metadata['dateUpdated']:
+            self._transient_metadata['dateUpdated'] = date
+        if date < self._transient_metadata['datePublished']:
+            self._transient_metadata['datePublished'] = date
+
+    def _parseAuthor(self, strip):
+        try:
+            authorLink = strip \
+                .find('i', {'class': 'icon-user'}) \
+                .findNextSibling('a')
+        except AttributeError:
+            raise ParsingError(u'Failed to locate author link.')
+        match = re.search(u'(8-\d+)', authorLink['onclick'])
+        if not match:
+            raise ParsingError(u'Failed to extract author ID.')
+        authorId = match.group(0)
+        authorUrl = 'http://%s/index/%s' % (self.getSiteDomain(), authorId)
+        authorName = stripHTML(authorLink.text)
+        self.story.setMetadata('authorId', authorId)
+        self.story.setMetadata('authorUrl', authorUrl)
+        self.story.setMetadata('author', authorName)
+
+    def _parseTitle(self, url, document):
+        try:
+            fullTitle = stripHTML(
+                document.find('div', {'class': 'eTitle'}).string)
+        except AttributeError:
+            raise ParsingError(u'Failed to locate title.')
+        parsedHeading = self._parseHeading(fullTitle)
+        if not self.story.getMetadata('title'):
+            self.story.setMetadata('title', parsedHeading['storyTitle'])
+        if 'chapterIndex' in parsedHeading:
+            self._transient_metadata['numChapters'] = max(
+                self._transient_metadata['numChapters'],
+                parsedHeading['chapterIndex'])
+        else:
+            self._transient_metadata['numChapters'] += 1
+        self.chapterUrls.append((parsedHeading['chapterTitle'], url))
+
+    def _parseHeading(self, fullTitle):
+        """Extracts meaningful parts from full chapter heading with.
+        Returns a dictionary containing `storyTitle', `chapterTitle'
+        (including numbering if allowed by settings, may be the same as
+        `storyTitle' for short stories), `chapterIndex' (optional, may be
+        zero), and `partIndex' (optional, chapter part, may be zero).
+        When no dedicated chapter title is present, generates one based on
+        chapter and part indices.  Correctly handles `prologue' and `epilogue'
+        cases."""
+        match = re.search(self.CHAPTER_NUMBER_PATTERN, fullTitle)
+        if match:
+            chapterIndex = int(match.group('chapterIndex'))
+            # There are cases with zero chapter or part number (e. g.:
+            # numbered prologue, not to be confused with just `Prologue').
+            if match.group('partIndex'):
+                partIndex = int(match.group('partIndex'))
+            else:
+                partIndex = None
+            chapterTitle = fullTitle[match.end():].strip()
+            if chapterTitle:
+                if self.getConfig('strip_chapter_numbers', False) \
+                        and not self.getConfig('add_chapter_numbers', False):
+                    if partIndex is not None:
+                        title = u'%d.%d %s' % \
+                                (chapterIndex, partIndex, chapterTitle)
+                    else:
+                        title = u'%d. %s' % (chapterIndex, chapterTitle)
+                else:
+                    title = chapterTitle
+            else:
+                title = u'Глава %d' % chapterIndex
+                if partIndex:
+                    title += u' (часть %d)' % partIndex
+
+            # For seldom found cases like `Story: prologue and chapter 1'.
+            storyTitle = fullTitle[:match.start()]
+            match = re.search(self.PROLOGUE_EPILOGUE_PATTERN, storyTitle)
+            if match:
+                matches = list(
+                    re.finditer(u'[:\.]', storyTitle))
+                if matches:
+                    realStoryTitleEnd = matches[-1].start()
+                    if realStoryTitleEnd >= 0:
+                        storyTitle = storyTitle[:realStoryTitleEnd]
+                    else:
+                        _logger.warning(
+                            u"Title contains `%s', suspected to be part of "
+                            u"numbering, but no period (`.') before it.  "
+                            u"Full title is preserved." % storyTitle)
+
+            result = {
+                'storyTitle': storyTitle,
+                'chapterTitle': title,
+                'chapterIndex': chapterIndex
+            }
+            if partIndex is not None:
+                result['partIndex'] = partIndex
+            return result
+
+        match = re.search(self.PROLOGUE_EPILOGUE_PATTERN, fullTitle)
+        if match:
+            storyTitle = fullTitle[:match.start()]
+            chapterTitle = fullTitle[match.end():].strip()
+            matchedText = fullTitle[match.start():match.end()]
+            if chapterTitle:
+                title = u'%s. %s' % (matchedText, chapterTitle)
+            else:
+                title = matchedText
+            return {
+                'storyTitle': storyTitle,
+                'chapterTitle': title
+            }
+
+        return {
+            'storyTitle': fullTitle,
+            'chapterTitle': fullTitle
+        }
+
+    def _loadDocument(self, url):
+        """Fetches URL content and returns its element tree
+        with parsing settings tuned for MassEffect2.in."""
+        documentId = self._extractDocumentId(url)
+        if documentId in self._parsing_cache:
+            _logger.debug(u"Memory cache HIT for parsed `%s'", url)
+            return self._parsing_cache[documentId]['document']
+        else:
+            _logger.debug(u"Memory cache MISS for parsed `%s'", url)
+            document = bs.BeautifulStoneSoup(
+                self._fetchUrl(url), selfClosingTags=('br', 'hr', 'img'))
+            self._parsing_cache[documentId] = {'document': document}
+            return document
+
+    def _fetchUrl(self, url,
+                  parameters=None,
+                  usecache=True,
+                  extrasleep=None):
+        """Fetches URL contents, see BaseSiteAdapter for details.
+        Overridden to support on-disk cache when debugging Calibre."""
+        from calibre.constants import DEBUG
+        if DEBUG:
+            import os
+            documentId = self._extractDocumentId(url)
+            path = u'./cache/%s' % documentId
+            if os.path.isfile(path) and os.access(path, os.R_OK):
+                _logger.debug(u"On-disk cache HIT for `%s'.", url)
+                with codecs.open(path, encoding='utf-8') as input:
+                    return input.read()
+            else:
+                _logger.debug(u"On-disk cache MISS for `%s'.", url)
+
+        content = BaseSiteAdapter._fetchUrl(
+            self, url, parameters, usecache, extrasleep)
+
+        if DEBUG:
+            import os
+            if os.path.isdir(os.path.dirname(path)):
+                _logger.debug(u"Caching `%s' content on disk.", url)
+                with codecs.open(path, mode='w', encoding='utf-8') as output:
+                    output.write(content)
+
+        return content
+
+    def _extractDocumentId(self, url):
+        """Extracts document ID from MassEffect2.in URL."""
+        match = re.search(self.DOCUMENT_ID_PATTERN, url)
+        if not match:
+            raise ValueError(u"Failed to extract document ID from `'" % url)
+        documentId = url[match.start():match.end()]
+        return documentId
+
+    def _getChapterTextElement(self, url):
+        """Fetches URL content and extracts an element containing text body.
+        Shall be used instead of `__collectTextElements'."""
+        documentId = self._extractDocumentId(url)
+        document = self._loadDocument(url)
+        cache = self._parsing_cache[documentId]
+        if 'body' in cache:
+            return cache['body']
+        else:
+            body = self.__collectTextElements(document)
+            cache['body'] = body
+            return body
+
+    def __collectTextElements(self, document):
+        """Returns all elements containing parts of chapter text (which may be
+        <p>aragraphs, <div>isions or plain text nodes) under a single root."""
+        starter = document.find('div', {'id': u'article'})
+        if starter is None:
+            # FIXME: This will occur if the method is called more than once.
+            # The reason is elements appended to `root' are removed from
+            # the document. BS 4.4 implements cloning via `copy.copy()',
+            # but supporting it for earlier versions is error-prone
+            # (due to relying on BS internals).
+            raise ParsingError(u'Failed to locate text.')
+        collection = [starter]
+        for element in starter.nextSiblingGenerator():
+            if element is None:
+                break
+            if isinstance(element, bs.Tag) and element.name == 'tr':
+                break
+            collection.append(element)
+        root = bs.Tag(document, 'td')
+        for element in collection:
+            root.append(element)
+        return root
--- a/fanficfare/defaults.ini
+++ b/fanficfare/defaults.ini
@ -1727,6 +1727,16 @@ extraships:InuYasha/Kagome
 ## Site dedicated to these categories/characters/ships
 extracategories:Lord of the Rings

+[www.masseffect2.in]
+## Site with stories in this language.
+language:Russian
+## Site dedicated to this fandom.
+extracategories:Mass Effect
+
+## Stories on the site almost never have cover image.
+## May be adjusted in `personal.ini' on per-story basis.
+cover_exclusion_regexp:.*
+
 [www.mediaminer.org]

 [www.midnightwhispers.ca]