From 707f7a347bfffd80a12e806c255c4b4e24f29dfa Mon Sep 17 00:00:00 2001
From: Dmitry Kozliuk <Dmitry.Kozliuk@gmail.com>
Date: Tue, 21 Jul 2015 19:59:02 +0300
Subject: [PATCH 01/18] Add rudimentary support for `www.masseffect2.in'.

Status: usable, but needs various enhancements and refactoring.

Implemented:
    * Downloading of whole stories given a chapter URL.
    * Automatic chapter numbering extraction and title generation.
    * Author identification.
    * Word and chapter (not chapter parts) counting.
    * Genre, character, and rating detection (in basic cases).
---
 fanficfare/adapters/__init__.py              |   1 +
 fanficfare/adapters/adapter_masseffect2in.py | 559 +++++++++++++++++++
 fanficfare/defaults.ini                      |  10 +
 3 files changed, 570 insertions(+)
 create mode 100644 fanficfare/adapters/adapter_masseffect2in.py

diff --git a/fanficfare/adapters/__init__.py b/fanficfare/adapters/__init__.py
index 606e0c50..d0fd6512 100644
--- a/fanficfare/adapters/__init__.py
+++ b/fanficfare/adapters/__init__.py
@@ -135,6 +135,7 @@ import adapter_fanfictionjunkiesde
 import adapter_devianthearts
 import adapter_tgstorytimecom
 import adapter_itcouldhappennet
+import adapter_masseffect2in
 
 ## This bit of complexity allows adapters to be added by just adding
 ## importing.  It eliminates the long if/else clauses we used to need
diff --git a/fanficfare/adapters/adapter_masseffect2in.py b/fanficfare/adapters/adapter_masseffect2in.py
new file mode 100644
index 00000000..23476951
--- /dev/null
+++ b/fanficfare/adapters/adapter_masseffect2in.py
@@ -0,0 +1,559 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2011 Fanficdownloader team,
+#           2015 FanFicFare team,
+#           2015 Dmitry Kozliuk
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import datetime
+import logging
+import re
+import urllib2
+import codecs
+
+from .. import BeautifulSoup as bs
+from ..htmlcleanup import stripHTML
+from .. import exceptions as exceptions
+from base_adapter import BaseSiteAdapter, makeDate
+
+
+_logger = logging.getLogger(__name__)
+
+
+def getClass():
+    """Returns adapter class defined in this module."""
+    return MassEffect2InAdapter
+
+
+class ParsingError(Exception):
+    """Indicates an error while parsing web page content."""
+    def __init__(self, message):
+        Exception.__init__(self)
+        self.message = message
+
+
+class MassEffect2InAdapter(BaseSiteAdapter):
+    """Provides support for masseffect2.in site as story source.
+    Can be used as a template for sites build upon Ucoz.com engine.
+    Specializations:
+        1) Russian content (date format, genre names, etc.);
+        2) original `R.A.T.I.N.G.' rating scale, used by masseffect2.in
+           and some affiliated sites."""
+
+    WORD_PATTERN = re.compile(u'\w+', re.UNICODE)
+
+    DOCUMENT_ID_PATTERN = re.compile(u'\d+-\d+-\d+-\d+')
+
+    # Various `et cetera' and `et al' forms in Russian texts.
+    # Intended to be used with whole strings!
+    ETC_PATTERN = re.compile(
+        u'''[и&]\s(?:
+              (?:т\.?\s?[пд]\.?)|
+              (?:др(?:угие|\.)?)|
+              (?:пр(?:очие|\.)?)|
+              # Note: identically looking letters `K' and `o'
+              # below are from Latin and Cyrillic alphabets.
+              (?:ко(?:мпания)?|[KК][oо°])
+            )$
+        ''',
+        re.IGNORECASE + re.UNICODE + re.VERBOSE)
+
+    CHAPTER_NUMBER_PATTERN = re.compile(
+        u'''[\.:\s]*
+            (?:глава)?  # `Chapter' in Russian.
+            \s
+            (?P<chapterIndex>\d+)
+            (?:
+              (?:
+                # For `X.Y' and `X-Y' numbering styles:
+                [\-\.]|
+                # For `Chapter X (part Y)' and similar numbering styles:
+                [\.,]?\s
+                (?P<brace>\()?
+                (?:часть)?      # `Part' in Russian.
+                \s
+              )
+              (?P<partIndex>\d+)
+              (?(brace)\))
+            )?
+            [\.:\s]*
+         ''',
+        re.IGNORECASE + re.UNICODE + re.VERBOSE)
+
+    PROLOGUE_EPILOGUE_PATTERN = re.compile(
+        u'''[\.:\s]*         # Optional separators.
+            (пролог|эпилог)  # `Prologue' or `epilogue' in Russian.
+            [\.:\s]*         # Optional separators.
+         ''',
+        re.IGNORECASE + re.UNICODE + re.VERBOSE)
+
+    def __init__(self, config, url):
+        BaseSiteAdapter.__init__(self, config, url)
+
+        self.decode = ["utf8"]
+        self.dateformat = "%d.%m.%Y"
+
+        self.story.setMetadata('siteabbrev', 'me2in')
+
+        self.story.setMetadata('storyId', self._extractDocumentId(self.url))
+
+        self._setURL(self._makeUrl(self.story.getMetadata('storyId')))
+
+        self._transient_metadata = {}
+
+        # Memory cache of document HTML parsing results.  Increases performance
+        # drastically, because all downloaded pages are parsed at least twice.
+        # FIXME: Can be simplified when BS is updated to 4.4 with cloning.
+        self._parsing_cache = {}
+
+    @classmethod
+    def _makeUrl(cls, chapterId):
+        """Makes a chapter URL given a chapter ID."""
+        return 'http://%s/publ/%s' % (cls.getSiteDomain(), chapterId)
+
+    # Must be @staticmethod, not @classmethod!
+    @staticmethod
+    def getSiteDomain():
+        return 'www.masseffect2.in'
+
+    @classmethod
+    def getSiteExampleURLs(cls):
+        return u' '.join([cls._makeUrl('19-1-0-1234'),
+                          cls._makeUrl('24-1-0-4321')])
+
+    def getSiteURLPattern(self):
+        return re.escape(self._makeUrl('')) + self.DOCUMENT_ID_PATTERN.pattern
+
+    def use_pagecache(self):
+        """Allows use of downloaded page cache.  It is essential for this
+        adapter, because the site does not offers chapter URL list, and many
+        pages have to be fetched and parsed repeatedly."""
+        return True
+
+    def extractChapterUrlsAndMetadata(self):
+        """Extracts chapter URLs and story metadata.  Actually downloads all
+        chapters, which is not exactly right, but necessary due to technical
+        limitations of the site."""
+
+        def followLinks(document, selector):
+            """Downloads chapters one by one by locating and following links
+            specified by a selector.  Returns chapters' URLs in order they
+            were found."""
+            block = document\
+                .find('td', {'class': 'eDetails1'})\
+                .find('div', selector)
+            if not block:
+                return
+            link = block.find('a')
+            if not link:
+                return
+            chapterId = self._extractDocumentId(link['href'])
+            url = self._makeUrl(chapterId)
+            try:
+                chapter = self._loadDocument(url)
+            except urllib2.HTTPError, error:
+                if error.code == 404:
+                    raise exceptions.FailedToDownload(
+                        u'Error downloading chapter: %s!' % url)
+                raise
+            yield url
+            for url in followLinks(chapter, selector):
+                yield url
+
+        def followPreviousLinks(document):
+            """Downloads chapters following `Previous chapter' links.
+            Returns a list of chapters' URLs."""
+            urls = list(followLinks(document, {'class': 'fl tal'}))
+            return list(reversed(urls))
+
+        def followNextLinks(document):
+            """Downloads chapters following `Next chapter' links.
+            Returns a list of chapters' URLs."""
+            return list(followLinks(document, {'class': 'tar fr'}))
+
+        try:
+            document = self._loadDocument(self.url)
+        except urllib2.HTTPError, error:
+            if error.code == 404:
+                raise exceptions.StoryDoesNotExist(self.url)
+            raise
+        # There is no convenient mechanism to obtain URLs of all chapters
+        # other than navigating to previous and next chapters using links
+        # located on each chapter page.
+        chapters = \
+            followPreviousLinks(document) + \
+            [self.url] + \
+            followNextLinks(document)
+
+        # Transient metadata is updated when parsing each chapter,
+        # then converted and saved to story metadata.
+        self._transient_metadata = {
+            # We only have one date for each chapter and assume the oldest one
+            # to be publication date and the most recent one to be update date.
+            'datePublished': datetime.datetime.max,
+            'dateUpdated': datetime.datetime.min,
+
+            'numWords': 0,
+
+            # We aim at counting chapters, not chapter parts.
+            'numChapters': 0
+        }
+
+        for url in chapters:
+            chapter = self._loadDocument(url)
+            _logger.debug(u"Parsing chapter `%s'", url)
+            self._parseChapterMetadata(url, chapter)
+
+        # Attributes are handled separately due to format conversions.
+        self.story.setMetadata(
+            'datePublished', self._transient_metadata['datePublished'])
+        self.story.setMetadata(
+            'dateUpdated', self._transient_metadata['dateUpdated'])
+        self.story.setMetadata(
+            'numWords', str(self._transient_metadata['numWords']))
+        self.story.setMetadata(
+            'numChapters', self._transient_metadata['numChapters'])
+
+    def getChapterText(self, url):
+        """Grabs the text for an individual chapter."""
+        element = self._getChapterTextElement(url)
+        return self.utf8FromSoup(url, element)
+
+    def _parseChapterMetadata(self, url, document):
+        try:
+            self._parseTitle(url, document)
+            infoBar = document.find('td', {'class': 'eDetails2'})
+            if not infoBar:
+                raise ParsingError(u'No informational bar found.')
+            if not self.story.getMetadata('authorId'):
+                self._parseAuthor(infoBar)
+            self._parseDates(infoBar)
+            self._parseTextForWordCount(url)
+            self._parseAttributes(document)
+        except ParsingError, error:
+            raise exceptions.FailedToDownload(
+                u"Error parsing `%s'.  %s" % (url, error.message))
+
+    def _parseAttributes(self, document):
+        try:
+            elements = document \
+                .find('div', {'class': 'comm-div'}) \
+                .findNextSibling('div', {'class': 'cb'}) \
+                .nextGenerator()
+            attributesText = u''
+            for element in elements:
+                if not element:
+                    _logger.warning(u'Attribute block not terminated!')
+                    break
+                if isinstance(element, bs.Tag):
+                    # Although deprecated, `has_key()' is required here.
+                    if element.name == 'div' and \
+                            element.has_key('class') and \
+                            element['class'] == 'cb':
+                        break
+                    elif element.name == 'img':
+                        self._parseRatingFromImage(element)
+                else:
+                    attributesText += stripHTML(element)
+        except AttributeError or TypeError:
+            raise ParsingError(u'Failed to locate and collect attributes.')
+
+        for record in re.split(u';|\.', attributesText):
+            parts = record.split(u':', 1)
+            if len(parts) < 2:
+                continue
+            key = parts[0].strip().lower()
+            value = parts[1].strip().strip(u'.')
+            self._parseAttribute(key, value)
+
+    def _parseRatingFromImage(self, element):
+        """Given an image element, tries to parse story rating from it."""
+        # FIXME: This should probably be made adjustable via settings.
+        ratings = {
+            'E': u'Exempt (18+)',
+            'R': u'Restricted (16+)',
+            'A': u'Иная история',
+            'T': u'To every',
+            'I': u'Art house',
+            'Nn': u'Новый мир',
+            'G': u'О, господи!',
+        }
+        ratings['IN'] = ratings['A']
+
+        # Although deprecated, `has_key()' is required here.
+        if not element.has_key('src'):
+            return
+        source = element['src']
+        if 'REITiNG' not in source:
+            return
+        match = re.search(u'/(?P<rating>[ERATINnG]+)\.png$', source)
+        if not match:
+            return
+        symbol = match.group('rating')
+        if symbol == 'IN':
+            symbol = 'A'
+        if symbol in ratings:
+            rating = ratings[symbol]
+            self.story.setMetadata('rating', rating)
+            if symbol in ('R', 'E'):
+                self.is_adult = True
+
+    def _parseAttribute(self, key, value):
+        """Parses a single known attribute value for chapter metadata."""
+
+        def refineCharacter(name):
+            """Refines character name from stop-words and distortions."""
+            strippedName = name.strip()
+            nameOnly = re.sub(self.ETC_PATTERN, u'', strippedName)
+            # TODO: extract canonical name (even ME-specific?).
+            canonicalName = nameOnly
+            return canonicalName
+
+        if key == u'жанр':
+            definitions = value.split(u',')
+            if len(definitions) > 4:
+                _logger.warning(u'Possibly incorrect genre detection!')
+            for definition in definitions:
+                genres = definition.split(u'/')
+                self.story.extendList('genre', genres)
+        elif key == u'статус':
+            status = 'In-Progress' if value == u'в процессе' else 'Completed'
+            self.story.setMetadata('status', status)
+        elif key == u'персонажи':
+            characters = [refineCharacter(name) for name in value.split(u',')]
+            self.story.extendList('characters', characters)
+        else:
+            _logger.debug(u"Unrecognized attribute `%s'.", key)
+
+    def _parseTextForWordCount(self, url):
+        element = self._getChapterTextElement(url)
+        text = stripHTML(element)
+        count = len(re.findall(self.WORD_PATTERN, text))
+        self._transient_metadata['numWords'] += count
+        pass
+
+    def _parseDates(self, infoBar):
+        try:
+            dateText = infoBar \
+                .find('i', {'class': 'icon-eye'}) \
+                .findPreviousSibling(text=True) \
+                .strip(u'| \n')
+        except AttributeError:
+            raise ParsingError(u'Failed to locate date.')
+        date = makeDate(dateText, self.dateformat)
+        if date > self._transient_metadata['dateUpdated']:
+            self._transient_metadata['dateUpdated'] = date
+        if date < self._transient_metadata['datePublished']:
+            self._transient_metadata['datePublished'] = date
+
+    def _parseAuthor(self, strip):
+        try:
+            authorLink = strip \
+                .find('i', {'class': 'icon-user'}) \
+                .findNextSibling('a')
+        except AttributeError:
+            raise ParsingError(u'Failed to locate author link.')
+        match = re.search(u'(8-\d+)', authorLink['onclick'])
+        if not match:
+            raise ParsingError(u'Failed to extract author ID.')
+        authorId = match.group(0)
+        authorUrl = 'http://%s/index/%s' % (self.getSiteDomain(), authorId)
+        authorName = stripHTML(authorLink.text)
+        self.story.setMetadata('authorId', authorId)
+        self.story.setMetadata('authorUrl', authorUrl)
+        self.story.setMetadata('author', authorName)
+
+    def _parseTitle(self, url, document):
+        try:
+            fullTitle = stripHTML(
+                document.find('div', {'class': 'eTitle'}).string)
+        except AttributeError:
+            raise ParsingError(u'Failed to locate title.')
+        parsedHeading = self._parseHeading(fullTitle)
+        if not self.story.getMetadata('title'):
+            self.story.setMetadata('title', parsedHeading['storyTitle'])
+        if 'chapterIndex' in parsedHeading:
+            self._transient_metadata['numChapters'] = max(
+                self._transient_metadata['numChapters'],
+                parsedHeading['chapterIndex'])
+        else:
+            self._transient_metadata['numChapters'] += 1
+        self.chapterUrls.append((parsedHeading['chapterTitle'], url))
+
+    def _parseHeading(self, fullTitle):
+        """Extracts meaningful parts from full chapter heading with.
+        Returns a dictionary containing `storyTitle', `chapterTitle'
+        (including numbering if allowed by settings, may be the same as
+        `storyTitle' for short stories), `chapterIndex' (optional, may be
+        zero), and `partIndex' (optional, chapter part, may be zero).
+        When no dedicated chapter title is present, generates one based on
+        chapter and part indices.  Correctly handles `prologue' and `epilogue'
+        cases."""
+        match = re.search(self.CHAPTER_NUMBER_PATTERN, fullTitle)
+        if match:
+            chapterIndex = int(match.group('chapterIndex'))
+            # There are cases with zero chapter or part number (e. g.:
+            # numbered prologue, not to be confused with just `Prologue').
+            if match.group('partIndex'):
+                partIndex = int(match.group('partIndex'))
+            else:
+                partIndex = None
+            chapterTitle = fullTitle[match.end():].strip()
+            if chapterTitle:
+                if self.getConfig('strip_chapter_numbers', False) \
+                        and not self.getConfig('add_chapter_numbers', False):
+                    if partIndex is not None:
+                        title = u'%d.%d %s' % \
+                                (chapterIndex, partIndex, chapterTitle)
+                    else:
+                        title = u'%d. %s' % (chapterIndex, chapterTitle)
+                else:
+                    title = chapterTitle
+            else:
+                title = u'Глава %d' % chapterIndex
+                if partIndex:
+                    title += u' (часть %d)' % partIndex
+
+            # For seldom found cases like `Story: prologue and chapter 1'.
+            storyTitle = fullTitle[:match.start()]
+            match = re.search(self.PROLOGUE_EPILOGUE_PATTERN, storyTitle)
+            if match:
+                matches = list(
+                    re.finditer(u'[:\.]', storyTitle))
+                if matches:
+                    realStoryTitleEnd = matches[-1].start()
+                    if realStoryTitleEnd >= 0:
+                        storyTitle = storyTitle[:realStoryTitleEnd]
+                    else:
+                        _logger.warning(
+                            u"Title contains `%s', suspected to be part of "
+                            u"numbering, but no period (`.') before it.  "
+                            u"Full title is preserved." % storyTitle)
+
+            result = {
+                'storyTitle': storyTitle,
+                'chapterTitle': title,
+                'chapterIndex': chapterIndex
+            }
+            if partIndex is not None:
+                result['partIndex'] = partIndex
+            return result
+
+        match = re.search(self.PROLOGUE_EPILOGUE_PATTERN, fullTitle)
+        if match:
+            storyTitle = fullTitle[:match.start()]
+            chapterTitle = fullTitle[match.end():].strip()
+            matchedText = fullTitle[match.start():match.end()]
+            if chapterTitle:
+                title = u'%s. %s' % (matchedText, chapterTitle)
+            else:
+                title = matchedText
+            return {
+                'storyTitle': storyTitle,
+                'chapterTitle': title
+            }
+
+        return {
+            'storyTitle': fullTitle,
+            'chapterTitle': fullTitle
+        }
+
+    def _loadDocument(self, url):
+        """Fetches URL content and returns its element tree
+        with parsing settings tuned for MassEffect2.in."""
+        documentId = self._extractDocumentId(url)
+        if documentId in self._parsing_cache:
+            _logger.debug(u"Memory cache HIT for parsed `%s'", url)
+            return self._parsing_cache[documentId]['document']
+        else:
+            _logger.debug(u"Memory cache MISS for parsed `%s'", url)
+            document = bs.BeautifulStoneSoup(
+                self._fetchUrl(url), selfClosingTags=('br', 'hr', 'img'))
+            self._parsing_cache[documentId] = {'document': document}
+            return document
+
+    def _fetchUrl(self, url,
+                  parameters=None,
+                  usecache=True,
+                  extrasleep=None):
+        """Fetches URL contents, see BaseSiteAdapter for details.
+        Overridden to support on-disk cache when debugging Calibre."""
+        from calibre.constants import DEBUG
+        if DEBUG:
+            import os
+            documentId = self._extractDocumentId(url)
+            path = u'./cache/%s' % documentId
+            if os.path.isfile(path) and os.access(path, os.R_OK):
+                _logger.debug(u"On-disk cache HIT for `%s'.", url)
+                with codecs.open(path, encoding='utf-8') as input:
+                    return input.read()
+            else:
+                _logger.debug(u"On-disk cache MISS for `%s'.", url)
+
+        content = BaseSiteAdapter._fetchUrl(
+            self, url, parameters, usecache, extrasleep)
+
+        if DEBUG:
+            import os
+            if os.path.isdir(os.path.dirname(path)):
+                _logger.debug(u"Caching `%s' content on disk.", url)
+                with codecs.open(path, mode='w', encoding='utf-8') as output:
+                    output.write(content)
+
+        return content
+
+    def _extractDocumentId(self, url):
+        """Extracts document ID from MassEffect2.in URL."""
+        match = re.search(self.DOCUMENT_ID_PATTERN, url)
+        if not match:
+            raise ValueError(u"Failed to extract document ID from `'" % url)
+        documentId = url[match.start():match.end()]
+        return documentId
+
+    def _getChapterTextElement(self, url):
+        """Fetches URL content and extracts an element containing text body.
+        Shall be used instead of `__collectTextElements'."""
+        documentId = self._extractDocumentId(url)
+        document = self._loadDocument(url)
+        cache = self._parsing_cache[documentId]
+        if 'body' in cache:
+            return cache['body']
+        else:
+            body = self.__collectTextElements(document)
+            cache['body'] = body
+            return body
+
+    def __collectTextElements(self, document):
+        """Returns all elements containing parts of chapter text (which may be
+        <p>aragraphs, <div>isions or plain text nodes) under a single root."""
+        starter = document.find('div', {'id': u'article'})
+        if starter is None:
+            # FIXME: This will occur if the method is called more than once.
+            # The reason is elements appended to `root' are removed from
+            # the document. BS 4.4 implements cloning via `copy.copy()',
+            # but supporting it for earlier versions is error-prone
+            # (due to relying on BS internals).
+            raise ParsingError(u'Failed to locate text.')
+        collection = [starter]
+        for element in starter.nextSiblingGenerator():
+            if element is None:
+                break
+            if isinstance(element, bs.Tag) and element.name == 'tr':
+                break
+            collection.append(element)
+        root = bs.Tag(document, 'td')
+        for element in collection:
+            root.append(element)
+        return root
diff --git a/fanficfare/defaults.ini b/fanficfare/defaults.ini
index 0663c12e..fe753705 100644
--- a/fanficfare/defaults.ini
+++ b/fanficfare/defaults.ini
@@ -1727,6 +1727,16 @@ extraships:InuYasha/Kagome
 ## Site dedicated to these categories/characters/ships
 extracategories:Lord of the Rings
 
+[www.masseffect2.in]
+## Site with stories in this language.
+language:Russian
+## Site dedicated to this fandom.
+extracategories:Mass Effect
+
+## Stories on the site almost never have cover image.
+## May be adjusted in `personal.ini' on per-story basis.
+cover_exclusion_regexp:.*
+
 [www.mediaminer.org]
 
 [www.midnightwhispers.ca]

From aa93d4bb2df983b4268147a3d3f63b4cf0cc1f34 Mon Sep 17 00:00:00 2001
From: Dmitry Kozliuk <Dmitry.Kozliuk@gmail.com>
Date: Wed, 22 Jul 2015 14:31:06 +0300
Subject: [PATCH 02/18] Set site-specific language and category correctly.

---
 calibre-plugin/plugin-defaults.ini           | 11 +++++++++++
 fanficfare/adapters/adapter_masseffect2in.py |  5 +++++
 fanficfare/defaults.ini                      |  5 +++--
 3 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/calibre-plugin/plugin-defaults.ini b/calibre-plugin/plugin-defaults.ini
index da2f0782..94cd796e 100644
--- a/calibre-plugin/plugin-defaults.ini
+++ b/calibre-plugin/plugin-defaults.ini
@@ -1132,6 +1132,17 @@ extracategories:Lord of the Rings
 ## Site dedicated to these categories/characters/ships
 extracategories:Harry Potter
 
+[www.masseffect2.in]
+## Site dedicated to this fandom.
+extracategories: Mass Effect
+
+## Stories on the site almost never have cover image.
+## May be adjusted in `personal.ini' on per-story basis.
+cover_exclusion_regexp:.*
+
+my_custom_label: Some text
+my_custom_setting: true
+
 [merlinfic.dtwins.co.uk]
 ## Some sites require login (or login for some rated stories) The
 ## program can prompt you, or you can save it in config.  In
diff --git a/fanficfare/adapters/adapter_masseffect2in.py b/fanficfare/adapters/adapter_masseffect2in.py
index 23476951..80d5446a 100644
--- a/fanficfare/adapters/adapter_masseffect2in.py
+++ b/fanficfare/adapters/adapter_masseffect2in.py
@@ -99,6 +99,8 @@ class MassEffect2InAdapter(BaseSiteAdapter):
          ''',
         re.IGNORECASE + re.UNICODE + re.VERBOSE)
 
+    SITE_LANGUAGE = u'Russian'
+
     def __init__(self, config, url):
         BaseSiteAdapter.__init__(self, config, url)
 
@@ -225,6 +227,9 @@ class MassEffect2InAdapter(BaseSiteAdapter):
             'numWords', str(self._transient_metadata['numWords']))
         self.story.setMetadata(
             'numChapters', self._transient_metadata['numChapters'])
+        # Site-specific metadata.
+        self.story.setMetadata(
+            'language', self.SITE_LANGUAGE)
 
     def getChapterText(self, url):
         """Grabs the text for an individual chapter."""
diff --git a/fanficfare/defaults.ini b/fanficfare/defaults.ini
index fe753705..231b154d 100644
--- a/fanficfare/defaults.ini
+++ b/fanficfare/defaults.ini
@@ -1728,8 +1728,6 @@ extraships:InuYasha/Kagome
 extracategories:Lord of the Rings
 
 [www.masseffect2.in]
-## Site with stories in this language.
-language:Russian
 ## Site dedicated to this fandom.
 extracategories:Mass Effect
 
@@ -1737,6 +1735,9 @@ extracategories:Mass Effect
 ## May be adjusted in `personal.ini' on per-story basis.
 cover_exclusion_regexp:.*
 
+my_custom_label:Some text
+my_custom_setting:true
+
 [www.mediaminer.org]
 
 [www.midnightwhispers.ca]

From 6a13323c9294d0b99da9529f725fd29b5341c610 Mon Sep 17 00:00:00 2001
From: Dmitry Kozliuk <Dmitry.Kozliuk@gmail.com>
Date: Wed, 22 Jul 2015 19:33:28 +0300
Subject: [PATCH 03/18] Improved configuration per JimmXinu's suggestion.

https://github.com/PlushBeaver/FanFicFare/commit/707f7a347bfffd80a12e806c255c4b4e24f29dfa#commitcomment-12298782
---
 calibre-plugin/plugin-defaults.ini | 2 +-
 fanficfare/defaults.ini            | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/calibre-plugin/plugin-defaults.ini b/calibre-plugin/plugin-defaults.ini
index a545e982..1ba2900a 100644
--- a/calibre-plugin/plugin-defaults.ini
+++ b/calibre-plugin/plugin-defaults.ini
@@ -1228,7 +1228,7 @@ extracategories: Mass Effect
 
 ## Stories on the site almost never have cover image.
 ## May be adjusted in `personal.ini' on per-story basis.
-cover_exclusion_regexp:.*
+never_make_cover: true
 
 my_custom_label: Some text
 my_custom_setting: true
diff --git a/fanficfare/defaults.ini b/fanficfare/defaults.ini
index 765d0e3d..e35ad4fd 100644
--- a/fanficfare/defaults.ini
+++ b/fanficfare/defaults.ini
@@ -1829,7 +1829,7 @@ extracategories:Mass Effect
 
 ## Stories on the site almost never have cover image.
 ## May be adjusted in `personal.ini' on per-story basis.
-cover_exclusion_regexp:.*
+never_make_cover: true
 
 my_custom_label:Some text
 my_custom_setting:true

From 79b56c872f1ab21430cd9e5a46dcf6c263e43c8a Mon Sep 17 00:00:00 2001
From: Dmitry Kozliuk <Dmitry.Kozliuk@gmail.com>
Date: Thu, 23 Jul 2015 02:25:43 +0300
Subject: [PATCH 04/18] Refactor MassEffect2.in adapter and improve it.

Refactoring:
    * New `Chapter' class extracted to separate HTML parsing from
      adapter output preparation and story-wide metadata collection.
    * Lazy-parsing and fragments caching is done at Chapter level.

Improvements:
    * Rating "adultness" and label-to-title mapping made configurable.
    * Fix chapter number detection when title contains large numbers
      (ex.: http://www.masseffect2.in/publ/19-1-0-2934).
    * Add mechanism for detecting series-like stories with no chapter
      numbering and extracting correct titles.
    * Fix number format for generated chapter titles.
---
 calibre-plugin/plugin-defaults.ini           |   7 +-
 fanficfare/adapters/adapter_masseffect2in.py | 957 +++++++++++--------
 fanficfare/configurable.py                   |   2 +
 fanficfare/defaults.ini                      |   9 +-
 4 files changed, 577 insertions(+), 398 deletions(-)

diff --git a/calibre-plugin/plugin-defaults.ini b/calibre-plugin/plugin-defaults.ini
index 1ba2900a..ac27065e 100644
--- a/calibre-plugin/plugin-defaults.ini
+++ b/calibre-plugin/plugin-defaults.ini
@@ -1230,8 +1230,11 @@ extracategories: Mass Effect
 ## May be adjusted in `personal.ini' on per-story basis.
 never_make_cover: true
 
-my_custom_label: Some text
-my_custom_setting: true
+## Titles for ratings identified by 1- or 2-letter codes from `ERATING system'
+## (`система Р.Е.Й.Т.И.Н.Г.').  MassEffect2.in and some other sites adopted it,
+## but changed titles and update them occasionally.
+rating_titles: R=RESTRICTED (16+), E=EXEMPT (18+), I=ART HOUSE, T=To every, A=IN=Иной мир, Nn=Новый мир, G=О\, Господи!
+adult_ratings: E,R
 
 [merlinfic.dtwins.co.uk]
 ## Some sites require login (or login for some rated stories) The
diff --git a/fanficfare/adapters/adapter_masseffect2in.py b/fanficfare/adapters/adapter_masseffect2in.py
index 80d5446a..815ae191 100644
--- a/fanficfare/adapters/adapter_masseffect2in.py
+++ b/fanficfare/adapters/adapter_masseffect2in.py
@@ -43,6 +43,9 @@ class ParsingError(Exception):
         Exception.__init__(self)
         self.message = message
 
+    def __str__(self):
+        return self.message
+
 
 class MassEffect2InAdapter(BaseSiteAdapter):
     """Provides support for masseffect2.in site as story source.
@@ -53,77 +56,21 @@ class MassEffect2InAdapter(BaseSiteAdapter):
            and some affiliated sites."""
 
     WORD_PATTERN = re.compile(u'\w+', re.UNICODE)
-
     DOCUMENT_ID_PATTERN = re.compile(u'\d+-\d+-\d+-\d+')
-
-    # Various `et cetera' and `et al' forms in Russian texts.
-    # Intended to be used with whole strings!
-    ETC_PATTERN = re.compile(
-        u'''[и&]\s(?:
-              (?:т\.?\s?[пд]\.?)|
-              (?:др(?:угие|\.)?)|
-              (?:пр(?:очие|\.)?)|
-              # Note: identically looking letters `K' and `o'
-              # below are from Latin and Cyrillic alphabets.
-              (?:ко(?:мпания)?|[KК][oо°])
-            )$
-        ''',
-        re.IGNORECASE + re.UNICODE + re.VERBOSE)
-
-    CHAPTER_NUMBER_PATTERN = re.compile(
-        u'''[\.:\s]*
-            (?:глава)?  # `Chapter' in Russian.
-            \s
-            (?P<chapterIndex>\d+)
-            (?:
-              (?:
-                # For `X.Y' and `X-Y' numbering styles:
-                [\-\.]|
-                # For `Chapter X (part Y)' and similar numbering styles:
-                [\.,]?\s
-                (?P<brace>\()?
-                (?:часть)?      # `Part' in Russian.
-                \s
-              )
-              (?P<partIndex>\d+)
-              (?(brace)\))
-            )?
-            [\.:\s]*
-         ''',
-        re.IGNORECASE + re.UNICODE + re.VERBOSE)
-
-    PROLOGUE_EPILOGUE_PATTERN = re.compile(
-        u'''[\.:\s]*         # Optional separators.
-            (пролог|эпилог)  # `Prologue' or `epilogue' in Russian.
-            [\.:\s]*         # Optional separators.
-         ''',
-        re.IGNORECASE + re.UNICODE + re.VERBOSE)
-
     SITE_LANGUAGE = u'Russian'
 
     def __init__(self, config, url):
         BaseSiteAdapter.__init__(self, config, url)
 
         self.decode = ["utf8"]
-        self.dateformat = "%d.%m.%Y"
 
         self.story.setMetadata('siteabbrev', 'me2in')
+        self.story.setMetadata('storyId', self._getDocumentId(self.url))
 
-        self.story.setMetadata('storyId', self._extractDocumentId(self.url))
+        self._setURL(self._makeDocumentUrl(self.story.getMetadata('storyId')))
 
-        self._setURL(self._makeUrl(self.story.getMetadata('storyId')))
-
-        self._transient_metadata = {}
-
-        # Memory cache of document HTML parsing results.  Increases performance
-        # drastically, because all downloaded pages are parsed at least twice.
-        # FIXME: Can be simplified when BS is updated to 4.4 with cloning.
-        self._parsing_cache = {}
-
-    @classmethod
-    def _makeUrl(cls, chapterId):
-        """Makes a chapter URL given a chapter ID."""
-        return 'http://%s/publ/%s' % (cls.getSiteDomain(), chapterId)
+        self._chapters = {}
+        self._parsingConfiguration = None
 
     # Must be @staticmethod, not @classmethod!
     @staticmethod
@@ -132,11 +79,11 @@ class MassEffect2InAdapter(BaseSiteAdapter):
 
     @classmethod
     def getSiteExampleURLs(cls):
-        return u' '.join([cls._makeUrl('19-1-0-1234'),
-                          cls._makeUrl('24-1-0-4321')])
+        return u' '.join([cls._makeDocumentUrl('19-1-0-1234'),
+                          cls._makeDocumentUrl('24-1-0-4321')])
 
     def getSiteURLPattern(self):
-        return re.escape(self._makeUrl('')) + self.DOCUMENT_ID_PATTERN.pattern
+        return re.escape(self._makeDocumentUrl('')) + self.DOCUMENT_ID_PATTERN.pattern
 
     def use_pagecache(self):
         """Allows use of downloaded page cache.  It is essential for this
@@ -149,345 +96,197 @@ class MassEffect2InAdapter(BaseSiteAdapter):
         chapters, which is not exactly right, but necessary due to technical
         limitations of the site."""
 
-        def followLinks(document, selector):
-            """Downloads chapters one by one by locating and following links
-            specified by a selector.  Returns chapters' URLs in order they
-            were found."""
-            block = document\
-                .find('td', {'class': 'eDetails1'})\
-                .find('div', selector)
-            if not block:
-                return
-            link = block.find('a')
-            if not link:
-                return
-            chapterId = self._extractDocumentId(link['href'])
-            url = self._makeUrl(chapterId)
-            try:
-                chapter = self._loadDocument(url)
-            except urllib2.HTTPError, error:
-                if error.code == 404:
-                    raise exceptions.FailedToDownload(
-                        u'Error downloading chapter: %s!' % url)
-                raise
-            yield url
-            for url in followLinks(chapter, selector):
-                yield url
-
-        def followPreviousLinks(document):
-            """Downloads chapters following `Previous chapter' links.
-            Returns a list of chapters' URLs."""
-            urls = list(followLinks(document, {'class': 'fl tal'}))
-            return list(reversed(urls))
-
-        def followNextLinks(document):
-            """Downloads chapters following `Next chapter' links.
-            Returns a list of chapters' URLs."""
-            return list(followLinks(document, {'class': 'tar fr'}))
+        def followChapters(starting, forward=True):
+            if forward:
+                url = starting.getNextChapterUrl()
+            else:
+                url = starting.getPreviousChapterUrl()
+            if url:
+                url = self._makeDocumentUrl(self._getDocumentId(url))
+                following = self._makeChapter(url)
+                if forward:
+                    yield following
+                for chapter in followChapters(following, forward):
+                    yield chapter
+                if not forward:
+                    yield following
 
         try:
-            document = self._loadDocument(self.url)
+            startingChapter = self._makeChapter(self.url)
         except urllib2.HTTPError, error:
             if error.code == 404:
                 raise exceptions.StoryDoesNotExist(self.url)
             raise
-        # There is no convenient mechanism to obtain URLs of all chapters
-        # other than navigating to previous and next chapters using links
-        # located on each chapter page.
+
+        try:
+            self.story.setMetadata('title', startingChapter.getStoryTitle())
+            self.story.setMetadata('author', startingChapter.getAuthorName())
+            authorId = startingChapter.getAuthorId()
+            authorUrl = 'http://%s/index/%s' % (self.getSiteDomain(), authorId)
+            self.story.setMetadata('authorId', authorId)
+            self.story.setMetadata('authorUrl', authorUrl)
+            self.story.setMetadata('rating', startingChapter.getRatingTitle())
+        except ParsingError, error:
+            raise exceptions.FailedToDownload(
+                u"Failed to parse story metadata for `%s': %s" % (self.url, error))
+
+        # We only have one date for each chapter and assume the oldest one
+        # to be publication date and the most recent one to be update date.
+        datePublished = datetime.datetime.max
+        dateUpdated = datetime.datetime.min
+        wordCount = 0
+        # We aim at counting chapters, not chapter parts.
+        chapterCount = 0
+        storyInProgress = False
+
         chapters = \
-            followPreviousLinks(document) + \
-            [self.url] + \
-            followNextLinks(document)
+            list(followChapters(startingChapter, forward=False)) + \
+            [startingChapter] + \
+            list(followChapters(startingChapter, forward=True))
 
-        # Transient metadata is updated when parsing each chapter,
-        # then converted and saved to story metadata.
-        self._transient_metadata = {
-            # We only have one date for each chapter and assume the oldest one
-            # to be publication date and the most recent one to be update date.
-            'datePublished': datetime.datetime.max,
-            'dateUpdated': datetime.datetime.min,
+        try:
+            for chapter in chapters:
+                url = chapter.getUrl()
+                self._chapters[url] = chapter
+                _logger.debug(u"Processing chapter `%s'.", url)
 
-            'numWords': 0,
+                datePublished = min(datePublished, chapter.getDate())
+                dateUpdated = max(dateUpdated, chapter.getDate())
 
-            # We aim at counting chapters, not chapter parts.
-            'numChapters': 0
-        }
+                self.story.extendList('genre', chapter.getGenres())
+                self.story.extendList('characters', chapter.getCharacters())
 
-        for url in chapters:
-            chapter = self._loadDocument(url)
-            _logger.debug(u"Parsing chapter `%s'", url)
-            self._parseChapterMetadata(url, chapter)
+                wordCount += self._getWordCount(chapter.getTextElement())
 
-        # Attributes are handled separately due to format conversions.
+                index = chapter.getIndex()
+                if index:
+                    chapterCount = max(chapterCount, index)
+                else:
+                    chapterCount += 1
+
+                # Story is in progress if any chapter is in progress.
+                # Some chapters may have no status attribute.
+                chapterInProgress = chapter.isInProgress()
+                if chapterInProgress is not None:
+                    storyInProgress |= chapterInProgress
+
+                # If any chapter is adult, consider the whole story adult.
+                if chapter.isRatingAdult():
+                    self.story.setMetadata('is_adult', True)
+
+            titles = [chapter.getTitle() for chapter in chapters]
+            hasNumbering = any([chapter.getIndex() is not None for chapter in chapters])
+            if not hasNumbering:
+                # There are stories without chapter numbering, but under single title,
+                # which is heading prefix (such stories are not series).  We identify
+                # common prefix for all chapters and use it as story title, trimming
+                # chapter titles the length of this prefix.
+                largestCommonPrefix = _getLargestCommonPrefix(*titles)
+                prefixLength = len(largestCommonPrefix)
+                storyTitle = re.sub(u'[:\.\s]*$', u'', largestCommonPrefix, re.UNICODE)
+                self.story.setMetadata('title', storyTitle)
+                for chapter in chapters:
+                    self.chapterUrls.append(
+                        (chapter.getTitle()[prefixLength:], chapter.getUrl()))
+            else:
+                # Simple processing for common cases.
+                for chapter in chapters:
+                    self.chapterUrls.append(
+                        (chapter.getTitle(), chapter.getUrl()))
+
+        except ParsingError, error:
+                raise exceptions.FailedToDownload(
+                    u"Failed to download chapter `%s': %s" % (url, error))
+
+        # Some metadata are handled separately due to format conversions.
         self.story.setMetadata(
-            'datePublished', self._transient_metadata['datePublished'])
-        self.story.setMetadata(
-            'dateUpdated', self._transient_metadata['dateUpdated'])
-        self.story.setMetadata(
-            'numWords', str(self._transient_metadata['numWords']))
-        self.story.setMetadata(
-            'numChapters', self._transient_metadata['numChapters'])
+            'status', 'In Progress' if storyInProgress else 'Completed')
+        self.story.setMetadata('datePublished', datePublished)
+        self.story.setMetadata('dateUpdated', dateUpdated)
+        self.story.setMetadata('numWords', str(wordCount))
+        self.story.setMetadata('numChapters', chapterCount)
+
         # Site-specific metadata.
-        self.story.setMetadata(
-            'language', self.SITE_LANGUAGE)
+        self.story.setMetadata('language', self.SITE_LANGUAGE)
 
     def getChapterText(self, url):
         """Grabs the text for an individual chapter."""
-        element = self._getChapterTextElement(url)
-        return self.utf8FromSoup(url, element)
+        if url not in self._chapters:
+            raise exceptions.FailedToDownload(u"No chapter `%s' present!" % url)
+        chapter = self._chapters[url]
+        return self.utf8FromSoup(url, chapter.getTextElement())
 
-    def _parseChapterMetadata(self, url, document):
-        try:
-            self._parseTitle(url, document)
-            infoBar = document.find('td', {'class': 'eDetails2'})
-            if not infoBar:
-                raise ParsingError(u'No informational bar found.')
-            if not self.story.getMetadata('authorId'):
-                self._parseAuthor(infoBar)
-            self._parseDates(infoBar)
-            self._parseTextForWordCount(url)
-            self._parseAttributes(document)
-        except ParsingError, error:
-            raise exceptions.FailedToDownload(
-                u"Error parsing `%s'.  %s" % (url, error.message))
+    def _makeChapter(self, url):
+        """Creates a chapter object given a URL."""
+        document = self._loadDocument(url)
+        chapter = Chapter(self._getParsingConfiguration(), url, document)
+        return chapter
 
-    def _parseAttributes(self, document):
-        try:
-            elements = document \
-                .find('div', {'class': 'comm-div'}) \
-                .findNextSibling('div', {'class': 'cb'}) \
-                .nextGenerator()
-            attributesText = u''
-            for element in elements:
-                if not element:
-                    _logger.warning(u'Attribute block not terminated!')
-                    break
-                if isinstance(element, bs.Tag):
-                    # Although deprecated, `has_key()' is required here.
-                    if element.name == 'div' and \
-                            element.has_key('class') and \
-                            element['class'] == 'cb':
-                        break
-                    elif element.name == 'img':
-                        self._parseRatingFromImage(element)
-                else:
-                    attributesText += stripHTML(element)
-        except AttributeError or TypeError:
-            raise ParsingError(u'Failed to locate and collect attributes.')
-
-        for record in re.split(u';|\.', attributesText):
-            parts = record.split(u':', 1)
-            if len(parts) < 2:
-                continue
-            key = parts[0].strip().lower()
-            value = parts[1].strip().strip(u'.')
-            self._parseAttribute(key, value)
-
-    def _parseRatingFromImage(self, element):
-        """Given an image element, tries to parse story rating from it."""
-        # FIXME: This should probably be made adjustable via settings.
-        ratings = {
-            'E': u'Exempt (18+)',
-            'R': u'Restricted (16+)',
-            'A': u'Иная история',
-            'T': u'To every',
-            'I': u'Art house',
-            'Nn': u'Новый мир',
-            'G': u'О, господи!',
-        }
-        ratings['IN'] = ratings['A']
-
-        # Although deprecated, `has_key()' is required here.
-        if not element.has_key('src'):
-            return
-        source = element['src']
-        if 'REITiNG' not in source:
-            return
-        match = re.search(u'/(?P<rating>[ERATINnG]+)\.png$', source)
-        if not match:
-            return
-        symbol = match.group('rating')
-        if symbol == 'IN':
-            symbol = 'A'
-        if symbol in ratings:
-            rating = ratings[symbol]
-            self.story.setMetadata('rating', rating)
-            if symbol in ('R', 'E'):
-                self.is_adult = True
-
-    def _parseAttribute(self, key, value):
-        """Parses a single known attribute value for chapter metadata."""
-
-        def refineCharacter(name):
-            """Refines character name from stop-words and distortions."""
-            strippedName = name.strip()
-            nameOnly = re.sub(self.ETC_PATTERN, u'', strippedName)
-            # TODO: extract canonical name (even ME-specific?).
-            canonicalName = nameOnly
-            return canonicalName
-
-        if key == u'жанр':
-            definitions = value.split(u',')
-            if len(definitions) > 4:
-                _logger.warning(u'Possibly incorrect genre detection!')
-            for definition in definitions:
-                genres = definition.split(u'/')
-                self.story.extendList('genre', genres)
-        elif key == u'статус':
-            status = 'In-Progress' if value == u'в процессе' else 'Completed'
-            self.story.setMetadata('status', status)
-        elif key == u'персонажи':
-            characters = [refineCharacter(name) for name in value.split(u',')]
-            self.story.extendList('characters', characters)
-        else:
-            _logger.debug(u"Unrecognized attribute `%s'.", key)
-
-    def _parseTextForWordCount(self, url):
-        element = self._getChapterTextElement(url)
+    def _getWordCount(self, element):
+        """Returns word count in plain text extracted from chapter body."""
         text = stripHTML(element)
         count = len(re.findall(self.WORD_PATTERN, text))
-        self._transient_metadata['numWords'] += count
-        pass
+        return count
 
-    def _parseDates(self, infoBar):
-        try:
-            dateText = infoBar \
-                .find('i', {'class': 'icon-eye'}) \
-                .findPreviousSibling(text=True) \
-                .strip(u'| \n')
-        except AttributeError:
-            raise ParsingError(u'Failed to locate date.')
-        date = makeDate(dateText, self.dateformat)
-        if date > self._transient_metadata['dateUpdated']:
-            self._transient_metadata['dateUpdated'] = date
-        if date < self._transient_metadata['datePublished']:
-            self._transient_metadata['datePublished'] = date
+    def _getParsingConfiguration(self):
+        if not self._parsingConfiguration:
+            self._parsingConfiguration = {}
 
-    def _parseAuthor(self, strip):
-        try:
-            authorLink = strip \
-                .find('i', {'class': 'icon-user'}) \
-                .findNextSibling('a')
-        except AttributeError:
-            raise ParsingError(u'Failed to locate author link.')
-        match = re.search(u'(8-\d+)', authorLink['onclick'])
-        if not match:
-            raise ParsingError(u'Failed to extract author ID.')
-        authorId = match.group(0)
-        authorUrl = 'http://%s/index/%s' % (self.getSiteDomain(), authorId)
-        authorName = stripHTML(authorLink.text)
-        self.story.setMetadata('authorId', authorId)
-        self.story.setMetadata('authorUrl', authorUrl)
-        self.story.setMetadata('author', authorName)
+            adultRatings = self.getConfigList('adult_ratings')
+            if not adultRatings:
+                raise exceptions.PersonalIniFailed(
+                    u"Missing `adult_ratings' setting", u"MassEffect2.in", u"?")
+            adultRatings = set(adultRatings)
+            self._parsingConfiguration['adultRatings'] = adultRatings
 
-    def _parseTitle(self, url, document):
-        try:
-            fullTitle = stripHTML(
-                document.find('div', {'class': 'eTitle'}).string)
-        except AttributeError:
-            raise ParsingError(u'Failed to locate title.')
-        parsedHeading = self._parseHeading(fullTitle)
-        if not self.story.getMetadata('title'):
-            self.story.setMetadata('title', parsedHeading['storyTitle'])
-        if 'chapterIndex' in parsedHeading:
-            self._transient_metadata['numChapters'] = max(
-                self._transient_metadata['numChapters'],
-                parsedHeading['chapterIndex'])
-        else:
-            self._transient_metadata['numChapters'] += 1
-        self.chapterUrls.append((parsedHeading['chapterTitle'], url))
-
-    def _parseHeading(self, fullTitle):
-        """Extracts meaningful parts from full chapter heading with.
-        Returns a dictionary containing `storyTitle', `chapterTitle'
-        (including numbering if allowed by settings, may be the same as
-        `storyTitle' for short stories), `chapterIndex' (optional, may be
-        zero), and `partIndex' (optional, chapter part, may be zero).
-        When no dedicated chapter title is present, generates one based on
-        chapter and part indices.  Correctly handles `prologue' and `epilogue'
-        cases."""
-        match = re.search(self.CHAPTER_NUMBER_PATTERN, fullTitle)
-        if match:
-            chapterIndex = int(match.group('chapterIndex'))
-            # There are cases with zero chapter or part number (e. g.:
-            # numbered prologue, not to be confused with just `Prologue').
-            if match.group('partIndex'):
-                partIndex = int(match.group('partIndex'))
-            else:
-                partIndex = None
-            chapterTitle = fullTitle[match.end():].strip()
-            if chapterTitle:
-                if self.getConfig('strip_chapter_numbers', False) \
-                        and not self.getConfig('add_chapter_numbers', False):
-                    if partIndex is not None:
-                        title = u'%d.%d %s' % \
-                                (chapterIndex, partIndex, chapterTitle)
-                    else:
-                        title = u'%d. %s' % (chapterIndex, chapterTitle)
-                else:
-                    title = chapterTitle
-            else:
-                title = u'Глава %d' % chapterIndex
-                if partIndex:
-                    title += u' (часть %d)' % partIndex
-
-            # For seldom found cases like `Story: prologue and chapter 1'.
-            storyTitle = fullTitle[:match.start()]
-            match = re.search(self.PROLOGUE_EPILOGUE_PATTERN, storyTitle)
-            if match:
-                matches = list(
-                    re.finditer(u'[:\.]', storyTitle))
-                if matches:
-                    realStoryTitleEnd = matches[-1].start()
-                    if realStoryTitleEnd >= 0:
-                        storyTitle = storyTitle[:realStoryTitleEnd]
-                    else:
+            ratingTitleDescriptions = self.getConfigList('rating_titles')
+            if ratingTitleDescriptions:
+                ratingTitles = {}
+                for ratingDescription in ratingTitleDescriptions:
+                    parts = ratingDescription.split(u'=')
+                    if len(parts) < 2:
                         _logger.warning(
-                            u"Title contains `%s', suspected to be part of "
-                            u"numbering, but no period (`.') before it.  "
-                            u"Full title is preserved." % storyTitle)
-
-            result = {
-                'storyTitle': storyTitle,
-                'chapterTitle': title,
-                'chapterIndex': chapterIndex
-            }
-            if partIndex is not None:
-                result['partIndex'] = partIndex
-            return result
-
-        match = re.search(self.PROLOGUE_EPILOGUE_PATTERN, fullTitle)
-        if match:
-            storyTitle = fullTitle[:match.start()]
-            chapterTitle = fullTitle[match.end():].strip()
-            matchedText = fullTitle[match.start():match.end()]
-            if chapterTitle:
-                title = u'%s. %s' % (matchedText, chapterTitle)
+                            u"Invalid `rating_titles' setting, missing `=' in `%s'."
+                            % ratingDescription)
+                        continue
+                    labels = parts[:-1]
+                    title = parts[-1]
+                    for label in labels:
+                        ratingTitles[label] = title
+                        # Duplicate label aliasing in adult rating set.
+                        if label in adultRatings:
+                            adultRatings.add(*labels)
+                self._parsingConfiguration['adultRatings'] = list(adultRatings)
+                self._parsingConfiguration['ratingTitles'] = ratingTitles
             else:
-                title = matchedText
-            return {
-                'storyTitle': storyTitle,
-                'chapterTitle': title
-            }
+                raise exceptions.PersonalIniFailed(
+                    u"Missing `rating_titles' setting", u"MassEffect2.in", u"?")
 
-        return {
-            'storyTitle': fullTitle,
-            'chapterTitle': fullTitle
-        }
+            self._parsingConfiguration['needsChapterNumbering'] = \
+                self.getConfig('strip_chapter_numbers', False) \
+                and not self.getConfig('add_chapter_numbers', False)
+
+
+        return self._parsingConfiguration
+
+    def _getDocumentId(self, url):
+        """Extracts document ID from MassEffect2.in URL."""
+        match = re.search(self.DOCUMENT_ID_PATTERN, url)
+        if not match:
+            raise ValueError(u"Failed to extract document ID from `'" % url)
+        documentId = url[match.start():match.end()]
+        return documentId
+
+    @classmethod
+    def _makeDocumentUrl(cls, documentId):
+        """Makes a chapter URL given a chapter ID."""
+        return 'http://%s/publ/%s' % (cls.getSiteDomain(), documentId)
 
     def _loadDocument(self, url):
         """Fetches URL content and returns its element tree
         with parsing settings tuned for MassEffect2.in."""
-        documentId = self._extractDocumentId(url)
-        if documentId in self._parsing_cache:
-            _logger.debug(u"Memory cache HIT for parsed `%s'", url)
-            return self._parsing_cache[documentId]['document']
-        else:
-            _logger.debug(u"Memory cache MISS for parsed `%s'", url)
-            document = bs.BeautifulStoneSoup(
-                self._fetchUrl(url), selfClosingTags=('br', 'hr', 'img'))
-            self._parsing_cache[documentId] = {'document': document}
-            return document
+        return bs.BeautifulStoneSoup(
+            self._fetchUrl(url), selfClosingTags=('br', 'hr', 'img'))
 
     def _fetchUrl(self, url,
                   parameters=None,
@@ -498,7 +297,7 @@ class MassEffect2InAdapter(BaseSiteAdapter):
         from calibre.constants import DEBUG
         if DEBUG:
             import os
-            documentId = self._extractDocumentId(url)
+            documentId = self._getDocumentId(url)
             path = u'./cache/%s' % documentId
             if os.path.isfile(path) and os.access(path, os.R_OK):
                 _logger.debug(u"On-disk cache HIT for `%s'.", url)
@@ -519,31 +318,380 @@ class MassEffect2InAdapter(BaseSiteAdapter):
 
         return content
 
-    def _extractDocumentId(self, url):
-        """Extracts document ID from MassEffect2.in URL."""
-        match = re.search(self.DOCUMENT_ID_PATTERN, url)
-        if not match:
-            raise ValueError(u"Failed to extract document ID from `'" % url)
-        documentId = url[match.start():match.end()]
-        return documentId
 
-    def _getChapterTextElement(self, url):
-        """Fetches URL content and extracts an element containing text body.
-        Shall be used instead of `__collectTextElements'."""
-        documentId = self._extractDocumentId(url)
-        document = self._loadDocument(url)
-        cache = self._parsing_cache[documentId]
-        if 'body' in cache:
-            return cache['body']
+class Chapter(object):
+    """Represents a lazily-parsed chapter of a story."""
+    def __init__(self, configuration, url, document):
+        self._configuration = configuration
+        self._url = url
+        self._document = document
+        # Lazy-loaded:
+        self._parsedHeading = None
+        self._date = None
+        self._author = None
+        self._attributes = None
+        self._textElement = None
+        self._infoBar = None
+
+    def getIndex(self):
+        parsedHeading = self._getHeading()
+        if 'chapterIndex' in parsedHeading:
+            return parsedHeading['chapterIndex']
+
+    def getPartIndex(self):
+        parsedHeading = self._getHeading()
+        if 'partIndex' in parsedHeading:
+            return parsedHeading['partIndex']
+
+    def getStoryTitle(self):
+        return self._getHeading()['storyTitle']
+
+    def getTitle(self):
+        return self._getHeading()['chapterTitle']
+
+    def getAuthorId(self):
+        return self._getAuthor()['id']
+
+    def getAuthorName(self):
+        return self._getAuthor()['name']
+
+    def getDate(self):
+        return self._getDate()
+
+    def getRatingTitle(self):
+        return self._getAttributes()['rating']['title']
+
+    def isRatingAdult(self):
+        return self._getAttributes()['rating']['isAdult']
+
+    def getCharacters(self):
+        attributes = self._getAttributes()
+        if 'characters' in attributes:
+            return attributes['characters']
+        return []
+
+    def getGenres(self):
+        attributes = self._getAttributes()
+        if 'genres' in attributes:
+            return attributes['genres']
+        return []
+
+    def isInProgress(self):
+        attributes = self._getAttributes()
+        if 'isInProgress' in attributes:
+            return attributes['isInProgress']
+
+    def getUrl(self):
+        return self._url
+
+    def getTextElement(self):
+        return self._getTextElement()
+
+    def getPreviousChapterUrl(self):
+        """Downloads chapters following `Previous chapter' links.
+        Returns a list of chapters' URLs."""
+        return self._getSiblingChapterUrl({'class': 'fl tal'})
+
+    def getNextChapterUrl(self):
+        """Downloads chapters following `Next chapter' links.
+        Returns a list of chapters' URLs."""
+        return self._getSiblingChapterUrl({'class': 'tar fr'})
+
         else:
-            body = self.__collectTextElements(document)
-            cache['body'] = body
-            return body
+            return storyTitle != thisStoryTitle
 
-    def __collectTextElements(self, document):
+    CHAPTER_NUMBER_PATTERN = re.compile(
+        u'''[\.:\s]*
+            (?:глава)?  # `Chapter' in Russian.
+            \s
+            (?:(?P<chapterIndex>\d{1,3})(?=\D|$))
+            (?:
+              (?:
+                # For `X.Y' and `X-Y' numbering styles:
+                [\-\.]|
+                # For `Chapter X (part Y)' and similar numbering styles:
+                [\.,]?\s
+                (?P<brace>\()?
+                (?:часть)?      # `Part' in Russian.
+                \s
+              )
+              (?P<partIndex>\d{1,3})
+              (?(brace)\))
+            )?
+            [\.:\s]*
+         ''',
+        re.IGNORECASE + re.UNICODE + re.VERBOSE)
+
+    PROLOGUE_EPILOGUE_PATTERN = re.compile(
+        u'''[\.:\s]*         # Optional separators.
+            (пролог|эпилог)  # `Prologue' or `epilogue' in Russian.
+            [\.:\s]*         # Optional separators.
+         ''',
+        re.IGNORECASE + re.UNICODE + re.VERBOSE)
+
+    def _getHeading(self):
+        if not self._parsedHeading:
+            self._parsedHeading = self._parseHeading()
+        return self._parsedHeading
+
+    def _parseHeading(self):
+        """Extracts meaningful parts from full chapter heading with.
+        Returns a dictionary containing `storyTitle', `chapterTitle'
+        (including numbering if allowed by settings, may be the same as
+        `storyTitle' for short stories), `chapterIndex' (optional, may be
+        zero), and `partIndex' (optional, chapter part, may be zero).
+        When no dedicated chapter title is present, generates one based on
+        chapter and part indices.  Correctly handles `prologue' and `epilogue'
+        cases."""
+        try:
+            heading = stripHTML(
+                self._document.find('div', {'class': 'eTitle'}).string)
+        except AttributeError:
+            raise ParsingError(u'Failed to locate title.')
+
+        match = re.search(self.CHAPTER_NUMBER_PATTERN, heading)
+        if match:
+            chapterIndex = int(match.group('chapterIndex'))
+            # There are cases with zero chapter or part number (e. g.:
+            # numbered prologue, not to be confused with just `Prologue').
+            if match.group('partIndex'):
+                partIndex = int(match.group('partIndex'))
+            else:
+                partIndex = None
+            chapterTitle = heading[match.end():].strip()
+            if chapterTitle:
+                if self._configuration['needsChapterNumbering']:
+                    if partIndex is not None:
+                        title = u'%d.%d. %s' % \
+                                (chapterIndex, partIndex, chapterTitle)
+                    else:
+                        title = u'%d. %s' % (chapterIndex, chapterTitle)
+                else:
+                    title = chapterTitle
+            else:
+                title = u'Глава %d' % chapterIndex
+                if partIndex:
+                    title += u' (часть %d)' % partIndex
+
+            # For seldom found cases like `Story: prologue and chapter 1'.
+            storyTitle = heading[:match.start()]
+            match = re.search(self.PROLOGUE_EPILOGUE_PATTERN, storyTitle)
+            if match:
+                matches = list(
+                    re.finditer(u'[:\.]', storyTitle))
+                if matches:
+                    realStoryTitleEnd = matches[-1].start()
+                    if realStoryTitleEnd >= 0:
+                        storyTitle = storyTitle[:realStoryTitleEnd]
+                    else:
+                        _logger.warning(
+                            u"Title contains `%s', suspected to be part of "
+                            u"numbering, but no period (`.') before it.  "
+                            u"Full title is preserved." % storyTitle)
+
+            self._parsedHeading = {
+                'storyTitle': unicode(storyTitle),
+                'chapterTitle': unicode(title),
+                'chapterIndex': chapterIndex
+            }
+            if partIndex is not None:
+                self._parsedHeading['partIndex'] = partIndex
+            return self._parsedHeading
+
+        match = re.search(self.PROLOGUE_EPILOGUE_PATTERN, heading)
+        if match:
+            storyTitle = heading[:match.start()]
+            chapterTitle = heading[match.end():].strip()
+            matchedText = heading[match.start():match.end()]
+            if chapterTitle:
+                title = u'%s. %s' % (matchedText, chapterTitle)
+            else:
+                title = matchedText
+            self._parsedHeading = {
+                'storyTitle': unicode(storyTitle),
+                'chapterTitle': unicode(title)
+            }
+            return self._parsedHeading
+
+        self._parsedHeading = {
+            'storyTitle': unicode(heading),
+            'chapterTitle': unicode(heading)
+        }
+        return self._parsedHeading
+
+    def _getAuthor(self):
+        if not self._author:
+            self._author = self._parseAuthor()
+        return self._author
+
+    def _parseAuthor(self):
+        try:
+            authorLink = self._getInfoBarElement() \
+                .find('i', {'class': 'icon-user'}) \
+                .findNextSibling('a')
+        except AttributeError:
+            raise ParsingError(u'Failed to locate author link.')
+        match = re.search(u'(8-\d+)', authorLink['onclick'])
+        if not match:
+            raise ParsingError(u'Failed to extract author ID.')
+        authorId = match.group(0)
+        authorName = stripHTML(authorLink.text)
+        return {
+            'id': authorId,
+            'name': authorName
+        }
+
+    def _getDate(self):
+        if not self._date:
+            self._date = self._parseDate()
+        return self._date
+
+    def _parseDate(self):
+        try:
+            dateText = self._getInfoBarElement() \
+                .find('i', {'class': 'icon-eye'}) \
+                .findPreviousSibling(text=True) \
+                .strip(u'| \n')
+        except AttributeError:
+            raise ParsingError(u'Failed to locate date.')
+        date = makeDate(dateText, '%d.%m.%Y')
+        return date
+
+    def _getInfoBarElement(self):
+        if not self._infoBar:
+            self._infoBar = self._document.find('td', {'class': 'eDetails2'})
+            if not self._infoBar:
+                raise ParsingError(u'No informational bar found.')
+        return self._infoBar
+
+    def _getAttributes(self):
+        if not self._attributes:
+            self._attributes = self._parseAttributes()
+        return self._attributes
+
+    def _parseAttributes(self):
+        attributes = {}
+        try:
+            elements = self._document \
+                .find('div', {'class': 'comm-div'}) \
+                .findNextSibling('div', {'class': 'cb'}) \
+                .nextGenerator()
+            attributesText = u''
+            for element in elements:
+                if not element:
+                    _logger.warning(u'Attribute block not terminated!')
+                    break
+                if isinstance(element, bs.Tag):
+                    # Although deprecated, `has_key()' is required here.
+                    if element.name == 'div' and \
+                            element.has_key('class') and \
+                            element['class'] == 'cb':
+                        break
+                    elif element.name == 'img':
+                        rating = self._parseRatingFromImage(element)
+                        if rating:
+                            attributes['rating'] = rating
+                else:
+                    attributesText += stripHTML(element)
+        except AttributeError or TypeError:
+            raise ParsingError(u'Failed to locate and collect attributes.')
+
+        for record in re.split(u';|\.', attributesText):
+            parts = record.split(u':', 1)
+            if len(parts) < 2:
+                continue
+            key = parts[0].strip().lower()
+            value = parts[1].strip().strip(u'.')
+            parsed = self._parseAttribute(key, value)
+            if parsed:
+                attributes[parsed[0]] = parsed[1]
+
+        if 'rating' not in attributes:
+            raise ParsingError(u'Failed to locate or recognize rating!')
+
+        return attributes
+
+    RATING_LABEL_PATTERN = re.compile(u'/(?P<rating>[ERATINnG]+)\.png$')
+
+    def _parseRatingFromImage(self, element):
+        """Given an image element, tries to parse story rating from it."""
+        # Although deprecated, `has_key()' is required here.
+        if not element.has_key('src'):
+            return
+        source = element['src']
+        if 'REITiNG' in source:
+            match = re.search(self.RATING_LABEL_PATTERN, source)
+            if not match:
+                return
+            label = match.group('rating')
+            if label in self._configuration['ratingTitles']:
+                return {
+                    'label': label,
+                    'title': self._configuration['ratingTitles'][label],
+                    'isAdult': label in self._configuration['adultRatings']
+                }
+            else:
+                _logger.warning(u"No title found for rating label `%s'!" % label)
+        # FIXME: It seems, rating has to be optional due to such URLs.
+        elif source == 'http://www.masseffect2.in/_fr/10/1360399.png':
+            label = 'Nn'
+            return {
+                'label': 'Nn',
+                'title': self._configuration['ratingTitles'][label],
+                'isAdult': label in self._configuration['adultRatings']
+            }
+
+    # Various `et cetera' and `et al' forms in Russian texts.
+    # Intended to be used with whole strings!
+    ETC_PATTERN = re.compile(
+        u'''[и&]\s(?:
+              (?:т\.?\s?[пд]\.?)|
+              (?:др(?:угие|\.)?)|
+              (?:пр(?:очие|\.)?)|
+              # Note: identically looking letters `K' and `o'
+              # below are from Latin and Cyrillic alphabets.
+              (?:ко(?:мпания)?|[KК][oо°])
+            )$
+        ''',
+        re.IGNORECASE + re.UNICODE + re.VERBOSE)
+
+    def _parseAttribute(self, key, value):
+        """Parses a single known attribute value for chapter metadata."""
+
+        def refineCharacter(name):
+            """Refines character name from stop-words and distortions."""
+            strippedName = name.strip()
+            nameOnly = re.sub(self.ETC_PATTERN, u'', strippedName)
+            # TODO: extract canonical name (even ME-specific?).
+            canonicalName = nameOnly
+            return canonicalName
+
+        if re.match(u'жанры?', key, re.UNICODE):
+            definitions = value.split(u',')
+            if len(definitions) > 4:
+                _logger.warning(u'Possibly incorrect genre detection!')
+            genres = []
+            for definition in definitions:
+                genres += definition.split(u'/')
+            return 'genres', genres
+        elif key == u'статус':
+            isInProgress = value == u'в процессе'
+            return 'isInProgress', isInProgress
+        elif key == u'персонажи':
+            characters = [refineCharacter(name) for name in value.split(u',')]
+            return 'characters', characters
+        else:
+            _logger.debug(u"Unrecognized attribute `%s' ignored.", key)
+
+    def _getTextElement(self):
+        if not self._textElement:
+            self._textElement = self.__collectTextElements()
+        return self._textElement
+
+    def __collectTextElements(self):
         """Returns all elements containing parts of chapter text (which may be
         <p>aragraphs, <div>isions or plain text nodes) under a single root."""
-        starter = document.find('div', {'id': u'article'})
+        starter = self._document.find('div', {'id': u'article'})
         if starter is None:
             # FIXME: This will occur if the method is called more than once.
             # The reason is elements appended to `root' are removed from
@@ -558,7 +706,30 @@ class MassEffect2InAdapter(BaseSiteAdapter):
             if isinstance(element, bs.Tag) and element.name == 'tr':
                 break
             collection.append(element)
-        root = bs.Tag(document, 'td')
+        root = bs.Tag(self._document, 'td')
         for element in collection:
             root.append(element)
         return root
+
+    def _getSiblingChapterUrl(self, selector):
+        """Downloads chapters one by one by locating and following links
+        specified by a selector.  Returns chapters' URLs in order they
+        were found."""
+        block = self._document\
+            .find('td', {'class': 'eDetails1'})\
+            .find('div', selector)
+        if not block:
+            return
+        link = block.find('a')
+        if not link:
+            return
+        return link['href']
+
+
+def _getLargestCommonPrefix(*args):
+    """Returns largest common prefix of all unicode(!) arguments.
+    :rtype : unicode
+    """
+    from itertools import takewhile, izip
+    allSame = lambda xs: len(set(xs)) == 1
+    return u''.join([i[0] for i in takewhile(allSame, izip(*args))])
diff --git a/fanficfare/configurable.py b/fanficfare/configurable.py
index 97fc399d..5dd2a561 100644
--- a/fanficfare/configurable.py
+++ b/fanficfare/configurable.py
@@ -205,6 +205,7 @@ def get_valid_keywords():
     return list(['(in|ex)clude_metadata_(pre|post)',
                  'add_chapter_numbers',
                  'add_genre_when_multi_category',
+                 'adult_ratings',
                  'allow_unsafe_filename',
                  'always_overwrite',
                  'anthology_tags',
@@ -285,6 +286,7 @@ def get_valid_keywords():
                  'output_filename_safepattern',
                  'password',
                  'post_process_cmd',
+                 'rating_titles',
                  'remove_transparency',
                  'replace_br_with_p',
                  'replace_hr',
diff --git a/fanficfare/defaults.ini b/fanficfare/defaults.ini
index e35ad4fd..6dbb44ba 100644
--- a/fanficfare/defaults.ini
+++ b/fanficfare/defaults.ini
@@ -1825,14 +1825,17 @@ extracategories:Lord of the Rings
 
 [www.masseffect2.in]
 ## Site dedicated to this fandom.
-extracategories:Mass Effect
+extracategories: Mass Effect
 
 ## Stories on the site almost never have cover image.
 ## May be adjusted in `personal.ini' on per-story basis.
 never_make_cover: true
 
-my_custom_label:Some text
-my_custom_setting:true
+## Titles for ratings identified by 1- or 2-letter codes from `ERATING system'
+## (`система Р.Е.Й.Т.И.Н.Г.').  MassEffect2.in and some other sites adopted it,
+## but changed titles and update them occasionally.
+rating_titles: R=RESTRICTED (16+), E=EXEMPT (18+), I=ART HOUSE, T=To every, A=IN=Иной мир, Nn=Новый мир, G=О\, Господи!
+adult_ratings: E,R
 
 [www.mediaminer.org]
 

From a8ce9d5711378a538c1f50af75ce0f85b64f0ba2 Mon Sep 17 00:00:00 2001
From: Dmitry Kozliuk <Dmitry.Kozliuk@gmail.com>
Date: Thu, 23 Jul 2015 02:44:50 +0300
Subject: [PATCH 05/18] Add editor signature removal capability.

All chapters have editor signature in the end.  Users wishing to remove
it can switch `exclude_editor_signature' option in `personal.ini'.
---
 calibre-plugin/plugin-defaults.ini           |  3 +++
 fanficfare/adapters/adapter_masseffect2in.py | 24 ++++++++++++++++++++
 fanficfare/configurable.py                   |  1 +
 fanficfare/defaults.ini                      |  3 +++
 4 files changed, 31 insertions(+)

diff --git a/calibre-plugin/plugin-defaults.ini b/calibre-plugin/plugin-defaults.ini
index ac27065e..a68a5d99 100644
--- a/calibre-plugin/plugin-defaults.ini
+++ b/calibre-plugin/plugin-defaults.ini
@@ -1226,6 +1226,9 @@ extracategories:Harry Potter
 ## Site dedicated to this fandom.
 extracategories: Mass Effect
 
+## Whether to exclude editor signature from the bottom if chapter text.
+exclude_editor_signature: false
+
 ## Stories on the site almost never have cover image.
 ## May be adjusted in `personal.ini' on per-story basis.
 never_make_cover: true
diff --git a/fanficfare/adapters/adapter_masseffect2in.py b/fanficfare/adapters/adapter_masseffect2in.py
index 815ae191..dd8444bd 100644
--- a/fanficfare/adapters/adapter_masseffect2in.py
+++ b/fanficfare/adapters/adapter_masseffect2in.py
@@ -266,6 +266,8 @@ class MassEffect2InAdapter(BaseSiteAdapter):
                 self.getConfig('strip_chapter_numbers', False) \
                 and not self.getConfig('add_chapter_numbers', False)
 
+            self._parsingConfiguration['excludeEditorSignature'] = \
+                self.getConfig('exclude_editor_signature', False)
 
         return self._parsingConfiguration
 
@@ -709,6 +711,10 @@ class Chapter(object):
         root = bs.Tag(self._document, 'td')
         for element in collection:
             root.append(element)
+
+        if self._configuration['excludeEditorSignature']:
+            root = self._excludeEditorSignature(root)
+
         return root
 
     def _getSiblingChapterUrl(self, selector):
@@ -725,6 +731,24 @@ class Chapter(object):
             return
         return link['href']
 
+    SIGNED_PATTERN = re.compile(u'отредактирова(?:но|ла?)[:.\s]', re.IGNORECASE + re.UNICODE)
+
+    def _excludeEditorSignature(self, root):
+        for textNode in root.findAll(text=True):
+            if re.match(self.SIGNED_PATTERN, textNode.string):
+                editorLink = textNode.findNext('a')
+                if editorLink:
+                    editorLink.extract()
+                # Seldom editor link has inner formatting, which is sibling DOM-wise.
+                editorName = textNode.findNext('i')
+                if editorName:
+                    editorName.extract()
+                textNode.extract()
+                # We could try removing container element, but there is a risk
+                # of removing text ending with it.  Better play safe here.
+                break
+        return root
+
 
 def _getLargestCommonPrefix(*args):
     """Returns largest common prefix of all unicode(!) arguments.
diff --git a/fanficfare/configurable.py b/fanficfare/configurable.py
index 5dd2a561..61b306df 100644
--- a/fanficfare/configurable.py
+++ b/fanficfare/configurable.py
@@ -235,6 +235,7 @@ def get_valid_keywords():
                  'description_limit',
                  'do_update_hook',
                  'exclude_notes',
+                 'exclude_editor_signature',
                  'extra_logpage_entries',
                  'extra_subject_tags',
                  'extra_titlepage_entries',
diff --git a/fanficfare/defaults.ini b/fanficfare/defaults.ini
index 6dbb44ba..d9069cf8 100644
--- a/fanficfare/defaults.ini
+++ b/fanficfare/defaults.ini
@@ -1827,6 +1827,9 @@ extracategories:Lord of the Rings
 ## Site dedicated to this fandom.
 extracategories: Mass Effect
 
+## Whether to exclude editor signature from the bottom if chapter text.
+exclude_editor_signature: false
+
 ## Stories on the site almost never have cover image.
 ## May be adjusted in `personal.ini' on per-story basis.
 never_make_cover: true

From d809ac9f6c9a64e6d4effc43fbe23265ca6e848b Mon Sep 17 00:00:00 2001
From: Dmitry Kozliuk <Dmitry.Kozliuk@gmail.com>
Date: Thu, 23 Jul 2015 02:47:49 +0300
Subject: [PATCH 06/18] Prevent capturing chapters from related stories.

When prequel or sequel is posted, editors sometimes make `Next/Previous
chapter' links between them.  Since it is the only mechanism for chapter
detection, additional title check was added.  It is heuristical and may
need improvement one day.
---
 fanficfare/adapters/adapter_masseffect2in.py | 24 ++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/fanficfare/adapters/adapter_masseffect2in.py b/fanficfare/adapters/adapter_masseffect2in.py
index dd8444bd..73047abd 100644
--- a/fanficfare/adapters/adapter_masseffect2in.py
+++ b/fanficfare/adapters/adapter_masseffect2in.py
@@ -104,6 +104,10 @@ class MassEffect2InAdapter(BaseSiteAdapter):
             if url:
                 url = self._makeDocumentUrl(self._getDocumentId(url))
                 following = self._makeChapter(url)
+                # Do not follow links to related, but different stories (prequels or sequels).
+                startingStoryTitle = self.story.getMetadata('title')
+                if not following.isFromStory(startingStoryTitle):
+                    return
                 if forward:
                     yield following
                 for chapter in followChapters(following, forward):
@@ -399,6 +403,26 @@ class Chapter(object):
         Returns a list of chapters' URLs."""
         return self._getSiblingChapterUrl({'class': 'tar fr'})
 
+    def isFromStory(self, storyTitle, prefixThreshold=-1):
+        """Checks if this chapter is from a story different from the given one.
+        Prefix threshold specifies how long common story title prefix shall be
+        for chapters from one story: negative value means implementation-defined
+        optimum, zero inhibits the check, and positive value adjusts threshold."""
+
+        def getFirstWord(string):
+            match = re.search(u'^\s*\w+', string, re.UNICODE)
+            return string[match.start():match.end()]
+
+        thisStoryTitle = self.getStoryTitle()
+        if prefixThreshold != 0:
+            if prefixThreshold < 0:
+                prefixThreshold = min(
+                    len(getFirstWord(storyTitle)), len(getFirstWord(thisStoryTitle)))
+            else:
+                prefixThreshold = min(
+                    prefixThreshold, len(storyTitle), len(thisStoryTitle))
+            result = len(_getLargestCommonPrefix(storyTitle, thisStoryTitle)) >= prefixThreshold
+            return result
         else:
             return storyTitle != thisStoryTitle
 

From d91d4b8c3c430c1235a07abcf9837f47fa68b926 Mon Sep 17 00:00:00 2001
From: Dmitry Kozliuk <Dmitry.Kozliuk@gmail.com>
Date: Fri, 24 Jul 2015 03:33:39 +0300
Subject: [PATCH 07/18] Refactor and improve heading parsing.

Now several (relatively) simple REs are used instead of a complex one.
New heading variants are supported:
    * Story title. Chapter X. Chapter title (part Y)
    * Story title. Chapter X (continued)
    * Story title. First chapter
Potentially overridable method `Chapter._extractHeading()' extracted.
---
 fanficfare/adapters/adapter_masseffect2in.py | 253 +++++++++++--------
 1 file changed, 141 insertions(+), 112 deletions(-)

diff --git a/fanficfare/adapters/adapter_masseffect2in.py b/fanficfare/adapters/adapter_masseffect2in.py
index 73047abd..70e9ce0b 100644
--- a/fanficfare/adapters/adapter_masseffect2in.py
+++ b/fanficfare/adapters/adapter_masseffect2in.py
@@ -340,20 +340,20 @@ class Chapter(object):
         self._infoBar = None
 
     def getIndex(self):
-        parsedHeading = self._getHeading()
+        parsedHeading = self.__getHeading()
         if 'chapterIndex' in parsedHeading:
             return parsedHeading['chapterIndex']
 
     def getPartIndex(self):
-        parsedHeading = self._getHeading()
+        parsedHeading = self.__getHeading()
         if 'partIndex' in parsedHeading:
             return parsedHeading['partIndex']
 
     def getStoryTitle(self):
-        return self._getHeading()['storyTitle']
+        return self.__getHeading()['storyTitle']
 
     def getTitle(self):
-        return self._getHeading()['chapterTitle']
+        return self.__getHeading()['chapterTitle']
 
     def getAuthorId(self):
         return self._getAuthor()['id']
@@ -426,124 +426,153 @@ class Chapter(object):
         else:
             return storyTitle != thisStoryTitle
 
-    CHAPTER_NUMBER_PATTERN = re.compile(
+    def _extractHeading(self):
+        """Extracts header text from the document."""
+        return stripHTML(
+            self._document.find('div', {'class': 'eTitle'}).string)
+
+    def __getHeading(self):
+        if not self._parsedHeading:
+            self._parsedHeading = self.__parseHeading()
+        return self._parsedHeading
+
+    NUMBERING_TITLE_PATTERN = re.compile(
+        u'''(?P<brace>\()?
+            (?P<essence>начало|продолжение|окончание|
+            часть\s(?:первая|вторая|третья|четвертая|пятая|шестая|седьмая|восьмая|девятая|десятая))
+            (?(brace)\)|\.)?
+        ''',
+        re.IGNORECASE | re.UNICODE | re.VERBOSE)
+
+    def __parseHeading(self):
+        """Locates chapter heading and extracts meaningful parts from it.
+        Returns a dictionary containing `storyTitle', `chapterTitle' (including numbering if allowed by settings,
+        may be the same as `storyTitle' for short stories, or generated from indices), `chapterIndex' (optional,
+        may be zero), and `partIndex' (optional, chapter part, may be zero)."""
+        try:
+            heading = self._extractHeading()
+        except Exception, error:
+            raise ParsingError(u'Failed to locate title: %s.' % error)
+
+        chapterIndex, partIndex, storyTitle, chapterTitle = self.__splitHeading(heading)
+        if chapterTitle:
+            match = re.search(self.NUMBERING_TITLE_PATTERN, chapterTitle)
+            if match:
+                chapterTitle = u'Глава %d. %s' % (chapterIndex, match.group('essence').capitalize())
+            elif self._configuration['needsChapterNumbering']:
+                if partIndex is not None:
+                    chapterTitle = u'%d.%d. %s' % (chapterIndex, partIndex, chapterTitle)
+                else:
+                    chapterTitle = u'%d. %s' % (chapterIndex, chapterTitle)
+        else:
+            chapterTitle = u'Глава %d' % chapterIndex
+            if partIndex is not None:
+                chapterTitle += u' (часть %d)' % partIndex
+
+        self._parsedHeading = {
+            'storyTitle': storyTitle,
+            'chapterTitle': chapterTitle
+        }
+        if chapterIndex is not None:
+            self._parsedHeading['chapterIndex'] = chapterIndex
+        if partIndex is not None:
+            self._parsedHeading['partIndex'] = partIndex
+            return self._parsedHeading
+        return self._parsedHeading
+
+    # Patterns below start end end with the same optional separator characters (to filter them)
+    # and allow only freestanding groups of 1--3 digits (ti filter long numbers in titles).
+
+    OUTLINE_PATTERN = re.compile(
         u'''[\.:\s]*
-            (?:глава)?  # `Chapter' in Russian.
-            \s
-            (?:(?P<chapterIndex>\d{1,3})(?=\D|$))
-            (?:
-              (?:
-                # For `X.Y' and `X-Y' numbering styles:
-                [\-\.]|
-                # For `Chapter X (part Y)' and similar numbering styles:
-                [\.,]?\s
-                (?P<brace>\()?
-                (?:часть)?      # `Part' in Russian.
-                \s
-              )
-              (?P<partIndex>\d{1,3})
-              (?(brace)\))
-            )?
+            (?:глава\s)?
+            (?:(?<!\d)(?P<chapterIndex>\d{1,3})(?=\D))
+            [\.-]
+            (?:(?P<partIndex>\d{1,3})(?=\D|$))
+            [\.:\s]*
+        ''',
+        re.IGNORECASE | re.UNICODE | re.VERBOSE)
+
+    CHAPTER_PATTERN = re.compile(
+        u'''[\.:\s]*
+            (?:глава\s)?(?:(?<!\d)(?P<chapterIndex>\d{1,3})(?=\D|$))
+            [\.:\s]*
+        ''',
+        re.IGNORECASE | re.UNICODE | re.VERBOSE)
+
+    PART_PATTERN = re.compile(
+        u'''[\.:\s]*
+            (?:[\.,]?\s)?
+            (?P<brace>\()?
+            (?:часть\s)?
+            (?:(?<!\d)(?P<partIndex>\d{1,3})(?=\D|$))
+            (?(brace)\))
+            [\.:\s]*
+        ''',
+        re.IGNORECASE | re.UNICODE | re.VERBOSE)
+
+    PROLOGUE_EPILOGUE_PATTERN = re.compile(
+        u'''[\.:\s]*
+            (?P<keyword>пролог|эпилог)  # `Prologue' or `epilogue' in Russian.
             [\.:\s]*
          ''',
         re.IGNORECASE + re.UNICODE + re.VERBOSE)
 
-    PROLOGUE_EPILOGUE_PATTERN = re.compile(
-        u'''[\.:\s]*         # Optional separators.
-            (пролог|эпилог)  # `Prologue' or `epilogue' in Russian.
-            [\.:\s]*         # Optional separators.
-         ''',
-        re.IGNORECASE + re.UNICODE + re.VERBOSE)
-
-    def _getHeading(self):
-        if not self._parsedHeading:
-            self._parsedHeading = self._parseHeading()
-        return self._parsedHeading
-
-    def _parseHeading(self):
-        """Extracts meaningful parts from full chapter heading with.
-        Returns a dictionary containing `storyTitle', `chapterTitle'
-        (including numbering if allowed by settings, may be the same as
-        `storyTitle' for short stories), `chapterIndex' (optional, may be
-        zero), and `partIndex' (optional, chapter part, may be zero).
-        When no dedicated chapter title is present, generates one based on
-        chapter and part indices.  Correctly handles `prologue' and `epilogue'
-        cases."""
-        try:
-            heading = stripHTML(
-                self._document.find('div', {'class': 'eTitle'}).string)
-        except AttributeError:
-            raise ParsingError(u'Failed to locate title.')
-
-        match = re.search(self.CHAPTER_NUMBER_PATTERN, heading)
-        if match:
-            chapterIndex = int(match.group('chapterIndex'))
-            # There are cases with zero chapter or part number (e. g.:
-            # numbered prologue, not to be confused with just `Prologue').
-            if match.group('partIndex'):
-                partIndex = int(match.group('partIndex'))
-            else:
-                partIndex = None
-            chapterTitle = heading[match.end():].strip()
-            if chapterTitle:
-                if self._configuration['needsChapterNumbering']:
-                    if partIndex is not None:
-                        title = u'%d.%d. %s' % \
-                                (chapterIndex, partIndex, chapterTitle)
-                    else:
-                        title = u'%d. %s' % (chapterIndex, chapterTitle)
-                else:
-                    title = chapterTitle
-            else:
-                title = u'Глава %d' % chapterIndex
-                if partIndex:
-                    title += u' (часть %d)' % partIndex
-
-            # For seldom found cases like `Story: prologue and chapter 1'.
-            storyTitle = heading[:match.start()]
-            match = re.search(self.PROLOGUE_EPILOGUE_PATTERN, storyTitle)
+    def __splitHeading(self, heading):
+        """Parses chapter heading text into meaningful parts.
+        Returns a tuple(chapter index, part index, story title, chapter title).
+        Any or both of the indices may be None if absent, chapter title may be empty (only if chapter index is None)."""
+        def filterPrologueOrEpilogue(title):
+            match = re.search(self.PROLOGUE_EPILOGUE_PATTERN, title)
             if match:
-                matches = list(
-                    re.finditer(u'[:\.]', storyTitle))
+                matches = list(re.finditer(u'[:\.]', title))
                 if matches:
                     realStoryTitleEnd = matches[-1].start()
-                    if realStoryTitleEnd >= 0:
-                        storyTitle = storyTitle[:realStoryTitleEnd]
+                    return title[:realStoryTitleEnd]
+                else:
+                    _logger.warning(
+                        u"Title contains `%s', suspected to be part of numbering, but no period (`.') before it.  "
+                        u"Full title is preserved." % title)
+            return title
+
+        outline_match = re.search(self.OUTLINE_PATTERN, heading)
+        if outline_match:
+            chapter_index = int(outline_match.group('chapterIndex'))
+            part_index = int(outline_match.group('partIndex'))
+            story = heading[:outline_match.start()]
+            story = filterPrologueOrEpilogue(story)
+            chapter = heading[outline_match.end():]
+            return chapter_index, part_index, story, chapter
+        else:
+            chapter_match = re.search(self.CHAPTER_PATTERN, heading)
+            if chapter_match:
+                chapter_index = int(chapter_match.group('chapterIndex'))
+                story = heading[:chapter_match.start()]
+                story = filterPrologueOrEpilogue(story)
+                suffix = heading[chapter_match.end():]
+                part_match = re.search(self.PART_PATTERN, suffix)
+                if part_match:
+                    part_index = int(part_match.group('partIndex'))
+                    if part_match.start() == 0:
+                        chapter = suffix[part_match.end():]
                     else:
-                        _logger.warning(
-                            u"Title contains `%s', suspected to be part of "
-                            u"numbering, but no period (`.') before it.  "
-                            u"Full title is preserved." % storyTitle)
-
-            self._parsedHeading = {
-                'storyTitle': unicode(storyTitle),
-                'chapterTitle': unicode(title),
-                'chapterIndex': chapterIndex
-            }
-            if partIndex is not None:
-                self._parsedHeading['partIndex'] = partIndex
-            return self._parsedHeading
-
-        match = re.search(self.PROLOGUE_EPILOGUE_PATTERN, heading)
-        if match:
-            storyTitle = heading[:match.start()]
-            chapterTitle = heading[match.end():].strip()
-            matchedText = heading[match.start():match.end()]
-            if chapterTitle:
-                title = u'%s. %s' % (matchedText, chapterTitle)
+                        chapter = suffix[:part_match.start()]
+                    return chapter_index, part_index, story, chapter
+                else:
+                    chapter = heading[chapter_match.end():]
+                    return chapter_index, None, story, chapter
             else:
-                title = matchedText
-            self._parsedHeading = {
-                'storyTitle': unicode(storyTitle),
-                'chapterTitle': unicode(title)
-            }
-            return self._parsedHeading
-
-        self._parsedHeading = {
-            'storyTitle': unicode(heading),
-            'chapterTitle': unicode(heading)
-        }
-        return self._parsedHeading
+                match = re.search(self.PROLOGUE_EPILOGUE_PATTERN, heading)
+                if match:
+                    story = heading[:match.start()]
+                    chapter = heading[match.end():]
+                    keyword = match.group('keyword')
+                    if chapter:
+                        chapter = u"%s. %s" % (keyword.title(), chapter)
+                    else:
+                        chapter = keyword
+                    return None, None, story, chapter
+        return None, None, heading, heading
 
     def _getAuthor(self):
         if not self._author:

From 2516e617e44a193ee82d47829b9dc78a24703b32 Mon Sep 17 00:00:00 2001
From: Dmitry Kozliuk <Dmitry.Kozliuk@gmail.com>
Date: Fri, 24 Jul 2015 03:37:58 +0300
Subject: [PATCH 08/18] Remove more stop-words variants.

---
 fanficfare/adapters/adapter_masseffect2in.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fanficfare/adapters/adapter_masseffect2in.py b/fanficfare/adapters/adapter_masseffect2in.py
index 70e9ce0b..ede12ae2 100644
--- a/fanficfare/adapters/adapter_masseffect2in.py
+++ b/fanficfare/adapters/adapter_masseffect2in.py
@@ -700,7 +700,7 @@ class Chapter(object):
     # Intended to be used with whole strings!
     ETC_PATTERN = re.compile(
         u'''[и&]\s(?:
-              (?:т\.?\s?[пд]\.?)|
+              (?:т\.?\s?[пд]?\.?)|
               (?:др(?:угие|\.)?)|
               (?:пр(?:очие|\.)?)|
               # Note: identically looking letters `K' and `o'

From 9c84c7201ccaeb68ec9fa00a39766dc86ddb6126 Mon Sep 17 00:00:00 2001
From: Dmitry Kozliuk <Dmitry.Kozliuk@gmail.com>
Date: Fri, 24 Jul 2015 14:23:46 +0300
Subject: [PATCH 09/18] Replace intricate numbering parsing with simpler
 approach.

Instead of locating numbering elements in headings, extracting titles
and indices and then combining them into chapter titles, we employ a
much simpler approach:
    * The longest common prefix of all headings is story title.
    * Everithing after it in every prefix is chapter title.
    * If `chapter X' is found in heading, prefix length is corrected.
    * If chapter title contains numbering prefix, chapter index is
      extracted (but not part index and not chapter title separately).
---
 fanficfare/adapters/adapter_masseffect2in.py | 246 ++++---------------
 1 file changed, 44 insertions(+), 202 deletions(-)

diff --git a/fanficfare/adapters/adapter_masseffect2in.py b/fanficfare/adapters/adapter_masseffect2in.py
index ede12ae2..0ee6209e 100644
--- a/fanficfare/adapters/adapter_masseffect2in.py
+++ b/fanficfare/adapters/adapter_masseffect2in.py
@@ -105,8 +105,7 @@ class MassEffect2InAdapter(BaseSiteAdapter):
                 url = self._makeDocumentUrl(self._getDocumentId(url))
                 following = self._makeChapter(url)
                 # Do not follow links to related, but different stories (prequels or sequels).
-                startingStoryTitle = self.story.getMetadata('title')
-                if not following.isFromStory(startingStoryTitle):
+                if not following.isFromStory(starting.getHeading()):
                     return
                 if forward:
                     yield following
@@ -123,7 +122,6 @@ class MassEffect2InAdapter(BaseSiteAdapter):
             raise
 
         try:
-            self.story.setMetadata('title', startingChapter.getStoryTitle())
             self.story.setMetadata('author', startingChapter.getAuthorName())
             authorId = startingChapter.getAuthorId()
             authorUrl = 'http://%s/index/%s' % (self.getSiteDomain(), authorId)
@@ -148,12 +146,28 @@ class MassEffect2InAdapter(BaseSiteAdapter):
             [startingChapter] + \
             list(followChapters(startingChapter, forward=True))
 
-        try:
-            for chapter in chapters:
-                url = chapter.getUrl()
-                self._chapters[url] = chapter
-                _logger.debug(u"Processing chapter `%s'.", url)
+        headings = [chapter.getHeading() for chapter in chapters]
+        largestCommonPrefix = _getLargestCommonPrefix(*headings)
+        prefixLength = len(largestCommonPrefix)
+        storyTitleEnd, chapterTitleStart = prefixLength, prefixLength
+        match = re.search(u'[:\.\s]*(?P<chapter>глава\s+)?$', largestCommonPrefix, re.IGNORECASE | re.UNICODE)
+        if match:
+            storyTitleEnd -= len(match.group())
+            label = match.group('chapter')
+            if label:
+                chapterTitleStart -= len(label)
+        storyTitle = largestCommonPrefix[:storyTitleEnd]
+        self.story.setMetadata('title', storyTitle)
 
+        garbagePattern = re.compile(u'(?P<start>^)?[:\.\s]*(?(start)|$)', re.UNICODE)
+        indexPattern = re.compile(u'(?:глава\s)?(?:(?<!\d)(?P<index>\d{1,3})(?=\D|$))', re.IGNORECASE | re.UNICODE)
+
+        for chapter in chapters:
+            url = chapter.getUrl()
+            self._chapters[url] = chapter
+            _logger.debug(u"Processing chapter `%s'.", url)
+
+            try:
                 datePublished = min(datePublished, chapter.getDate())
                 dateUpdated = max(dateUpdated, chapter.getDate())
 
@@ -162,14 +176,7 @@ class MassEffect2InAdapter(BaseSiteAdapter):
 
                 wordCount += self._getWordCount(chapter.getTextElement())
 
-                index = chapter.getIndex()
-                if index:
-                    chapterCount = max(chapterCount, index)
-                else:
-                    chapterCount += 1
-
-                # Story is in progress if any chapter is in progress.
-                # Some chapters may have no status attribute.
+                # Story is in progress if any chapter is in progress. Some chapters may have no status attribute.
                 chapterInProgress = chapter.isInProgress()
                 if chapterInProgress is not None:
                     storyInProgress |= chapterInProgress
@@ -178,29 +185,18 @@ class MassEffect2InAdapter(BaseSiteAdapter):
                 if chapter.isRatingAdult():
                     self.story.setMetadata('is_adult', True)
 
-            titles = [chapter.getTitle() for chapter in chapters]
-            hasNumbering = any([chapter.getIndex() is not None for chapter in chapters])
-            if not hasNumbering:
-                # There are stories without chapter numbering, but under single title,
-                # which is heading prefix (such stories are not series).  We identify
-                # common prefix for all chapters and use it as story title, trimming
-                # chapter titles the length of this prefix.
-                largestCommonPrefix = _getLargestCommonPrefix(*titles)
-                prefixLength = len(largestCommonPrefix)
-                storyTitle = re.sub(u'[:\.\s]*$', u'', largestCommonPrefix, re.UNICODE)
-                self.story.setMetadata('title', storyTitle)
-                for chapter in chapters:
-                    self.chapterUrls.append(
-                        (chapter.getTitle()[prefixLength:], chapter.getUrl()))
-            else:
-                # Simple processing for common cases.
-                for chapter in chapters:
-                    self.chapterUrls.append(
-                        (chapter.getTitle(), chapter.getUrl()))
+                chapterTitle = re.sub(garbagePattern, u'', chapter.getHeading()[chapterTitleStart:])
 
-        except ParsingError, error:
-                raise exceptions.FailedToDownload(
-                    u"Failed to download chapter `%s': %s" % (url, error))
+                match = re.search(indexPattern, chapterTitle)
+                if match:
+                    index = int(match.group('index'))
+                    chapterCount = max(chapterCount, index)
+                else:
+                    chapterCount += 1
+
+                self.chapterUrls.append((chapterTitle, url))
+            except ParsingError, error:
+                    raise exceptions.FailedToDownload(u"Failed to download chapter `%s': %s" % (url, error))
 
         # Some metadata are handled separately due to format conversions.
         self.story.setMetadata(
@@ -266,10 +262,6 @@ class MassEffect2InAdapter(BaseSiteAdapter):
                 raise exceptions.PersonalIniFailed(
                     u"Missing `rating_titles' setting", u"MassEffect2.in", u"?")
 
-            self._parsingConfiguration['needsChapterNumbering'] = \
-                self.getConfig('strip_chapter_numbers', False) \
-                and not self.getConfig('add_chapter_numbers', False)
-
             self._parsingConfiguration['excludeEditorSignature'] = \
                 self.getConfig('exclude_editor_signature', False)
 
@@ -332,28 +324,15 @@ class Chapter(object):
         self._url = url
         self._document = document
         # Lazy-loaded:
-        self._parsedHeading = None
+        self._heading = None
         self._date = None
         self._author = None
         self._attributes = None
         self._textElement = None
         self._infoBar = None
 
-    def getIndex(self):
-        parsedHeading = self.__getHeading()
-        if 'chapterIndex' in parsedHeading:
-            return parsedHeading['chapterIndex']
-
-    def getPartIndex(self):
-        parsedHeading = self.__getHeading()
-        if 'partIndex' in parsedHeading:
-            return parsedHeading['partIndex']
-
-    def getStoryTitle(self):
-        return self.__getHeading()['storyTitle']
-
-    def getTitle(self):
-        return self.__getHeading()['chapterTitle']
+    def getHeading(self):
+        return self._extractHeading()
 
     def getAuthorId(self):
         return self._getAuthor()['id']
@@ -413,7 +392,7 @@ class Chapter(object):
             match = re.search(u'^\s*\w+', string, re.UNICODE)
             return string[match.start():match.end()]
 
-        thisStoryTitle = self.getStoryTitle()
+        thisStoryTitle = self.getHeading()
         if prefixThreshold != 0:
             if prefixThreshold < 0:
                 prefixThreshold = min(
@@ -432,147 +411,9 @@ class Chapter(object):
             self._document.find('div', {'class': 'eTitle'}).string)
 
     def __getHeading(self):
-        if not self._parsedHeading:
-            self._parsedHeading = self.__parseHeading()
-        return self._parsedHeading
-
-    NUMBERING_TITLE_PATTERN = re.compile(
-        u'''(?P<brace>\()?
-            (?P<essence>начало|продолжение|окончание|
-            часть\s(?:первая|вторая|третья|четвертая|пятая|шестая|седьмая|восьмая|девятая|десятая))
-            (?(brace)\)|\.)?
-        ''',
-        re.IGNORECASE | re.UNICODE | re.VERBOSE)
-
-    def __parseHeading(self):
-        """Locates chapter heading and extracts meaningful parts from it.
-        Returns a dictionary containing `storyTitle', `chapterTitle' (including numbering if allowed by settings,
-        may be the same as `storyTitle' for short stories, or generated from indices), `chapterIndex' (optional,
-        may be zero), and `partIndex' (optional, chapter part, may be zero)."""
-        try:
-            heading = self._extractHeading()
-        except Exception, error:
-            raise ParsingError(u'Failed to locate title: %s.' % error)
-
-        chapterIndex, partIndex, storyTitle, chapterTitle = self.__splitHeading(heading)
-        if chapterTitle:
-            match = re.search(self.NUMBERING_TITLE_PATTERN, chapterTitle)
-            if match:
-                chapterTitle = u'Глава %d. %s' % (chapterIndex, match.group('essence').capitalize())
-            elif self._configuration['needsChapterNumbering']:
-                if partIndex is not None:
-                    chapterTitle = u'%d.%d. %s' % (chapterIndex, partIndex, chapterTitle)
-                else:
-                    chapterTitle = u'%d. %s' % (chapterIndex, chapterTitle)
-        else:
-            chapterTitle = u'Глава %d' % chapterIndex
-            if partIndex is not None:
-                chapterTitle += u' (часть %d)' % partIndex
-
-        self._parsedHeading = {
-            'storyTitle': storyTitle,
-            'chapterTitle': chapterTitle
-        }
-        if chapterIndex is not None:
-            self._parsedHeading['chapterIndex'] = chapterIndex
-        if partIndex is not None:
-            self._parsedHeading['partIndex'] = partIndex
-            return self._parsedHeading
-        return self._parsedHeading
-
-    # Patterns below start end end with the same optional separator characters (to filter them)
-    # and allow only freestanding groups of 1--3 digits (ti filter long numbers in titles).
-
-    OUTLINE_PATTERN = re.compile(
-        u'''[\.:\s]*
-            (?:глава\s)?
-            (?:(?<!\d)(?P<chapterIndex>\d{1,3})(?=\D))
-            [\.-]
-            (?:(?P<partIndex>\d{1,3})(?=\D|$))
-            [\.:\s]*
-        ''',
-        re.IGNORECASE | re.UNICODE | re.VERBOSE)
-
-    CHAPTER_PATTERN = re.compile(
-        u'''[\.:\s]*
-            (?:глава\s)?(?:(?<!\d)(?P<chapterIndex>\d{1,3})(?=\D|$))
-            [\.:\s]*
-        ''',
-        re.IGNORECASE | re.UNICODE | re.VERBOSE)
-
-    PART_PATTERN = re.compile(
-        u'''[\.:\s]*
-            (?:[\.,]?\s)?
-            (?P<brace>\()?
-            (?:часть\s)?
-            (?:(?<!\d)(?P<partIndex>\d{1,3})(?=\D|$))
-            (?(brace)\))
-            [\.:\s]*
-        ''',
-        re.IGNORECASE | re.UNICODE | re.VERBOSE)
-
-    PROLOGUE_EPILOGUE_PATTERN = re.compile(
-        u'''[\.:\s]*
-            (?P<keyword>пролог|эпилог)  # `Prologue' or `epilogue' in Russian.
-            [\.:\s]*
-         ''',
-        re.IGNORECASE + re.UNICODE + re.VERBOSE)
-
-    def __splitHeading(self, heading):
-        """Parses chapter heading text into meaningful parts.
-        Returns a tuple(chapter index, part index, story title, chapter title).
-        Any or both of the indices may be None if absent, chapter title may be empty (only if chapter index is None)."""
-        def filterPrologueOrEpilogue(title):
-            match = re.search(self.PROLOGUE_EPILOGUE_PATTERN, title)
-            if match:
-                matches = list(re.finditer(u'[:\.]', title))
-                if matches:
-                    realStoryTitleEnd = matches[-1].start()
-                    return title[:realStoryTitleEnd]
-                else:
-                    _logger.warning(
-                        u"Title contains `%s', suspected to be part of numbering, but no period (`.') before it.  "
-                        u"Full title is preserved." % title)
-            return title
-
-        outline_match = re.search(self.OUTLINE_PATTERN, heading)
-        if outline_match:
-            chapter_index = int(outline_match.group('chapterIndex'))
-            part_index = int(outline_match.group('partIndex'))
-            story = heading[:outline_match.start()]
-            story = filterPrologueOrEpilogue(story)
-            chapter = heading[outline_match.end():]
-            return chapter_index, part_index, story, chapter
-        else:
-            chapter_match = re.search(self.CHAPTER_PATTERN, heading)
-            if chapter_match:
-                chapter_index = int(chapter_match.group('chapterIndex'))
-                story = heading[:chapter_match.start()]
-                story = filterPrologueOrEpilogue(story)
-                suffix = heading[chapter_match.end():]
-                part_match = re.search(self.PART_PATTERN, suffix)
-                if part_match:
-                    part_index = int(part_match.group('partIndex'))
-                    if part_match.start() == 0:
-                        chapter = suffix[part_match.end():]
-                    else:
-                        chapter = suffix[:part_match.start()]
-                    return chapter_index, part_index, story, chapter
-                else:
-                    chapter = heading[chapter_match.end():]
-                    return chapter_index, None, story, chapter
-            else:
-                match = re.search(self.PROLOGUE_EPILOGUE_PATTERN, heading)
-                if match:
-                    story = heading[:match.start()]
-                    chapter = heading[match.end():]
-                    keyword = match.group('keyword')
-                    if chapter:
-                        chapter = u"%s. %s" % (keyword.title(), chapter)
-                    else:
-                        chapter = keyword
-                    return None, None, story, chapter
-        return None, None, heading, heading
+        if not self._heading:
+            self._heading = self._extractHeading()
+        return self._heading
 
     def _getAuthor(self):
         if not self._author:
@@ -804,9 +645,10 @@ class Chapter(object):
 
 
 def _getLargestCommonPrefix(*args):
-    """Returns largest common prefix of all unicode(!) arguments.
+    """Returns largest common prefix of all unicode arguments, ignoring case.
     :rtype : unicode
     """
     from itertools import takewhile, izip
-    allSame = lambda xs: len(set(xs)) == 1
+    toLower = lambda xs: map(lambda x: x.lower(), xs)
+    allSame = lambda xs: len(set(toLower(xs))) == 1
     return u''.join([i[0] for i in takewhile(allSame, izip(*args))])

From fb6a8fc9315a5fa6c00add215bd2f655522d2bdd Mon Sep 17 00:00:00 2001
From: Dmitry Kozliuk <Dmitry.Kozliuk@gmail.com>
Date: Sun, 26 Jul 2015 21:09:24 +0300
Subject: [PATCH 10/18] Support more variants of chapter and story attribute
 formats.

Collaterally, parse characters and pairings to separate lists.
---
 fanficfare/adapters/adapter_masseffect2in.py | 121 ++++++++++++-------
 1 file changed, 80 insertions(+), 41 deletions(-)

diff --git a/fanficfare/adapters/adapter_masseffect2in.py b/fanficfare/adapters/adapter_masseffect2in.py
index 0ee6209e..844ebb1f 100644
--- a/fanficfare/adapters/adapter_masseffect2in.py
+++ b/fanficfare/adapters/adapter_masseffect2in.py
@@ -24,7 +24,7 @@ import urllib2
 import codecs
 
 from .. import BeautifulSoup as bs
-from ..htmlcleanup import stripHTML
+from ..htmlcleanup import removeEntities, stripHTML
 from .. import exceptions as exceptions
 from base_adapter import BaseSiteAdapter, makeDate
 
@@ -173,6 +173,7 @@ class MassEffect2InAdapter(BaseSiteAdapter):
 
                 self.story.extendList('genre', chapter.getGenres())
                 self.story.extendList('characters', chapter.getCharacters())
+                self.story.extendList('ships', chapter.getPairings())
 
                 wordCount += self._getWordCount(chapter.getTextElement())
 
@@ -350,16 +351,13 @@ class Chapter(object):
         return self._getAttributes()['rating']['isAdult']
 
     def getCharacters(self):
-        attributes = self._getAttributes()
-        if 'characters' in attributes:
-            return attributes['characters']
-        return []
+        return self._getListAttribute('characters')
+
+    def getPairings(self):
+        return self._getListAttribute('pairings')
 
     def getGenres(self):
-        attributes = self._getAttributes()
-        if 'genres' in attributes:
-            return attributes['genres']
-        return []
+        return self._getListAttribute('genres')
 
     def isInProgress(self):
         attributes = self._getAttributes()
@@ -405,6 +403,13 @@ class Chapter(object):
         else:
             return storyTitle != thisStoryTitle
 
+    def _getListAttribute(self, name):
+        """Return an attribute value as a list or an empty list if the attribute is absent."""
+        attributes = self._getAttributes()
+        if name in attributes:
+            return attributes[name]
+        return []
+
     def _extractHeading(self):
         """Extracts header text from the document."""
         return stripHTML(
@@ -466,41 +471,66 @@ class Chapter(object):
         return self._attributes
 
     def _parseAttributes(self):
+        """Parse chapter attribute block and return it as a dictionary with standard entries."""
+
         attributes = {}
+        attributesText = u''
         try:
-            elements = self._document \
+            starter = self._document \
                 .find('div', {'class': 'comm-div'}) \
-                .findNextSibling('div', {'class': 'cb'}) \
-                .nextGenerator()
-            attributesText = u''
-            for element in elements:
-                if not element:
-                    _logger.warning(u'Attribute block not terminated!')
-                    break
+                .findNextSibling('div', {'class': 'cb'})
+            bound = starter.findNextSibling('div', {'class': 'cb'})
+
+            def processElement(element):
+                """Return textual representation an *inline* element of chapter attribute block."""
+                result = u''
                 if isinstance(element, bs.Tag):
-                    # Although deprecated, `has_key()' is required here.
-                    if element.name == 'div' and \
-                            element.has_key('class') and \
-                            element['class'] == 'cb':
+                    if element.name in ('b', 'strong', 'font', 'br'):
+                        result += u"\n"
+                    if element.name == 's':
+                        result += u"<s>%s</s>" % stripHTML(element)
+                    else:
+                        result += stripHTML(element)
+                else:
+                    result += removeEntities(element)
+                return result
+
+            elements = starter.nextSiblingGenerator()
+            for element in elements:
+                if isinstance(element, bs.Tag):
+                    if element == bound:
+                        break
+                    else:
+                        if element.name in ('div', 'p'):
+                            attributesText += u"\n"
+                            for child in element.childGenerator():
+                                attributesText += processElement(child)
+                            continue
+                attributesText += processElement(element)
+
+            elements = starter.nextGenerator()
+            for element in elements:
+                if isinstance(element, bs.Tag):
+                    if element == bound:
                         break
                     elif element.name == 'img':
                         rating = self._parseRatingFromImage(element)
                         if rating:
                             attributes['rating'] = rating
-                else:
-                    attributesText += stripHTML(element)
+                            break
         except AttributeError or TypeError:
             raise ParsingError(u'Failed to locate and collect attributes.')
 
-        for record in re.split(u';|\.', attributesText):
-            parts = record.split(u':', 1)
-            if len(parts) < 2:
+        separators = u"\r\n :;."
+        for line in attributesText.split(u'\n'):
+            if line.count(u':') != 1:
                 continue
-            key = parts[0].strip().lower()
-            value = parts[1].strip().strip(u'.')
+            key, value = line.split(u':', 1)
+            key = key.strip(separators).lower()
+            value = value.strip().strip(separators)
             parsed = self._parseAttribute(key, value)
-            if parsed:
-                attributes[parsed[0]] = parsed[1]
+            for parsedKey, parsedValue in parsed.iteritems():
+                attributes[parsedKey] = parsedValue
 
         if 'rating' not in attributes:
             raise ParsingError(u'Failed to locate or recognize rating!')
@@ -552,7 +582,10 @@ class Chapter(object):
         re.IGNORECASE + re.UNICODE + re.VERBOSE)
 
     def _parseAttribute(self, key, value):
-        """Parses a single known attribute value for chapter metadata."""
+        """
+        Parse a single a single record in chapter attributes for chapter metadata.
+        Return a dictionary of canonical attributes and values (i. e. multiple attributes may be discovered).
+        """
 
         def refineCharacter(name):
             """Refines character name from stop-words and distortions."""
@@ -563,21 +596,27 @@ class Chapter(object):
             return canonicalName
 
         if re.match(u'жанры?', key, re.UNICODE):
-            definitions = value.split(u',')
-            if len(definitions) > 4:
-                _logger.warning(u'Possibly incorrect genre detection!')
-            genres = []
-            for definition in definitions:
-                genres += definition.split(u'/')
-            return 'genres', genres
+            genres = filter(bool, map(unicode.strip, re.split(u'[,;/]', value)))
+            return {'genres': genres}
         elif key == u'статус':
             isInProgress = value == u'в процессе'
-            return 'isInProgress', isInProgress
+            return {'isInProgress': isInProgress}
         elif key == u'персонажи':
-            characters = [refineCharacter(name) for name in value.split(u',')]
-            return 'characters', characters
+            participants = map(refineCharacter, re.split(u'[,;]', value))
+            characters = []
+            pairings = []
+            for participant in participants:
+                if u'/' in participant:
+                    pairings.append(participant)
+                else:
+                    characters.append(participant)
+            return {
+                'characters': characters,
+                'pairings': pairings
+            }
         else:
             _logger.debug(u"Unrecognized attribute `%s' ignored.", key)
+            return {}
 
     def _getTextElement(self):
         if not self._textElement:

From e2e4590f1d20e943a86363ccb22b9d2764831ec6 Mon Sep 17 00:00:00 2001
From: Dmitry Kozliuk <Dmitry.Kozliuk@gmail.com>
Date: Sun, 26 Jul 2015 21:13:47 +0300
Subject: [PATCH 11/18] Consider story in progress if the last, not any,
 chapter is in progress.

---
 fanficfare/adapters/adapter_masseffect2in.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fanficfare/adapters/adapter_masseffect2in.py b/fanficfare/adapters/adapter_masseffect2in.py
index 844ebb1f..076ca378 100644
--- a/fanficfare/adapters/adapter_masseffect2in.py
+++ b/fanficfare/adapters/adapter_masseffect2in.py
@@ -177,10 +177,11 @@ class MassEffect2InAdapter(BaseSiteAdapter):
 
                 wordCount += self._getWordCount(chapter.getTextElement())
 
-                # Story is in progress if any chapter is in progress. Some chapters may have no status attribute.
+                # Chapter status usually represents the story status, so we want the last chapter status.
+                # Some chapters may have no status attribute.
                 chapterInProgress = chapter.isInProgress()
                 if chapterInProgress is not None:
-                    storyInProgress |= chapterInProgress
+                    storyInProgress = chapterInProgress
 
                 # If any chapter is adult, consider the whole story adult.
                 if chapter.isRatingAdult():
@@ -200,8 +201,7 @@ class MassEffect2InAdapter(BaseSiteAdapter):
                     raise exceptions.FailedToDownload(u"Failed to download chapter `%s': %s" % (url, error))
 
         # Some metadata are handled separately due to format conversions.
-        self.story.setMetadata(
-            'status', 'In Progress' if storyInProgress else 'Completed')
+        self.story.setMetadata('status', 'In Progress' if storyInProgress else 'Completed')
         self.story.setMetadata('datePublished', datePublished)
         self.story.setMetadata('dateUpdated', dateUpdated)
         self.story.setMetadata('numWords', str(wordCount))

From b8710eba970c680b258835dbb33317637e9ecb2d Mon Sep 17 00:00:00 2001
From: Dmitry Kozliuk <Dmitry.Kozliuk@gmail.com>
Date: Sun, 26 Jul 2015 21:21:28 +0300
Subject: [PATCH 12/18] Collect story-wide metadata across all chapters.

    * Support multiple authors for story and no author for chapter.
    * Make chapter rating optional.
    * Detect chapter (and thus, story) "adultness" by either rating or
      editor warning, whichever is present.
    * Add first chapter summary as story summary, parse as summary
      either a dedicated attribute or freestanding text.
---
 fanficfare/adapters/adapter_masseffect2in.py | 87 +++++++++++++++-----
 1 file changed, 66 insertions(+), 21 deletions(-)

diff --git a/fanficfare/adapters/adapter_masseffect2in.py b/fanficfare/adapters/adapter_masseffect2in.py
index 076ca378..11199b1c 100644
--- a/fanficfare/adapters/adapter_masseffect2in.py
+++ b/fanficfare/adapters/adapter_masseffect2in.py
@@ -121,17 +121,6 @@ class MassEffect2InAdapter(BaseSiteAdapter):
                 raise exceptions.StoryDoesNotExist(self.url)
             raise
 
-        try:
-            self.story.setMetadata('author', startingChapter.getAuthorName())
-            authorId = startingChapter.getAuthorId()
-            authorUrl = 'http://%s/index/%s' % (self.getSiteDomain(), authorId)
-            self.story.setMetadata('authorId', authorId)
-            self.story.setMetadata('authorUrl', authorUrl)
-            self.story.setMetadata('rating', startingChapter.getRatingTitle())
-        except ParsingError, error:
-            raise exceptions.FailedToDownload(
-                u"Failed to parse story metadata for `%s': %s" % (self.url, error))
-
         # We only have one date for each chapter and assume the oldest one
         # to be publication date and the most recent one to be update date.
         datePublished = datetime.datetime.max
@@ -168,6 +157,28 @@ class MassEffect2InAdapter(BaseSiteAdapter):
             _logger.debug(u"Processing chapter `%s'.", url)
 
             try:
+                authorName = chapter.getAuthorName()
+                if authorName:
+                    self.story.extendList('author', [authorName])
+                    authorId = chapter.getAuthorId()
+                    if authorId:
+                        authorUrl = 'http://%s/index/%s' % (self.getSiteDomain(), authorId)
+                    else:
+                        authorId = u''
+                        authorUrl = u''
+                    self.story.extendList('authorId', [authorId])
+                    self.story.extendList('authorUrl', [authorUrl])
+
+                if not self.story.getMetadata('rating'):
+                    ratingTitle = chapter.getRatingTitle()
+                    if ratingTitle:
+                        self.story.setMetadata('rating', ratingTitle)
+
+                if not self.story.getMetadata('description'):
+                    summary = chapter.getSummary()
+                    if summary:
+                        self.story.setMetadata('description', summary)
+
                 datePublished = min(datePublished, chapter.getDate())
                 dateUpdated = max(dateUpdated, chapter.getDate())
 
@@ -184,7 +195,7 @@ class MassEffect2InAdapter(BaseSiteAdapter):
                     storyInProgress = chapterInProgress
 
                 # If any chapter is adult, consider the whole story adult.
-                if chapter.isRatingAdult():
+                if chapter.isAdult():
                     self.story.setMetadata('is_adult', True)
 
                 chapterTitle = re.sub(garbagePattern, u'', chapter.getHeading()[chapterTitleStart:])
@@ -198,7 +209,7 @@ class MassEffect2InAdapter(BaseSiteAdapter):
 
                 self.chapterUrls.append((chapterTitle, url))
             except ParsingError, error:
-                    raise exceptions.FailedToDownload(u"Failed to download chapter `%s': %s" % (url, error))
+                raise exceptions.FailedToDownload(u"Failed to download chapter `%s': %s" % (url, error))
 
         # Some metadata are handled separately due to format conversions.
         self.story.setMetadata('status', 'In Progress' if storyInProgress else 'Completed')
@@ -335,20 +346,36 @@ class Chapter(object):
     def getHeading(self):
         return self._extractHeading()
 
+    def getSummary(self):
+        attributes = self._getAttributes()
+        if 'summary' in attributes:
+            return attributes['summary']
+
     def getAuthorId(self):
-        return self._getAuthor()['id']
+        author = self._getAuthor()
+        if author:
+            return author['id']
 
     def getAuthorName(self):
-        return self._getAuthor()['name']
+        author = self._getAuthor()
+        if author:
+            return author['name']
 
     def getDate(self):
         return self._getDate()
 
     def getRatingTitle(self):
-        return self._getAttributes()['rating']['title']
+        attributes = self._getAttributes()
+        if 'rating' in attributes:
+            return attributes['rating']['title']
 
-    def isRatingAdult(self):
-        return self._getAttributes()['rating']['isAdult']
+    def isAdult(self):
+        attributes = self._getAttributes()
+        if 'rating' in attributes and attributes['rating']['isAdult']:
+            return True
+        if 'warning' in attributes:
+            return True
+        return False
 
     def getCharacters(self):
         return self._getListAttribute('characters')
@@ -522,8 +549,10 @@ class Chapter(object):
             raise ParsingError(u'Failed to locate and collect attributes.')
 
         separators = u"\r\n :;."
+        freestandingText = u''
         for line in attributesText.split(u'\n'):
             if line.count(u':') != 1:
+                freestandingText += line
                 continue
             key, value = line.split(u':', 1)
             key = key.strip(separators).lower()
@@ -532,15 +561,20 @@ class Chapter(object):
             for parsedKey, parsedValue in parsed.iteritems():
                 attributes[parsedKey] = parsedValue
 
+        freestandingText = freestandingText.strip()
+        if 'summary' not in attributes and freestandingText:
+            attributes['summary'] = freestandingText
+
         if 'rating' not in attributes:
-            raise ParsingError(u'Failed to locate or recognize rating!')
+            _logger.warning(u"Failed to locate or recognize rating for `%s'!", self.getUrl())
 
         return attributes
 
+    # Most, but not all, URLs of rating icons match this.
     RATING_LABEL_PATTERN = re.compile(u'/(?P<rating>[ERATINnG]+)\.png$')
 
     def _parseRatingFromImage(self, element):
-        """Given an image element, tries to parse story rating from it."""
+        """Given an image element, try to parse story rating from it."""
         # Although deprecated, `has_key()' is required here.
         if not element.has_key('src'):
             return
@@ -558,7 +592,7 @@ class Chapter(object):
                 }
             else:
                 _logger.warning(u"No title found for rating label `%s'!" % label)
-        # FIXME: It seems, rating has to be optional due to such URLs.
+        # TODO: conduct a research on such abnormal URLs.
         elif source == 'http://www.masseffect2.in/_fr/10/1360399.png':
             label = 'Nn'
             return {
@@ -581,6 +615,9 @@ class Chapter(object):
         ''',
         re.IGNORECASE + re.UNICODE + re.VERBOSE)
 
+    # `Author's Notes' and its variants in Russian.
+    ANNOTATION_PATTERN = re.compile(u'аннотация|описание|(?:(?:за|при)мечание\s)?(?:от\s)?автора', re.UNICODE)
+
     def _parseAttribute(self, key, value):
         """
         Parse a single a single record in chapter attributes for chapter metadata.
@@ -614,6 +651,14 @@ class Chapter(object):
                 'characters': characters,
                 'pairings': pairings
             }
+        elif key == u'предупреждение':
+            return {'warning': value}
+        elif re.match(self.ANNOTATION_PATTERN, key):
+            if not value.endswith(u'.'):
+                value += u'.'
+            # Capitalize would make value[1:] lowercase, which we don't want.
+            value = value[:1].upper() + value[1:]
+            return {'summary': value}
         else:
             _logger.debug(u"Unrecognized attribute `%s' ignored.", key)
             return {}

From 5b01eef4958798a659500ed8be7bce044f9e00b0 Mon Sep 17 00:00:00 2001
From: Dmitry Kozliuk <Dmitry.Kozliuk@gmail.com>
Date: Sun, 26 Jul 2015 21:26:41 +0300
Subject: [PATCH 13/18] Add documentation strings, make some methods private
 (__*).

---
 fanficfare/adapters/adapter_masseffect2in.py | 53 +++++++++++---------
 1 file changed, 30 insertions(+), 23 deletions(-)

diff --git a/fanficfare/adapters/adapter_masseffect2in.py b/fanficfare/adapters/adapter_masseffect2in.py
index 11199b1c..488dba87 100644
--- a/fanficfare/adapters/adapter_masseffect2in.py
+++ b/fanficfare/adapters/adapter_masseffect2in.py
@@ -280,7 +280,7 @@ class MassEffect2InAdapter(BaseSiteAdapter):
         return self._parsingConfiguration
 
     def _getDocumentId(self, url):
-        """Extracts document ID from MassEffect2.in URL."""
+        """Extract document ID from MassEffect2.in URL."""
         match = re.search(self.DOCUMENT_ID_PATTERN, url)
         if not match:
             raise ValueError(u"Failed to extract document ID from `'" % url)
@@ -289,12 +289,11 @@ class MassEffect2InAdapter(BaseSiteAdapter):
 
     @classmethod
     def _makeDocumentUrl(cls, documentId):
-        """Makes a chapter URL given a chapter ID."""
+        """Make a chapter URL given a document ID."""
         return 'http://%s/publ/%s' % (cls.getSiteDomain(), documentId)
 
     def _loadDocument(self, url):
-        """Fetches URL content and returns its element tree
-        with parsing settings tuned for MassEffect2.in."""
+        """Fetch URL content and return its element tree with parsing settings tuned for MassEffect2.in."""
         return bs.BeautifulStoneSoup(
             self._fetchUrl(url), selfClosingTags=('br', 'hr', 'img'))
 
@@ -302,7 +301,7 @@ class MassEffect2InAdapter(BaseSiteAdapter):
                   parameters=None,
                   usecache=True,
                   extrasleep=None):
-        """Fetches URL contents, see BaseSiteAdapter for details.
+        """Fetch URL contents, see BaseSiteAdapter for details.
         Overridden to support on-disk cache when debugging Calibre."""
         from calibre.constants import DEBUG
         if DEBUG:
@@ -347,7 +346,7 @@ class Chapter(object):
         return self._extractHeading()
 
     def getSummary(self):
-        attributes = self._getAttributes()
+        attributes = self.__getAttributes()
         if 'summary' in attributes:
             return attributes['summary']
 
@@ -362,15 +361,15 @@ class Chapter(object):
             return author['name']
 
     def getDate(self):
-        return self._getDate()
+        return self.__getDate()
 
     def getRatingTitle(self):
-        attributes = self._getAttributes()
+        attributes = self.__getAttributes()
         if 'rating' in attributes:
             return attributes['rating']['title']
 
     def isAdult(self):
-        attributes = self._getAttributes()
+        attributes = self.__getAttributes()
         if 'rating' in attributes and attributes['rating']['isAdult']:
             return True
         if 'warning' in attributes:
@@ -387,7 +386,7 @@ class Chapter(object):
         return self._getListAttribute('genres')
 
     def isInProgress(self):
-        attributes = self._getAttributes()
+        attributes = self.__getAttributes()
         if 'isInProgress' in attributes:
             return attributes['isInProgress']
 
@@ -398,17 +397,17 @@ class Chapter(object):
         return self._getTextElement()
 
     def getPreviousChapterUrl(self):
-        """Downloads chapters following `Previous chapter' links.
+        """Download chapters following `Previous chapter' links.
         Returns a list of chapters' URLs."""
         return self._getSiblingChapterUrl({'class': 'fl tal'})
 
     def getNextChapterUrl(self):
-        """Downloads chapters following `Next chapter' links.
+        """Download chapters following `Next chapter' links.
         Returns a list of chapters' URLs."""
         return self._getSiblingChapterUrl({'class': 'tar fr'})
 
     def isFromStory(self, storyTitle, prefixThreshold=-1):
-        """Checks if this chapter is from a story different from the given one.
+        """Check if this chapter is from a story different from the given one.
         Prefix threshold specifies how long common story title prefix shall be
         for chapters from one story: negative value means implementation-defined
         optimum, zero inhibits the check, and positive value adjusts threshold."""
@@ -432,27 +431,30 @@ class Chapter(object):
 
     def _getListAttribute(self, name):
         """Return an attribute value as a list or an empty list if the attribute is absent."""
-        attributes = self._getAttributes()
+        attributes = self.__getAttributes()
         if name in attributes:
             return attributes[name]
         return []
 
     def _extractHeading(self):
-        """Extracts header text from the document."""
+        """Extract header text from the document."""
         return stripHTML(
             self._document.find('div', {'class': 'eTitle'}).string)
 
     def __getHeading(self):
+        """Lazily parse and return heading."""
         if not self._heading:
             self._heading = self._extractHeading()
         return self._heading
 
     def _getAuthor(self):
+        """Lazily parse and return author's information."""
         if not self._author:
             self._author = self._parseAuthor()
         return self._author
 
     def _parseAuthor(self):
+        """Locate and parse chapter author's information to a dictionary with author's `id' and `name'."""
         try:
             authorLink = self._getInfoBarElement() \
                 .find('i', {'class': 'icon-user'}) \
@@ -469,12 +471,14 @@ class Chapter(object):
             'name': authorName
         }
 
-    def _getDate(self):
+    def __getDate(self):
+        """Lazily parse chapter date."""
         if not self._date:
             self._date = self._parseDate()
         return self._date
 
     def _parseDate(self):
+        """Locate and parse chapter date."""
         try:
             dateText = self._getInfoBarElement() \
                 .find('i', {'class': 'icon-eye'}) \
@@ -486,13 +490,15 @@ class Chapter(object):
         return date
 
     def _getInfoBarElement(self):
+        """Locate informational bar element, containing chapter date and author, on the page."""
         if not self._infoBar:
             self._infoBar = self._document.find('td', {'class': 'eDetails2'})
             if not self._infoBar:
                 raise ParsingError(u'No informational bar found.')
         return self._infoBar
 
-    def _getAttributes(self):
+    def __getAttributes(self):
+        """Lazily parse attributes."""
         if not self._attributes:
             self._attributes = self._parseAttributes()
         return self._attributes
@@ -664,12 +670,13 @@ class Chapter(object):
             return {}
 
     def _getTextElement(self):
+        """Locate chapter body text element on the page."""
         if not self._textElement:
             self._textElement = self.__collectTextElements()
         return self._textElement
 
     def __collectTextElements(self):
-        """Returns all elements containing parts of chapter text (which may be
+        """Return all elements containing parts of chapter text (which may be
         <p>aragraphs, <div>isions or plain text nodes) under a single root."""
         starter = self._document.find('div', {'id': u'article'})
         if starter is None:
@@ -696,11 +703,9 @@ class Chapter(object):
         return root
 
     def _getSiblingChapterUrl(self, selector):
-        """Downloads chapters one by one by locating and following links
-        specified by a selector.  Returns chapters' URLs in order they
-        were found."""
-        block = self._document\
-            .find('td', {'class': 'eDetails1'})\
+        """Locate a link to a sibling chapter, either previous or next one, and return its URL."""
+        block = self._document \
+            .find('td', {'class': 'eDetails1'}) \
             .find('div', selector)
         if not block:
             return
@@ -709,9 +714,11 @@ class Chapter(object):
             return
         return link['href']
 
+    # Editor signature always starts with something like this.
     SIGNED_PATTERN = re.compile(u'отредактирова(?:но|ла?)[:.\s]', re.IGNORECASE + re.UNICODE)
 
     def _excludeEditorSignature(self, root):
+        """Exclude editor signature from within `root' element."""
         for textNode in root.findAll(text=True):
             if re.match(self.SIGNED_PATTERN, textNode.string):
                 editorLink = textNode.findNext('a')

From 9d5c64b5dbf47b3c0093a50f1d437fc5edd49e99 Mon Sep 17 00:00:00 2001
From: Dmitry Kozliuk <Dmitry.Kozliuk@gmail.com>
Date: Sun, 26 Jul 2015 22:00:08 +0300
Subject: [PATCH 14/18] Remove development and debugging facilities.

---
 fanficfare/adapters/adapter_masseffect2in.py | 48 ++++----------------
 1 file changed, 9 insertions(+), 39 deletions(-)

diff --git a/fanficfare/adapters/adapter_masseffect2in.py b/fanficfare/adapters/adapter_masseffect2in.py
index 488dba87..8eb97bd2 100644
--- a/fanficfare/adapters/adapter_masseffect2in.py
+++ b/fanficfare/adapters/adapter_masseffect2in.py
@@ -1,8 +1,6 @@
 # -*- coding: utf-8 -*-
 
-# Copyright 2011 Fanficdownloader team,
-#           2015 FanFicFare team,
-#           2015 Dmitry Kozliuk
+# Copyright 2015 FanFicFare team
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -21,7 +19,6 @@ import datetime
 import logging
 import re
 import urllib2
-import codecs
 
 from .. import BeautifulSoup as bs
 from ..htmlcleanup import removeEntities, stripHTML
@@ -48,12 +45,15 @@ class ParsingError(Exception):
 
 
 class MassEffect2InAdapter(BaseSiteAdapter):
-    """Provides support for masseffect2.in site as story source.
-    Can be used as a template for sites build upon Ucoz.com engine.
+    """
+    Provides support for MassEffect2.in site as story source.
+    Can be used as a template for sites build upon Ucoz.com engine (until no base class extracted).
     Specializations:
         1) Russian content (date format, genre names, etc.);
-        2) original `R.A.T.I.N.G.' rating scale, used by masseffect2.in
-           and some affiliated sites."""
+        2) original `E.R.A.T.I.N.G.' rating scale, used by masseffect2.in
+           and some affiliated sites, denoted with images;
+        3) editor signatures an an option to remove them.
+    """
 
     WORD_PATTERN = re.compile(u'\w+', re.UNICODE)
     DOCUMENT_ID_PATTERN = re.compile(u'\d+-\d+-\d+-\d+')
@@ -297,36 +297,6 @@ class MassEffect2InAdapter(BaseSiteAdapter):
         return bs.BeautifulStoneSoup(
             self._fetchUrl(url), selfClosingTags=('br', 'hr', 'img'))
 
-    def _fetchUrl(self, url,
-                  parameters=None,
-                  usecache=True,
-                  extrasleep=None):
-        """Fetch URL contents, see BaseSiteAdapter for details.
-        Overridden to support on-disk cache when debugging Calibre."""
-        from calibre.constants import DEBUG
-        if DEBUG:
-            import os
-            documentId = self._getDocumentId(url)
-            path = u'./cache/%s' % documentId
-            if os.path.isfile(path) and os.access(path, os.R_OK):
-                _logger.debug(u"On-disk cache HIT for `%s'.", url)
-                with codecs.open(path, encoding='utf-8') as input:
-                    return input.read()
-            else:
-                _logger.debug(u"On-disk cache MISS for `%s'.", url)
-
-        content = BaseSiteAdapter._fetchUrl(
-            self, url, parameters, usecache, extrasleep)
-
-        if DEBUG:
-            import os
-            if os.path.isdir(os.path.dirname(path)):
-                _logger.debug(u"Caching `%s' content on disk.", url)
-                with codecs.open(path, mode='w', encoding='utf-8') as output:
-                    output.write(content)
-
-        return content
-
 
 class Chapter(object):
     """Represents a lazily-parsed chapter of a story."""
@@ -666,7 +636,7 @@ class Chapter(object):
             value = value[:1].upper() + value[1:]
             return {'summary': value}
         else:
-            _logger.debug(u"Unrecognized attribute `%s' ignored.", key)
+            _logger.info(u"Unrecognized attribute `%s' ignored.", key)
             return {}
 
     def _getTextElement(self):

From 13dde78139d20770085dc140f3207367444b7cf6 Mon Sep 17 00:00:00 2001
From: Dmitry Kozliuk <Dmitry.Kozliuk@gmail.com>
Date: Sun, 26 Jul 2015 22:07:50 +0300
Subject: [PATCH 15/18] Append warnings to story metadata.

---
 fanficfare/adapters/adapter_masseffect2in.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/fanficfare/adapters/adapter_masseffect2in.py b/fanficfare/adapters/adapter_masseffect2in.py
index 8eb97bd2..b8c924af 100644
--- a/fanficfare/adapters/adapter_masseffect2in.py
+++ b/fanficfare/adapters/adapter_masseffect2in.py
@@ -197,6 +197,9 @@ class MassEffect2InAdapter(BaseSiteAdapter):
                 # If any chapter is adult, consider the whole story adult.
                 if chapter.isAdult():
                     self.story.setMetadata('is_adult', True)
+                    warning = chapter.getWarning()
+                    if warning:
+                        self.story.extendList('warnings', [warning])
 
                 chapterTitle = re.sub(garbagePattern, u'', chapter.getHeading()[chapterTitleStart:])
 
@@ -346,6 +349,11 @@ class Chapter(object):
             return True
         return False
 
+    def getWarning(self):
+        attributes = self.__getAttributes()
+        if 'warning' in attributes:
+            return attributes['warning']
+
     def getCharacters(self):
         return self._getListAttribute('characters')
 

From db1cf8587c4315845dead91160c62bd454d97381 Mon Sep 17 00:00:00 2001
From: Dmitry Kozliuk <Dmitry.Kozliuk@gmail.com>
Date: Sun, 26 Jul 2015 23:08:14 +0300
Subject: [PATCH 16/18] Set lifted `slow_down_sleep_time' to prevent IP ban for
 excessive requests.

---
 calibre-plugin/plugin-defaults.ini | 5 +++++
 fanficfare/defaults.ini            | 5 +++++
 2 files changed, 10 insertions(+)

diff --git a/calibre-plugin/plugin-defaults.ini b/calibre-plugin/plugin-defaults.ini
index a68a5d99..a0832d90 100644
--- a/calibre-plugin/plugin-defaults.ini
+++ b/calibre-plugin/plugin-defaults.ini
@@ -1226,6 +1226,11 @@ extracategories:Harry Potter
 ## Site dedicated to this fandom.
 extracategories: Mass Effect
 
+## Ucoz.com engine, upon which MassEffect2.in is based, imposes an unspecified limit on request frequency.
+## Sources vary from `5 requests per second' to `2 requests per second for more than 10 per minute'.
+## With default settings, a several-hours IP ban may follow, so set it lifted.
+slow_down_sleep_time: 2
+
 ## Whether to exclude editor signature from the bottom if chapter text.
 exclude_editor_signature: false
 
diff --git a/fanficfare/defaults.ini b/fanficfare/defaults.ini
index d9069cf8..856bfa33 100644
--- a/fanficfare/defaults.ini
+++ b/fanficfare/defaults.ini
@@ -1827,6 +1827,11 @@ extracategories:Lord of the Rings
 ## Site dedicated to this fandom.
 extracategories: Mass Effect
 
+## Ucoz.com engine, upon which MassEffect2.in is based, imposes an unspecified limit on request frequency.
+## Sources vary from `5 requests per second' to `2 requests per second for more than 10 per minute'.
+## With default settings, a several-hours IP ban may follow, so set it lifted.
+slow_down_sleep_time: 2
+
 ## Whether to exclude editor signature from the bottom if chapter text.
 exclude_editor_signature: false
 

From 1ad45299db34dc6598ded351eb8122441396d963 Mon Sep 17 00:00:00 2001
From: Dmitry Kozliuk <Dmitry.Kozliuk@gmail.com>
Date: Mon, 27 Jul 2015 18:15:10 +0300
Subject: [PATCH 17/18] Fix mistakes and typos in configuration per JimmXinu's
 suggestion.

See GitHub PR comments:
    https://github.com/JimmXinu/FanFicFare/pull/103#discussion_r35535523
    https://github.com/JimmXinu/FanFicFare/pull/103#discussion_r35535396
---
 calibre-plugin/plugin-defaults.ini | 10 +++++-----
 fanficfare/defaults.ini            | 10 +++++-----
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/calibre-plugin/plugin-defaults.ini b/calibre-plugin/plugin-defaults.ini
index ebaa3f73..c8161ee9 100644
--- a/calibre-plugin/plugin-defaults.ini
+++ b/calibre-plugin/plugin-defaults.ini
@@ -1227,15 +1227,15 @@ extracategories:Harry Potter
 extracategories: Mass Effect
 
 ## Ucoz.com engine, upon which MassEffect2.in is based, imposes an unspecified limit on request frequency.
-## Sources vary from `5 requests per second' to `2 requests per second for more than 10 per minute'.
-## With default settings, a several-hours IP ban may follow, so set it lifted.
+## Reports vary from `5 requests per second' to `2 requests per second for more than 10 per minute'.
+## With default settings, a several-hours IP ban may follow, so set it higher.
 slow_down_sleep_time: 2
 
-## Whether to exclude editor signature from the bottom if chapter text.
+## Whether to exclude editor signature from the bottom of chapter text.
 exclude_editor_signature: false
 
-## Stories on the site almost never have cover image.
-## May be adjusted in `personal.ini' on per-story basis.
+## Stories on the site almost never have cover image, and for the stories which do,
+## this may be adjusted in `personal.ini' before downloading.
 never_make_cover: true
 
 ## Titles for ratings identified by 1- or 2-letter codes from `ERATING system'
diff --git a/fanficfare/defaults.ini b/fanficfare/defaults.ini
index b0a4754d..335d0462 100644
--- a/fanficfare/defaults.ini
+++ b/fanficfare/defaults.ini
@@ -1828,15 +1828,15 @@ extracategories:Lord of the Rings
 extracategories: Mass Effect
 
 ## Ucoz.com engine, upon which MassEffect2.in is based, imposes an unspecified limit on request frequency.
-## Sources vary from `5 requests per second' to `2 requests per second for more than 10 per minute'.
-## With default settings, a several-hours IP ban may follow, so set it lifted.
+## Reports vary from `5 requests per second' to `2 requests per second for more than 10 per minute'.
+## With default settings, a several-hours IP ban may follow, so set it higher.
 slow_down_sleep_time: 2
 
-## Whether to exclude editor signature from the bottom if chapter text.
+## Whether to exclude editor signature from the bottom of chapter text.
 exclude_editor_signature: false
 
-## Stories on the site almost never have cover image.
-## May be adjusted in `personal.ini' on per-story basis.
+## Stories on the site almost never have cover image, and for the stories which do,
+## this may be adjusted in `personal.ini' before downloading.
 never_make_cover: true
 
 ## Titles for ratings identified by 1- or 2-letter codes from `ERATING system'

From 0ec1e8b779fbc74d1786982f199d60e776d8ebbf Mon Sep 17 00:00:00 2001
From: Dmitry Kozliuk <Dmitry.Kozliuk@gmail.com>
Date: Mon, 27 Jul 2015 19:55:49 +0300
Subject: [PATCH 18/18] Switch MassEffect2.in adapter to BeautifulSoup 4.3.

---
 fanficfare/adapters/adapter_masseffect2in.py | 30 +++++++++-----------
 1 file changed, 13 insertions(+), 17 deletions(-)

diff --git a/fanficfare/adapters/adapter_masseffect2in.py b/fanficfare/adapters/adapter_masseffect2in.py
index b8c924af..e2b77cb3 100644
--- a/fanficfare/adapters/adapter_masseffect2in.py
+++ b/fanficfare/adapters/adapter_masseffect2in.py
@@ -15,12 +15,12 @@
 # limitations under the License.
 #
 
+import bs4
 import datetime
 import logging
 import re
 import urllib2
 
-from .. import BeautifulSoup as bs
 from ..htmlcleanup import removeEntities, stripHTML
 from .. import exceptions as exceptions
 from base_adapter import BaseSiteAdapter, makeDate
@@ -233,7 +233,7 @@ class MassEffect2InAdapter(BaseSiteAdapter):
 
     def _makeChapter(self, url):
         """Creates a chapter object given a URL."""
-        document = self._loadDocument(url)
+        document = self.make_soup(self._fetchUrl(url))
         chapter = Chapter(self._getParsingConfiguration(), url, document)
         return chapter
 
@@ -295,11 +295,6 @@ class MassEffect2InAdapter(BaseSiteAdapter):
         """Make a chapter URL given a document ID."""
         return 'http://%s/publ/%s' % (cls.getSiteDomain(), documentId)
 
-    def _loadDocument(self, url):
-        """Fetch URL content and return its element tree with parsing settings tuned for MassEffect2.in."""
-        return bs.BeautifulStoneSoup(
-            self._fetchUrl(url), selfClosingTags=('br', 'hr', 'img'))
-
 
 class Chapter(object):
     """Represents a lazily-parsed chapter of a story."""
@@ -495,7 +490,7 @@ class Chapter(object):
             def processElement(element):
                 """Return textual representation an *inline* element of chapter attribute block."""
                 result = u''
-                if isinstance(element, bs.Tag):
+                if isinstance(element, bs4.Tag):
                     if element.name in ('b', 'strong', 'font', 'br'):
                         result += u"\n"
                     if element.name == 's':
@@ -508,7 +503,7 @@ class Chapter(object):
 
             elements = starter.nextSiblingGenerator()
             for element in elements:
-                if isinstance(element, bs.Tag):
+                if isinstance(element, bs4.Tag):
                     if element == bound:
                         break
                     else:
@@ -521,7 +516,7 @@ class Chapter(object):
 
             elements = starter.nextGenerator()
             for element in elements:
-                if isinstance(element, bs.Tag):
+                if isinstance(element, bs4.Tag):
                     if element == bound:
                         break
                     elif element.name == 'img':
@@ -560,7 +555,7 @@ class Chapter(object):
     def _parseRatingFromImage(self, element):
         """Given an image element, try to parse story rating from it."""
         # Although deprecated, `has_key()' is required here.
-        if not element.has_key('src'):
+        if not element.has_attr('src'):
             return
         source = element['src']
         if 'REITiNG' in source:
@@ -659,19 +654,20 @@ class Chapter(object):
         starter = self._document.find('div', {'id': u'article'})
         if starter is None:
             # FIXME: This will occur if the method is called more than once.
-            # The reason is elements appended to `root' are removed from
-            # the document. BS 4.4 implements cloning via `copy.copy()',
-            # but supporting it for earlier versions is error-prone
-            # (due to relying on BS internals).
+            # The reason is elements appended to `root' are removed from the document.
+            # BS 4.4 implements cloning via `copy.copy()', but supporting it for BS 4.3
+            # would be error-prone (due to relying on BS internals) and is not needed.
+            if self._textElement:
+                _logger.debug(u"You may not call this function more than once!")
             raise ParsingError(u'Failed to locate text.')
         collection = [starter]
         for element in starter.nextSiblingGenerator():
             if element is None:
                 break
-            if isinstance(element, bs.Tag) and element.name == 'tr':
+            if isinstance(element, bs4.Tag) and element.name == 'tr':
                 break
             collection.append(element)
-        root = bs.Tag(self._document, 'td')
+        root = bs4.Tag(name='td')
         for element in collection:
             root.append(element)