From 707f7a347bfffd80a12e806c255c4b4e24f29dfa Mon Sep 17 00:00:00 2001 From: Dmitry Kozliuk Date: Tue, 21 Jul 2015 19:59:02 +0300 Subject: [PATCH 01/18] Add rudimentary support for `www.masseffect2.in'. Status: usable, but needs various enhancements and refactoring. Implemented: * Downloading of whole stories given a chapter URL. * Automatic chapter numbering extraction and title generation. * Author identification. * Word and chapter (not chapter parts) counting. * Genre, character, and rating detection (in basic cases). --- fanficfare/adapters/__init__.py | 1 + fanficfare/adapters/adapter_masseffect2in.py | 559 +++++++++++++++++++ fanficfare/defaults.ini | 10 + 3 files changed, 570 insertions(+) create mode 100644 fanficfare/adapters/adapter_masseffect2in.py diff --git a/fanficfare/adapters/__init__.py b/fanficfare/adapters/__init__.py index 606e0c50..d0fd6512 100644 --- a/fanficfare/adapters/__init__.py +++ b/fanficfare/adapters/__init__.py @@ -135,6 +135,7 @@ import adapter_fanfictionjunkiesde import adapter_devianthearts import adapter_tgstorytimecom import adapter_itcouldhappennet +import adapter_masseffect2in ## This bit of complexity allows adapters to be added by just adding ## importing. It eliminates the long if/else clauses we used to need diff --git a/fanficfare/adapters/adapter_masseffect2in.py b/fanficfare/adapters/adapter_masseffect2in.py new file mode 100644 index 00000000..23476951 --- /dev/null +++ b/fanficfare/adapters/adapter_masseffect2in.py @@ -0,0 +1,559 @@ +# -*- coding: utf-8 -*- + +# Copyright 2011 Fanficdownloader team, +# 2015 FanFicFare team, +# 2015 Dmitry Kozliuk +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import datetime +import logging +import re +import urllib2 +import codecs + +from .. import BeautifulSoup as bs +from ..htmlcleanup import stripHTML +from .. import exceptions as exceptions +from base_adapter import BaseSiteAdapter, makeDate + + +_logger = logging.getLogger(__name__) + + +def getClass(): + """Returns adapter class defined in this module.""" + return MassEffect2InAdapter + + +class ParsingError(Exception): + """Indicates an error while parsing web page content.""" + def __init__(self, message): + Exception.__init__(self) + self.message = message + + +class MassEffect2InAdapter(BaseSiteAdapter): + """Provides support for masseffect2.in site as story source. + Can be used as a template for sites build upon Ucoz.com engine. + Specializations: + 1) Russian content (date format, genre names, etc.); + 2) original `R.A.T.I.N.G.' rating scale, used by masseffect2.in + and some affiliated sites.""" + + WORD_PATTERN = re.compile(u'\w+', re.UNICODE) + + DOCUMENT_ID_PATTERN = re.compile(u'\d+-\d+-\d+-\d+') + + # Various `et cetera' and `et al' forms in Russian texts. + # Intended to be used with whole strings! + ETC_PATTERN = re.compile( + u'''[и&]\s(?: + (?:т\.?\s?[пд]\.?)| + (?:др(?:угие|\.)?)| + (?:пр(?:очие|\.)?)| + # Note: identically looking letters `K' and `o' + # below are from Latin and Cyrillic alphabets. + (?:ко(?:мпания)?|[KК][oо°]) + )$ + ''', + re.IGNORECASE + re.UNICODE + re.VERBOSE) + + CHAPTER_NUMBER_PATTERN = re.compile( + u'''[\.:\s]* + (?:глава)? # `Chapter' in Russian. + \s + (?P\d+) + (?: + (?: + # For `X.Y' and `X-Y' numbering styles: + [\-\.]| + # For `Chapter X (part Y)' and similar numbering styles: + [\.,]?\s + (?P\()? + (?:часть)? # `Part' in Russian. + \s + ) + (?P\d+) + (?(brace)\)) + )? + [\.:\s]* + ''', + re.IGNORECASE + re.UNICODE + re.VERBOSE) + + PROLOGUE_EPILOGUE_PATTERN = re.compile( + u'''[\.:\s]* # Optional separators. + (пролог|эпилог) # `Prologue' or `epilogue' in Russian. + [\.:\s]* # Optional separators. + ''', + re.IGNORECASE + re.UNICODE + re.VERBOSE) + + def __init__(self, config, url): + BaseSiteAdapter.__init__(self, config, url) + + self.decode = ["utf8"] + self.dateformat = "%d.%m.%Y" + + self.story.setMetadata('siteabbrev', 'me2in') + + self.story.setMetadata('storyId', self._extractDocumentId(self.url)) + + self._setURL(self._makeUrl(self.story.getMetadata('storyId'))) + + self._transient_metadata = {} + + # Memory cache of document HTML parsing results. Increases performance + # drastically, because all downloaded pages are parsed at least twice. + # FIXME: Can be simplified when BS is updated to 4.4 with cloning. + self._parsing_cache = {} + + @classmethod + def _makeUrl(cls, chapterId): + """Makes a chapter URL given a chapter ID.""" + return 'http://%s/publ/%s' % (cls.getSiteDomain(), chapterId) + + # Must be @staticmethod, not @classmethod! + @staticmethod + def getSiteDomain(): + return 'www.masseffect2.in' + + @classmethod + def getSiteExampleURLs(cls): + return u' '.join([cls._makeUrl('19-1-0-1234'), + cls._makeUrl('24-1-0-4321')]) + + def getSiteURLPattern(self): + return re.escape(self._makeUrl('')) + self.DOCUMENT_ID_PATTERN.pattern + + def use_pagecache(self): + """Allows use of downloaded page cache. It is essential for this + adapter, because the site does not offers chapter URL list, and many + pages have to be fetched and parsed repeatedly.""" + return True + + def extractChapterUrlsAndMetadata(self): + """Extracts chapter URLs and story metadata. Actually downloads all + chapters, which is not exactly right, but necessary due to technical + limitations of the site.""" + + def followLinks(document, selector): + """Downloads chapters one by one by locating and following links + specified by a selector. Returns chapters' URLs in order they + were found.""" + block = document\ + .find('td', {'class': 'eDetails1'})\ + .find('div', selector) + if not block: + return + link = block.find('a') + if not link: + return + chapterId = self._extractDocumentId(link['href']) + url = self._makeUrl(chapterId) + try: + chapter = self._loadDocument(url) + except urllib2.HTTPError, error: + if error.code == 404: + raise exceptions.FailedToDownload( + u'Error downloading chapter: %s!' % url) + raise + yield url + for url in followLinks(chapter, selector): + yield url + + def followPreviousLinks(document): + """Downloads chapters following `Previous chapter' links. + Returns a list of chapters' URLs.""" + urls = list(followLinks(document, {'class': 'fl tal'})) + return list(reversed(urls)) + + def followNextLinks(document): + """Downloads chapters following `Next chapter' links. + Returns a list of chapters' URLs.""" + return list(followLinks(document, {'class': 'tar fr'})) + + try: + document = self._loadDocument(self.url) + except urllib2.HTTPError, error: + if error.code == 404: + raise exceptions.StoryDoesNotExist(self.url) + raise + # There is no convenient mechanism to obtain URLs of all chapters + # other than navigating to previous and next chapters using links + # located on each chapter page. + chapters = \ + followPreviousLinks(document) + \ + [self.url] + \ + followNextLinks(document) + + # Transient metadata is updated when parsing each chapter, + # then converted and saved to story metadata. + self._transient_metadata = { + # We only have one date for each chapter and assume the oldest one + # to be publication date and the most recent one to be update date. + 'datePublished': datetime.datetime.max, + 'dateUpdated': datetime.datetime.min, + + 'numWords': 0, + + # We aim at counting chapters, not chapter parts. + 'numChapters': 0 + } + + for url in chapters: + chapter = self._loadDocument(url) + _logger.debug(u"Parsing chapter `%s'", url) + self._parseChapterMetadata(url, chapter) + + # Attributes are handled separately due to format conversions. + self.story.setMetadata( + 'datePublished', self._transient_metadata['datePublished']) + self.story.setMetadata( + 'dateUpdated', self._transient_metadata['dateUpdated']) + self.story.setMetadata( + 'numWords', str(self._transient_metadata['numWords'])) + self.story.setMetadata( + 'numChapters', self._transient_metadata['numChapters']) + + def getChapterText(self, url): + """Grabs the text for an individual chapter.""" + element = self._getChapterTextElement(url) + return self.utf8FromSoup(url, element) + + def _parseChapterMetadata(self, url, document): + try: + self._parseTitle(url, document) + infoBar = document.find('td', {'class': 'eDetails2'}) + if not infoBar: + raise ParsingError(u'No informational bar found.') + if not self.story.getMetadata('authorId'): + self._parseAuthor(infoBar) + self._parseDates(infoBar) + self._parseTextForWordCount(url) + self._parseAttributes(document) + except ParsingError, error: + raise exceptions.FailedToDownload( + u"Error parsing `%s'. %s" % (url, error.message)) + + def _parseAttributes(self, document): + try: + elements = document \ + .find('div', {'class': 'comm-div'}) \ + .findNextSibling('div', {'class': 'cb'}) \ + .nextGenerator() + attributesText = u'' + for element in elements: + if not element: + _logger.warning(u'Attribute block not terminated!') + break + if isinstance(element, bs.Tag): + # Although deprecated, `has_key()' is required here. + if element.name == 'div' and \ + element.has_key('class') and \ + element['class'] == 'cb': + break + elif element.name == 'img': + self._parseRatingFromImage(element) + else: + attributesText += stripHTML(element) + except AttributeError or TypeError: + raise ParsingError(u'Failed to locate and collect attributes.') + + for record in re.split(u';|\.', attributesText): + parts = record.split(u':', 1) + if len(parts) < 2: + continue + key = parts[0].strip().lower() + value = parts[1].strip().strip(u'.') + self._parseAttribute(key, value) + + def _parseRatingFromImage(self, element): + """Given an image element, tries to parse story rating from it.""" + # FIXME: This should probably be made adjustable via settings. + ratings = { + 'E': u'Exempt (18+)', + 'R': u'Restricted (16+)', + 'A': u'Иная история', + 'T': u'To every', + 'I': u'Art house', + 'Nn': u'Новый мир', + 'G': u'О, господи!', + } + ratings['IN'] = ratings['A'] + + # Although deprecated, `has_key()' is required here. + if not element.has_key('src'): + return + source = element['src'] + if 'REITiNG' not in source: + return + match = re.search(u'/(?P[ERATINnG]+)\.png$', source) + if not match: + return + symbol = match.group('rating') + if symbol == 'IN': + symbol = 'A' + if symbol in ratings: + rating = ratings[symbol] + self.story.setMetadata('rating', rating) + if symbol in ('R', 'E'): + self.is_adult = True + + def _parseAttribute(self, key, value): + """Parses a single known attribute value for chapter metadata.""" + + def refineCharacter(name): + """Refines character name from stop-words and distortions.""" + strippedName = name.strip() + nameOnly = re.sub(self.ETC_PATTERN, u'', strippedName) + # TODO: extract canonical name (even ME-specific?). + canonicalName = nameOnly + return canonicalName + + if key == u'жанр': + definitions = value.split(u',') + if len(definitions) > 4: + _logger.warning(u'Possibly incorrect genre detection!') + for definition in definitions: + genres = definition.split(u'/') + self.story.extendList('genre', genres) + elif key == u'статус': + status = 'In-Progress' if value == u'в процессе' else 'Completed' + self.story.setMetadata('status', status) + elif key == u'персонажи': + characters = [refineCharacter(name) for name in value.split(u',')] + self.story.extendList('characters', characters) + else: + _logger.debug(u"Unrecognized attribute `%s'.", key) + + def _parseTextForWordCount(self, url): + element = self._getChapterTextElement(url) + text = stripHTML(element) + count = len(re.findall(self.WORD_PATTERN, text)) + self._transient_metadata['numWords'] += count + pass + + def _parseDates(self, infoBar): + try: + dateText = infoBar \ + .find('i', {'class': 'icon-eye'}) \ + .findPreviousSibling(text=True) \ + .strip(u'| \n') + except AttributeError: + raise ParsingError(u'Failed to locate date.') + date = makeDate(dateText, self.dateformat) + if date > self._transient_metadata['dateUpdated']: + self._transient_metadata['dateUpdated'] = date + if date < self._transient_metadata['datePublished']: + self._transient_metadata['datePublished'] = date + + def _parseAuthor(self, strip): + try: + authorLink = strip \ + .find('i', {'class': 'icon-user'}) \ + .findNextSibling('a') + except AttributeError: + raise ParsingError(u'Failed to locate author link.') + match = re.search(u'(8-\d+)', authorLink['onclick']) + if not match: + raise ParsingError(u'Failed to extract author ID.') + authorId = match.group(0) + authorUrl = 'http://%s/index/%s' % (self.getSiteDomain(), authorId) + authorName = stripHTML(authorLink.text) + self.story.setMetadata('authorId', authorId) + self.story.setMetadata('authorUrl', authorUrl) + self.story.setMetadata('author', authorName) + + def _parseTitle(self, url, document): + try: + fullTitle = stripHTML( + document.find('div', {'class': 'eTitle'}).string) + except AttributeError: + raise ParsingError(u'Failed to locate title.') + parsedHeading = self._parseHeading(fullTitle) + if not self.story.getMetadata('title'): + self.story.setMetadata('title', parsedHeading['storyTitle']) + if 'chapterIndex' in parsedHeading: + self._transient_metadata['numChapters'] = max( + self._transient_metadata['numChapters'], + parsedHeading['chapterIndex']) + else: + self._transient_metadata['numChapters'] += 1 + self.chapterUrls.append((parsedHeading['chapterTitle'], url)) + + def _parseHeading(self, fullTitle): + """Extracts meaningful parts from full chapter heading with. + Returns a dictionary containing `storyTitle', `chapterTitle' + (including numbering if allowed by settings, may be the same as + `storyTitle' for short stories), `chapterIndex' (optional, may be + zero), and `partIndex' (optional, chapter part, may be zero). + When no dedicated chapter title is present, generates one based on + chapter and part indices. Correctly handles `prologue' and `epilogue' + cases.""" + match = re.search(self.CHAPTER_NUMBER_PATTERN, fullTitle) + if match: + chapterIndex = int(match.group('chapterIndex')) + # There are cases with zero chapter or part number (e. g.: + # numbered prologue, not to be confused with just `Prologue'). + if match.group('partIndex'): + partIndex = int(match.group('partIndex')) + else: + partIndex = None + chapterTitle = fullTitle[match.end():].strip() + if chapterTitle: + if self.getConfig('strip_chapter_numbers', False) \ + and not self.getConfig('add_chapter_numbers', False): + if partIndex is not None: + title = u'%d.%d %s' % \ + (chapterIndex, partIndex, chapterTitle) + else: + title = u'%d. %s' % (chapterIndex, chapterTitle) + else: + title = chapterTitle + else: + title = u'Глава %d' % chapterIndex + if partIndex: + title += u' (часть %d)' % partIndex + + # For seldom found cases like `Story: prologue and chapter 1'. + storyTitle = fullTitle[:match.start()] + match = re.search(self.PROLOGUE_EPILOGUE_PATTERN, storyTitle) + if match: + matches = list( + re.finditer(u'[:\.]', storyTitle)) + if matches: + realStoryTitleEnd = matches[-1].start() + if realStoryTitleEnd >= 0: + storyTitle = storyTitle[:realStoryTitleEnd] + else: + _logger.warning( + u"Title contains `%s', suspected to be part of " + u"numbering, but no period (`.') before it. " + u"Full title is preserved." % storyTitle) + + result = { + 'storyTitle': storyTitle, + 'chapterTitle': title, + 'chapterIndex': chapterIndex + } + if partIndex is not None: + result['partIndex'] = partIndex + return result + + match = re.search(self.PROLOGUE_EPILOGUE_PATTERN, fullTitle) + if match: + storyTitle = fullTitle[:match.start()] + chapterTitle = fullTitle[match.end():].strip() + matchedText = fullTitle[match.start():match.end()] + if chapterTitle: + title = u'%s. %s' % (matchedText, chapterTitle) + else: + title = matchedText + return { + 'storyTitle': storyTitle, + 'chapterTitle': title + } + + return { + 'storyTitle': fullTitle, + 'chapterTitle': fullTitle + } + + def _loadDocument(self, url): + """Fetches URL content and returns its element tree + with parsing settings tuned for MassEffect2.in.""" + documentId = self._extractDocumentId(url) + if documentId in self._parsing_cache: + _logger.debug(u"Memory cache HIT for parsed `%s'", url) + return self._parsing_cache[documentId]['document'] + else: + _logger.debug(u"Memory cache MISS for parsed `%s'", url) + document = bs.BeautifulStoneSoup( + self._fetchUrl(url), selfClosingTags=('br', 'hr', 'img')) + self._parsing_cache[documentId] = {'document': document} + return document + + def _fetchUrl(self, url, + parameters=None, + usecache=True, + extrasleep=None): + """Fetches URL contents, see BaseSiteAdapter for details. + Overridden to support on-disk cache when debugging Calibre.""" + from calibre.constants import DEBUG + if DEBUG: + import os + documentId = self._extractDocumentId(url) + path = u'./cache/%s' % documentId + if os.path.isfile(path) and os.access(path, os.R_OK): + _logger.debug(u"On-disk cache HIT for `%s'.", url) + with codecs.open(path, encoding='utf-8') as input: + return input.read() + else: + _logger.debug(u"On-disk cache MISS for `%s'.", url) + + content = BaseSiteAdapter._fetchUrl( + self, url, parameters, usecache, extrasleep) + + if DEBUG: + import os + if os.path.isdir(os.path.dirname(path)): + _logger.debug(u"Caching `%s' content on disk.", url) + with codecs.open(path, mode='w', encoding='utf-8') as output: + output.write(content) + + return content + + def _extractDocumentId(self, url): + """Extracts document ID from MassEffect2.in URL.""" + match = re.search(self.DOCUMENT_ID_PATTERN, url) + if not match: + raise ValueError(u"Failed to extract document ID from `'" % url) + documentId = url[match.start():match.end()] + return documentId + + def _getChapterTextElement(self, url): + """Fetches URL content and extracts an element containing text body. + Shall be used instead of `__collectTextElements'.""" + documentId = self._extractDocumentId(url) + document = self._loadDocument(url) + cache = self._parsing_cache[documentId] + if 'body' in cache: + return cache['body'] + else: + body = self.__collectTextElements(document) + cache['body'] = body + return body + + def __collectTextElements(self, document): + """Returns all elements containing parts of chapter text (which may be +

aragraphs,

isions or plain text nodes) under a single root.""" + starter = document.find('div', {'id': u'article'}) + if starter is None: + # FIXME: This will occur if the method is called more than once. + # The reason is elements appended to `root' are removed from + # the document. BS 4.4 implements cloning via `copy.copy()', + # but supporting it for earlier versions is error-prone + # (due to relying on BS internals). + raise ParsingError(u'Failed to locate text.') + collection = [starter] + for element in starter.nextSiblingGenerator(): + if element is None: + break + if isinstance(element, bs.Tag) and element.name == 'tr': + break + collection.append(element) + root = bs.Tag(document, 'td') + for element in collection: + root.append(element) + return root diff --git a/fanficfare/defaults.ini b/fanficfare/defaults.ini index 0663c12e..fe753705 100644 --- a/fanficfare/defaults.ini +++ b/fanficfare/defaults.ini @@ -1727,6 +1727,16 @@ extraships:InuYasha/Kagome ## Site dedicated to these categories/characters/ships extracategories:Lord of the Rings +[www.masseffect2.in] +## Site with stories in this language. +language:Russian +## Site dedicated to this fandom. +extracategories:Mass Effect + +## Stories on the site almost never have cover image. +## May be adjusted in `personal.ini' on per-story basis. +cover_exclusion_regexp:.* + [www.mediaminer.org] [www.midnightwhispers.ca] From aa93d4bb2df983b4268147a3d3f63b4cf0cc1f34 Mon Sep 17 00:00:00 2001 From: Dmitry Kozliuk Date: Wed, 22 Jul 2015 14:31:06 +0300 Subject: [PATCH 02/18] Set site-specific language and category correctly. --- calibre-plugin/plugin-defaults.ini | 11 +++++++++++ fanficfare/adapters/adapter_masseffect2in.py | 5 +++++ fanficfare/defaults.ini | 5 +++-- 3 files changed, 19 insertions(+), 2 deletions(-) diff --git a/calibre-plugin/plugin-defaults.ini b/calibre-plugin/plugin-defaults.ini index da2f0782..94cd796e 100644 --- a/calibre-plugin/plugin-defaults.ini +++ b/calibre-plugin/plugin-defaults.ini @@ -1132,6 +1132,17 @@ extracategories:Lord of the Rings ## Site dedicated to these categories/characters/ships extracategories:Harry Potter +[www.masseffect2.in] +## Site dedicated to this fandom. +extracategories: Mass Effect + +## Stories on the site almost never have cover image. +## May be adjusted in `personal.ini' on per-story basis. +cover_exclusion_regexp:.* + +my_custom_label: Some text +my_custom_setting: true + [merlinfic.dtwins.co.uk] ## Some sites require login (or login for some rated stories) The ## program can prompt you, or you can save it in config. In diff --git a/fanficfare/adapters/adapter_masseffect2in.py b/fanficfare/adapters/adapter_masseffect2in.py index 23476951..80d5446a 100644 --- a/fanficfare/adapters/adapter_masseffect2in.py +++ b/fanficfare/adapters/adapter_masseffect2in.py @@ -99,6 +99,8 @@ class MassEffect2InAdapter(BaseSiteAdapter): ''', re.IGNORECASE + re.UNICODE + re.VERBOSE) + SITE_LANGUAGE = u'Russian' + def __init__(self, config, url): BaseSiteAdapter.__init__(self, config, url) @@ -225,6 +227,9 @@ class MassEffect2InAdapter(BaseSiteAdapter): 'numWords', str(self._transient_metadata['numWords'])) self.story.setMetadata( 'numChapters', self._transient_metadata['numChapters']) + # Site-specific metadata. + self.story.setMetadata( + 'language', self.SITE_LANGUAGE) def getChapterText(self, url): """Grabs the text for an individual chapter.""" diff --git a/fanficfare/defaults.ini b/fanficfare/defaults.ini index fe753705..231b154d 100644 --- a/fanficfare/defaults.ini +++ b/fanficfare/defaults.ini @@ -1728,8 +1728,6 @@ extraships:InuYasha/Kagome extracategories:Lord of the Rings [www.masseffect2.in] -## Site with stories in this language. -language:Russian ## Site dedicated to this fandom. extracategories:Mass Effect @@ -1737,6 +1735,9 @@ extracategories:Mass Effect ## May be adjusted in `personal.ini' on per-story basis. cover_exclusion_regexp:.* +my_custom_label:Some text +my_custom_setting:true + [www.mediaminer.org] [www.midnightwhispers.ca] From 6a13323c9294d0b99da9529f725fd29b5341c610 Mon Sep 17 00:00:00 2001 From: Dmitry Kozliuk Date: Wed, 22 Jul 2015 19:33:28 +0300 Subject: [PATCH 03/18] Improved configuration per JimmXinu's suggestion. https://github.com/PlushBeaver/FanFicFare/commit/707f7a347bfffd80a12e806c255c4b4e24f29dfa#commitcomment-12298782 --- calibre-plugin/plugin-defaults.ini | 2 +- fanficfare/defaults.ini | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/calibre-plugin/plugin-defaults.ini b/calibre-plugin/plugin-defaults.ini index a545e982..1ba2900a 100644 --- a/calibre-plugin/plugin-defaults.ini +++ b/calibre-plugin/plugin-defaults.ini @@ -1228,7 +1228,7 @@ extracategories: Mass Effect ## Stories on the site almost never have cover image. ## May be adjusted in `personal.ini' on per-story basis. -cover_exclusion_regexp:.* +never_make_cover: true my_custom_label: Some text my_custom_setting: true diff --git a/fanficfare/defaults.ini b/fanficfare/defaults.ini index 765d0e3d..e35ad4fd 100644 --- a/fanficfare/defaults.ini +++ b/fanficfare/defaults.ini @@ -1829,7 +1829,7 @@ extracategories:Mass Effect ## Stories on the site almost never have cover image. ## May be adjusted in `personal.ini' on per-story basis. -cover_exclusion_regexp:.* +never_make_cover: true my_custom_label:Some text my_custom_setting:true From 79b56c872f1ab21430cd9e5a46dcf6c263e43c8a Mon Sep 17 00:00:00 2001 From: Dmitry Kozliuk Date: Thu, 23 Jul 2015 02:25:43 +0300 Subject: [PATCH 04/18] Refactor MassEffect2.in adapter and improve it. Refactoring: * New `Chapter' class extracted to separate HTML parsing from adapter output preparation and story-wide metadata collection. * Lazy-parsing and fragments caching is done at Chapter level. Improvements: * Rating "adultness" and label-to-title mapping made configurable. * Fix chapter number detection when title contains large numbers (ex.: http://www.masseffect2.in/publ/19-1-0-2934). * Add mechanism for detecting series-like stories with no chapter numbering and extracting correct titles. * Fix number format for generated chapter titles. --- calibre-plugin/plugin-defaults.ini | 7 +- fanficfare/adapters/adapter_masseffect2in.py | 957 +++++++++++-------- fanficfare/configurable.py | 2 + fanficfare/defaults.ini | 9 +- 4 files changed, 577 insertions(+), 398 deletions(-) diff --git a/calibre-plugin/plugin-defaults.ini b/calibre-plugin/plugin-defaults.ini index 1ba2900a..ac27065e 100644 --- a/calibre-plugin/plugin-defaults.ini +++ b/calibre-plugin/plugin-defaults.ini @@ -1230,8 +1230,11 @@ extracategories: Mass Effect ## May be adjusted in `personal.ini' on per-story basis. never_make_cover: true -my_custom_label: Some text -my_custom_setting: true +## Titles for ratings identified by 1- or 2-letter codes from `ERATING system' +## (`система Р.Е.Й.Т.И.Н.Г.'). MassEffect2.in and some other sites adopted it, +## but changed titles and update them occasionally. +rating_titles: R=RESTRICTED (16+), E=EXEMPT (18+), I=ART HOUSE, T=To every, A=IN=Иной мир, Nn=Новый мир, G=О\, Господи! +adult_ratings: E,R [merlinfic.dtwins.co.uk] ## Some sites require login (or login for some rated stories) The diff --git a/fanficfare/adapters/adapter_masseffect2in.py b/fanficfare/adapters/adapter_masseffect2in.py index 80d5446a..815ae191 100644 --- a/fanficfare/adapters/adapter_masseffect2in.py +++ b/fanficfare/adapters/adapter_masseffect2in.py @@ -43,6 +43,9 @@ class ParsingError(Exception): Exception.__init__(self) self.message = message + def __str__(self): + return self.message + class MassEffect2InAdapter(BaseSiteAdapter): """Provides support for masseffect2.in site as story source. @@ -53,77 +56,21 @@ class MassEffect2InAdapter(BaseSiteAdapter): and some affiliated sites.""" WORD_PATTERN = re.compile(u'\w+', re.UNICODE) - DOCUMENT_ID_PATTERN = re.compile(u'\d+-\d+-\d+-\d+') - - # Various `et cetera' and `et al' forms in Russian texts. - # Intended to be used with whole strings! - ETC_PATTERN = re.compile( - u'''[и&]\s(?: - (?:т\.?\s?[пд]\.?)| - (?:др(?:угие|\.)?)| - (?:пр(?:очие|\.)?)| - # Note: identically looking letters `K' and `o' - # below are from Latin and Cyrillic alphabets. - (?:ко(?:мпания)?|[KК][oо°]) - )$ - ''', - re.IGNORECASE + re.UNICODE + re.VERBOSE) - - CHAPTER_NUMBER_PATTERN = re.compile( - u'''[\.:\s]* - (?:глава)? # `Chapter' in Russian. - \s - (?P\d+) - (?: - (?: - # For `X.Y' and `X-Y' numbering styles: - [\-\.]| - # For `Chapter X (part Y)' and similar numbering styles: - [\.,]?\s - (?P\()? - (?:часть)? # `Part' in Russian. - \s - ) - (?P\d+) - (?(brace)\)) - )? - [\.:\s]* - ''', - re.IGNORECASE + re.UNICODE + re.VERBOSE) - - PROLOGUE_EPILOGUE_PATTERN = re.compile( - u'''[\.:\s]* # Optional separators. - (пролог|эпилог) # `Prologue' or `epilogue' in Russian. - [\.:\s]* # Optional separators. - ''', - re.IGNORECASE + re.UNICODE + re.VERBOSE) - SITE_LANGUAGE = u'Russian' def __init__(self, config, url): BaseSiteAdapter.__init__(self, config, url) self.decode = ["utf8"] - self.dateformat = "%d.%m.%Y" self.story.setMetadata('siteabbrev', 'me2in') + self.story.setMetadata('storyId', self._getDocumentId(self.url)) - self.story.setMetadata('storyId', self._extractDocumentId(self.url)) + self._setURL(self._makeDocumentUrl(self.story.getMetadata('storyId'))) - self._setURL(self._makeUrl(self.story.getMetadata('storyId'))) - - self._transient_metadata = {} - - # Memory cache of document HTML parsing results. Increases performance - # drastically, because all downloaded pages are parsed at least twice. - # FIXME: Can be simplified when BS is updated to 4.4 with cloning. - self._parsing_cache = {} - - @classmethod - def _makeUrl(cls, chapterId): - """Makes a chapter URL given a chapter ID.""" - return 'http://%s/publ/%s' % (cls.getSiteDomain(), chapterId) + self._chapters = {} + self._parsingConfiguration = None # Must be @staticmethod, not @classmethod! @staticmethod @@ -132,11 +79,11 @@ class MassEffect2InAdapter(BaseSiteAdapter): @classmethod def getSiteExampleURLs(cls): - return u' '.join([cls._makeUrl('19-1-0-1234'), - cls._makeUrl('24-1-0-4321')]) + return u' '.join([cls._makeDocumentUrl('19-1-0-1234'), + cls._makeDocumentUrl('24-1-0-4321')]) def getSiteURLPattern(self): - return re.escape(self._makeUrl('')) + self.DOCUMENT_ID_PATTERN.pattern + return re.escape(self._makeDocumentUrl('')) + self.DOCUMENT_ID_PATTERN.pattern def use_pagecache(self): """Allows use of downloaded page cache. It is essential for this @@ -149,345 +96,197 @@ class MassEffect2InAdapter(BaseSiteAdapter): chapters, which is not exactly right, but necessary due to technical limitations of the site.""" - def followLinks(document, selector): - """Downloads chapters one by one by locating and following links - specified by a selector. Returns chapters' URLs in order they - were found.""" - block = document\ - .find('td', {'class': 'eDetails1'})\ - .find('div', selector) - if not block: - return - link = block.find('a') - if not link: - return - chapterId = self._extractDocumentId(link['href']) - url = self._makeUrl(chapterId) - try: - chapter = self._loadDocument(url) - except urllib2.HTTPError, error: - if error.code == 404: - raise exceptions.FailedToDownload( - u'Error downloading chapter: %s!' % url) - raise - yield url - for url in followLinks(chapter, selector): - yield url - - def followPreviousLinks(document): - """Downloads chapters following `Previous chapter' links. - Returns a list of chapters' URLs.""" - urls = list(followLinks(document, {'class': 'fl tal'})) - return list(reversed(urls)) - - def followNextLinks(document): - """Downloads chapters following `Next chapter' links. - Returns a list of chapters' URLs.""" - return list(followLinks(document, {'class': 'tar fr'})) + def followChapters(starting, forward=True): + if forward: + url = starting.getNextChapterUrl() + else: + url = starting.getPreviousChapterUrl() + if url: + url = self._makeDocumentUrl(self._getDocumentId(url)) + following = self._makeChapter(url) + if forward: + yield following + for chapter in followChapters(following, forward): + yield chapter + if not forward: + yield following try: - document = self._loadDocument(self.url) + startingChapter = self._makeChapter(self.url) except urllib2.HTTPError, error: if error.code == 404: raise exceptions.StoryDoesNotExist(self.url) raise - # There is no convenient mechanism to obtain URLs of all chapters - # other than navigating to previous and next chapters using links - # located on each chapter page. + + try: + self.story.setMetadata('title', startingChapter.getStoryTitle()) + self.story.setMetadata('author', startingChapter.getAuthorName()) + authorId = startingChapter.getAuthorId() + authorUrl = 'http://%s/index/%s' % (self.getSiteDomain(), authorId) + self.story.setMetadata('authorId', authorId) + self.story.setMetadata('authorUrl', authorUrl) + self.story.setMetadata('rating', startingChapter.getRatingTitle()) + except ParsingError, error: + raise exceptions.FailedToDownload( + u"Failed to parse story metadata for `%s': %s" % (self.url, error)) + + # We only have one date for each chapter and assume the oldest one + # to be publication date and the most recent one to be update date. + datePublished = datetime.datetime.max + dateUpdated = datetime.datetime.min + wordCount = 0 + # We aim at counting chapters, not chapter parts. + chapterCount = 0 + storyInProgress = False + chapters = \ - followPreviousLinks(document) + \ - [self.url] + \ - followNextLinks(document) + list(followChapters(startingChapter, forward=False)) + \ + [startingChapter] + \ + list(followChapters(startingChapter, forward=True)) - # Transient metadata is updated when parsing each chapter, - # then converted and saved to story metadata. - self._transient_metadata = { - # We only have one date for each chapter and assume the oldest one - # to be publication date and the most recent one to be update date. - 'datePublished': datetime.datetime.max, - 'dateUpdated': datetime.datetime.min, + try: + for chapter in chapters: + url = chapter.getUrl() + self._chapters[url] = chapter + _logger.debug(u"Processing chapter `%s'.", url) - 'numWords': 0, + datePublished = min(datePublished, chapter.getDate()) + dateUpdated = max(dateUpdated, chapter.getDate()) - # We aim at counting chapters, not chapter parts. - 'numChapters': 0 - } + self.story.extendList('genre', chapter.getGenres()) + self.story.extendList('characters', chapter.getCharacters()) - for url in chapters: - chapter = self._loadDocument(url) - _logger.debug(u"Parsing chapter `%s'", url) - self._parseChapterMetadata(url, chapter) + wordCount += self._getWordCount(chapter.getTextElement()) - # Attributes are handled separately due to format conversions. + index = chapter.getIndex() + if index: + chapterCount = max(chapterCount, index) + else: + chapterCount += 1 + + # Story is in progress if any chapter is in progress. + # Some chapters may have no status attribute. + chapterInProgress = chapter.isInProgress() + if chapterInProgress is not None: + storyInProgress |= chapterInProgress + + # If any chapter is adult, consider the whole story adult. + if chapter.isRatingAdult(): + self.story.setMetadata('is_adult', True) + + titles = [chapter.getTitle() for chapter in chapters] + hasNumbering = any([chapter.getIndex() is not None for chapter in chapters]) + if not hasNumbering: + # There are stories without chapter numbering, but under single title, + # which is heading prefix (such stories are not series). We identify + # common prefix for all chapters and use it as story title, trimming + # chapter titles the length of this prefix. + largestCommonPrefix = _getLargestCommonPrefix(*titles) + prefixLength = len(largestCommonPrefix) + storyTitle = re.sub(u'[:\.\s]*$', u'', largestCommonPrefix, re.UNICODE) + self.story.setMetadata('title', storyTitle) + for chapter in chapters: + self.chapterUrls.append( + (chapter.getTitle()[prefixLength:], chapter.getUrl())) + else: + # Simple processing for common cases. + for chapter in chapters: + self.chapterUrls.append( + (chapter.getTitle(), chapter.getUrl())) + + except ParsingError, error: + raise exceptions.FailedToDownload( + u"Failed to download chapter `%s': %s" % (url, error)) + + # Some metadata are handled separately due to format conversions. self.story.setMetadata( - 'datePublished', self._transient_metadata['datePublished']) - self.story.setMetadata( - 'dateUpdated', self._transient_metadata['dateUpdated']) - self.story.setMetadata( - 'numWords', str(self._transient_metadata['numWords'])) - self.story.setMetadata( - 'numChapters', self._transient_metadata['numChapters']) + 'status', 'In Progress' if storyInProgress else 'Completed') + self.story.setMetadata('datePublished', datePublished) + self.story.setMetadata('dateUpdated', dateUpdated) + self.story.setMetadata('numWords', str(wordCount)) + self.story.setMetadata('numChapters', chapterCount) + # Site-specific metadata. - self.story.setMetadata( - 'language', self.SITE_LANGUAGE) + self.story.setMetadata('language', self.SITE_LANGUAGE) def getChapterText(self, url): """Grabs the text for an individual chapter.""" - element = self._getChapterTextElement(url) - return self.utf8FromSoup(url, element) + if url not in self._chapters: + raise exceptions.FailedToDownload(u"No chapter `%s' present!" % url) + chapter = self._chapters[url] + return self.utf8FromSoup(url, chapter.getTextElement()) - def _parseChapterMetadata(self, url, document): - try: - self._parseTitle(url, document) - infoBar = document.find('td', {'class': 'eDetails2'}) - if not infoBar: - raise ParsingError(u'No informational bar found.') - if not self.story.getMetadata('authorId'): - self._parseAuthor(infoBar) - self._parseDates(infoBar) - self._parseTextForWordCount(url) - self._parseAttributes(document) - except ParsingError, error: - raise exceptions.FailedToDownload( - u"Error parsing `%s'. %s" % (url, error.message)) + def _makeChapter(self, url): + """Creates a chapter object given a URL.""" + document = self._loadDocument(url) + chapter = Chapter(self._getParsingConfiguration(), url, document) + return chapter - def _parseAttributes(self, document): - try: - elements = document \ - .find('div', {'class': 'comm-div'}) \ - .findNextSibling('div', {'class': 'cb'}) \ - .nextGenerator() - attributesText = u'' - for element in elements: - if not element: - _logger.warning(u'Attribute block not terminated!') - break - if isinstance(element, bs.Tag): - # Although deprecated, `has_key()' is required here. - if element.name == 'div' and \ - element.has_key('class') and \ - element['class'] == 'cb': - break - elif element.name == 'img': - self._parseRatingFromImage(element) - else: - attributesText += stripHTML(element) - except AttributeError or TypeError: - raise ParsingError(u'Failed to locate and collect attributes.') - - for record in re.split(u';|\.', attributesText): - parts = record.split(u':', 1) - if len(parts) < 2: - continue - key = parts[0].strip().lower() - value = parts[1].strip().strip(u'.') - self._parseAttribute(key, value) - - def _parseRatingFromImage(self, element): - """Given an image element, tries to parse story rating from it.""" - # FIXME: This should probably be made adjustable via settings. - ratings = { - 'E': u'Exempt (18+)', - 'R': u'Restricted (16+)', - 'A': u'Иная история', - 'T': u'To every', - 'I': u'Art house', - 'Nn': u'Новый мир', - 'G': u'О, господи!', - } - ratings['IN'] = ratings['A'] - - # Although deprecated, `has_key()' is required here. - if not element.has_key('src'): - return - source = element['src'] - if 'REITiNG' not in source: - return - match = re.search(u'/(?P[ERATINnG]+)\.png$', source) - if not match: - return - symbol = match.group('rating') - if symbol == 'IN': - symbol = 'A' - if symbol in ratings: - rating = ratings[symbol] - self.story.setMetadata('rating', rating) - if symbol in ('R', 'E'): - self.is_adult = True - - def _parseAttribute(self, key, value): - """Parses a single known attribute value for chapter metadata.""" - - def refineCharacter(name): - """Refines character name from stop-words and distortions.""" - strippedName = name.strip() - nameOnly = re.sub(self.ETC_PATTERN, u'', strippedName) - # TODO: extract canonical name (even ME-specific?). - canonicalName = nameOnly - return canonicalName - - if key == u'жанр': - definitions = value.split(u',') - if len(definitions) > 4: - _logger.warning(u'Possibly incorrect genre detection!') - for definition in definitions: - genres = definition.split(u'/') - self.story.extendList('genre', genres) - elif key == u'статус': - status = 'In-Progress' if value == u'в процессе' else 'Completed' - self.story.setMetadata('status', status) - elif key == u'персонажи': - characters = [refineCharacter(name) for name in value.split(u',')] - self.story.extendList('characters', characters) - else: - _logger.debug(u"Unrecognized attribute `%s'.", key) - - def _parseTextForWordCount(self, url): - element = self._getChapterTextElement(url) + def _getWordCount(self, element): + """Returns word count in plain text extracted from chapter body.""" text = stripHTML(element) count = len(re.findall(self.WORD_PATTERN, text)) - self._transient_metadata['numWords'] += count - pass + return count - def _parseDates(self, infoBar): - try: - dateText = infoBar \ - .find('i', {'class': 'icon-eye'}) \ - .findPreviousSibling(text=True) \ - .strip(u'| \n') - except AttributeError: - raise ParsingError(u'Failed to locate date.') - date = makeDate(dateText, self.dateformat) - if date > self._transient_metadata['dateUpdated']: - self._transient_metadata['dateUpdated'] = date - if date < self._transient_metadata['datePublished']: - self._transient_metadata['datePublished'] = date + def _getParsingConfiguration(self): + if not self._parsingConfiguration: + self._parsingConfiguration = {} - def _parseAuthor(self, strip): - try: - authorLink = strip \ - .find('i', {'class': 'icon-user'}) \ - .findNextSibling('a') - except AttributeError: - raise ParsingError(u'Failed to locate author link.') - match = re.search(u'(8-\d+)', authorLink['onclick']) - if not match: - raise ParsingError(u'Failed to extract author ID.') - authorId = match.group(0) - authorUrl = 'http://%s/index/%s' % (self.getSiteDomain(), authorId) - authorName = stripHTML(authorLink.text) - self.story.setMetadata('authorId', authorId) - self.story.setMetadata('authorUrl', authorUrl) - self.story.setMetadata('author', authorName) + adultRatings = self.getConfigList('adult_ratings') + if not adultRatings: + raise exceptions.PersonalIniFailed( + u"Missing `adult_ratings' setting", u"MassEffect2.in", u"?") + adultRatings = set(adultRatings) + self._parsingConfiguration['adultRatings'] = adultRatings - def _parseTitle(self, url, document): - try: - fullTitle = stripHTML( - document.find('div', {'class': 'eTitle'}).string) - except AttributeError: - raise ParsingError(u'Failed to locate title.') - parsedHeading = self._parseHeading(fullTitle) - if not self.story.getMetadata('title'): - self.story.setMetadata('title', parsedHeading['storyTitle']) - if 'chapterIndex' in parsedHeading: - self._transient_metadata['numChapters'] = max( - self._transient_metadata['numChapters'], - parsedHeading['chapterIndex']) - else: - self._transient_metadata['numChapters'] += 1 - self.chapterUrls.append((parsedHeading['chapterTitle'], url)) - - def _parseHeading(self, fullTitle): - """Extracts meaningful parts from full chapter heading with. - Returns a dictionary containing `storyTitle', `chapterTitle' - (including numbering if allowed by settings, may be the same as - `storyTitle' for short stories), `chapterIndex' (optional, may be - zero), and `partIndex' (optional, chapter part, may be zero). - When no dedicated chapter title is present, generates one based on - chapter and part indices. Correctly handles `prologue' and `epilogue' - cases.""" - match = re.search(self.CHAPTER_NUMBER_PATTERN, fullTitle) - if match: - chapterIndex = int(match.group('chapterIndex')) - # There are cases with zero chapter or part number (e. g.: - # numbered prologue, not to be confused with just `Prologue'). - if match.group('partIndex'): - partIndex = int(match.group('partIndex')) - else: - partIndex = None - chapterTitle = fullTitle[match.end():].strip() - if chapterTitle: - if self.getConfig('strip_chapter_numbers', False) \ - and not self.getConfig('add_chapter_numbers', False): - if partIndex is not None: - title = u'%d.%d %s' % \ - (chapterIndex, partIndex, chapterTitle) - else: - title = u'%d. %s' % (chapterIndex, chapterTitle) - else: - title = chapterTitle - else: - title = u'Глава %d' % chapterIndex - if partIndex: - title += u' (часть %d)' % partIndex - - # For seldom found cases like `Story: prologue and chapter 1'. - storyTitle = fullTitle[:match.start()] - match = re.search(self.PROLOGUE_EPILOGUE_PATTERN, storyTitle) - if match: - matches = list( - re.finditer(u'[:\.]', storyTitle)) - if matches: - realStoryTitleEnd = matches[-1].start() - if realStoryTitleEnd >= 0: - storyTitle = storyTitle[:realStoryTitleEnd] - else: + ratingTitleDescriptions = self.getConfigList('rating_titles') + if ratingTitleDescriptions: + ratingTitles = {} + for ratingDescription in ratingTitleDescriptions: + parts = ratingDescription.split(u'=') + if len(parts) < 2: _logger.warning( - u"Title contains `%s', suspected to be part of " - u"numbering, but no period (`.') before it. " - u"Full title is preserved." % storyTitle) - - result = { - 'storyTitle': storyTitle, - 'chapterTitle': title, - 'chapterIndex': chapterIndex - } - if partIndex is not None: - result['partIndex'] = partIndex - return result - - match = re.search(self.PROLOGUE_EPILOGUE_PATTERN, fullTitle) - if match: - storyTitle = fullTitle[:match.start()] - chapterTitle = fullTitle[match.end():].strip() - matchedText = fullTitle[match.start():match.end()] - if chapterTitle: - title = u'%s. %s' % (matchedText, chapterTitle) + u"Invalid `rating_titles' setting, missing `=' in `%s'." + % ratingDescription) + continue + labels = parts[:-1] + title = parts[-1] + for label in labels: + ratingTitles[label] = title + # Duplicate label aliasing in adult rating set. + if label in adultRatings: + adultRatings.add(*labels) + self._parsingConfiguration['adultRatings'] = list(adultRatings) + self._parsingConfiguration['ratingTitles'] = ratingTitles else: - title = matchedText - return { - 'storyTitle': storyTitle, - 'chapterTitle': title - } + raise exceptions.PersonalIniFailed( + u"Missing `rating_titles' setting", u"MassEffect2.in", u"?") - return { - 'storyTitle': fullTitle, - 'chapterTitle': fullTitle - } + self._parsingConfiguration['needsChapterNumbering'] = \ + self.getConfig('strip_chapter_numbers', False) \ + and not self.getConfig('add_chapter_numbers', False) + + + return self._parsingConfiguration + + def _getDocumentId(self, url): + """Extracts document ID from MassEffect2.in URL.""" + match = re.search(self.DOCUMENT_ID_PATTERN, url) + if not match: + raise ValueError(u"Failed to extract document ID from `'" % url) + documentId = url[match.start():match.end()] + return documentId + + @classmethod + def _makeDocumentUrl(cls, documentId): + """Makes a chapter URL given a chapter ID.""" + return 'http://%s/publ/%s' % (cls.getSiteDomain(), documentId) def _loadDocument(self, url): """Fetches URL content and returns its element tree with parsing settings tuned for MassEffect2.in.""" - documentId = self._extractDocumentId(url) - if documentId in self._parsing_cache: - _logger.debug(u"Memory cache HIT for parsed `%s'", url) - return self._parsing_cache[documentId]['document'] - else: - _logger.debug(u"Memory cache MISS for parsed `%s'", url) - document = bs.BeautifulStoneSoup( - self._fetchUrl(url), selfClosingTags=('br', 'hr', 'img')) - self._parsing_cache[documentId] = {'document': document} - return document + return bs.BeautifulStoneSoup( + self._fetchUrl(url), selfClosingTags=('br', 'hr', 'img')) def _fetchUrl(self, url, parameters=None, @@ -498,7 +297,7 @@ class MassEffect2InAdapter(BaseSiteAdapter): from calibre.constants import DEBUG if DEBUG: import os - documentId = self._extractDocumentId(url) + documentId = self._getDocumentId(url) path = u'./cache/%s' % documentId if os.path.isfile(path) and os.access(path, os.R_OK): _logger.debug(u"On-disk cache HIT for `%s'.", url) @@ -519,31 +318,380 @@ class MassEffect2InAdapter(BaseSiteAdapter): return content - def _extractDocumentId(self, url): - """Extracts document ID from MassEffect2.in URL.""" - match = re.search(self.DOCUMENT_ID_PATTERN, url) - if not match: - raise ValueError(u"Failed to extract document ID from `'" % url) - documentId = url[match.start():match.end()] - return documentId - def _getChapterTextElement(self, url): - """Fetches URL content and extracts an element containing text body. - Shall be used instead of `__collectTextElements'.""" - documentId = self._extractDocumentId(url) - document = self._loadDocument(url) - cache = self._parsing_cache[documentId] - if 'body' in cache: - return cache['body'] +class Chapter(object): + """Represents a lazily-parsed chapter of a story.""" + def __init__(self, configuration, url, document): + self._configuration = configuration + self._url = url + self._document = document + # Lazy-loaded: + self._parsedHeading = None + self._date = None + self._author = None + self._attributes = None + self._textElement = None + self._infoBar = None + + def getIndex(self): + parsedHeading = self._getHeading() + if 'chapterIndex' in parsedHeading: + return parsedHeading['chapterIndex'] + + def getPartIndex(self): + parsedHeading = self._getHeading() + if 'partIndex' in parsedHeading: + return parsedHeading['partIndex'] + + def getStoryTitle(self): + return self._getHeading()['storyTitle'] + + def getTitle(self): + return self._getHeading()['chapterTitle'] + + def getAuthorId(self): + return self._getAuthor()['id'] + + def getAuthorName(self): + return self._getAuthor()['name'] + + def getDate(self): + return self._getDate() + + def getRatingTitle(self): + return self._getAttributes()['rating']['title'] + + def isRatingAdult(self): + return self._getAttributes()['rating']['isAdult'] + + def getCharacters(self): + attributes = self._getAttributes() + if 'characters' in attributes: + return attributes['characters'] + return [] + + def getGenres(self): + attributes = self._getAttributes() + if 'genres' in attributes: + return attributes['genres'] + return [] + + def isInProgress(self): + attributes = self._getAttributes() + if 'isInProgress' in attributes: + return attributes['isInProgress'] + + def getUrl(self): + return self._url + + def getTextElement(self): + return self._getTextElement() + + def getPreviousChapterUrl(self): + """Downloads chapters following `Previous chapter' links. + Returns a list of chapters' URLs.""" + return self._getSiblingChapterUrl({'class': 'fl tal'}) + + def getNextChapterUrl(self): + """Downloads chapters following `Next chapter' links. + Returns a list of chapters' URLs.""" + return self._getSiblingChapterUrl({'class': 'tar fr'}) + else: - body = self.__collectTextElements(document) - cache['body'] = body - return body + return storyTitle != thisStoryTitle - def __collectTextElements(self, document): + CHAPTER_NUMBER_PATTERN = re.compile( + u'''[\.:\s]* + (?:глава)? # `Chapter' in Russian. + \s + (?:(?P\d{1,3})(?=\D|$)) + (?: + (?: + # For `X.Y' and `X-Y' numbering styles: + [\-\.]| + # For `Chapter X (part Y)' and similar numbering styles: + [\.,]?\s + (?P\()? + (?:часть)? # `Part' in Russian. + \s + ) + (?P\d{1,3}) + (?(brace)\)) + )? + [\.:\s]* + ''', + re.IGNORECASE + re.UNICODE + re.VERBOSE) + + PROLOGUE_EPILOGUE_PATTERN = re.compile( + u'''[\.:\s]* # Optional separators. + (пролог|эпилог) # `Prologue' or `epilogue' in Russian. + [\.:\s]* # Optional separators. + ''', + re.IGNORECASE + re.UNICODE + re.VERBOSE) + + def _getHeading(self): + if not self._parsedHeading: + self._parsedHeading = self._parseHeading() + return self._parsedHeading + + def _parseHeading(self): + """Extracts meaningful parts from full chapter heading with. + Returns a dictionary containing `storyTitle', `chapterTitle' + (including numbering if allowed by settings, may be the same as + `storyTitle' for short stories), `chapterIndex' (optional, may be + zero), and `partIndex' (optional, chapter part, may be zero). + When no dedicated chapter title is present, generates one based on + chapter and part indices. Correctly handles `prologue' and `epilogue' + cases.""" + try: + heading = stripHTML( + self._document.find('div', {'class': 'eTitle'}).string) + except AttributeError: + raise ParsingError(u'Failed to locate title.') + + match = re.search(self.CHAPTER_NUMBER_PATTERN, heading) + if match: + chapterIndex = int(match.group('chapterIndex')) + # There are cases with zero chapter or part number (e. g.: + # numbered prologue, not to be confused with just `Prologue'). + if match.group('partIndex'): + partIndex = int(match.group('partIndex')) + else: + partIndex = None + chapterTitle = heading[match.end():].strip() + if chapterTitle: + if self._configuration['needsChapterNumbering']: + if partIndex is not None: + title = u'%d.%d. %s' % \ + (chapterIndex, partIndex, chapterTitle) + else: + title = u'%d. %s' % (chapterIndex, chapterTitle) + else: + title = chapterTitle + else: + title = u'Глава %d' % chapterIndex + if partIndex: + title += u' (часть %d)' % partIndex + + # For seldom found cases like `Story: prologue and chapter 1'. + storyTitle = heading[:match.start()] + match = re.search(self.PROLOGUE_EPILOGUE_PATTERN, storyTitle) + if match: + matches = list( + re.finditer(u'[:\.]', storyTitle)) + if matches: + realStoryTitleEnd = matches[-1].start() + if realStoryTitleEnd >= 0: + storyTitle = storyTitle[:realStoryTitleEnd] + else: + _logger.warning( + u"Title contains `%s', suspected to be part of " + u"numbering, but no period (`.') before it. " + u"Full title is preserved." % storyTitle) + + self._parsedHeading = { + 'storyTitle': unicode(storyTitle), + 'chapterTitle': unicode(title), + 'chapterIndex': chapterIndex + } + if partIndex is not None: + self._parsedHeading['partIndex'] = partIndex + return self._parsedHeading + + match = re.search(self.PROLOGUE_EPILOGUE_PATTERN, heading) + if match: + storyTitle = heading[:match.start()] + chapterTitle = heading[match.end():].strip() + matchedText = heading[match.start():match.end()] + if chapterTitle: + title = u'%s. %s' % (matchedText, chapterTitle) + else: + title = matchedText + self._parsedHeading = { + 'storyTitle': unicode(storyTitle), + 'chapterTitle': unicode(title) + } + return self._parsedHeading + + self._parsedHeading = { + 'storyTitle': unicode(heading), + 'chapterTitle': unicode(heading) + } + return self._parsedHeading + + def _getAuthor(self): + if not self._author: + self._author = self._parseAuthor() + return self._author + + def _parseAuthor(self): + try: + authorLink = self._getInfoBarElement() \ + .find('i', {'class': 'icon-user'}) \ + .findNextSibling('a') + except AttributeError: + raise ParsingError(u'Failed to locate author link.') + match = re.search(u'(8-\d+)', authorLink['onclick']) + if not match: + raise ParsingError(u'Failed to extract author ID.') + authorId = match.group(0) + authorName = stripHTML(authorLink.text) + return { + 'id': authorId, + 'name': authorName + } + + def _getDate(self): + if not self._date: + self._date = self._parseDate() + return self._date + + def _parseDate(self): + try: + dateText = self._getInfoBarElement() \ + .find('i', {'class': 'icon-eye'}) \ + .findPreviousSibling(text=True) \ + .strip(u'| \n') + except AttributeError: + raise ParsingError(u'Failed to locate date.') + date = makeDate(dateText, '%d.%m.%Y') + return date + + def _getInfoBarElement(self): + if not self._infoBar: + self._infoBar = self._document.find('td', {'class': 'eDetails2'}) + if not self._infoBar: + raise ParsingError(u'No informational bar found.') + return self._infoBar + + def _getAttributes(self): + if not self._attributes: + self._attributes = self._parseAttributes() + return self._attributes + + def _parseAttributes(self): + attributes = {} + try: + elements = self._document \ + .find('div', {'class': 'comm-div'}) \ + .findNextSibling('div', {'class': 'cb'}) \ + .nextGenerator() + attributesText = u'' + for element in elements: + if not element: + _logger.warning(u'Attribute block not terminated!') + break + if isinstance(element, bs.Tag): + # Although deprecated, `has_key()' is required here. + if element.name == 'div' and \ + element.has_key('class') and \ + element['class'] == 'cb': + break + elif element.name == 'img': + rating = self._parseRatingFromImage(element) + if rating: + attributes['rating'] = rating + else: + attributesText += stripHTML(element) + except AttributeError or TypeError: + raise ParsingError(u'Failed to locate and collect attributes.') + + for record in re.split(u';|\.', attributesText): + parts = record.split(u':', 1) + if len(parts) < 2: + continue + key = parts[0].strip().lower() + value = parts[1].strip().strip(u'.') + parsed = self._parseAttribute(key, value) + if parsed: + attributes[parsed[0]] = parsed[1] + + if 'rating' not in attributes: + raise ParsingError(u'Failed to locate or recognize rating!') + + return attributes + + RATING_LABEL_PATTERN = re.compile(u'/(?P[ERATINnG]+)\.png$') + + def _parseRatingFromImage(self, element): + """Given an image element, tries to parse story rating from it.""" + # Although deprecated, `has_key()' is required here. + if not element.has_key('src'): + return + source = element['src'] + if 'REITiNG' in source: + match = re.search(self.RATING_LABEL_PATTERN, source) + if not match: + return + label = match.group('rating') + if label in self._configuration['ratingTitles']: + return { + 'label': label, + 'title': self._configuration['ratingTitles'][label], + 'isAdult': label in self._configuration['adultRatings'] + } + else: + _logger.warning(u"No title found for rating label `%s'!" % label) + # FIXME: It seems, rating has to be optional due to such URLs. + elif source == 'http://www.masseffect2.in/_fr/10/1360399.png': + label = 'Nn' + return { + 'label': 'Nn', + 'title': self._configuration['ratingTitles'][label], + 'isAdult': label in self._configuration['adultRatings'] + } + + # Various `et cetera' and `et al' forms in Russian texts. + # Intended to be used with whole strings! + ETC_PATTERN = re.compile( + u'''[и&]\s(?: + (?:т\.?\s?[пд]\.?)| + (?:др(?:угие|\.)?)| + (?:пр(?:очие|\.)?)| + # Note: identically looking letters `K' and `o' + # below are from Latin and Cyrillic alphabets. + (?:ко(?:мпания)?|[KК][oо°]) + )$ + ''', + re.IGNORECASE + re.UNICODE + re.VERBOSE) + + def _parseAttribute(self, key, value): + """Parses a single known attribute value for chapter metadata.""" + + def refineCharacter(name): + """Refines character name from stop-words and distortions.""" + strippedName = name.strip() + nameOnly = re.sub(self.ETC_PATTERN, u'', strippedName) + # TODO: extract canonical name (even ME-specific?). + canonicalName = nameOnly + return canonicalName + + if re.match(u'жанры?', key, re.UNICODE): + definitions = value.split(u',') + if len(definitions) > 4: + _logger.warning(u'Possibly incorrect genre detection!') + genres = [] + for definition in definitions: + genres += definition.split(u'/') + return 'genres', genres + elif key == u'статус': + isInProgress = value == u'в процессе' + return 'isInProgress', isInProgress + elif key == u'персонажи': + characters = [refineCharacter(name) for name in value.split(u',')] + return 'characters', characters + else: + _logger.debug(u"Unrecognized attribute `%s' ignored.", key) + + def _getTextElement(self): + if not self._textElement: + self._textElement = self.__collectTextElements() + return self._textElement + + def __collectTextElements(self): """Returns all elements containing parts of chapter text (which may be

aragraphs,

isions or plain text nodes) under a single root.""" - starter = document.find('div', {'id': u'article'}) + starter = self._document.find('div', {'id': u'article'}) if starter is None: # FIXME: This will occur if the method is called more than once. # The reason is elements appended to `root' are removed from @@ -558,7 +706,30 @@ class MassEffect2InAdapter(BaseSiteAdapter): if isinstance(element, bs.Tag) and element.name == 'tr': break collection.append(element) - root = bs.Tag(document, 'td') + root = bs.Tag(self._document, 'td') for element in collection: root.append(element) return root + + def _getSiblingChapterUrl(self, selector): + """Downloads chapters one by one by locating and following links + specified by a selector. Returns chapters' URLs in order they + were found.""" + block = self._document\ + .find('td', {'class': 'eDetails1'})\ + .find('div', selector) + if not block: + return + link = block.find('a') + if not link: + return + return link['href'] + + +def _getLargestCommonPrefix(*args): + """Returns largest common prefix of all unicode(!) arguments. + :rtype : unicode + """ + from itertools import takewhile, izip + allSame = lambda xs: len(set(xs)) == 1 + return u''.join([i[0] for i in takewhile(allSame, izip(*args))]) diff --git a/fanficfare/configurable.py b/fanficfare/configurable.py index 97fc399d..5dd2a561 100644 --- a/fanficfare/configurable.py +++ b/fanficfare/configurable.py @@ -205,6 +205,7 @@ def get_valid_keywords(): return list(['(in|ex)clude_metadata_(pre|post)', 'add_chapter_numbers', 'add_genre_when_multi_category', + 'adult_ratings', 'allow_unsafe_filename', 'always_overwrite', 'anthology_tags', @@ -285,6 +286,7 @@ def get_valid_keywords(): 'output_filename_safepattern', 'password', 'post_process_cmd', + 'rating_titles', 'remove_transparency', 'replace_br_with_p', 'replace_hr', diff --git a/fanficfare/defaults.ini b/fanficfare/defaults.ini index e35ad4fd..6dbb44ba 100644 --- a/fanficfare/defaults.ini +++ b/fanficfare/defaults.ini @@ -1825,14 +1825,17 @@ extracategories:Lord of the Rings [www.masseffect2.in] ## Site dedicated to this fandom. -extracategories:Mass Effect +extracategories: Mass Effect ## Stories on the site almost never have cover image. ## May be adjusted in `personal.ini' on per-story basis. never_make_cover: true -my_custom_label:Some text -my_custom_setting:true +## Titles for ratings identified by 1- or 2-letter codes from `ERATING system' +## (`система Р.Е.Й.Т.И.Н.Г.'). MassEffect2.in and some other sites adopted it, +## but changed titles and update them occasionally. +rating_titles: R=RESTRICTED (16+), E=EXEMPT (18+), I=ART HOUSE, T=To every, A=IN=Иной мир, Nn=Новый мир, G=О\, Господи! +adult_ratings: E,R [www.mediaminer.org] From a8ce9d5711378a538c1f50af75ce0f85b64f0ba2 Mon Sep 17 00:00:00 2001 From: Dmitry Kozliuk Date: Thu, 23 Jul 2015 02:44:50 +0300 Subject: [PATCH 05/18] Add editor signature removal capability. All chapters have editor signature in the end. Users wishing to remove it can switch `exclude_editor_signature' option in `personal.ini'. --- calibre-plugin/plugin-defaults.ini | 3 +++ fanficfare/adapters/adapter_masseffect2in.py | 24 ++++++++++++++++++++ fanficfare/configurable.py | 1 + fanficfare/defaults.ini | 3 +++ 4 files changed, 31 insertions(+) diff --git a/calibre-plugin/plugin-defaults.ini b/calibre-plugin/plugin-defaults.ini index ac27065e..a68a5d99 100644 --- a/calibre-plugin/plugin-defaults.ini +++ b/calibre-plugin/plugin-defaults.ini @@ -1226,6 +1226,9 @@ extracategories:Harry Potter ## Site dedicated to this fandom. extracategories: Mass Effect +## Whether to exclude editor signature from the bottom if chapter text. +exclude_editor_signature: false + ## Stories on the site almost never have cover image. ## May be adjusted in `personal.ini' on per-story basis. never_make_cover: true diff --git a/fanficfare/adapters/adapter_masseffect2in.py b/fanficfare/adapters/adapter_masseffect2in.py index 815ae191..dd8444bd 100644 --- a/fanficfare/adapters/adapter_masseffect2in.py +++ b/fanficfare/adapters/adapter_masseffect2in.py @@ -266,6 +266,8 @@ class MassEffect2InAdapter(BaseSiteAdapter): self.getConfig('strip_chapter_numbers', False) \ and not self.getConfig('add_chapter_numbers', False) + self._parsingConfiguration['excludeEditorSignature'] = \ + self.getConfig('exclude_editor_signature', False) return self._parsingConfiguration @@ -709,6 +711,10 @@ class Chapter(object): root = bs.Tag(self._document, 'td') for element in collection: root.append(element) + + if self._configuration['excludeEditorSignature']: + root = self._excludeEditorSignature(root) + return root def _getSiblingChapterUrl(self, selector): @@ -725,6 +731,24 @@ class Chapter(object): return return link['href'] + SIGNED_PATTERN = re.compile(u'отредактирова(?:но|ла?)[:.\s]', re.IGNORECASE + re.UNICODE) + + def _excludeEditorSignature(self, root): + for textNode in root.findAll(text=True): + if re.match(self.SIGNED_PATTERN, textNode.string): + editorLink = textNode.findNext('a') + if editorLink: + editorLink.extract() + # Seldom editor link has inner formatting, which is sibling DOM-wise. + editorName = textNode.findNext('i') + if editorName: + editorName.extract() + textNode.extract() + # We could try removing container element, but there is a risk + # of removing text ending with it. Better play safe here. + break + return root + def _getLargestCommonPrefix(*args): """Returns largest common prefix of all unicode(!) arguments. diff --git a/fanficfare/configurable.py b/fanficfare/configurable.py index 5dd2a561..61b306df 100644 --- a/fanficfare/configurable.py +++ b/fanficfare/configurable.py @@ -235,6 +235,7 @@ def get_valid_keywords(): 'description_limit', 'do_update_hook', 'exclude_notes', + 'exclude_editor_signature', 'extra_logpage_entries', 'extra_subject_tags', 'extra_titlepage_entries', diff --git a/fanficfare/defaults.ini b/fanficfare/defaults.ini index 6dbb44ba..d9069cf8 100644 --- a/fanficfare/defaults.ini +++ b/fanficfare/defaults.ini @@ -1827,6 +1827,9 @@ extracategories:Lord of the Rings ## Site dedicated to this fandom. extracategories: Mass Effect +## Whether to exclude editor signature from the bottom if chapter text. +exclude_editor_signature: false + ## Stories on the site almost never have cover image. ## May be adjusted in `personal.ini' on per-story basis. never_make_cover: true From d809ac9f6c9a64e6d4effc43fbe23265ca6e848b Mon Sep 17 00:00:00 2001 From: Dmitry Kozliuk Date: Thu, 23 Jul 2015 02:47:49 +0300 Subject: [PATCH 06/18] Prevent capturing chapters from related stories. When prequel or sequel is posted, editors sometimes make `Next/Previous chapter' links between them. Since it is the only mechanism for chapter detection, additional title check was added. It is heuristical and may need improvement one day. --- fanficfare/adapters/adapter_masseffect2in.py | 24 ++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/fanficfare/adapters/adapter_masseffect2in.py b/fanficfare/adapters/adapter_masseffect2in.py index dd8444bd..73047abd 100644 --- a/fanficfare/adapters/adapter_masseffect2in.py +++ b/fanficfare/adapters/adapter_masseffect2in.py @@ -104,6 +104,10 @@ class MassEffect2InAdapter(BaseSiteAdapter): if url: url = self._makeDocumentUrl(self._getDocumentId(url)) following = self._makeChapter(url) + # Do not follow links to related, but different stories (prequels or sequels). + startingStoryTitle = self.story.getMetadata('title') + if not following.isFromStory(startingStoryTitle): + return if forward: yield following for chapter in followChapters(following, forward): @@ -399,6 +403,26 @@ class Chapter(object): Returns a list of chapters' URLs.""" return self._getSiblingChapterUrl({'class': 'tar fr'}) + def isFromStory(self, storyTitle, prefixThreshold=-1): + """Checks if this chapter is from a story different from the given one. + Prefix threshold specifies how long common story title prefix shall be + for chapters from one story: negative value means implementation-defined + optimum, zero inhibits the check, and positive value adjusts threshold.""" + + def getFirstWord(string): + match = re.search(u'^\s*\w+', string, re.UNICODE) + return string[match.start():match.end()] + + thisStoryTitle = self.getStoryTitle() + if prefixThreshold != 0: + if prefixThreshold < 0: + prefixThreshold = min( + len(getFirstWord(storyTitle)), len(getFirstWord(thisStoryTitle))) + else: + prefixThreshold = min( + prefixThreshold, len(storyTitle), len(thisStoryTitle)) + result = len(_getLargestCommonPrefix(storyTitle, thisStoryTitle)) >= prefixThreshold + return result else: return storyTitle != thisStoryTitle From d91d4b8c3c430c1235a07abcf9837f47fa68b926 Mon Sep 17 00:00:00 2001 From: Dmitry Kozliuk Date: Fri, 24 Jul 2015 03:33:39 +0300 Subject: [PATCH 07/18] Refactor and improve heading parsing. Now several (relatively) simple REs are used instead of a complex one. New heading variants are supported: * Story title. Chapter X. Chapter title (part Y) * Story title. Chapter X (continued) * Story title. First chapter Potentially overridable method `Chapter._extractHeading()' extracted. --- fanficfare/adapters/adapter_masseffect2in.py | 253 +++++++++++-------- 1 file changed, 141 insertions(+), 112 deletions(-) diff --git a/fanficfare/adapters/adapter_masseffect2in.py b/fanficfare/adapters/adapter_masseffect2in.py index 73047abd..70e9ce0b 100644 --- a/fanficfare/adapters/adapter_masseffect2in.py +++ b/fanficfare/adapters/adapter_masseffect2in.py @@ -340,20 +340,20 @@ class Chapter(object): self._infoBar = None def getIndex(self): - parsedHeading = self._getHeading() + parsedHeading = self.__getHeading() if 'chapterIndex' in parsedHeading: return parsedHeading['chapterIndex'] def getPartIndex(self): - parsedHeading = self._getHeading() + parsedHeading = self.__getHeading() if 'partIndex' in parsedHeading: return parsedHeading['partIndex'] def getStoryTitle(self): - return self._getHeading()['storyTitle'] + return self.__getHeading()['storyTitle'] def getTitle(self): - return self._getHeading()['chapterTitle'] + return self.__getHeading()['chapterTitle'] def getAuthorId(self): return self._getAuthor()['id'] @@ -426,124 +426,153 @@ class Chapter(object): else: return storyTitle != thisStoryTitle - CHAPTER_NUMBER_PATTERN = re.compile( + def _extractHeading(self): + """Extracts header text from the document.""" + return stripHTML( + self._document.find('div', {'class': 'eTitle'}).string) + + def __getHeading(self): + if not self._parsedHeading: + self._parsedHeading = self.__parseHeading() + return self._parsedHeading + + NUMBERING_TITLE_PATTERN = re.compile( + u'''(?P\()? + (?Pначало|продолжение|окончание| + часть\s(?:первая|вторая|третья|четвертая|пятая|шестая|седьмая|восьмая|девятая|десятая)) + (?(brace)\)|\.)? + ''', + re.IGNORECASE | re.UNICODE | re.VERBOSE) + + def __parseHeading(self): + """Locates chapter heading and extracts meaningful parts from it. + Returns a dictionary containing `storyTitle', `chapterTitle' (including numbering if allowed by settings, + may be the same as `storyTitle' for short stories, or generated from indices), `chapterIndex' (optional, + may be zero), and `partIndex' (optional, chapter part, may be zero).""" + try: + heading = self._extractHeading() + except Exception, error: + raise ParsingError(u'Failed to locate title: %s.' % error) + + chapterIndex, partIndex, storyTitle, chapterTitle = self.__splitHeading(heading) + if chapterTitle: + match = re.search(self.NUMBERING_TITLE_PATTERN, chapterTitle) + if match: + chapterTitle = u'Глава %d. %s' % (chapterIndex, match.group('essence').capitalize()) + elif self._configuration['needsChapterNumbering']: + if partIndex is not None: + chapterTitle = u'%d.%d. %s' % (chapterIndex, partIndex, chapterTitle) + else: + chapterTitle = u'%d. %s' % (chapterIndex, chapterTitle) + else: + chapterTitle = u'Глава %d' % chapterIndex + if partIndex is not None: + chapterTitle += u' (часть %d)' % partIndex + + self._parsedHeading = { + 'storyTitle': storyTitle, + 'chapterTitle': chapterTitle + } + if chapterIndex is not None: + self._parsedHeading['chapterIndex'] = chapterIndex + if partIndex is not None: + self._parsedHeading['partIndex'] = partIndex + return self._parsedHeading + return self._parsedHeading + + # Patterns below start end end with the same optional separator characters (to filter them) + # and allow only freestanding groups of 1--3 digits (ti filter long numbers in titles). + + OUTLINE_PATTERN = re.compile( u'''[\.:\s]* - (?:глава)? # `Chapter' in Russian. - \s - (?:(?P\d{1,3})(?=\D|$)) - (?: - (?: - # For `X.Y' and `X-Y' numbering styles: - [\-\.]| - # For `Chapter X (part Y)' and similar numbering styles: - [\.,]?\s - (?P\()? - (?:часть)? # `Part' in Russian. - \s - ) - (?P\d{1,3}) - (?(brace)\)) - )? + (?:глава\s)? + (?:(?\d{1,3})(?=\D)) + [\.-] + (?:(?P\d{1,3})(?=\D|$)) + [\.:\s]* + ''', + re.IGNORECASE | re.UNICODE | re.VERBOSE) + + CHAPTER_PATTERN = re.compile( + u'''[\.:\s]* + (?:глава\s)?(?:(?\d{1,3})(?=\D|$)) + [\.:\s]* + ''', + re.IGNORECASE | re.UNICODE | re.VERBOSE) + + PART_PATTERN = re.compile( + u'''[\.:\s]* + (?:[\.,]?\s)? + (?P\()? + (?:часть\s)? + (?:(?\d{1,3})(?=\D|$)) + (?(brace)\)) + [\.:\s]* + ''', + re.IGNORECASE | re.UNICODE | re.VERBOSE) + + PROLOGUE_EPILOGUE_PATTERN = re.compile( + u'''[\.:\s]* + (?Pпролог|эпилог) # `Prologue' or `epilogue' in Russian. [\.:\s]* ''', re.IGNORECASE + re.UNICODE + re.VERBOSE) - PROLOGUE_EPILOGUE_PATTERN = re.compile( - u'''[\.:\s]* # Optional separators. - (пролог|эпилог) # `Prologue' or `epilogue' in Russian. - [\.:\s]* # Optional separators. - ''', - re.IGNORECASE + re.UNICODE + re.VERBOSE) - - def _getHeading(self): - if not self._parsedHeading: - self._parsedHeading = self._parseHeading() - return self._parsedHeading - - def _parseHeading(self): - """Extracts meaningful parts from full chapter heading with. - Returns a dictionary containing `storyTitle', `chapterTitle' - (including numbering if allowed by settings, may be the same as - `storyTitle' for short stories), `chapterIndex' (optional, may be - zero), and `partIndex' (optional, chapter part, may be zero). - When no dedicated chapter title is present, generates one based on - chapter and part indices. Correctly handles `prologue' and `epilogue' - cases.""" - try: - heading = stripHTML( - self._document.find('div', {'class': 'eTitle'}).string) - except AttributeError: - raise ParsingError(u'Failed to locate title.') - - match = re.search(self.CHAPTER_NUMBER_PATTERN, heading) - if match: - chapterIndex = int(match.group('chapterIndex')) - # There are cases with zero chapter or part number (e. g.: - # numbered prologue, not to be confused with just `Prologue'). - if match.group('partIndex'): - partIndex = int(match.group('partIndex')) - else: - partIndex = None - chapterTitle = heading[match.end():].strip() - if chapterTitle: - if self._configuration['needsChapterNumbering']: - if partIndex is not None: - title = u'%d.%d. %s' % \ - (chapterIndex, partIndex, chapterTitle) - else: - title = u'%d. %s' % (chapterIndex, chapterTitle) - else: - title = chapterTitle - else: - title = u'Глава %d' % chapterIndex - if partIndex: - title += u' (часть %d)' % partIndex - - # For seldom found cases like `Story: prologue and chapter 1'. - storyTitle = heading[:match.start()] - match = re.search(self.PROLOGUE_EPILOGUE_PATTERN, storyTitle) + def __splitHeading(self, heading): + """Parses chapter heading text into meaningful parts. + Returns a tuple(chapter index, part index, story title, chapter title). + Any or both of the indices may be None if absent, chapter title may be empty (only if chapter index is None).""" + def filterPrologueOrEpilogue(title): + match = re.search(self.PROLOGUE_EPILOGUE_PATTERN, title) if match: - matches = list( - re.finditer(u'[:\.]', storyTitle)) + matches = list(re.finditer(u'[:\.]', title)) if matches: realStoryTitleEnd = matches[-1].start() - if realStoryTitleEnd >= 0: - storyTitle = storyTitle[:realStoryTitleEnd] + return title[:realStoryTitleEnd] + else: + _logger.warning( + u"Title contains `%s', suspected to be part of numbering, but no period (`.') before it. " + u"Full title is preserved." % title) + return title + + outline_match = re.search(self.OUTLINE_PATTERN, heading) + if outline_match: + chapter_index = int(outline_match.group('chapterIndex')) + part_index = int(outline_match.group('partIndex')) + story = heading[:outline_match.start()] + story = filterPrologueOrEpilogue(story) + chapter = heading[outline_match.end():] + return chapter_index, part_index, story, chapter + else: + chapter_match = re.search(self.CHAPTER_PATTERN, heading) + if chapter_match: + chapter_index = int(chapter_match.group('chapterIndex')) + story = heading[:chapter_match.start()] + story = filterPrologueOrEpilogue(story) + suffix = heading[chapter_match.end():] + part_match = re.search(self.PART_PATTERN, suffix) + if part_match: + part_index = int(part_match.group('partIndex')) + if part_match.start() == 0: + chapter = suffix[part_match.end():] else: - _logger.warning( - u"Title contains `%s', suspected to be part of " - u"numbering, but no period (`.') before it. " - u"Full title is preserved." % storyTitle) - - self._parsedHeading = { - 'storyTitle': unicode(storyTitle), - 'chapterTitle': unicode(title), - 'chapterIndex': chapterIndex - } - if partIndex is not None: - self._parsedHeading['partIndex'] = partIndex - return self._parsedHeading - - match = re.search(self.PROLOGUE_EPILOGUE_PATTERN, heading) - if match: - storyTitle = heading[:match.start()] - chapterTitle = heading[match.end():].strip() - matchedText = heading[match.start():match.end()] - if chapterTitle: - title = u'%s. %s' % (matchedText, chapterTitle) + chapter = suffix[:part_match.start()] + return chapter_index, part_index, story, chapter + else: + chapter = heading[chapter_match.end():] + return chapter_index, None, story, chapter else: - title = matchedText - self._parsedHeading = { - 'storyTitle': unicode(storyTitle), - 'chapterTitle': unicode(title) - } - return self._parsedHeading - - self._parsedHeading = { - 'storyTitle': unicode(heading), - 'chapterTitle': unicode(heading) - } - return self._parsedHeading + match = re.search(self.PROLOGUE_EPILOGUE_PATTERN, heading) + if match: + story = heading[:match.start()] + chapter = heading[match.end():] + keyword = match.group('keyword') + if chapter: + chapter = u"%s. %s" % (keyword.title(), chapter) + else: + chapter = keyword + return None, None, story, chapter + return None, None, heading, heading def _getAuthor(self): if not self._author: From 2516e617e44a193ee82d47829b9dc78a24703b32 Mon Sep 17 00:00:00 2001 From: Dmitry Kozliuk Date: Fri, 24 Jul 2015 03:37:58 +0300 Subject: [PATCH 08/18] Remove more stop-words variants. --- fanficfare/adapters/adapter_masseffect2in.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fanficfare/adapters/adapter_masseffect2in.py b/fanficfare/adapters/adapter_masseffect2in.py index 70e9ce0b..ede12ae2 100644 --- a/fanficfare/adapters/adapter_masseffect2in.py +++ b/fanficfare/adapters/adapter_masseffect2in.py @@ -700,7 +700,7 @@ class Chapter(object): # Intended to be used with whole strings! ETC_PATTERN = re.compile( u'''[и&]\s(?: - (?:т\.?\s?[пд]\.?)| + (?:т\.?\s?[пд]?\.?)| (?:др(?:угие|\.)?)| (?:пр(?:очие|\.)?)| # Note: identically looking letters `K' and `o' From 9c84c7201ccaeb68ec9fa00a39766dc86ddb6126 Mon Sep 17 00:00:00 2001 From: Dmitry Kozliuk Date: Fri, 24 Jul 2015 14:23:46 +0300 Subject: [PATCH 09/18] Replace intricate numbering parsing with simpler approach. Instead of locating numbering elements in headings, extracting titles and indices and then combining them into chapter titles, we employ a much simpler approach: * The longest common prefix of all headings is story title. * Everithing after it in every prefix is chapter title. * If `chapter X' is found in heading, prefix length is corrected. * If chapter title contains numbering prefix, chapter index is extracted (but not part index and not chapter title separately). --- fanficfare/adapters/adapter_masseffect2in.py | 246 ++++--------------- 1 file changed, 44 insertions(+), 202 deletions(-) diff --git a/fanficfare/adapters/adapter_masseffect2in.py b/fanficfare/adapters/adapter_masseffect2in.py index ede12ae2..0ee6209e 100644 --- a/fanficfare/adapters/adapter_masseffect2in.py +++ b/fanficfare/adapters/adapter_masseffect2in.py @@ -105,8 +105,7 @@ class MassEffect2InAdapter(BaseSiteAdapter): url = self._makeDocumentUrl(self._getDocumentId(url)) following = self._makeChapter(url) # Do not follow links to related, but different stories (prequels or sequels). - startingStoryTitle = self.story.getMetadata('title') - if not following.isFromStory(startingStoryTitle): + if not following.isFromStory(starting.getHeading()): return if forward: yield following @@ -123,7 +122,6 @@ class MassEffect2InAdapter(BaseSiteAdapter): raise try: - self.story.setMetadata('title', startingChapter.getStoryTitle()) self.story.setMetadata('author', startingChapter.getAuthorName()) authorId = startingChapter.getAuthorId() authorUrl = 'http://%s/index/%s' % (self.getSiteDomain(), authorId) @@ -148,12 +146,28 @@ class MassEffect2InAdapter(BaseSiteAdapter): [startingChapter] + \ list(followChapters(startingChapter, forward=True)) - try: - for chapter in chapters: - url = chapter.getUrl() - self._chapters[url] = chapter - _logger.debug(u"Processing chapter `%s'.", url) + headings = [chapter.getHeading() for chapter in chapters] + largestCommonPrefix = _getLargestCommonPrefix(*headings) + prefixLength = len(largestCommonPrefix) + storyTitleEnd, chapterTitleStart = prefixLength, prefixLength + match = re.search(u'[:\.\s]*(?Pглава\s+)?$', largestCommonPrefix, re.IGNORECASE | re.UNICODE) + if match: + storyTitleEnd -= len(match.group()) + label = match.group('chapter') + if label: + chapterTitleStart -= len(label) + storyTitle = largestCommonPrefix[:storyTitleEnd] + self.story.setMetadata('title', storyTitle) + garbagePattern = re.compile(u'(?P^)?[:\.\s]*(?(start)|$)', re.UNICODE) + indexPattern = re.compile(u'(?:глава\s)?(?:(?\d{1,3})(?=\D|$))', re.IGNORECASE | re.UNICODE) + + for chapter in chapters: + url = chapter.getUrl() + self._chapters[url] = chapter + _logger.debug(u"Processing chapter `%s'.", url) + + try: datePublished = min(datePublished, chapter.getDate()) dateUpdated = max(dateUpdated, chapter.getDate()) @@ -162,14 +176,7 @@ class MassEffect2InAdapter(BaseSiteAdapter): wordCount += self._getWordCount(chapter.getTextElement()) - index = chapter.getIndex() - if index: - chapterCount = max(chapterCount, index) - else: - chapterCount += 1 - - # Story is in progress if any chapter is in progress. - # Some chapters may have no status attribute. + # Story is in progress if any chapter is in progress. Some chapters may have no status attribute. chapterInProgress = chapter.isInProgress() if chapterInProgress is not None: storyInProgress |= chapterInProgress @@ -178,29 +185,18 @@ class MassEffect2InAdapter(BaseSiteAdapter): if chapter.isRatingAdult(): self.story.setMetadata('is_adult', True) - titles = [chapter.getTitle() for chapter in chapters] - hasNumbering = any([chapter.getIndex() is not None for chapter in chapters]) - if not hasNumbering: - # There are stories without chapter numbering, but under single title, - # which is heading prefix (such stories are not series). We identify - # common prefix for all chapters and use it as story title, trimming - # chapter titles the length of this prefix. - largestCommonPrefix = _getLargestCommonPrefix(*titles) - prefixLength = len(largestCommonPrefix) - storyTitle = re.sub(u'[:\.\s]*$', u'', largestCommonPrefix, re.UNICODE) - self.story.setMetadata('title', storyTitle) - for chapter in chapters: - self.chapterUrls.append( - (chapter.getTitle()[prefixLength:], chapter.getUrl())) - else: - # Simple processing for common cases. - for chapter in chapters: - self.chapterUrls.append( - (chapter.getTitle(), chapter.getUrl())) + chapterTitle = re.sub(garbagePattern, u'', chapter.getHeading()[chapterTitleStart:]) - except ParsingError, error: - raise exceptions.FailedToDownload( - u"Failed to download chapter `%s': %s" % (url, error)) + match = re.search(indexPattern, chapterTitle) + if match: + index = int(match.group('index')) + chapterCount = max(chapterCount, index) + else: + chapterCount += 1 + + self.chapterUrls.append((chapterTitle, url)) + except ParsingError, error: + raise exceptions.FailedToDownload(u"Failed to download chapter `%s': %s" % (url, error)) # Some metadata are handled separately due to format conversions. self.story.setMetadata( @@ -266,10 +262,6 @@ class MassEffect2InAdapter(BaseSiteAdapter): raise exceptions.PersonalIniFailed( u"Missing `rating_titles' setting", u"MassEffect2.in", u"?") - self._parsingConfiguration['needsChapterNumbering'] = \ - self.getConfig('strip_chapter_numbers', False) \ - and not self.getConfig('add_chapter_numbers', False) - self._parsingConfiguration['excludeEditorSignature'] = \ self.getConfig('exclude_editor_signature', False) @@ -332,28 +324,15 @@ class Chapter(object): self._url = url self._document = document # Lazy-loaded: - self._parsedHeading = None + self._heading = None self._date = None self._author = None self._attributes = None self._textElement = None self._infoBar = None - def getIndex(self): - parsedHeading = self.__getHeading() - if 'chapterIndex' in parsedHeading: - return parsedHeading['chapterIndex'] - - def getPartIndex(self): - parsedHeading = self.__getHeading() - if 'partIndex' in parsedHeading: - return parsedHeading['partIndex'] - - def getStoryTitle(self): - return self.__getHeading()['storyTitle'] - - def getTitle(self): - return self.__getHeading()['chapterTitle'] + def getHeading(self): + return self._extractHeading() def getAuthorId(self): return self._getAuthor()['id'] @@ -413,7 +392,7 @@ class Chapter(object): match = re.search(u'^\s*\w+', string, re.UNICODE) return string[match.start():match.end()] - thisStoryTitle = self.getStoryTitle() + thisStoryTitle = self.getHeading() if prefixThreshold != 0: if prefixThreshold < 0: prefixThreshold = min( @@ -432,147 +411,9 @@ class Chapter(object): self._document.find('div', {'class': 'eTitle'}).string) def __getHeading(self): - if not self._parsedHeading: - self._parsedHeading = self.__parseHeading() - return self._parsedHeading - - NUMBERING_TITLE_PATTERN = re.compile( - u'''(?P\()? - (?Pначало|продолжение|окончание| - часть\s(?:первая|вторая|третья|четвертая|пятая|шестая|седьмая|восьмая|девятая|десятая)) - (?(brace)\)|\.)? - ''', - re.IGNORECASE | re.UNICODE | re.VERBOSE) - - def __parseHeading(self): - """Locates chapter heading and extracts meaningful parts from it. - Returns a dictionary containing `storyTitle', `chapterTitle' (including numbering if allowed by settings, - may be the same as `storyTitle' for short stories, or generated from indices), `chapterIndex' (optional, - may be zero), and `partIndex' (optional, chapter part, may be zero).""" - try: - heading = self._extractHeading() - except Exception, error: - raise ParsingError(u'Failed to locate title: %s.' % error) - - chapterIndex, partIndex, storyTitle, chapterTitle = self.__splitHeading(heading) - if chapterTitle: - match = re.search(self.NUMBERING_TITLE_PATTERN, chapterTitle) - if match: - chapterTitle = u'Глава %d. %s' % (chapterIndex, match.group('essence').capitalize()) - elif self._configuration['needsChapterNumbering']: - if partIndex is not None: - chapterTitle = u'%d.%d. %s' % (chapterIndex, partIndex, chapterTitle) - else: - chapterTitle = u'%d. %s' % (chapterIndex, chapterTitle) - else: - chapterTitle = u'Глава %d' % chapterIndex - if partIndex is not None: - chapterTitle += u' (часть %d)' % partIndex - - self._parsedHeading = { - 'storyTitle': storyTitle, - 'chapterTitle': chapterTitle - } - if chapterIndex is not None: - self._parsedHeading['chapterIndex'] = chapterIndex - if partIndex is not None: - self._parsedHeading['partIndex'] = partIndex - return self._parsedHeading - return self._parsedHeading - - # Patterns below start end end with the same optional separator characters (to filter them) - # and allow only freestanding groups of 1--3 digits (ti filter long numbers in titles). - - OUTLINE_PATTERN = re.compile( - u'''[\.:\s]* - (?:глава\s)? - (?:(?\d{1,3})(?=\D)) - [\.-] - (?:(?P\d{1,3})(?=\D|$)) - [\.:\s]* - ''', - re.IGNORECASE | re.UNICODE | re.VERBOSE) - - CHAPTER_PATTERN = re.compile( - u'''[\.:\s]* - (?:глава\s)?(?:(?\d{1,3})(?=\D|$)) - [\.:\s]* - ''', - re.IGNORECASE | re.UNICODE | re.VERBOSE) - - PART_PATTERN = re.compile( - u'''[\.:\s]* - (?:[\.,]?\s)? - (?P\()? - (?:часть\s)? - (?:(?\d{1,3})(?=\D|$)) - (?(brace)\)) - [\.:\s]* - ''', - re.IGNORECASE | re.UNICODE | re.VERBOSE) - - PROLOGUE_EPILOGUE_PATTERN = re.compile( - u'''[\.:\s]* - (?Pпролог|эпилог) # `Prologue' or `epilogue' in Russian. - [\.:\s]* - ''', - re.IGNORECASE + re.UNICODE + re.VERBOSE) - - def __splitHeading(self, heading): - """Parses chapter heading text into meaningful parts. - Returns a tuple(chapter index, part index, story title, chapter title). - Any or both of the indices may be None if absent, chapter title may be empty (only if chapter index is None).""" - def filterPrologueOrEpilogue(title): - match = re.search(self.PROLOGUE_EPILOGUE_PATTERN, title) - if match: - matches = list(re.finditer(u'[:\.]', title)) - if matches: - realStoryTitleEnd = matches[-1].start() - return title[:realStoryTitleEnd] - else: - _logger.warning( - u"Title contains `%s', suspected to be part of numbering, but no period (`.') before it. " - u"Full title is preserved." % title) - return title - - outline_match = re.search(self.OUTLINE_PATTERN, heading) - if outline_match: - chapter_index = int(outline_match.group('chapterIndex')) - part_index = int(outline_match.group('partIndex')) - story = heading[:outline_match.start()] - story = filterPrologueOrEpilogue(story) - chapter = heading[outline_match.end():] - return chapter_index, part_index, story, chapter - else: - chapter_match = re.search(self.CHAPTER_PATTERN, heading) - if chapter_match: - chapter_index = int(chapter_match.group('chapterIndex')) - story = heading[:chapter_match.start()] - story = filterPrologueOrEpilogue(story) - suffix = heading[chapter_match.end():] - part_match = re.search(self.PART_PATTERN, suffix) - if part_match: - part_index = int(part_match.group('partIndex')) - if part_match.start() == 0: - chapter = suffix[part_match.end():] - else: - chapter = suffix[:part_match.start()] - return chapter_index, part_index, story, chapter - else: - chapter = heading[chapter_match.end():] - return chapter_index, None, story, chapter - else: - match = re.search(self.PROLOGUE_EPILOGUE_PATTERN, heading) - if match: - story = heading[:match.start()] - chapter = heading[match.end():] - keyword = match.group('keyword') - if chapter: - chapter = u"%s. %s" % (keyword.title(), chapter) - else: - chapter = keyword - return None, None, story, chapter - return None, None, heading, heading + if not self._heading: + self._heading = self._extractHeading() + return self._heading def _getAuthor(self): if not self._author: @@ -804,9 +645,10 @@ class Chapter(object): def _getLargestCommonPrefix(*args): - """Returns largest common prefix of all unicode(!) arguments. + """Returns largest common prefix of all unicode arguments, ignoring case. :rtype : unicode """ from itertools import takewhile, izip - allSame = lambda xs: len(set(xs)) == 1 + toLower = lambda xs: map(lambda x: x.lower(), xs) + allSame = lambda xs: len(set(toLower(xs))) == 1 return u''.join([i[0] for i in takewhile(allSame, izip(*args))]) From fb6a8fc9315a5fa6c00add215bd2f655522d2bdd Mon Sep 17 00:00:00 2001 From: Dmitry Kozliuk Date: Sun, 26 Jul 2015 21:09:24 +0300 Subject: [PATCH 10/18] Support more variants of chapter and story attribute formats. Collaterally, parse characters and pairings to separate lists. --- fanficfare/adapters/adapter_masseffect2in.py | 121 ++++++++++++------- 1 file changed, 80 insertions(+), 41 deletions(-) diff --git a/fanficfare/adapters/adapter_masseffect2in.py b/fanficfare/adapters/adapter_masseffect2in.py index 0ee6209e..844ebb1f 100644 --- a/fanficfare/adapters/adapter_masseffect2in.py +++ b/fanficfare/adapters/adapter_masseffect2in.py @@ -24,7 +24,7 @@ import urllib2 import codecs from .. import BeautifulSoup as bs -from ..htmlcleanup import stripHTML +from ..htmlcleanup import removeEntities, stripHTML from .. import exceptions as exceptions from base_adapter import BaseSiteAdapter, makeDate @@ -173,6 +173,7 @@ class MassEffect2InAdapter(BaseSiteAdapter): self.story.extendList('genre', chapter.getGenres()) self.story.extendList('characters', chapter.getCharacters()) + self.story.extendList('ships', chapter.getPairings()) wordCount += self._getWordCount(chapter.getTextElement()) @@ -350,16 +351,13 @@ class Chapter(object): return self._getAttributes()['rating']['isAdult'] def getCharacters(self): - attributes = self._getAttributes() - if 'characters' in attributes: - return attributes['characters'] - return [] + return self._getListAttribute('characters') + + def getPairings(self): + return self._getListAttribute('pairings') def getGenres(self): - attributes = self._getAttributes() - if 'genres' in attributes: - return attributes['genres'] - return [] + return self._getListAttribute('genres') def isInProgress(self): attributes = self._getAttributes() @@ -405,6 +403,13 @@ class Chapter(object): else: return storyTitle != thisStoryTitle + def _getListAttribute(self, name): + """Return an attribute value as a list or an empty list if the attribute is absent.""" + attributes = self._getAttributes() + if name in attributes: + return attributes[name] + return [] + def _extractHeading(self): """Extracts header text from the document.""" return stripHTML( @@ -466,41 +471,66 @@ class Chapter(object): return self._attributes def _parseAttributes(self): + """Parse chapter attribute block and return it as a dictionary with standard entries.""" + attributes = {} + attributesText = u'' try: - elements = self._document \ + starter = self._document \ .find('div', {'class': 'comm-div'}) \ - .findNextSibling('div', {'class': 'cb'}) \ - .nextGenerator() - attributesText = u'' - for element in elements: - if not element: - _logger.warning(u'Attribute block not terminated!') - break + .findNextSibling('div', {'class': 'cb'}) + bound = starter.findNextSibling('div', {'class': 'cb'}) + + def processElement(element): + """Return textual representation an *inline* element of chapter attribute block.""" + result = u'' if isinstance(element, bs.Tag): - # Although deprecated, `has_key()' is required here. - if element.name == 'div' and \ - element.has_key('class') and \ - element['class'] == 'cb': + if element.name in ('b', 'strong', 'font', 'br'): + result += u"\n" + if element.name == 's': + result += u"%s" % stripHTML(element) + else: + result += stripHTML(element) + else: + result += removeEntities(element) + return result + + elements = starter.nextSiblingGenerator() + for element in elements: + if isinstance(element, bs.Tag): + if element == bound: + break + else: + if element.name in ('div', 'p'): + attributesText += u"\n" + for child in element.childGenerator(): + attributesText += processElement(child) + continue + attributesText += processElement(element) + + elements = starter.nextGenerator() + for element in elements: + if isinstance(element, bs.Tag): + if element == bound: break elif element.name == 'img': rating = self._parseRatingFromImage(element) if rating: attributes['rating'] = rating - else: - attributesText += stripHTML(element) + break except AttributeError or TypeError: raise ParsingError(u'Failed to locate and collect attributes.') - for record in re.split(u';|\.', attributesText): - parts = record.split(u':', 1) - if len(parts) < 2: + separators = u"\r\n :;." + for line in attributesText.split(u'\n'): + if line.count(u':') != 1: continue - key = parts[0].strip().lower() - value = parts[1].strip().strip(u'.') + key, value = line.split(u':', 1) + key = key.strip(separators).lower() + value = value.strip().strip(separators) parsed = self._parseAttribute(key, value) - if parsed: - attributes[parsed[0]] = parsed[1] + for parsedKey, parsedValue in parsed.iteritems(): + attributes[parsedKey] = parsedValue if 'rating' not in attributes: raise ParsingError(u'Failed to locate or recognize rating!') @@ -552,7 +582,10 @@ class Chapter(object): re.IGNORECASE + re.UNICODE + re.VERBOSE) def _parseAttribute(self, key, value): - """Parses a single known attribute value for chapter metadata.""" + """ + Parse a single a single record in chapter attributes for chapter metadata. + Return a dictionary of canonical attributes and values (i. e. multiple attributes may be discovered). + """ def refineCharacter(name): """Refines character name from stop-words and distortions.""" @@ -563,21 +596,27 @@ class Chapter(object): return canonicalName if re.match(u'жанры?', key, re.UNICODE): - definitions = value.split(u',') - if len(definitions) > 4: - _logger.warning(u'Possibly incorrect genre detection!') - genres = [] - for definition in definitions: - genres += definition.split(u'/') - return 'genres', genres + genres = filter(bool, map(unicode.strip, re.split(u'[,;/]', value))) + return {'genres': genres} elif key == u'статус': isInProgress = value == u'в процессе' - return 'isInProgress', isInProgress + return {'isInProgress': isInProgress} elif key == u'персонажи': - characters = [refineCharacter(name) for name in value.split(u',')] - return 'characters', characters + participants = map(refineCharacter, re.split(u'[,;]', value)) + characters = [] + pairings = [] + for participant in participants: + if u'/' in participant: + pairings.append(participant) + else: + characters.append(participant) + return { + 'characters': characters, + 'pairings': pairings + } else: _logger.debug(u"Unrecognized attribute `%s' ignored.", key) + return {} def _getTextElement(self): if not self._textElement: From e2e4590f1d20e943a86363ccb22b9d2764831ec6 Mon Sep 17 00:00:00 2001 From: Dmitry Kozliuk Date: Sun, 26 Jul 2015 21:13:47 +0300 Subject: [PATCH 11/18] Consider story in progress if the last, not any, chapter is in progress. --- fanficfare/adapters/adapter_masseffect2in.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fanficfare/adapters/adapter_masseffect2in.py b/fanficfare/adapters/adapter_masseffect2in.py index 844ebb1f..076ca378 100644 --- a/fanficfare/adapters/adapter_masseffect2in.py +++ b/fanficfare/adapters/adapter_masseffect2in.py @@ -177,10 +177,11 @@ class MassEffect2InAdapter(BaseSiteAdapter): wordCount += self._getWordCount(chapter.getTextElement()) - # Story is in progress if any chapter is in progress. Some chapters may have no status attribute. + # Chapter status usually represents the story status, so we want the last chapter status. + # Some chapters may have no status attribute. chapterInProgress = chapter.isInProgress() if chapterInProgress is not None: - storyInProgress |= chapterInProgress + storyInProgress = chapterInProgress # If any chapter is adult, consider the whole story adult. if chapter.isRatingAdult(): @@ -200,8 +201,7 @@ class MassEffect2InAdapter(BaseSiteAdapter): raise exceptions.FailedToDownload(u"Failed to download chapter `%s': %s" % (url, error)) # Some metadata are handled separately due to format conversions. - self.story.setMetadata( - 'status', 'In Progress' if storyInProgress else 'Completed') + self.story.setMetadata('status', 'In Progress' if storyInProgress else 'Completed') self.story.setMetadata('datePublished', datePublished) self.story.setMetadata('dateUpdated', dateUpdated) self.story.setMetadata('numWords', str(wordCount)) From b8710eba970c680b258835dbb33317637e9ecb2d Mon Sep 17 00:00:00 2001 From: Dmitry Kozliuk Date: Sun, 26 Jul 2015 21:21:28 +0300 Subject: [PATCH 12/18] Collect story-wide metadata across all chapters. * Support multiple authors for story and no author for chapter. * Make chapter rating optional. * Detect chapter (and thus, story) "adultness" by either rating or editor warning, whichever is present. * Add first chapter summary as story summary, parse as summary either a dedicated attribute or freestanding text. --- fanficfare/adapters/adapter_masseffect2in.py | 87 +++++++++++++++----- 1 file changed, 66 insertions(+), 21 deletions(-) diff --git a/fanficfare/adapters/adapter_masseffect2in.py b/fanficfare/adapters/adapter_masseffect2in.py index 076ca378..11199b1c 100644 --- a/fanficfare/adapters/adapter_masseffect2in.py +++ b/fanficfare/adapters/adapter_masseffect2in.py @@ -121,17 +121,6 @@ class MassEffect2InAdapter(BaseSiteAdapter): raise exceptions.StoryDoesNotExist(self.url) raise - try: - self.story.setMetadata('author', startingChapter.getAuthorName()) - authorId = startingChapter.getAuthorId() - authorUrl = 'http://%s/index/%s' % (self.getSiteDomain(), authorId) - self.story.setMetadata('authorId', authorId) - self.story.setMetadata('authorUrl', authorUrl) - self.story.setMetadata('rating', startingChapter.getRatingTitle()) - except ParsingError, error: - raise exceptions.FailedToDownload( - u"Failed to parse story metadata for `%s': %s" % (self.url, error)) - # We only have one date for each chapter and assume the oldest one # to be publication date and the most recent one to be update date. datePublished = datetime.datetime.max @@ -168,6 +157,28 @@ class MassEffect2InAdapter(BaseSiteAdapter): _logger.debug(u"Processing chapter `%s'.", url) try: + authorName = chapter.getAuthorName() + if authorName: + self.story.extendList('author', [authorName]) + authorId = chapter.getAuthorId() + if authorId: + authorUrl = 'http://%s/index/%s' % (self.getSiteDomain(), authorId) + else: + authorId = u'' + authorUrl = u'' + self.story.extendList('authorId', [authorId]) + self.story.extendList('authorUrl', [authorUrl]) + + if not self.story.getMetadata('rating'): + ratingTitle = chapter.getRatingTitle() + if ratingTitle: + self.story.setMetadata('rating', ratingTitle) + + if not self.story.getMetadata('description'): + summary = chapter.getSummary() + if summary: + self.story.setMetadata('description', summary) + datePublished = min(datePublished, chapter.getDate()) dateUpdated = max(dateUpdated, chapter.getDate()) @@ -184,7 +195,7 @@ class MassEffect2InAdapter(BaseSiteAdapter): storyInProgress = chapterInProgress # If any chapter is adult, consider the whole story adult. - if chapter.isRatingAdult(): + if chapter.isAdult(): self.story.setMetadata('is_adult', True) chapterTitle = re.sub(garbagePattern, u'', chapter.getHeading()[chapterTitleStart:]) @@ -198,7 +209,7 @@ class MassEffect2InAdapter(BaseSiteAdapter): self.chapterUrls.append((chapterTitle, url)) except ParsingError, error: - raise exceptions.FailedToDownload(u"Failed to download chapter `%s': %s" % (url, error)) + raise exceptions.FailedToDownload(u"Failed to download chapter `%s': %s" % (url, error)) # Some metadata are handled separately due to format conversions. self.story.setMetadata('status', 'In Progress' if storyInProgress else 'Completed') @@ -335,20 +346,36 @@ class Chapter(object): def getHeading(self): return self._extractHeading() + def getSummary(self): + attributes = self._getAttributes() + if 'summary' in attributes: + return attributes['summary'] + def getAuthorId(self): - return self._getAuthor()['id'] + author = self._getAuthor() + if author: + return author['id'] def getAuthorName(self): - return self._getAuthor()['name'] + author = self._getAuthor() + if author: + return author['name'] def getDate(self): return self._getDate() def getRatingTitle(self): - return self._getAttributes()['rating']['title'] + attributes = self._getAttributes() + if 'rating' in attributes: + return attributes['rating']['title'] - def isRatingAdult(self): - return self._getAttributes()['rating']['isAdult'] + def isAdult(self): + attributes = self._getAttributes() + if 'rating' in attributes and attributes['rating']['isAdult']: + return True + if 'warning' in attributes: + return True + return False def getCharacters(self): return self._getListAttribute('characters') @@ -522,8 +549,10 @@ class Chapter(object): raise ParsingError(u'Failed to locate and collect attributes.') separators = u"\r\n :;." + freestandingText = u'' for line in attributesText.split(u'\n'): if line.count(u':') != 1: + freestandingText += line continue key, value = line.split(u':', 1) key = key.strip(separators).lower() @@ -532,15 +561,20 @@ class Chapter(object): for parsedKey, parsedValue in parsed.iteritems(): attributes[parsedKey] = parsedValue + freestandingText = freestandingText.strip() + if 'summary' not in attributes and freestandingText: + attributes['summary'] = freestandingText + if 'rating' not in attributes: - raise ParsingError(u'Failed to locate or recognize rating!') + _logger.warning(u"Failed to locate or recognize rating for `%s'!", self.getUrl()) return attributes + # Most, but not all, URLs of rating icons match this. RATING_LABEL_PATTERN = re.compile(u'/(?P[ERATINnG]+)\.png$') def _parseRatingFromImage(self, element): - """Given an image element, tries to parse story rating from it.""" + """Given an image element, try to parse story rating from it.""" # Although deprecated, `has_key()' is required here. if not element.has_key('src'): return @@ -558,7 +592,7 @@ class Chapter(object): } else: _logger.warning(u"No title found for rating label `%s'!" % label) - # FIXME: It seems, rating has to be optional due to such URLs. + # TODO: conduct a research on such abnormal URLs. elif source == 'http://www.masseffect2.in/_fr/10/1360399.png': label = 'Nn' return { @@ -581,6 +615,9 @@ class Chapter(object): ''', re.IGNORECASE + re.UNICODE + re.VERBOSE) + # `Author's Notes' and its variants in Russian. + ANNOTATION_PATTERN = re.compile(u'аннотация|описание|(?:(?:за|при)мечание\s)?(?:от\s)?автора', re.UNICODE) + def _parseAttribute(self, key, value): """ Parse a single a single record in chapter attributes for chapter metadata. @@ -614,6 +651,14 @@ class Chapter(object): 'characters': characters, 'pairings': pairings } + elif key == u'предупреждение': + return {'warning': value} + elif re.match(self.ANNOTATION_PATTERN, key): + if not value.endswith(u'.'): + value += u'.' + # Capitalize would make value[1:] lowercase, which we don't want. + value = value[:1].upper() + value[1:] + return {'summary': value} else: _logger.debug(u"Unrecognized attribute `%s' ignored.", key) return {} From 5b01eef4958798a659500ed8be7bce044f9e00b0 Mon Sep 17 00:00:00 2001 From: Dmitry Kozliuk Date: Sun, 26 Jul 2015 21:26:41 +0300 Subject: [PATCH 13/18] Add documentation strings, make some methods private (__*). --- fanficfare/adapters/adapter_masseffect2in.py | 53 +++++++++++--------- 1 file changed, 30 insertions(+), 23 deletions(-) diff --git a/fanficfare/adapters/adapter_masseffect2in.py b/fanficfare/adapters/adapter_masseffect2in.py index 11199b1c..488dba87 100644 --- a/fanficfare/adapters/adapter_masseffect2in.py +++ b/fanficfare/adapters/adapter_masseffect2in.py @@ -280,7 +280,7 @@ class MassEffect2InAdapter(BaseSiteAdapter): return self._parsingConfiguration def _getDocumentId(self, url): - """Extracts document ID from MassEffect2.in URL.""" + """Extract document ID from MassEffect2.in URL.""" match = re.search(self.DOCUMENT_ID_PATTERN, url) if not match: raise ValueError(u"Failed to extract document ID from `'" % url) @@ -289,12 +289,11 @@ class MassEffect2InAdapter(BaseSiteAdapter): @classmethod def _makeDocumentUrl(cls, documentId): - """Makes a chapter URL given a chapter ID.""" + """Make a chapter URL given a document ID.""" return 'http://%s/publ/%s' % (cls.getSiteDomain(), documentId) def _loadDocument(self, url): - """Fetches URL content and returns its element tree - with parsing settings tuned for MassEffect2.in.""" + """Fetch URL content and return its element tree with parsing settings tuned for MassEffect2.in.""" return bs.BeautifulStoneSoup( self._fetchUrl(url), selfClosingTags=('br', 'hr', 'img')) @@ -302,7 +301,7 @@ class MassEffect2InAdapter(BaseSiteAdapter): parameters=None, usecache=True, extrasleep=None): - """Fetches URL contents, see BaseSiteAdapter for details. + """Fetch URL contents, see BaseSiteAdapter for details. Overridden to support on-disk cache when debugging Calibre.""" from calibre.constants import DEBUG if DEBUG: @@ -347,7 +346,7 @@ class Chapter(object): return self._extractHeading() def getSummary(self): - attributes = self._getAttributes() + attributes = self.__getAttributes() if 'summary' in attributes: return attributes['summary'] @@ -362,15 +361,15 @@ class Chapter(object): return author['name'] def getDate(self): - return self._getDate() + return self.__getDate() def getRatingTitle(self): - attributes = self._getAttributes() + attributes = self.__getAttributes() if 'rating' in attributes: return attributes['rating']['title'] def isAdult(self): - attributes = self._getAttributes() + attributes = self.__getAttributes() if 'rating' in attributes and attributes['rating']['isAdult']: return True if 'warning' in attributes: @@ -387,7 +386,7 @@ class Chapter(object): return self._getListAttribute('genres') def isInProgress(self): - attributes = self._getAttributes() + attributes = self.__getAttributes() if 'isInProgress' in attributes: return attributes['isInProgress'] @@ -398,17 +397,17 @@ class Chapter(object): return self._getTextElement() def getPreviousChapterUrl(self): - """Downloads chapters following `Previous chapter' links. + """Download chapters following `Previous chapter' links. Returns a list of chapters' URLs.""" return self._getSiblingChapterUrl({'class': 'fl tal'}) def getNextChapterUrl(self): - """Downloads chapters following `Next chapter' links. + """Download chapters following `Next chapter' links. Returns a list of chapters' URLs.""" return self._getSiblingChapterUrl({'class': 'tar fr'}) def isFromStory(self, storyTitle, prefixThreshold=-1): - """Checks if this chapter is from a story different from the given one. + """Check if this chapter is from a story different from the given one. Prefix threshold specifies how long common story title prefix shall be for chapters from one story: negative value means implementation-defined optimum, zero inhibits the check, and positive value adjusts threshold.""" @@ -432,27 +431,30 @@ class Chapter(object): def _getListAttribute(self, name): """Return an attribute value as a list or an empty list if the attribute is absent.""" - attributes = self._getAttributes() + attributes = self.__getAttributes() if name in attributes: return attributes[name] return [] def _extractHeading(self): - """Extracts header text from the document.""" + """Extract header text from the document.""" return stripHTML( self._document.find('div', {'class': 'eTitle'}).string) def __getHeading(self): + """Lazily parse and return heading.""" if not self._heading: self._heading = self._extractHeading() return self._heading def _getAuthor(self): + """Lazily parse and return author's information.""" if not self._author: self._author = self._parseAuthor() return self._author def _parseAuthor(self): + """Locate and parse chapter author's information to a dictionary with author's `id' and `name'.""" try: authorLink = self._getInfoBarElement() \ .find('i', {'class': 'icon-user'}) \ @@ -469,12 +471,14 @@ class Chapter(object): 'name': authorName } - def _getDate(self): + def __getDate(self): + """Lazily parse chapter date.""" if not self._date: self._date = self._parseDate() return self._date def _parseDate(self): + """Locate and parse chapter date.""" try: dateText = self._getInfoBarElement() \ .find('i', {'class': 'icon-eye'}) \ @@ -486,13 +490,15 @@ class Chapter(object): return date def _getInfoBarElement(self): + """Locate informational bar element, containing chapter date and author, on the page.""" if not self._infoBar: self._infoBar = self._document.find('td', {'class': 'eDetails2'}) if not self._infoBar: raise ParsingError(u'No informational bar found.') return self._infoBar - def _getAttributes(self): + def __getAttributes(self): + """Lazily parse attributes.""" if not self._attributes: self._attributes = self._parseAttributes() return self._attributes @@ -664,12 +670,13 @@ class Chapter(object): return {} def _getTextElement(self): + """Locate chapter body text element on the page.""" if not self._textElement: self._textElement = self.__collectTextElements() return self._textElement def __collectTextElements(self): - """Returns all elements containing parts of chapter text (which may be + """Return all elements containing parts of chapter text (which may be

aragraphs,

isions or plain text nodes) under a single root.""" starter = self._document.find('div', {'id': u'article'}) if starter is None: @@ -696,11 +703,9 @@ class Chapter(object): return root def _getSiblingChapterUrl(self, selector): - """Downloads chapters one by one by locating and following links - specified by a selector. Returns chapters' URLs in order they - were found.""" - block = self._document\ - .find('td', {'class': 'eDetails1'})\ + """Locate a link to a sibling chapter, either previous or next one, and return its URL.""" + block = self._document \ + .find('td', {'class': 'eDetails1'}) \ .find('div', selector) if not block: return @@ -709,9 +714,11 @@ class Chapter(object): return return link['href'] + # Editor signature always starts with something like this. SIGNED_PATTERN = re.compile(u'отредактирова(?:но|ла?)[:.\s]', re.IGNORECASE + re.UNICODE) def _excludeEditorSignature(self, root): + """Exclude editor signature from within `root' element.""" for textNode in root.findAll(text=True): if re.match(self.SIGNED_PATTERN, textNode.string): editorLink = textNode.findNext('a') From 9d5c64b5dbf47b3c0093a50f1d437fc5edd49e99 Mon Sep 17 00:00:00 2001 From: Dmitry Kozliuk Date: Sun, 26 Jul 2015 22:00:08 +0300 Subject: [PATCH 14/18] Remove development and debugging facilities. --- fanficfare/adapters/adapter_masseffect2in.py | 48 ++++---------------- 1 file changed, 9 insertions(+), 39 deletions(-) diff --git a/fanficfare/adapters/adapter_masseffect2in.py b/fanficfare/adapters/adapter_masseffect2in.py index 488dba87..8eb97bd2 100644 --- a/fanficfare/adapters/adapter_masseffect2in.py +++ b/fanficfare/adapters/adapter_masseffect2in.py @@ -1,8 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2011 Fanficdownloader team, -# 2015 FanFicFare team, -# 2015 Dmitry Kozliuk +# Copyright 2015 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -21,7 +19,6 @@ import datetime import logging import re import urllib2 -import codecs from .. import BeautifulSoup as bs from ..htmlcleanup import removeEntities, stripHTML @@ -48,12 +45,15 @@ class ParsingError(Exception): class MassEffect2InAdapter(BaseSiteAdapter): - """Provides support for masseffect2.in site as story source. - Can be used as a template for sites build upon Ucoz.com engine. + """ + Provides support for MassEffect2.in site as story source. + Can be used as a template for sites build upon Ucoz.com engine (until no base class extracted). Specializations: 1) Russian content (date format, genre names, etc.); - 2) original `R.A.T.I.N.G.' rating scale, used by masseffect2.in - and some affiliated sites.""" + 2) original `E.R.A.T.I.N.G.' rating scale, used by masseffect2.in + and some affiliated sites, denoted with images; + 3) editor signatures an an option to remove them. + """ WORD_PATTERN = re.compile(u'\w+', re.UNICODE) DOCUMENT_ID_PATTERN = re.compile(u'\d+-\d+-\d+-\d+') @@ -297,36 +297,6 @@ class MassEffect2InAdapter(BaseSiteAdapter): return bs.BeautifulStoneSoup( self._fetchUrl(url), selfClosingTags=('br', 'hr', 'img')) - def _fetchUrl(self, url, - parameters=None, - usecache=True, - extrasleep=None): - """Fetch URL contents, see BaseSiteAdapter for details. - Overridden to support on-disk cache when debugging Calibre.""" - from calibre.constants import DEBUG - if DEBUG: - import os - documentId = self._getDocumentId(url) - path = u'./cache/%s' % documentId - if os.path.isfile(path) and os.access(path, os.R_OK): - _logger.debug(u"On-disk cache HIT for `%s'.", url) - with codecs.open(path, encoding='utf-8') as input: - return input.read() - else: - _logger.debug(u"On-disk cache MISS for `%s'.", url) - - content = BaseSiteAdapter._fetchUrl( - self, url, parameters, usecache, extrasleep) - - if DEBUG: - import os - if os.path.isdir(os.path.dirname(path)): - _logger.debug(u"Caching `%s' content on disk.", url) - with codecs.open(path, mode='w', encoding='utf-8') as output: - output.write(content) - - return content - class Chapter(object): """Represents a lazily-parsed chapter of a story.""" @@ -666,7 +636,7 @@ class Chapter(object): value = value[:1].upper() + value[1:] return {'summary': value} else: - _logger.debug(u"Unrecognized attribute `%s' ignored.", key) + _logger.info(u"Unrecognized attribute `%s' ignored.", key) return {} def _getTextElement(self): From 13dde78139d20770085dc140f3207367444b7cf6 Mon Sep 17 00:00:00 2001 From: Dmitry Kozliuk Date: Sun, 26 Jul 2015 22:07:50 +0300 Subject: [PATCH 15/18] Append warnings to story metadata. --- fanficfare/adapters/adapter_masseffect2in.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fanficfare/adapters/adapter_masseffect2in.py b/fanficfare/adapters/adapter_masseffect2in.py index 8eb97bd2..b8c924af 100644 --- a/fanficfare/adapters/adapter_masseffect2in.py +++ b/fanficfare/adapters/adapter_masseffect2in.py @@ -197,6 +197,9 @@ class MassEffect2InAdapter(BaseSiteAdapter): # If any chapter is adult, consider the whole story adult. if chapter.isAdult(): self.story.setMetadata('is_adult', True) + warning = chapter.getWarning() + if warning: + self.story.extendList('warnings', [warning]) chapterTitle = re.sub(garbagePattern, u'', chapter.getHeading()[chapterTitleStart:]) @@ -346,6 +349,11 @@ class Chapter(object): return True return False + def getWarning(self): + attributes = self.__getAttributes() + if 'warning' in attributes: + return attributes['warning'] + def getCharacters(self): return self._getListAttribute('characters') From db1cf8587c4315845dead91160c62bd454d97381 Mon Sep 17 00:00:00 2001 From: Dmitry Kozliuk Date: Sun, 26 Jul 2015 23:08:14 +0300 Subject: [PATCH 16/18] Set lifted `slow_down_sleep_time' to prevent IP ban for excessive requests. --- calibre-plugin/plugin-defaults.ini | 5 +++++ fanficfare/defaults.ini | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/calibre-plugin/plugin-defaults.ini b/calibre-plugin/plugin-defaults.ini index a68a5d99..a0832d90 100644 --- a/calibre-plugin/plugin-defaults.ini +++ b/calibre-plugin/plugin-defaults.ini @@ -1226,6 +1226,11 @@ extracategories:Harry Potter ## Site dedicated to this fandom. extracategories: Mass Effect +## Ucoz.com engine, upon which MassEffect2.in is based, imposes an unspecified limit on request frequency. +## Sources vary from `5 requests per second' to `2 requests per second for more than 10 per minute'. +## With default settings, a several-hours IP ban may follow, so set it lifted. +slow_down_sleep_time: 2 + ## Whether to exclude editor signature from the bottom if chapter text. exclude_editor_signature: false diff --git a/fanficfare/defaults.ini b/fanficfare/defaults.ini index d9069cf8..856bfa33 100644 --- a/fanficfare/defaults.ini +++ b/fanficfare/defaults.ini @@ -1827,6 +1827,11 @@ extracategories:Lord of the Rings ## Site dedicated to this fandom. extracategories: Mass Effect +## Ucoz.com engine, upon which MassEffect2.in is based, imposes an unspecified limit on request frequency. +## Sources vary from `5 requests per second' to `2 requests per second for more than 10 per minute'. +## With default settings, a several-hours IP ban may follow, so set it lifted. +slow_down_sleep_time: 2 + ## Whether to exclude editor signature from the bottom if chapter text. exclude_editor_signature: false From 1ad45299db34dc6598ded351eb8122441396d963 Mon Sep 17 00:00:00 2001 From: Dmitry Kozliuk Date: Mon, 27 Jul 2015 18:15:10 +0300 Subject: [PATCH 17/18] Fix mistakes and typos in configuration per JimmXinu's suggestion. See GitHub PR comments: https://github.com/JimmXinu/FanFicFare/pull/103#discussion_r35535523 https://github.com/JimmXinu/FanFicFare/pull/103#discussion_r35535396 --- calibre-plugin/plugin-defaults.ini | 10 +++++----- fanficfare/defaults.ini | 10 +++++----- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/calibre-plugin/plugin-defaults.ini b/calibre-plugin/plugin-defaults.ini index ebaa3f73..c8161ee9 100644 --- a/calibre-plugin/plugin-defaults.ini +++ b/calibre-plugin/plugin-defaults.ini @@ -1227,15 +1227,15 @@ extracategories:Harry Potter extracategories: Mass Effect ## Ucoz.com engine, upon which MassEffect2.in is based, imposes an unspecified limit on request frequency. -## Sources vary from `5 requests per second' to `2 requests per second for more than 10 per minute'. -## With default settings, a several-hours IP ban may follow, so set it lifted. +## Reports vary from `5 requests per second' to `2 requests per second for more than 10 per minute'. +## With default settings, a several-hours IP ban may follow, so set it higher. slow_down_sleep_time: 2 -## Whether to exclude editor signature from the bottom if chapter text. +## Whether to exclude editor signature from the bottom of chapter text. exclude_editor_signature: false -## Stories on the site almost never have cover image. -## May be adjusted in `personal.ini' on per-story basis. +## Stories on the site almost never have cover image, and for the stories which do, +## this may be adjusted in `personal.ini' before downloading. never_make_cover: true ## Titles for ratings identified by 1- or 2-letter codes from `ERATING system' diff --git a/fanficfare/defaults.ini b/fanficfare/defaults.ini index b0a4754d..335d0462 100644 --- a/fanficfare/defaults.ini +++ b/fanficfare/defaults.ini @@ -1828,15 +1828,15 @@ extracategories:Lord of the Rings extracategories: Mass Effect ## Ucoz.com engine, upon which MassEffect2.in is based, imposes an unspecified limit on request frequency. -## Sources vary from `5 requests per second' to `2 requests per second for more than 10 per minute'. -## With default settings, a several-hours IP ban may follow, so set it lifted. +## Reports vary from `5 requests per second' to `2 requests per second for more than 10 per minute'. +## With default settings, a several-hours IP ban may follow, so set it higher. slow_down_sleep_time: 2 -## Whether to exclude editor signature from the bottom if chapter text. +## Whether to exclude editor signature from the bottom of chapter text. exclude_editor_signature: false -## Stories on the site almost never have cover image. -## May be adjusted in `personal.ini' on per-story basis. +## Stories on the site almost never have cover image, and for the stories which do, +## this may be adjusted in `personal.ini' before downloading. never_make_cover: true ## Titles for ratings identified by 1- or 2-letter codes from `ERATING system' From 0ec1e8b779fbc74d1786982f199d60e776d8ebbf Mon Sep 17 00:00:00 2001 From: Dmitry Kozliuk Date: Mon, 27 Jul 2015 19:55:49 +0300 Subject: [PATCH 18/18] Switch MassEffect2.in adapter to BeautifulSoup 4.3. --- fanficfare/adapters/adapter_masseffect2in.py | 30 +++++++++----------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/fanficfare/adapters/adapter_masseffect2in.py b/fanficfare/adapters/adapter_masseffect2in.py index b8c924af..e2b77cb3 100644 --- a/fanficfare/adapters/adapter_masseffect2in.py +++ b/fanficfare/adapters/adapter_masseffect2in.py @@ -15,12 +15,12 @@ # limitations under the License. # +import bs4 import datetime import logging import re import urllib2 -from .. import BeautifulSoup as bs from ..htmlcleanup import removeEntities, stripHTML from .. import exceptions as exceptions from base_adapter import BaseSiteAdapter, makeDate @@ -233,7 +233,7 @@ class MassEffect2InAdapter(BaseSiteAdapter): def _makeChapter(self, url): """Creates a chapter object given a URL.""" - document = self._loadDocument(url) + document = self.make_soup(self._fetchUrl(url)) chapter = Chapter(self._getParsingConfiguration(), url, document) return chapter @@ -295,11 +295,6 @@ class MassEffect2InAdapter(BaseSiteAdapter): """Make a chapter URL given a document ID.""" return 'http://%s/publ/%s' % (cls.getSiteDomain(), documentId) - def _loadDocument(self, url): - """Fetch URL content and return its element tree with parsing settings tuned for MassEffect2.in.""" - return bs.BeautifulStoneSoup( - self._fetchUrl(url), selfClosingTags=('br', 'hr', 'img')) - class Chapter(object): """Represents a lazily-parsed chapter of a story.""" @@ -495,7 +490,7 @@ class Chapter(object): def processElement(element): """Return textual representation an *inline* element of chapter attribute block.""" result = u'' - if isinstance(element, bs.Tag): + if isinstance(element, bs4.Tag): if element.name in ('b', 'strong', 'font', 'br'): result += u"\n" if element.name == 's': @@ -508,7 +503,7 @@ class Chapter(object): elements = starter.nextSiblingGenerator() for element in elements: - if isinstance(element, bs.Tag): + if isinstance(element, bs4.Tag): if element == bound: break else: @@ -521,7 +516,7 @@ class Chapter(object): elements = starter.nextGenerator() for element in elements: - if isinstance(element, bs.Tag): + if isinstance(element, bs4.Tag): if element == bound: break elif element.name == 'img': @@ -560,7 +555,7 @@ class Chapter(object): def _parseRatingFromImage(self, element): """Given an image element, try to parse story rating from it.""" # Although deprecated, `has_key()' is required here. - if not element.has_key('src'): + if not element.has_attr('src'): return source = element['src'] if 'REITiNG' in source: @@ -659,19 +654,20 @@ class Chapter(object): starter = self._document.find('div', {'id': u'article'}) if starter is None: # FIXME: This will occur if the method is called more than once. - # The reason is elements appended to `root' are removed from - # the document. BS 4.4 implements cloning via `copy.copy()', - # but supporting it for earlier versions is error-prone - # (due to relying on BS internals). + # The reason is elements appended to `root' are removed from the document. + # BS 4.4 implements cloning via `copy.copy()', but supporting it for BS 4.3 + # would be error-prone (due to relying on BS internals) and is not needed. + if self._textElement: + _logger.debug(u"You may not call this function more than once!") raise ParsingError(u'Failed to locate text.') collection = [starter] for element in starter.nextSiblingGenerator(): if element is None: break - if isinstance(element, bs.Tag) and element.name == 'tr': + if isinstance(element, bs4.Tag) and element.name == 'tr': break collection.append(element) - root = bs.Tag(self._document, 'td') + root = bs4.Tag(name='td') for element in collection: root.append(element)