add wuxiaworld.site adapter

2025-12-06 08:52:55 +01:00 · 2020-09-09 11:51:33 +02:00 · 2020-09-09 11:51:33 +02:00 · c991f3cd3a
commit c991f3cd3a
parent 12a5208ab2
6 changed files with 8533 additions and 0 deletions
--- a/fanficfare/adapters/init.py
+++ b/fanficfare/adapters/init.py
@ -171,6 +171,7 @@ from . import adapter_silmarillionwritersguildorg
 from . import adapter_chireadscom
 from . import adapter_scribblehubcom
 from . import adapter_fictionlive
+from . import adapter_wuxiaworldsite

 ## This bit of complexity allows adapters to be added by just adding
 ## importing.  It eliminates the long if/else clauses we used to need
--- a/fanficfare/adapters/adapter_wuxiaworldsite.py
+++ b/fanficfare/adapters/adapter_wuxiaworldsite.py
@ -0,0 +1,145 @@
+# -*- coding: utf-8 -*-
+# Copyright 2016 Fanficdownloader team
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Adapted by GComyn on December 14. 2016
+
+from __future__ import absolute_import
+import json
+import logging
+import re
+# py2 vs py3 transition
+from ..six import text_type as unicode
+from ..six.moves.urllib import parse as urlparse
+from ..six.moves.urllib.error import HTTPError
+from ..dateutils import parse_relative_date_string
+
+from .base_adapter import BaseSiteAdapter, makeDate
+from ..htmlcleanup import stripHTML
+from .. import exceptions as exceptions
+
+logger = logging.getLogger(__name__)
+
+
+def getClass():
+    return WuxiaWorldSiteSiteAdapter
+
+
+class WuxiaWorldSiteSiteAdapter(BaseSiteAdapter):
+    def __init__(self, config, url):
+        BaseSiteAdapter.__init__(self, config, url)
+        self.story.setMetadata('siteabbrev', 'wuxsite')
+        self._dateformat = '%Y-%m-%d'
+
+        # get storyId from url--url validation guarantees query correct
+        match = re.match(self.getSiteURLPattern(), url)
+        if not match:
+            raise exceptions.InvalidStoryURL(url, self.getSiteDomain(), self.getSiteExampleURLs())
+
+        story_id = match.group('id')
+        self.story.setMetadata('storyId', story_id)
+        self._setURL('https://%s/novel/%s' % (self.getSiteDomain(), story_id))
+
+    @staticmethod
+    def getSiteDomain():
+        return 'wuxiaworld.site'
+
+    @classmethod
+    def getSiteExampleURLs(cls):
+        return 'https://%s/novel/story-name' % cls.getSiteDomain()
+
+    def getSiteURLPattern(self):
+        return r'https?://%s/novel/(?P<id>[^/]+)(/)?' % re.escape(self.getSiteDomain())
+
+    def use_pagecache(self):
+        return True
+
+    def _parse_linked_data(self, soup):
+        # See https://json-ld.org
+        tag = soup.find('script', type='application/ld+json')
+        if not tag:
+            return {}
+        return json.loads(tag.string)
+
+    def _parse_date(self, text):
+        # Strip time from date--site doesn't seem to have it anymore.
+        text = re.sub(r'T.*$', '', text)
+        return makeDate(text, self._dateformat)
+
+    def extractChapterUrlsAndMetadata(self):
+        logger.debug('URL: %s', self.url)
+        try:
+            data = self._fetchUrl(self.url)
+        except HTTPError as exception:
+            if exception.code == 404:
+                raise exceptions.StoryDoesNotExist('404 error: {}'.format(self.url))
+            raise exception
+
+        soup = self.make_soup(data)
+
+        self.story.setMetadata('title', soup.find('meta', {'property': 'og:title'})['content'].split(' - ')[0])
+
+        author_name = soup.select_one('.author-content > a').get_text()
+        self.story.setMetadata('author', author_name)
+        self.story.setMetadata('authorId', author_name.lower())
+
+        ld = self._parse_linked_data(soup)
+        webpage_graph = [g for g in ld['@graph'] if g['@type'] == 'WebPage']
+        date_updated_webpage = None
+        if len(webpage_graph) > 0:
+            webpage = webpage_graph[0]
+
+            str_date_published = webpage['datePublished']
+            date_published = self._parse_date(str_date_published)
+            self.story.setMetadata('datePublished', date_published)
+
+            str_date_updated_webpage = webpage['dateModified']
+            date_updated_webpage = self._parse_date(str_date_updated_webpage)
+
+        str_date_updated_last_chapter = soup.select_one('.chapter-release-date').i.get_text()
+        if str_date_updated_last_chapter[-4:] == ' ago':
+            date_updated_last_chapter = parse_relative_date_string(str_date_updated_last_chapter[:-4])
+        else:
+            date_updated_last_chapter = makeDate(str_date_updated_last_chapter, '%B %d, %Y')
+
+        date_updated = date_updated_last_chapter if date_updated_webpage is None else max(date_updated_webpage, date_updated_last_chapter)
+        self.story.setMetadata('dateUpdated', date_updated)
+
+        tags = [stripHTML(a) for a in soup.select('.post-status .summary-content')]
+        for tag in tags:
+            if 'Completed' == tag:
+                self.story.setMetadata('status', 'Completed')
+            elif 'OnGoing' == tag:
+                self.story.setMetadata('status', 'In-Progress')
+
+        self.setCoverImage(self.url, soup.find('meta', {'property': "og:image"})['content'])
+
+        description = ' '.join([stripHTML(a) for a in soup.select('.summary__content p')])
+        self.setDescription(self.url, description)
+
+        chapter_list = soup.select('.wp-manga-chapter > a')
+        chapter_list.reverse()
+        for a in chapter_list:
+            title = stripHTML(a)
+            url = urlparse.urljoin(self.url, a['href'])
+            self.add_chapter(title, url)
+
+    def getChapterText(self, url):
+        logger.debug('Getting chapter text from: %s', url)
+        data = self._fetchUrl(url)
+        soup = self.make_soup(data)
+        content = soup.select_one('.reading-content')
+
+        return self.utf8FromSoup(url, content)
--- a/fanficfare/dateutils.py
+++ b/fanficfare/dateutils.py
@ -49,6 +49,7 @@ def parse_relative_date_string(string_):
        'weeks': 'weeks',
        'second': 'seconds',
        'minute': 'minutes',
+        'mins': 'minutes',
        'hour': 'hours',
        'day': 'days',
        'week': 'weeks',
--- a/tests/adapters/test_adapter_wuxiaworldsite.py
+++ b/tests/adapters/test_adapter_wuxiaworldsite.py
@ -0,0 +1,70 @@
+import pytest
+
+from unittest.mock import patch
+
+from fanficfare.adapters.adapter_wuxiaworldsite import WuxiaWorldSiteSiteAdapter as adapter_for_tests
+from datetime import datetime
+
+from tests.adapters.generic_adapter_test import GenericAdapterTestExtractChapterUrlsAndMetadata, GenericAdapterTestGetChapterText
+from tests.conftest import wuxiaworldsite_html_return, wuxiaworldsite_html_chapter_return
+
+SPECIFIC_TEST_DATA = {
+    'adapter': adapter_for_tests,
+    'url': 'https://wuxiaworld.site/novel/some-story',
+    'sections': ["wuxiaworld.site"],
+    'specific_path_adapter': 'adapter_wuxiaworldsite.WuxiaWorldSiteSiteAdapter',
+
+    'title': 'The Tutorial Is Too Hard',
+    'cover_image': 'https://wuxiaworld.site/wp-content/uploads/2019/04/the-tutorial-is-too-hard-193x278.jpg',
+    'author': 'Gandara',
+    'authorId': 'gandara',
+    'dateUpdated': datetime.utcnow().strftime('%Y-%m-%d'),
+    'intro': "Read The Tutorial Is Too Hard Novel at WuxiaWorld.Site On a normal boring day, a message appears, inviting him to a Tutorial. A tale about Lee Ho Jae and his escape from the Tutorial. But he just happened to choose the hardest possible difficulty: Hell. Disclaimer:Neither the picture nor the content belongs to me. They are uploaded here, not for any bad purpose but for entertainment only. Disclaimer:If this novel is yours, please let us share this novel to everyone else and send us your credit. We display your credit to this novel! If you don’t please tell us too, We respect your decision.",
+    'expected_chapters': {
+        0:   {'title': 'Chapter 1',
+              'url': 'https://wuxiaworld.site/novel/the-tutorial-is-too-hard/chapter-1'},
+        10:  {'title': 'Chapter 11',
+              'url': 'https://wuxiaworld.site/novel/the-tutorial-is-too-hard/chapter-11'},
+        100: {'title': 'Chapter 101',
+              'url': 'https://wuxiaworld.site/novel/the-tutorial-is-too-hard/chapter-101'},
+        190: {'title': 'Chapter 191  -  Tutorial 35th Floor (10) (part 1)',
+              'url': 'https://wuxiaworld.site/novel/the-tutorial-is-too-hard/chapter-191'},
+        191: {'title': 'Chapter 191B - Tutorial 35th Floor (10) (part 2)',
+              'url': 'https://wuxiaworld.site/novel/the-tutorial-is-too-hard/chapter-191b'},
+    },
+    'list_chapters_fixture': wuxiaworldsite_html_return,
+
+    'chapter_url': 'https://wuxiaworld.site/novel/the-tutorial-is-too-hard/chapitre-1-some-title/2020/02/08/',
+    'expected_sentences': [
+        "Life is a series of choices.",
+        "I always loved novels and cartoons. I’ve dreamt of those fantasy-like events happening to me.",
+        "[Will you enter the Tutorial world?]",
+        "[Choose the Tutorial difficulty. Depending on the difficulty, the dangers of the Tutorial stages increase along with the growth rate and reward.]"
+    ],
+    'chapter_fixture': wuxiaworldsite_html_chapter_return,
+}
+
+
+class TestExtractChapterUrlsAndMetadata(GenericAdapterTestExtractChapterUrlsAndMetadata):
+    def setup_method(self):
+        self.expected_data = SPECIFIC_TEST_DATA
+
+        super().setup_method(
+            SPECIFIC_TEST_DATA['adapter'],
+            SPECIFIC_TEST_DATA['url'],
+            SPECIFIC_TEST_DATA['sections'],
+            SPECIFIC_TEST_DATA['specific_path_adapter'],
+            SPECIFIC_TEST_DATA['list_chapters_fixture'])
+
+
+class TestGetChapterText(GenericAdapterTestGetChapterText):
+    def setup_method(self):
+        self.expected_data = SPECIFIC_TEST_DATA
+
+        super().setup_method(
+            SPECIFIC_TEST_DATA['adapter'],
+            SPECIFIC_TEST_DATA['url'],
+            SPECIFIC_TEST_DATA['sections'],
+            SPECIFIC_TEST_DATA['specific_path_adapter'],
+            SPECIFIC_TEST_DATA['chapter_fixture'])
+
--- a/tests/conftest.py
+++ b/tests/conftest.py
@ -1 +1,2 @@
 from fixtures_chireads import *
+from fixtures_wuxiaworldsite import *
--- a/tests/fixtures_wuxiaworldsite.py
+++ b/tests/fixtures_wuxiaworldsite.py