add wuxiaworld.site adapter

This commit is contained in:
Kolbo 2020-09-09 11:51:33 +02:00
parent 12a5208ab2
commit c991f3cd3a
6 changed files with 8533 additions and 0 deletions

View file

@ -171,6 +171,7 @@ from . import adapter_silmarillionwritersguildorg
from . import adapter_chireadscom
from . import adapter_scribblehubcom
from . import adapter_fictionlive
from . import adapter_wuxiaworldsite
## This bit of complexity allows adapters to be added by just adding
## importing. It eliminates the long if/else clauses we used to need

View file

@ -0,0 +1,145 @@
# -*- coding: utf-8 -*-
# Copyright 2016 Fanficdownloader team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Adapted by GComyn on December 14. 2016
from __future__ import absolute_import
import json
import logging
import re
# py2 vs py3 transition
from ..six import text_type as unicode
from ..six.moves.urllib import parse as urlparse
from ..six.moves.urllib.error import HTTPError
from ..dateutils import parse_relative_date_string
from .base_adapter import BaseSiteAdapter, makeDate
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
logger = logging.getLogger(__name__)
def getClass():
return WuxiaWorldSiteSiteAdapter
class WuxiaWorldSiteSiteAdapter(BaseSiteAdapter):
def __init__(self, config, url):
BaseSiteAdapter.__init__(self, config, url)
self.story.setMetadata('siteabbrev', 'wuxsite')
self._dateformat = '%Y-%m-%d'
# get storyId from url--url validation guarantees query correct
match = re.match(self.getSiteURLPattern(), url)
if not match:
raise exceptions.InvalidStoryURL(url, self.getSiteDomain(), self.getSiteExampleURLs())
story_id = match.group('id')
self.story.setMetadata('storyId', story_id)
self._setURL('https://%s/novel/%s' % (self.getSiteDomain(), story_id))
@staticmethod
def getSiteDomain():
return 'wuxiaworld.site'
@classmethod
def getSiteExampleURLs(cls):
return 'https://%s/novel/story-name' % cls.getSiteDomain()
def getSiteURLPattern(self):
return r'https?://%s/novel/(?P<id>[^/]+)(/)?' % re.escape(self.getSiteDomain())
def use_pagecache(self):
return True
def _parse_linked_data(self, soup):
# See https://json-ld.org
tag = soup.find('script', type='application/ld+json')
if not tag:
return {}
return json.loads(tag.string)
def _parse_date(self, text):
# Strip time from date--site doesn't seem to have it anymore.
text = re.sub(r'T.*$', '', text)
return makeDate(text, self._dateformat)
def extractChapterUrlsAndMetadata(self):
logger.debug('URL: %s', self.url)
try:
data = self._fetchUrl(self.url)
except HTTPError as exception:
if exception.code == 404:
raise exceptions.StoryDoesNotExist('404 error: {}'.format(self.url))
raise exception
soup = self.make_soup(data)
self.story.setMetadata('title', soup.find('meta', {'property': 'og:title'})['content'].split(' - ')[0])
author_name = soup.select_one('.author-content > a').get_text()
self.story.setMetadata('author', author_name)
self.story.setMetadata('authorId', author_name.lower())
ld = self._parse_linked_data(soup)
webpage_graph = [g for g in ld['@graph'] if g['@type'] == 'WebPage']
date_updated_webpage = None
if len(webpage_graph) > 0:
webpage = webpage_graph[0]
str_date_published = webpage['datePublished']
date_published = self._parse_date(str_date_published)
self.story.setMetadata('datePublished', date_published)
str_date_updated_webpage = webpage['dateModified']
date_updated_webpage = self._parse_date(str_date_updated_webpage)
str_date_updated_last_chapter = soup.select_one('.chapter-release-date').i.get_text()
if str_date_updated_last_chapter[-4:] == ' ago':
date_updated_last_chapter = parse_relative_date_string(str_date_updated_last_chapter[:-4])
else:
date_updated_last_chapter = makeDate(str_date_updated_last_chapter, '%B %d, %Y')
date_updated = date_updated_last_chapter if date_updated_webpage is None else max(date_updated_webpage, date_updated_last_chapter)
self.story.setMetadata('dateUpdated', date_updated)
tags = [stripHTML(a) for a in soup.select('.post-status .summary-content')]
for tag in tags:
if 'Completed' == tag:
self.story.setMetadata('status', 'Completed')
elif 'OnGoing' == tag:
self.story.setMetadata('status', 'In-Progress')
self.setCoverImage(self.url, soup.find('meta', {'property': "og:image"})['content'])
description = ' '.join([stripHTML(a) for a in soup.select('.summary__content p')])
self.setDescription(self.url, description)
chapter_list = soup.select('.wp-manga-chapter > a')
chapter_list.reverse()
for a in chapter_list:
title = stripHTML(a)
url = urlparse.urljoin(self.url, a['href'])
self.add_chapter(title, url)
def getChapterText(self, url):
logger.debug('Getting chapter text from: %s', url)
data = self._fetchUrl(url)
soup = self.make_soup(data)
content = soup.select_one('.reading-content')
return self.utf8FromSoup(url, content)

View file

@ -49,6 +49,7 @@ def parse_relative_date_string(string_):
'weeks': 'weeks',
'second': 'seconds',
'minute': 'minutes',
'mins': 'minutes',
'hour': 'hours',
'day': 'days',
'week': 'weeks',

View file

@ -0,0 +1,70 @@
import pytest
from unittest.mock import patch
from fanficfare.adapters.adapter_wuxiaworldsite import WuxiaWorldSiteSiteAdapter as adapter_for_tests
from datetime import datetime
from tests.adapters.generic_adapter_test import GenericAdapterTestExtractChapterUrlsAndMetadata, GenericAdapterTestGetChapterText
from tests.conftest import wuxiaworldsite_html_return, wuxiaworldsite_html_chapter_return
SPECIFIC_TEST_DATA = {
'adapter': adapter_for_tests,
'url': 'https://wuxiaworld.site/novel/some-story',
'sections': ["wuxiaworld.site"],
'specific_path_adapter': 'adapter_wuxiaworldsite.WuxiaWorldSiteSiteAdapter',
'title': 'The Tutorial Is Too Hard',
'cover_image': 'https://wuxiaworld.site/wp-content/uploads/2019/04/the-tutorial-is-too-hard-193x278.jpg',
'author': 'Gandara',
'authorId': 'gandara',
'dateUpdated': datetime.utcnow().strftime('%Y-%m-%d'),
'intro': "Read The Tutorial Is Too Hard Novel at WuxiaWorld.Site On a normal boring day, a message appears, inviting him to a Tutorial. A tale about Lee Ho Jae and his escape from the Tutorial. But he just happened to choose the hardest possible difficulty: Hell. Disclaimer:Neither the picture nor the content belongs to me. They are uploaded here, not for any bad purpose but for entertainment only. Disclaimer:If this novel is yours, please let us share this novel to everyone else and send us your credit. We display your credit to this novel! If you dont please tell us too, We respect your decision.",
'expected_chapters': {
0: {'title': 'Chapter 1',
'url': 'https://wuxiaworld.site/novel/the-tutorial-is-too-hard/chapter-1'},
10: {'title': 'Chapter 11',
'url': 'https://wuxiaworld.site/novel/the-tutorial-is-too-hard/chapter-11'},
100: {'title': 'Chapter 101',
'url': 'https://wuxiaworld.site/novel/the-tutorial-is-too-hard/chapter-101'},
190: {'title': 'Chapter 191 - Tutorial 35th Floor (10) (part 1)',
'url': 'https://wuxiaworld.site/novel/the-tutorial-is-too-hard/chapter-191'},
191: {'title': 'Chapter 191B - Tutorial 35th Floor (10) (part 2)',
'url': 'https://wuxiaworld.site/novel/the-tutorial-is-too-hard/chapter-191b'},
},
'list_chapters_fixture': wuxiaworldsite_html_return,
'chapter_url': 'https://wuxiaworld.site/novel/the-tutorial-is-too-hard/chapitre-1-some-title/2020/02/08/',
'expected_sentences': [
"Life is a series of choices.",
"I always loved novels and cartoons. Ive dreamt of those fantasy-like events happening to me.",
"[Will you enter the Tutorial world?]",
"[Choose the Tutorial difficulty. Depending on the difficulty, the dangers of the Tutorial stages increase along with the growth rate and reward.]"
],
'chapter_fixture': wuxiaworldsite_html_chapter_return,
}
class TestExtractChapterUrlsAndMetadata(GenericAdapterTestExtractChapterUrlsAndMetadata):
def setup_method(self):
self.expected_data = SPECIFIC_TEST_DATA
super().setup_method(
SPECIFIC_TEST_DATA['adapter'],
SPECIFIC_TEST_DATA['url'],
SPECIFIC_TEST_DATA['sections'],
SPECIFIC_TEST_DATA['specific_path_adapter'],
SPECIFIC_TEST_DATA['list_chapters_fixture'])
class TestGetChapterText(GenericAdapterTestGetChapterText):
def setup_method(self):
self.expected_data = SPECIFIC_TEST_DATA
super().setup_method(
SPECIFIC_TEST_DATA['adapter'],
SPECIFIC_TEST_DATA['url'],
SPECIFIC_TEST_DATA['sections'],
SPECIFIC_TEST_DATA['specific_path_adapter'],
SPECIFIC_TEST_DATA['chapter_fixture'])

View file

@ -1 +1,2 @@
from fixtures_chireads import *
from fixtures_wuxiaworldsite import *

File diff suppressed because one or more lines are too long