# -*- coding: utf-8 -*- # Copyright 2024 FanFicFare team # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from __future__ import absolute_import import io import logging import re import zipfile from bs4 import BeautifulSoup # py2 vs py3 transition from .base_adapter import BaseSiteAdapter, makeDate from fanficfare.htmlcleanup import stripHTML from .. import exceptions as exceptions logger = logging.getLogger(__name__) def getClass(): return FanfictionsFrSiteAdapter class FanfictionsFrSiteAdapter(BaseSiteAdapter): def __init__(self, config, url): BaseSiteAdapter.__init__(self, config, url) self.story.setMetadata('siteabbrev', 'fanfictionsfr') self.story.setMetadata('langcode','fr') self.story.setMetadata('language','Français') # get storyId from url--url validation guarantees query correct match = re.match(self.getSiteURLPattern(), url) if not match: raise exceptions.InvalidStoryURL(url, self.getSiteDomain(), self.getSiteExampleURLs()) story_id = match.group('id') self.story.setMetadata('storyId', story_id) fandom_name = match.group('fandom') self._setURL('https://%s/fanfictions/%s/%s/chapters.html' % (self.getSiteDomain(), fandom_name, story_id)) @staticmethod def getSiteDomain(): return 'www.fanfictions.fr' @classmethod def getSiteExampleURLs(cls): return 'https://%s/fanfictions/fandom/fanfiction-id/chapters.html' % cls.getSiteDomain() def getSiteURLPattern(self): return r'https?://(?:www\.)?fanfictions\.fr/fanfictions/(?P[^/]+)/(?P[^/]+)(/chapters.html)?' def extractChapterUrlsAndMetadata(self): logger.debug('URL: %s', self.url) data = self.get_request(self.url) soup = self.make_soup(data) # detect if the fanfiction is 'suspended' (chapters unavailable) alert_div = soup.find('div', id='alertInactiveFic') if alert_div: raise exceptions.FailedToDownload("Failed to download the fanfiction, most likely because it is suspended.") title_element = soup.find('h1', itemprop='name') self.story.setMetadata('title', stripHTML(title_element)) author_div = soup.find('div', itemprop='author') author_name = stripHTML(author_div.a) author_id = author_div.a['href'].split('/')[-1].replace('.html', '') self.story.setMetadata('author', author_name) self.story.setMetadata('authorId', author_id) published_date_element = soup.find('span', class_='date-distance') published_date_text = published_date_element['data-date'] published_date = makeDate(published_date_text, '%Y-%m-%d %H:%M:%S') if published_date: self.story.setMetadata('datePublished', published_date) status_element = soup.find('p', title="Statut de la fanfiction").find('span', class_='badge') french_status = stripHTML(status_element) status_translation = { "En cours": "In-Progress", "Terminée": "Completed", "One-shot": "Completed", } self.story.setMetadata('status', status_translation.get(french_status, french_status)) genre_elements = soup.find('div', title="Format et genres").find_all('span', class_="highlightable") self.story.extendList('genre', [ stripHTML(genre) for genre in genre_elements[1:] ]) category_elements = soup.find_all('li', class_="breadcrumb-item") self.story.extendList('category', [ stripHTML(category) for category in category_elements[-2].find_all('a') ]) first_description = soup.find('p', itemprop='abstract') self.setDescription(self.url, first_description) chapter_cards = soup.find_all(class_=['card', 'chapter']) for chapter_card in chapter_cards: chapter_title_tag = chapter_card.find('h2') if chapter_title_tag: chapter_title = stripHTML(chapter_title_tag) chapter_link = 'https://'+self.getSiteDomain()+chapter_title_tag.find('a')['href'] # Clean up the chapter title by replacing multiple spaces and newline characters with a single space chapter_title = re.sub(r'\s+', ' ', chapter_title) self.add_chapter(chapter_title, chapter_link) last_chapter_div = chapter_cards[-1] updated_date_element = last_chapter_div.find('span', class_='date-distance') last_chapter_update_date = updated_date_element['data-date'] date = makeDate(last_chapter_update_date, '%Y-%m-%d %H:%M:%S') if date: self.story.setMetadata('dateUpdated', date) def getChapterText(self, url): logger.debug('Getting chapter text from: %s' % url) response, redirection_url = self.get_request_redirected(url) if "telecharger_pdf.html" in redirection_url: with zipfile.ZipFile(io.BytesIO(response.encode('latin1'))) as z: # Assuming there's only one text file inside the zip file_list = z.namelist() if len(file_list) != 1: raise exceptions.FailedToDownload("Error downloading Chapter: %s! Zip file should contain exactly one text file!" % url) text_filename = file_list[0] with z.open(text_filename) as text_file: # Decode the text file with windows-1252 encoding text = text_file.read().decode('windows-1252') return text.replace("\r\n", "
\r\n") else: soup = self.make_soup(response) div_content = soup.find('div', id='readarea') if div_content is None: raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url) return self.utf8FromSoup(url, div_content)