mirror of
https://github.com/JimmXinu/FanFicFare.git
synced 2026-05-03 04:14:35 +02:00
New Site: SpiritFanfiction.com (#1078)
Add support for spiritfanfiction.com
This commit is contained in:
parent
c468c26208
commit
33b2b10bf3
4 changed files with 369 additions and 0 deletions
|
|
@ -3856,3 +3856,18 @@ use_basic_cache:true
|
|||
## specific to wuxiaworld.xyz order and dedup chapters.
|
||||
dedup_order_chapter_list:false
|
||||
|
||||
[www.spiritfanfiction.com]
|
||||
extra_valid_entries: freeformtags,bookmarks,comments,hits,kudos
|
||||
|
||||
freeformtags_label:Freeform Tags
|
||||
bookmarks_label:Bookmarks
|
||||
comments_label:Comments
|
||||
hits_label:Hits
|
||||
kudos_label:Kudos
|
||||
|
||||
## All notes are included by default, but this allows
|
||||
## you to customize which you include. Copy this parameter to your
|
||||
## personal.ini and list the ones you don't want.
|
||||
#exclude_notes:chapterheadnotes,chapterfootnotes
|
||||
|
||||
website_encodings: utf8, Windows-1252, iso-8859-1
|
||||
|
|
@ -140,6 +140,7 @@ from . import adapter_syosetucom
|
|||
from . import adapter_kakuyomujp
|
||||
from . import adapter_fanfictionsfr
|
||||
from . import adapter_touchfluffytail
|
||||
from . import adapter_spiritfanfictioncom
|
||||
|
||||
## This bit of complexity allows adapters to be added by just adding
|
||||
## importing. It eliminates the long if/else clauses we used to need
|
||||
|
|
|
|||
338
fanficfare/adapters/adapter_spiritfanfictioncom.py
Normal file
338
fanficfare/adapters/adapter_spiritfanfictioncom.py
Normal file
|
|
@ -0,0 +1,338 @@
|
|||
from __future__ import absolute_import
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
import re
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from ..htmlcleanup import stripHTML
|
||||
from .. import exceptions as exceptions
|
||||
|
||||
# py2 vs py3 transition
|
||||
from ..six import text_type as unicode
|
||||
|
||||
from .base_adapter import BaseSiteAdapter, makeDate
|
||||
|
||||
def getClass():
|
||||
return SpiritFanfictionComAdapter
|
||||
|
||||
class SpiritFanfictionComAdapter(BaseSiteAdapter):
|
||||
|
||||
def __init__(self, config, url):
|
||||
BaseSiteAdapter.__init__(self, config, url)
|
||||
|
||||
# get storyId from url--url validation guarantees query is only sid=1234
|
||||
self.storyId = unicode(self.getStoryId(url))
|
||||
self.story.setMetadata('storyId', self.storyId)
|
||||
|
||||
# normalized story URL
|
||||
self._setURL('https://' + self.getSiteDomain() + '/historia/'+self.story.getMetadata('storyId'))
|
||||
|
||||
# Each adapter needs to have a unique site abbreviation.
|
||||
self.story.setMetadata('siteabbrev',self.getSiteAbbrev())
|
||||
|
||||
# The date format will vary from site to site.
|
||||
# http://docs.python.org/library/datetime.html#strftime-strptime-behavior
|
||||
self.dateformat = "%Y-%m-%dT%H:%M:%S%z"
|
||||
|
||||
self.chapter_photoUrl = {}
|
||||
|
||||
|
||||
@staticmethod
|
||||
def getSiteDomain():
|
||||
return 'www.spiritfanfiction.com'
|
||||
|
||||
|
||||
@classmethod
|
||||
def getAcceptDomains(cls):
|
||||
return ['www.spiritfanfiction.com',
|
||||
'www.socialspirit.com.br',
|
||||
]
|
||||
|
||||
|
||||
@classmethod
|
||||
def getSiteExampleURLs(cls):
|
||||
#Accepted formats
|
||||
#https://www.spiritfanfiction.com/historia/1234
|
||||
#https://www.spiritfanfiction.com/historia/story-name-1234
|
||||
return "https://"+cls.getSiteDomain()+"/historia/story-name-1234 https://"+cls.getSiteDomain()+"/historia/1234"
|
||||
|
||||
|
||||
@classmethod
|
||||
def getSiteURLPattern(self):
|
||||
#logger.debug(r"https?://(" + r"|".join([x.replace('.','\.') for x in self.getAcceptDomains()]) + r")/historia/(?:[a-zA-Z0-9-]+-)?(?P<storyId>\d+)")
|
||||
return r"https?://(" + r"|".join([x.replace('.','\.') for x in self.getAcceptDomains()]) + r")/historia/(?:[a-zA-Z0-9-]+-)?(?P<storyId>\d+)"
|
||||
|
||||
|
||||
@classmethod
|
||||
def getSiteAbbrev(cls):
|
||||
return 'spirit'
|
||||
|
||||
|
||||
def getStoryId(self, url):
|
||||
|
||||
# get storyId from url--url validation guarantees query correct
|
||||
m = re.match(self.getSiteURLPattern(), url)
|
||||
if m:
|
||||
return m.group('storyId')
|
||||
else:
|
||||
raise exceptions.InvalidStoryURL(url, self.getSiteDomain(), self.getSiteExampleURLs())
|
||||
|
||||
|
||||
def extractChapterUrlsAndMetadata(self):
|
||||
|
||||
data = self.get_request(self.url)
|
||||
# use BeautifulSoup HTML parser to make everything easier to find.
|
||||
soup = self.make_soup(data)
|
||||
# Now go hunting for all the meta data and the chapter list.
|
||||
|
||||
# Title
|
||||
title = soup.find('h1', {'class':'tituloPrincipal'})
|
||||
self.story.setMetadata('title', stripHTML(title.find('strong')))
|
||||
|
||||
# Authors
|
||||
# Find authorid and URL
|
||||
authors = soup.findAll('span', {'class':'usuario'})
|
||||
|
||||
for author in authors:
|
||||
self.story.addToList('authorId', author.find('a')['href'].split('/')[-1])
|
||||
self.story.addToList('authorUrl', author.find('a')['href'])
|
||||
self.story.addToList('author', stripHTML(author.find('strong')))
|
||||
|
||||
# Cover image
|
||||
cover_img = soup.find('img', {'class':'imagemResponsiva'})
|
||||
if cover_img:
|
||||
self.setCoverImage(self.url, cover_img['src'])
|
||||
|
||||
newestChapter = None
|
||||
self.newestChapterNum = None # save for comparing during update.
|
||||
# Find the chapters:
|
||||
chapters = soup.findAll('table', {'class':'listagemCapitulos espacamentoTop'})
|
||||
for chapter in chapters:
|
||||
|
||||
for row in chapter.findAll('tr', {'class': 'listagem-textoBg1'}): # Find each row with chapter info
|
||||
a = row.find('a') # Chapter link
|
||||
|
||||
# Datetime
|
||||
date = a.find_next('time')['datetime']
|
||||
chapterDate = makeDate(date, self.dateformat).date()
|
||||
|
||||
chapter_title = stripHTML(a.find('strong'))
|
||||
|
||||
self.add_chapter(chapter_title, a.get('href'), {'date': chapterDate})
|
||||
|
||||
if newestChapter == None or chapterDate > newestChapter:
|
||||
newestChapter = chapterDate
|
||||
self.newestChapterNum = self.story.getMetadata('numChapters')
|
||||
|
||||
logger.debug('numChapters: (%s)', self.story.getMetadata('numChapters'))
|
||||
|
||||
# Summary
|
||||
div_element = soup.find('div', {'class':'clearfix'})
|
||||
summary = div_element.find('div', class_='texto')
|
||||
|
||||
strong_tag = summary.find('strong', text='Sinopse:')
|
||||
if strong_tag:
|
||||
strong_tag.decompose()
|
||||
|
||||
self.decode_emails(summary)
|
||||
for a_tag in summary.find_all('a', {'data-cfemail': True}):
|
||||
email_text = a_tag.string
|
||||
a_tag.replace_with(email_text)
|
||||
|
||||
full_text = unicode(summary)
|
||||
self.story.setMetadata('description', full_text)
|
||||
|
||||
|
||||
def parse_until_br(attribute, start_index, element_list):
|
||||
# Initialize counter
|
||||
next_index = start_index
|
||||
for element in element_list[start_index:]:
|
||||
next_index += 1
|
||||
if element.name == 'br':
|
||||
break
|
||||
elif element.name == 'strong':
|
||||
if attribute == 'status':
|
||||
if element.contents[0].text == 'Sim':
|
||||
self.story.setMetadata(attribute, 'Completed')
|
||||
elif element.contents[0].text == 'Não':
|
||||
self.story.setMetadata(attribute, 'In-Progress')
|
||||
elif attribute == 'characters':
|
||||
terms = re.findall(r"[^&]+", stripHTML(element))
|
||||
for term in terms:
|
||||
self.story.addToList(attribute, term)
|
||||
elif attribute == 'numWords':
|
||||
self.story.setMetadata(attribute, stripHTML(element).replace('.',''))
|
||||
else:
|
||||
self.story.setMetadata(attribute, stripHTML(element))
|
||||
elif element.name == 'a':
|
||||
if element.contents[0].name == 'strong':
|
||||
self.story.addToList(attribute, stripHTML(element.contents[0]))
|
||||
elif element.name == 'time':
|
||||
self.story.setMetadata(attribute, makeDate(element['datetime'], self.dateformat))
|
||||
return next_index
|
||||
|
||||
# Informações Gerais
|
||||
content_metadata = [
|
||||
('Iniciado', 'datePublished'),
|
||||
('Atualizada', 'dateUpdated'),
|
||||
('Idioma', 'language'),
|
||||
('Visualizações', 'hits'),
|
||||
('Favoritos', 'kudos'),
|
||||
('Comentários', 'comments'),
|
||||
('Listas de leitura', 'bookmarks'),
|
||||
('Palavras', 'numWords'),
|
||||
('Concluído', 'status'),
|
||||
('Categorias', 'category'),
|
||||
('Personagens', 'characters'),
|
||||
('Tags', 'freeformtags'),
|
||||
('Gêneros:', 'genre'),
|
||||
('Avisos:', 'warnings')
|
||||
]
|
||||
tag_mapping = dict(content_metadata)
|
||||
|
||||
information = div_element.find(lambda tag: tag.name == 'div' and
|
||||
tag.get('class') == ['texto', 'espacamentoTop'] and
|
||||
tag.get('id') != 'cphConteudo_cphConteudo_divDestaque')
|
||||
logger.debug('information: (%s)', information)
|
||||
info_contents = information.contents
|
||||
i = 0
|
||||
while i < len(info_contents):
|
||||
content = info_contents[i]
|
||||
stripped_tag = stripHTML(content)
|
||||
|
||||
if stripped_tag in tag_mapping:
|
||||
i = parse_until_br(tag_mapping[stripped_tag], i+1, info_contents)
|
||||
else:
|
||||
i += 1
|
||||
|
||||
# Classificação, Gêneros e Avisos
|
||||
# Finding div element with class "clearfix baixo"
|
||||
div_element = soup.find('div', {'class': 'clearfix baixo'})
|
||||
|
||||
# Finding div element with class "classificacao"
|
||||
classificacao_element = div_element.find('div', class_='classificacao')
|
||||
|
||||
# Extracting last word from class name
|
||||
if classificacao_element and 'class' in classificacao_element.attrs:
|
||||
class_value = classificacao_element.attrs['class']
|
||||
self.story.setMetadata('rating',class_value[-1].split('-')[-1])
|
||||
|
||||
# Extracting text content "Gêneros" and "Avisos"
|
||||
contents = classificacao_element.find_next('div').contents
|
||||
i = 0
|
||||
while i < len(contents):
|
||||
content = contents[i]
|
||||
stripped_tag = stripHTML(content)
|
||||
|
||||
if stripped_tag in tag_mapping:
|
||||
i = parse_until_br(tag_mapping[stripped_tag], i+1, contents)
|
||||
else:
|
||||
i += 1
|
||||
|
||||
## Normalize chapter URLs in case of title change
|
||||
def normalize_chapterurl(self,url):
|
||||
#https://www.spiritfanfiction.com/historia/story-name-1234/capitulo56
|
||||
url = re.sub(r"https?://("+self.getSiteDomain()+r"/historia/\d+/capitulo\d+)$",
|
||||
r"https://\1",url)
|
||||
return url
|
||||
|
||||
|
||||
def getChapterText(self, url):
|
||||
logger.debug('Getting chapter text from: %s' % url)
|
||||
|
||||
save_chapter_soup = self.make_soup("<div></div>")
|
||||
save_chapter = save_chapter_soup.find('div')
|
||||
|
||||
chapter_dl_soup = self.make_soup(self.get_request(url))
|
||||
if None == chapter_dl_soup:
|
||||
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
|
||||
div_chapter = chapter_dl_soup.find('div', {'class':'clearfix'})
|
||||
chapter_text = div_chapter.find('div', class_='texto-capitulo')
|
||||
|
||||
exclude_notes=self.getConfigList('exclude_notes')
|
||||
|
||||
|
||||
def append_tag(elem, tag, string=None, classes=None):
|
||||
'''bs4 requires tags be added separately.'''
|
||||
new_tag = save_chapter_soup.new_tag(tag)
|
||||
if string:
|
||||
new_tag.string=string
|
||||
if classes:
|
||||
new_tag['class']=[classes]
|
||||
elem.append(new_tag)
|
||||
return new_tag
|
||||
|
||||
|
||||
chapimg = chaphead = chapfoot = None
|
||||
# Chapter Image
|
||||
img_url = chapter_text.find('img', {'class':'imagemResponsiva'})
|
||||
if img_url:
|
||||
chapimg = chapter_dl_soup.new_tag('p', style="text-align: center")
|
||||
chapimg.insert(0, chapter_dl_soup.new_tag('img', src=img_url['src']))
|
||||
|
||||
for tag in chapter_text.find_all('h2'):
|
||||
if tag.string.startswith('Notas do Autor'):
|
||||
chaphead = self.make_soup(unicode(tag.find_next_sibling('div', {'class': 'texto texto-capitulo-notas'})))
|
||||
elif tag.string.startswith('Notas Finais'):
|
||||
chapfoot = self.make_soup(unicode(tag.find_next_sibling('div', {'class': 'texto texto-capitulo-notas'})))
|
||||
else:
|
||||
# Apparently, not all chapters have the "Capítulo" text anymore, but it's the only other h2 in there
|
||||
chaptext = self.make_soup(unicode(tag.find_next_sibling('div', {'class': 'texto'})))
|
||||
# Decode emails
|
||||
self.decode_emails(chaptext)
|
||||
if chapimg != None:
|
||||
if chaptext.div == None:
|
||||
append_tag(chaptext, 'div')
|
||||
chaptext.div.insert(0, chapimg)
|
||||
|
||||
head_notes_div = append_tag(save_chapter,'div',classes="fff_chapter_notes fff_head_notes")
|
||||
if 'chapterheadnotes' not in exclude_notes:
|
||||
if chaphead != None:
|
||||
append_tag(head_notes_div,'b',"Notas do Autor:")
|
||||
self.decode_emails(chaphead)
|
||||
head_notes_div.append(chaphead)
|
||||
append_tag(head_notes_div,'hr')
|
||||
|
||||
save_chapter.append(chaptext)
|
||||
|
||||
foot_notes_div = append_tag(save_chapter,'div',classes="fff_chapter_notes fff_foot_notes")
|
||||
## Can appear on every chapter
|
||||
if 'chapterfootnotes' not in exclude_notes:
|
||||
if chapfoot != None:
|
||||
append_tag(foot_notes_div,'hr')
|
||||
append_tag(foot_notes_div,'b',"Notas Finais:")
|
||||
self.decode_emails(chapfoot)
|
||||
foot_notes_div.append(chapfoot)
|
||||
|
||||
## remove empty head/food notes div(s)
|
||||
if not head_notes_div.find(True):
|
||||
head_notes_div.extract()
|
||||
if not foot_notes_div.find(True):
|
||||
foot_notes_div.extract()
|
||||
|
||||
return self.utf8FromSoup(url,save_chapter)
|
||||
|
||||
|
||||
def decode_emails(self, html_text):
|
||||
|
||||
def decode_email(encoded_email):
|
||||
email = ""
|
||||
r = int(encoded_email[:2], 16)
|
||||
for i in range(2, len(encoded_email), 2):
|
||||
char_code = int(encoded_email[i:i + 2], 16) ^ r
|
||||
email += chr(char_code)
|
||||
return email
|
||||
|
||||
|
||||
# Find all elements with class '__cf_email__'
|
||||
email_elements = html_text.find_all(class_='__cf_email__')
|
||||
for element in email_elements:
|
||||
# Get the data-cfemail attribute value
|
||||
encoded_email = element.get('data-cfemail')
|
||||
if encoded_email:
|
||||
# Decode the email address
|
||||
decoded_email = decode_email(encoded_email)
|
||||
# Replace the obfuscated email with the decoded email
|
||||
element.string = decoded_email
|
||||
return unicode(html_text)
|
||||
|
||||
|
|
@ -3831,3 +3831,18 @@ use_basic_cache:true
|
|||
## specific to wuxiaworld.xyz order and dedup chapters.
|
||||
dedup_order_chapter_list:false
|
||||
|
||||
[www.spiritfanfiction.com]
|
||||
extra_valid_entries: freeformtags,bookmarks,comments,hits,kudos
|
||||
|
||||
freeformtags_label:Freeform Tags
|
||||
bookmarks_label:Bookmarks
|
||||
comments_label:Comments
|
||||
hits_label:Hits
|
||||
kudos_label:Kudos
|
||||
|
||||
## All notes are included by default, but this allows
|
||||
## you to customize which you include. Copy this parameter to your
|
||||
## personal.ini and list the ones you don't want.
|
||||
#exclude_notes:chapterheadnotes,chapterfootnotes
|
||||
|
||||
website_encodings: utf8, Windows-1252, iso-8859-1
|
||||
Loading…
Reference in a new issue