adapter_ficbooknet: Collect numWords

This commit is contained in:
dbhmw 2025-10-29 16:49:16 +00:00 committed by Jim Miller
parent ff402c16ca
commit c5264c2147

View file

@ -16,16 +16,15 @@
#
from __future__ import absolute_import,unicode_literals
import datetime
# import datetime
import logging
import json
logger = logging.getLogger(__name__)
import re
from .. import translit
# from .. import translit
from ..htmlcleanup import stripHTML
from .. import exceptions as exceptions
from .. import exceptions# as exceptions
# py2 vs py3 transition
@ -87,8 +86,7 @@ class FicBookNetAdapter(BaseSiteAdapter):
if 'Войти используя аккаунт на сайте' in d:
raise exceptions.FailedToLogin(url,params['login'])
return False
else:
return True
## Getting the chapter list and the meta data, plus 'is adult' checking.
@ -109,10 +107,6 @@ class FicBookNetAdapter(BaseSiteAdapter):
try:
a = soup.find('section',{'class':'chapter-info'}).find('h1')
except AttributeError:
# Handle 404 in a nicer way when using nsapa proxy
if re.search(r'404 — Страница не найдена', soup.find('title').text):
raise exceptions.StoryDoesNotExist(url)
else:
raise exceptions.FailedToDownload("Error collecting meta: %s! Missing required element!" % url)
# kill '+' marks if present.
sup = a.find('sup')
@ -145,7 +139,7 @@ class FicBookNetAdapter(BaseSiteAdapter):
# Find the chapters:
pubdate = None
chapters = soup.find('ul', {'class' : 'list-of-fanfic-parts'})
if chapters != None:
if chapters is not None:
for chapdiv in chapters.find_all('li', {'class':'part'}):
chapter=chapdiv.find('a',href=re.compile(r'/readfic/'+self.story.getMetadata('storyId')+r"/\d+#part_content$"))
churl='https://'+self.host+chapter['href']
@ -158,7 +152,7 @@ class FicBookNetAdapter(BaseSiteAdapter):
self.add_chapter(chapter,churl,
{'date':chapterdate.strftime(self.getConfig("datechapter_format",self.getConfig("datePublished_format",self.dateformat)))})
if pubdate == None and chapterdate:
if pubdate is None and chapterdate:
pubdate = chapterdate
update = chapterdate
else:
@ -175,16 +169,6 @@ class FicBookNetAdapter(BaseSiteAdapter):
self.story.setMetadata('datePublished', pubdate)
self.story.setMetadata('language','Russian')
## after site change, I don't see word count anywhere.
# pr=soup.find('a', href=re.compile(r'/printfic/\w+'))
# pr='https://'+self.host+pr['href']
# pr = self.make_soup(self.get_request(pr))
# pr=pr.find_all('div', {'class' : 'part_text'})
# i=0
# for part in pr:
# i=i+len(stripHTML(part).split(' '))
# self.story.setMetadata('numWords', unicode(i))
dlinfo = soup.select_one('header.d-flex.flex-column.gap-12.word-break')
series_label = dlinfo.select_one('div.description.word-break').find('strong', string='Серия:')
@ -208,6 +192,9 @@ class FicBookNetAdapter(BaseSiteAdapter):
for genre in tags.find_all('a',href=re.compile(r'/tags/')):
self.story.addToList('genre',stripHTML(genre))
logger.debug("category: (%s)"%self.story.getMetadata('category'))
logger.debug("genre: (%s)"%self.story.getMetadata('genre'))
ratingdt = dlinfo.find('div',{'class':re.compile(r'badge-rating-.*')})
self.story.setMetadata('rating', stripHTML(ratingdt.find('span')))
@ -269,15 +256,23 @@ class FicBookNetAdapter(BaseSiteAdapter):
self.story.setMetadata('numCollections', value)
logger.debug("numCollections: (%s)"%self.story.getMetadata('numCollections'))
# Grab the amount of pages
# Grab the amount of pages and words
targetpages = soup.find('strong',string='Размер:').find_next('div')
if targetpages:
pages_raw = re.search(r'(.+)\s+(?:страницы|страниц)', targetpages.text, re.UNICODE)
pages = int(re.sub(r'[^\d]', '', pages_raw.group(1)))
targetpages_text = re.sub(r"(?<!\,)\s| ", "", targetpages.text, flags=re.UNICODE | re.MULTILINE)
pages_raw = re.search(r'(\d+)(?:страницы|страниц)', targetpages_text, re.UNICODE)
pages = int(pages_raw.group(1))
if pages > 0:
self.story.setMetadata('pages', pages)
logger.debug("pages: (%s)"%self.story.getMetadata('pages'))
numWords_raw = re.search(r"(\d+)(?:слова|слов)", targetpages_text, re.UNICODE)
numWords = int(numWords_raw.group(1))
if numWords > 0:
self.story.setMetadata('numWords', numWords)
logger.debug("numWords: (%s)"%self.story.getMetadata('numWords'))
# Grab FBN Category
class_tag = soup.select_one('div[class^="badge-with-icon direction"]').find('span', {'class' : 'badge-text'}).text
if class_tag:
@ -286,7 +281,7 @@ class FicBookNetAdapter(BaseSiteAdapter):
# Find dedication.
ded = soup.find('div', {'class' : 'js-public-beta-dedication'})
if ded != None:
if ded:
ded['class'].append('part_text')
self.story.setMetadata('dedication',ded)
@ -296,11 +291,7 @@ class FicBookNetAdapter(BaseSiteAdapter):
comm['class'].append('part_text')
self.story.setMetadata('authorcomment',comm)
# When using nsapa proxy the required elements are not returned.
try:
follows = stats.find('fanfic-follow-button')[':follow-count']
except TypeError:
follows = stripHTML(stats.find('button', {'class': 'btn btn-with-description btn-primary jsVueComponent', 'type': 'button'}).span)
if int(follows) > 0:
self.story.setMetadata('follows', int(follows))
logger.debug("follows: (%s)"%self.story.getMetadata('follows'))
@ -315,13 +306,7 @@ class FicBookNetAdapter(BaseSiteAdapter):
self.story.extendList('awards', [str(award['user_text']) for award in award_list])
#logger.debug("awards (%s)"%self.story.getMetadata('awards'))
except (TypeError, KeyError):
awards_section = soup.find('section', {'class':'fanfic-author-actions__column mt-5 jsVueComponent'})
if awards_section is not None:
awards = awards_section.select('div:not([class])')
numAwards = int(len(awards))
naward = awards_section.find('span', {'class':'js-span-link'})
if naward is not None:
numAwards = numAwards + int(re.sub(r'[^\d]', '', naward.text))
logger.debug("Could not grab the awards")
if numAwards > 0:
self.story.setMetadata('numAwards', numAwards)
@ -329,14 +314,7 @@ class FicBookNetAdapter(BaseSiteAdapter):
if get_cover:
cover = soup.find('fanfic-cover', {'class':"jsVueComponent"})
if cover is None:
# When using nsapa proxy the element is replaced by different one.
cover = soup.find('picture', {'class':"fanfic-hat-cover-picture"})
if cover is not None:
cover = re.sub('/fanfic-covers/(?:m_|d_)', '/fanfic-covers/', cover.img['src'])
logger.debug("Cover url (%s)"%cover)
self.setCoverImage(url,cover)
else:
self.setCoverImage(url,cover['src-original'])
# grab the text for an individual chapter.
@ -347,17 +325,12 @@ class FicBookNetAdapter(BaseSiteAdapter):
soup = self.make_soup(self.get_request(url))
chapter = soup.find('div', {'id' : 'content'})
if chapter == None: ## still needed?
if chapter is None: ## still needed?
chapter = soup.find('div', {'class' : 'public_beta_disabled'})
if None == chapter:
if chapter is None:
raise exceptions.FailedToDownload("Error downloading Chapter: %s! Missing required element!" % url)
# Remove ads that show up when using NSAPA proxy.
if self.getConfig("use_nsapa_proxy",True):
for ads in chapter.find_all('div', {'class' : 'ads-in-text'}):
ads.extract()
exclude_notes=self.getConfigList('exclude_notes')
if 'headnotes' not in exclude_notes:
# Find the headnote