FanFicFare/fanficfare/adapters/adapter_fanfictionnet.py

# -*- coding: utf-8 -*-

# Copyright 2011 Fanficdownloader team, 2018 FanFicFare team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from __future__ import absolute_import
from datetime import datetime
import logging
logger = logging.getLogger(__name__)
import re

# py2 vs py3 transition
from ..six import text_type as unicode
from ..six.moves.urllib.error import HTTPError

from ..chromagnon.cacheParse import ChromeCache

from .. import exceptions as exceptions
from ..htmlcleanup import stripHTML

from .base_adapter import BaseSiteAdapter,  makeDate

ffnetgenres=["Adventure", "Angst", "Crime", "Drama", "Family", "Fantasy", "Friendship", "General",
             "Horror", "Humor", "Hurt-Comfort", "Mystery", "Parody", "Poetry", "Romance", "Sci-Fi",
             "Spiritual", "Supernatural", "Suspense", "Tragedy", "Western"]

class FanFictionNetSiteAdapter(BaseSiteAdapter):

    def __init__(self, config, url):
        BaseSiteAdapter.__init__(self, config, url)
        self.story.setMetadata('siteabbrev','ffnet')

        # get storyId from url--url validation guarantees second part is storyId
        self.story.setMetadata('storyId',self.parsedUrl.path.split('/',)[2])

        # normalized story URL.
        self._setURL("https://"+self.getSiteDomain()\
                         +"/s/"+self.story.getMetadata('storyId')+"/1/")

        # ffnet update emails have the latest chapter URL.
        # Frequently, when they arrive, not all the servers have the
        # latest chapter yet and going back to chapter 1 to pull the
        # chapter list doesn't get the latest.  So save and use the
        # original URL given to pull chapter list & metadata.
        # Not used by plugin because URL gets normalized first for
        # eliminating duplicate story urls.
        self.origurl = url
        if "https://m." in self.origurl:
            ## accept m(mobile)url, but use www.
            self.origurl = self.origurl.replace("https://m.","https://www.")

        self.chromagnon_cache = None
    @staticmethod
    def getSiteDomain():
        return 'www.fanfiction.net'

    @classmethod
    def getAcceptDomains(cls):
        return ['www.fanfiction.net','m.fanfiction.net']

    @classmethod
    def getSiteExampleURLs(cls):
        return "https://www.fanfiction.net/s/1234/1/ https://www.fanfiction.net/s/1234/12/ http://www.fanfiction.net/s/1234/1/Story_Title http://m.fanfiction.net/s/1234/1/"

    def getSiteURLPattern(self):
        return r"https?://(www|m)?\.fanfiction\.net/s/\d+(/\d+)?(/|/[^/]+)?/?$"

    def _postUrl(self, url,
                 parameters={},
                 headers={},
                 extrasleep=None,
                 usecache=True):
        logger.debug("_postUrl")
        raise NotImplementedError

    def _fetchUrlRawOpened(self, url,
                           parameters=None,
                           extrasleep=None,
                           usecache=True,
                           referer=None):
        logger.debug("_fetchUrlRawOpened")
        raise NotImplementedError

    def _fetchUrlOpened(self, url,
                        parameters=None,
                        usecache=True,
                        extrasleep=None,
                        referer=None):
        logger.debug("_fetchUrlOpened")
        raise NotImplementedError

    def _fetchUrlRaw(self, url,
                     parameters=None,
                     extrasleep=None,
                     usecache=True,
                     referer=None):
        ## This should be the one called for images.
        logger.debug("_fetchUrlRaw")
        raise NotImplementedError

    def _fetchUrl(self,url,parameters=None,extrasleep=1.0,usecache=True):

        if self.chromagnon_cache is None:
            logger.debug("Start making self.chromagnon_cache")
            try:
                if not self.getConfig("chrome_cache_path"):
                    raise exceptions.FailedToDownload("FFnet Workaround: chrome_cache_path setting must be set.")
                self.chromagnon_cache = ChromeCache(self.getConfig("chrome_cache_path"))
            except PermissionError:
                raise exceptions.FailedToDownload("Permission to Chrome Cache (%s) denied--Did you quit Chrome?" % self.getConfig("chrome_cache_path"))
            logger.debug("Done making self.chromagnon_cache")
        data = self.chromagnon_cache.get_cached_file(url)
        if data is None:
            ## XXX Do something to collect list of failed URLs?
            ## Turn on continue on fail?
            raise exceptions.FailedToDownload("URL not found in Chrome Cache: %s" % url)
        logger.debug("%s:len(%s)"%(url,len(data)))
        return self.configuration._decode(data)

    def use_pagecache(self):
        '''
        adapters that will work with the page cache need to implement
        this and change it to True.
        '''
        return True

    def doExtractChapterUrlsAndMetadata(self,get_cover=True):
        get_cover=False
        # fetch the chapter.  From that we will get almost all the
        # metadata and chapter list

        url = self.origurl
        logger.debug("URL: "+url)
        # raise exceptions.FailedToDownload("The site fanfiction.net is blocking downloads.  Site is disabled in this version of FanFicFare.")

        # use BeautifulSoup HTML parser to make everything easier to find.
        try:
            data = self._fetchUrl(url)
            # logger.debug("\n===================\n%s\n===================\n"%data)
            soup = self.make_soup(data)
            # logger.debug("\n===================\n%s\n===================\n"%soup)
        except HTTPError as e:
            if e.code == 404:
                raise exceptions.StoryDoesNotExist(url)
            else:
                raise e

        if "Unable to locate story" in data or "Story Not Found" in data:
            raise exceptions.StoryDoesNotExist(url)

        # some times "Chapter not found...", sometimes "Chapter text
        # not found..." or "Story does not have any chapters"
        if "Please check to see you are not using an outdated url." in data:
            raise exceptions.FailedToDownload("Error downloading Chapter: %s!  'Chapter not found. Please check to see you are not using an outdated url.'" % url)

        # if self.getConfig('check_next_chapter'):
        #     try:
        #         ## ffnet used to have a tendency to send out update
        #         ## notices in email before all their servers were
        #         ## showing the update on the first chapter.  It
        #         ## generates another server request and doesn't seem
        #         ## to be needed lately, so now default it to off.
        #         try:
        #             chapcount = len(soup.find('select', { 'name' : 'chapter' } ).findAll('option'))
        #         # get chapter part of url.
        #         except:
        #             chapcount = 1
        #         tryurl = "https://%s/s/%s/%d/"%(self.getSiteDomain(),
        #                                         self.story.getMetadata('storyId'),
        #                                         chapcount+1)
        #         logger.debug('=Trying newer chapter: %s' % tryurl)
        #         newdata = self._fetchUrl(tryurl)
        #         if "not found. Please check to see you are not using an outdated url." not in newdata \
        #                 and "This request takes too long to process, it is timed out by the server." not in newdata:
        #             logger.debug('=======Found newer chapter: %s' % tryurl)
        #             soup = self.make_soup(newdata)
        #     except HTTPError as e:
        #         if e.code == 503:
        #             raise e
        #     except Exception as e:
        #         logger.warning("Caught an exception reading URL: %s Exception %s."%(unicode(url),unicode(e)))
        #         pass

        # Find authorid and URL from... author url.
        a = soup.find('a', href=re.compile(r"^/u/\d+"))
        self.story.setMetadata('authorId',a['href'].split('/')[2])
        self.story.setMetadata('authorUrl','https://'+self.host+a['href'])
        self.story.setMetadata('author',a.string)

        ## Pull some additional data from html.

        ## ffnet shows category two ways
        ## 1) class(Book, TV, Game,etc) >> category(Harry Potter, Sailor Moon, etc)
        ## 2) cat1_cat2_Crossover
        ## For 1, use the second link.
        ## For 2, fetch the crossover page and pull the two categories from there.
        pre_links = soup.find('div',{'id':'pre_story_links'})
        categories = pre_links.findAll('a',{'class':'xcontrast_txt'})
        #print("xcontrast_txt a:%s"%categories)
        if len(categories) > 1:
            # Strangely, the ones with *two* links are the
            # non-crossover categories.  Each is in a category itself
            # of Book, Movie, etc.
            self.story.addToList('category',stripHTML(categories[1]))
        elif 'Crossover' in categories[0]['href']:
            # caturl = "https://%s%s"%(self.getSiteDomain(),categories[0]['href'])
            # catsoup = self.make_soup(self._fetchUrl(caturl))
            # found = False
            # for a in catsoup.findAll('a',href=re.compile(r"^/crossovers/.+?/\d+/")):
            #     self.story.addToList('category',stripHTML(a))
            #     found = True
            # if not found:
            #     # Fall back.  I ran across a story with a Crossver
            #     # category link to a broken page once.
            #     # http://www.fanfiction.net/s/2622060/1/
            #     # Naruto + Harry Potter Crossover
            #     logger.info("Fall back category collection")
            for c in stripHTML(categories[0]).replace(" Crossover","").split(' + '):
                self.story.addToList('category',c)

        a = soup.find('a', href=re.compile(r'https?://www\.fictionratings\.com/'))
        rating = a.string
        if 'Fiction' in rating: # if rating has 'Fiction ', strip that out for consistency with past.
            rating = rating[8:]

        self.story.setMetadata('rating',rating)

        # after Rating, the same bit of text containing id:123456 contains
        # Complete--if completed.
        gui_table1i = soup.find('div',{'id':'content_wrapper_inner'})

        self.story.setMetadata('title', stripHTML(gui_table1i.find('b'))) # title appears to be only(or at least first) bold tag in gui_table1i

        summarydiv = gui_table1i.find('div',{'style':'margin-top:2px'})
        if summarydiv:
            self.setDescription(url,stripHTML(summarydiv))


        grayspan = gui_table1i.find('span', {'class':'xgray xcontrast_txt'})
        # for b in grayspan.findAll('button'):
        #     b.extract()
        metatext = stripHTML(grayspan).replace('Hurt/Comfort','Hurt-Comfort')
        #logger.debug("metatext:(%s)"%metatext)

        if 'Status: Complete' in metatext:
            self.story.setMetadata('status', 'Completed')
        else:
            self.story.setMetadata('status', 'In-Progress')

        ## Newer BS libraries are discarding whitespace after tags now. :-/
        metalist = re.split(" ?- ",metatext)
        #logger.debug("metalist:(%s)"%metalist)

        # Rated: Fiction K - English - Words: 158,078 - Published: 02-04-11
        # Rated: Fiction T - English - Adventure/Sci-Fi - Naruto U. - Chapters: 22 - Words: 114,414 - Reviews: 395 - Favs: 779 - Follows: 835 - Updated: 03-21-13 - Published: 04-28-12 - id: 8067258

        # rating is obtained above more robustly.
        if metalist[0].startswith('Rated:'):
            metalist=metalist[1:]

        # next is assumed to be language.
        self.story.setMetadata('language',metalist[0])
        metalist=metalist[1:]

        # next might be genre.
        genrelist = metalist[0].split('/') # Hurt/Comfort already changed above.
        goodgenres=True
        for g in genrelist:
            #logger.debug("g:(%s)"%g)
            if g.strip() not in ffnetgenres:
                #logger.info("g not in ffnetgenres")
                goodgenres=False
        if goodgenres:
            self.story.extendList('genre',genrelist)
            metalist=metalist[1:]

        # Updated: <span data-xutime='1368059198'>5/8</span> - Published: <span data-xutime='1278984264'>7/12/2010</span>
        # Published: <span data-xutime='1384358726'>8m ago</span>
        dates = soup.findAll('span',{'data-xutime':re.compile(r'^\d+$')})
        if len(dates) > 1 :
            # updated get set to the same as published upstream if not found.
            self.story.setMetadata('dateUpdated',datetime.fromtimestamp(float(dates[0]['data-xutime'])))
        self.story.setMetadata('datePublished',datetime.fromtimestamp(float(dates[-1]['data-xutime'])))

        # Meta key titles and the metadata they go into, if any.
        metakeys = {
            # These are already handled separately.
            'Chapters':False,
            'Status':False,
            'id':False,
            'Updated':False,
            'Published':False,
            'Reviews':'reviews',
            'Favs':'favs',
            'Follows':'follows',
            'Words':'numWords',
            }

        chars_ships_list=[]
        while len(metalist) > 0:
            m = metalist.pop(0)
            if ':' in m:
                key = m.split(':')[0].strip()
                if key in metakeys:
                    if metakeys[key]:
                        self.story.setMetadata(metakeys[key],m.split(':')[1].strip())
                    continue
            # no ':' or not found in metakeys
            chars_ships_list.append(m)

        # all because sometimes chars can have ' - ' in them.
        chars_ships_text = (' - ').join(chars_ships_list)
        # print("chars_ships_text:%s"%chars_ships_text)
        # with 'pairing' support, pairings are bracketed w/o comma after
        # [Caspian X, Lucy Pevensie] Edmund Pevensie, Peter Pevensie
        self.story.extendList('characters',chars_ships_text.replace('[','').replace(']',',').split(','))

        l = chars_ships_text
        while '[' in l:
            self.story.addToList('ships',l[l.index('[')+1:l.index(']')].replace(', ','/'))
            l = l[l.index(']')+1:]

        if get_cover:
            # Try the larger image first.
            cover_url = ""
            try:
                img = soup.select_one('img.lazy.cimage')
                cover_url=img['data-original']
            except:
                img = soup.select_one('img.cimage:not(.lazy)')
                if img:
                    cover_url=img['src']
            ## Nov 19, 2020, ffnet lazy cover images returning 0 byte
            ## files.
            # logger.debug("cover_url:%s"%cover_url)

            # authimg_url = ""
            # if cover_url and self.getConfig('include_images') and self.getConfig('skip_author_cover'):
            #     authsoup = self.make_soup(self._fetchUrl(self.story.getMetadata('authorUrl')))
            #     try:
            #         img = authsoup.select_one('img.lazy.cimage')
            #         authimg_url=img['data-original']
            #     except:
            #         img = authsoup.select_one('img.cimage')
            #         if img:
            #             authimg_url=img['src']

            #     logger.debug("authimg_url:%s"%authimg_url)

            #     ## ffnet uses different sizes on auth & story pages, but same id.
            #     ## Old URLs:
            #     ## //ffcdn2012t-fictionpressllc.netdna-ssl.com/image/1936929/150/
            #     ## //ffcdn2012t-fictionpressllc.netdna-ssl.com/image/1936929/180/
            #     ## After Dec 2020 ffnet changes:
            #     ## /image/6472517/180/
            #     ## /image/6472517/150/
            #     try:
            #         cover_id = cover_url.split('/')[-3]
            #     except:
            #         cover_id = None
            #     try:
            #         authimg_id = authimg_url.split('/')[-3]
            #     except:
            #         authimg_id = None

            #     ## don't use cover if it matches the auth image.
            #     if cover_id and authimg_id and cover_id == authimg_id:
            #         logger.debug("skip_author_cover: cover_url matches authimg_url: don't use")
            #         cover_url = None

            # if cover_url:
            #     self.setCoverImage(url,cover_url)


        # Find the chapter selector
        select = soup.find('select', { 'name' : 'chapter' } )

        if select is None:
            # no selector found, so it's a one-chapter story.
            self.add_chapter(self.story.getMetadata('title'),url)
        else:
            allOptions = select.findAll('option')
            for o in allOptions:
                url = u'https://%s/s/%s/%s/' % ( self.getSiteDomain(),
                                                 self.story.getMetadata('storyId'),
                                                 o['value'])
                # just in case there's tags, like <i> in chapter titles.
                title = u"%s" % o
                title = re.sub(r'<[^>]+>','',title)
                self.add_chapter(title,url)


        return

    def getChapterText(self, url):
        logger.debug('Getting chapter text from: %s' % url)
        ## ffnet(and, I assume, fpcom) tends to fail more if hit too
        ## fast.  This is in additional to what ever the
        ## slow_down_sleep_time setting is.
        data = self._fetchUrl(url,extrasleep=4.0)

        if "Please email this error message in full to <a href='mailto:support@fanfiction.com'>support@fanfiction.com</a>" in data:
            raise exceptions.FailedToDownload("Error downloading Chapter: %s!  FanFiction.net Site Error!" % url)

        soup = self.make_soup(data)

        div = soup.find('div', {'id' : 'storytextp'})

        if None == div:
            logger.debug('div id=storytextp not found.  data:%s'%data)
            raise exceptions.FailedToDownload("Error downloading Chapter: %s!  Missing required element!" % url)

        return self.utf8FromSoup(url,div)

def getClass():
    return FanFictionNetSiteAdapter