FanFicFare/fanficdownloader/adapters/base_adapter.py

# -*- coding: utf-8 -*-

# Copyright 2011 Fanficdownloader team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import re
import datetime
import time
import logging
import urllib
import urllib2 as u2
import urlparse as up
from functools import partial

from .. import BeautifulSoup as bs
from ..htmlcleanup import stripHTML

try:
    from google.appengine.api import apiproxy_stub_map
    def urlfetch_timeout_hook(service, call, request, response):
        if call != 'Fetch':
            return
        # Make the default deadline 10 seconds instead of 5.
        if not request.has_deadline():
            request.set_deadline(10.0)

    apiproxy_stub_map.apiproxy.GetPreCallHooks().Append(
        'urlfetch_timeout_hook', urlfetch_timeout_hook, 'urlfetch')
    logging.info("Hook to make default deadline 10.0 installed.")
except:
    pass
    #logging.info("Hook to make default deadline 10.0 NOT installed--not using appengine")

from ..story import Story
from ..gziphttp import GZipProcessor
from ..configurable import Configurable
from ..htmlcleanup import removeEntities, removeAllEntities, stripHTML
from ..exceptions import InvalidStoryURL

try:
    from .. import chardet as chardet
except ImportError:
    chardet = None

class BaseSiteAdapter(Configurable):

    @classmethod
    def matchesSite(cls,site):
        return site in cls.getAcceptDomains()

    @classmethod
    def getAcceptDomains(cls):
        return [cls.getSiteDomain()]

    def validateURL(self):
        return re.match(self.getSiteURLPattern(), self.url)

    def __init__(self, config, url):
        self.config = config
        Configurable.__init__(self, config)
        self.setSectionOrder(self.getConfigSection())

        self.username = "NoneGiven" # if left empty, site doesn't return any message at all.
        self.password = ""
        self.is_adult=False

        self.opener = u2.build_opener(u2.HTTPCookieProcessor(),GZipProcessor())
        self.storyDone = False
        self.metadataDone = False
        self.story = Story()
        self.story.setMetadata('site',self.getConfigSection())
        self.story.setMetadata('dateCreated',datetime.datetime.now())
        self.chapterUrls = [] # tuples of (chapter title,chapter url)
        self.chapterFirst = None
        self.chapterLast = None
        self.oldchapters = None
        self.oldimgs = None
        self.oldcover = None # (data of existing cover html, data of existing cover image)
        self.calibrebookmark = None
        ## order of preference for decoding.
        self.decode = ["utf8",
                       "Windows-1252"] # 1252 is a superset of
                                       # iso-8859-1.  Most sites that
                                       # claim to be iso-8859-1 (and
                                       # some that claim to be utf8)
                                       # are really windows-1252.
        self._setURL(url)
        if not self.validateURL():
            raise InvalidStoryURL(url,
                                  self.getSiteDomain(),
                                  self.getSiteExampleURLs())

    def _setURL(self,url):
        self.url = url
        self.parsedUrl = up.urlparse(url)
        self.host = self.parsedUrl.netloc
        self.path = self.parsedUrl.path
        self.story.setMetadata('storyUrl',self.url)

## website encoding(s)--in theory, each website reports the character
## encoding they use for each page.  In practice, some sites report it
## incorrectly.  Each adapter has a default list, usually "utf8,
## Windows-1252" or "Windows-1252, utf8".  The special value 'auto'
## will call chardet and use the encoding it reports if it has +90%
## confidence.  'auto' is not reliable.
    def _decode(self,data):
        if self.getConfig('website_encodings'):
            decode = self.getConfigList('website_encodings')
        else:
            decode = self.decode

        for code in decode:
            try:
                #print code
                if code == "auto":
                    if not chardet:
                        logging.info("chardet not available, skipping 'auto' encoding")
                        continue
                    detected = chardet.detect(data)
                    #print detected
                    if detected['confidence'] > 0.9:
                        code=detected['encoding']
                    else:
                        continue
                logging.debug("try code:"+code)
                return data.decode(code)
            except:
                logging.info("code failed:"+code)
                pass
        logging.info("Could not decode story, tried:%s Stripping non-ASCII."%decode)
        return "".join([x for x in data if ord(x) < 128])

    # Assumes application/x-www-form-urlencoded.  parameters, headers are dict()s
    def _postUrl(self, url, parameters={}, headers={}):
        if self.getConfig('slow_down_sleep_time'):
            time.sleep(float(self.getConfig('slow_down_sleep_time')))

        ## u2.Request assumes POST when data!=None.  Also assumes data
        ## is application/x-www-form-urlencoded.
        if 'Content-type' not in headers:
            headers['Content-type']='application/x-www-form-urlencoded'
        if 'Accept' not in headers:
            headers['Accept']="text/html,*/*"
        req = u2.Request(url,
                         data=urllib.urlencode(parameters),
                         headers=headers)
        return self._decode(self.opener.open(req).read())

    def _fetchUrlRaw(self, url, parameters=None):
        if parameters != None:
            return self.opener.open(url.replace(' ','%20'),urllib.urlencode(parameters)).read()
        else:
            return self.opener.open(url.replace(' ','%20')).read()

    # parameters is a dict()
    def _fetchUrl(self, url, parameters=None):
        if self.getConfig('slow_down_sleep_time'):
            time.sleep(float(self.getConfig('slow_down_sleep_time')))

        excpt=None
        for sleeptime in [0, 0.5, 4, 9]:
            time.sleep(sleeptime)
            try:
                return self._decode(self._fetchUrlRaw(url,parameters))
            except Exception, e:
                excpt=e
                logging.warn("Caught an exception reading URL: %s  Exception %s."%(unicode(url),unicode(e)))

        logging.error("Giving up on %s" %url)
        logging.exception(excpt)
        raise(excpt)

    # Limit chapters to download.  Input starts at 1, list starts at 0
    def setChaptersRange(self,first=None,last=None):
        if first:
            self.chapterFirst=int(first)-1
        if last:
            self.chapterLast=int(last)-1

    # Does the download the first time it's called.
    def getStory(self):
        if not self.storyDone:
            self.getStoryMetadataOnly()

            for index, (title,url) in enumerate(self.chapterUrls):
                if (self.chapterFirst!=None and index < self.chapterFirst) or \
                        (self.chapterLast!=None and index > self.chapterLast):
                    self.story.addChapter(removeEntities(title),
                                          None)
                else:
                    if self.oldchapters and index < len(self.oldchapters):
                        data = self.utf8FromSoup(None,
                                                 self.oldchapters[index],
                                                 partial(cachedfetch,self._fetchUrlRaw,self.oldimgs))
                    else:
                        data = self.getChapterText(url)
                    self.story.addChapter(removeEntities(title),
                                          removeEntities(data))
            self.storyDone = True

            # include image, but no cover from story, add default_cover_image cover.
            if self.getConfig('include_images') and \
                    not self.story.cover and \
                    self.getConfig('default_cover_image'):
                self.story.addImgUrl(self,
                                     None,
                                     #self.getConfig('default_cover_image'),
                                     self.story.formatFileName(self.getConfig('default_cover_image'),
                                                               self.getConfig('allow_unsafe_filename')),
                                     self._fetchUrlRaw,
                                     cover=True)

            # no new cover, set old cover, if there is one.
            if not self.story.cover and self.oldcover:
                self.story.oldcover = self.oldcover

            # cheesy way to carry calibre bookmark file forward across update.
            if self.calibrebookmark:
                self.story.calibrebookmark = self.calibrebookmark

        return self.story

    def getStoryMetadataOnly(self):
        if not self.metadataDone:
            self.extractChapterUrlsAndMetadata()

            if not self.story.getMetadataRaw('dateUpdated'):
                self.story.setMetadata('dateUpdated',self.story.getMetadataRaw('datePublished'))

            self.metadataDone = True
        return self.story

    ###############################

    @staticmethod
    def getSiteDomain():
        "Needs to be overriden in each adapter class."
        return 'no such domain'

    @classmethod
    def getConfigSection(cls):
        "Only needs to be overriden if != site domain."
        return cls.getSiteDomain()

    ## URL pattern validation is done *after* picking an adaptor based
    ## on domain instead of *as* the adaptor selector so we can offer
    ## the user example(s) for that particular site.
    ## Override validateURL(self) instead if you need more control.
    def getSiteURLPattern(self):
        "Used to validate URL.  Should be override in each adapter class."
        return '^http://'+re.escape(self.getSiteDomain())

    def getSiteExampleURLs(self):
        """
        Needs to be overriden in each adapter class.  It's the adapter
        writer's responsibility to make sure the example(s) pass the
        URL validate.
        """
        return 'no such example'

    def extractChapterUrlsAndMetadata(self):
        "Needs to be overriden in each adapter class.  Populates self.story metadata and self.chapterUrls"
        pass

    def getChapterText(self, url):
        "Needs to be overriden in each adapter class."
        pass

    # Just for series, in case we choose to change how it's stored or represented later.
    def setSeries(self,name,num):
        if self.getConfig('collect_series'):
            self.story.setMetadata('series','%s [%s]'%(name, int(num)))

    def setDescription(self,url,svalue):
        #print("\n\nsvalue:\n%s\n"%svalue)
        if self.getConfig('keep_summary_html'):
            if isinstance(svalue,str) or isinstance(svalue,unicode):
                svalue = bs.BeautifulSoup(svalue)
            self.story.setMetadata('description',self.utf8FromSoup(url,svalue))
        else:
            self.story.setMetadata('description',stripHTML(svalue))
        #print("\n\ndescription:\n"+self.story.getMetadata('description')+"\n\n")

    # This gives us a unicode object, not just a string containing bytes.
    # (I gave soup a unicode string, you'd think it could give it back...)
    # Now also does a bunch of other common processing for us.
    def utf8FromSoup(self,url,soup,fetch=None):
        if not fetch:
            fetch=self._fetchUrlRaw

        acceptable_attributes = ['href','name']
        #print("include_images:"+self.getConfig('include_images'))
        if self.getConfig('include_images'):
            acceptable_attributes.extend(('src','alt','longdesc'))
            for img in soup.findAll('img'):
                # some pre-existing epubs have img tags that had src stripped off.
                if img.has_key('src'):
                    img['longdesc']=img['src']
                    img['src']=self.story.addImgUrl(self,url,img['src'],fetch,
                                                    coverexclusion=self.getConfig('cover_exclusion_regexp'))

        for attr in soup._getAttrMap().keys():
            if attr not in acceptable_attributes:
                del soup[attr] ## strip all tag attributes except href and name

        for t in soup.findAll(recursive=True):
            for attr in t._getAttrMap().keys():
                if attr not in acceptable_attributes:
                    del t[attr] ## strip all tag attributes except href and name

            # these are not acceptable strict XHTML.  But we do already have
	    # CSS classes of the same names defined
            if t.name in ('u'):
                t['class']=t.name
                t.name='span'
            if t.name in ('center'):
                t['class']=t.name
                t.name='div'
	    # removes paired, but empty tags.
            if t.string != None and len(t.string.strip()) == 0 :
                t.extract()

        retval = soup.__str__('utf8').decode('utf-8')

        if self.getConfig('replace_hr'):
            # replacing a self-closing tag with a container tag in the
            # soup is more difficult than it first appears.  So cheat.
            retval = retval.replace("<hr />","<div class='center'>* * *</div>")

        if self.getConfig('nook_img_fix'):
            # if the <img> tag doesn't have a div or a p around it,
            # nook gets confused and displays it on every page after
            # that under the text for the rest of the chapter.
            retval = re.sub(r"(?!<(div|p)>)\s*(?P<imgtag><img[^>]+>)\s*(?!</(div|p)>)",
                            "<div>\g<imgtag></div>",retval)

        # Don't want body tags in chapter html--writers add them.
        # This is primarily for epub updates.
        return re.sub(r"</?body>\r?\n?","",retval)

def cachedfetch(realfetch,cache,url):
    if url in cache:
        print("cache hit")
        return cache[url]
    else:
        return realfetch(url)

fullmon = {"January":"01", "February":"02", "March":"03", "April":"04", "May":"05",
           "June":"06","July":"07", "August":"08", "September":"09", "October":"10",
           "November":"11", "December":"12" }

def makeDate(string,format):
    # Surprise!  Abstracting this turned out to be more useful than
    # just saving bytes.

    # fudge english month names for people who's locale is set to
    # non-english.  All our current sites date in english, even if
    # there's non-english content. -- ficbook.net now makes that a
    # lie.  It has to do something even more complicated to get
    # Russian month names correct everywhere.
    do_abbrev = "%b" in format

    if "%B" in format or do_abbrev:
        format = format.replace("%B","%m").replace("%b","%m")
        for (name,num) in fullmon.items():
            if do_abbrev:
                name = name[:3] # first three for abbrev
            if name in string:
                string = string.replace(name,num)
                break

    return datetime.datetime.strptime(string,format)