FanFicFare/twilighted.py

import os
import re
import sys
import shutil
import os.path
import urllib as u
import logging
import pprint as pp
import unittest
import urllib2 as u2
import urlparse as up
import BeautifulSoup as bs
import htmlentitydefs as hdefs
import time
import datetime

from adapter import *
import twipassword

class Twilighted(FanfictionSiteAdapter):
    def __init__(self, url):
        self.url = url
        parsedUrl = up.urlparse(url)
        self.host = parsedUrl.netloc
        self.path = parsedUrl.path
        self.opener = u2.build_opener(u2.HTTPCookieProcessor())
        self.password=twipassword.password
        self.login='sigizmund'
        self.storyDescription = 'Fanfiction Story'
        self.authorId = '0'
        self.authorURL = ''
        self.storyId = '0'
        self.storyPublished = datetime.date(1970, 01, 31)
        self.storyCreated = datetime.datetime.now()
        self.storyUpdated = datetime.date(1970, 01, 31)
        self.languageId = 'en-UK'
        self.language = 'English'
        self.subjects = []
        self.subjects.append ('fanfiction')
        self.subjects.append ('Twilight')
        self.publisher = self.host
        self.numChapters = 0
        self.numWords = 0
        self.genre = 'FanFiction'
        self.category = 'Category'
        self.storyStatus = 'In-Progress'
        self.storyRating = 'PG'
        self.storyUserRating = '0'
        self.storyCharacters = []
        self.storySeries = ''
        self.uuid = 'urn:uuid:' + self.host + '-u.' + self.authorId + '-s.' + self.storyId
        logging.debug('self.uuid=%s' % self.uuid)

        logging.debug("Created Twilighted: url=%s" % (self.url))


    def requiresLogin(self, url = None):
        # potionsandsnitches.net doesn't require login.
        if self.host == 'potionsandsnitches.net':
          return False
        else:
          return True

    def performLogin(self, url = None):
        data = {}

        data['penname'] = self.login
        data['password'] = self.password
        data['cookiecheck'] = '1'
        data['submit'] = 'Submit'

        urlvals = u.urlencode(data)
        loginUrl = 'http://' + self.host + self._getLoginScript()
        logging.debug("Will now login to URL %s" % loginUrl)

        req = self.opener.open(loginUrl, urlvals)

        d = req.read().decode('utf-8')

        if self.reqLoginData(d) :
          return False
        else:
          return True


    def setLogin(self, login):
        self.login = login

    def setPassword(self, password):
        self.password = password

    def _addSubject(self, subject):
        subj = subject.upper()
        for s in self.subjects:
            if s.upper() == subj:
                return False
        self.subjects.append(subject)
        return True

    def _addCharacter(self, character):
        chara = character.upper()
        for c in self.storyCharacters:
            if c.upper() == chara:
                return False
        self.storyCharacters.append(character)
        return True

    def extractIndividualUrls(self):
        data = self.opener.open(self.url).read()

        if self.reqLoginData(data):
          self.performLogin()
          data = self.opener.open(self.url).read()
          if self.reqLoginData(data):
            return None

        soup = bs.BeautifulStoneSoup(data)

        title = soup.find('title').string
        self.storyName = title.split(' by ')[0].strip()
        self.authorName = title.split(' by ')[1].strip()
        self.outputName = self.storyName.replace(" ", "_")

        select = soup.find('select', { 'name' : 'chapter' } )

        result = []
        if select is None:
    	   # no chapters found, try url by itself.
    	   result.append((self.url,self.storyName))
        else:
    	   allOptions = select.findAll('option')
    	   for o in allOptions:
    	     url = self.url + "&chapter=%s" % o['value']
    	     title = o.string
    	     result.append((url,title))

        url = self.url + "&index=1"
        data = self.opener.open(url).read()
        lines = data.split('\n')
        soup = bs.BeautifulStoneSoup(data)
        metas = soup.findAll('meta')
        for meta in metas:
            if 'name' in meta._getAttrMap() and meta['name'].find('description') != -1:
                #logging.debug('Meta: %s' % meta)
                if 'content' in meta._getAttrMap():
                    s1 = bs.BeautifulStoneSoup(meta['content'])
                    ps = s1.findAll('p')
                    if len(ps) > 0:
                        self.storyDescription = ps[0]
                        logging.debug('self.storyDescription=%s' % (self.storyDescription))
                    else:
                        divs = meta.findAll('div')
                        #logging.debug('Divs: %s' % divs)

                        for div in divs:
                            #logging.debug('Div: %s' % div)
                            if 'id' in div._getAttrMap() and div['id'].find('pagetitle') != -1:
                                #logging.debug('Div PAGETITLE: %s' % div)
                                allA = div.findAll('a')
                                for a in allA:
                                    if 'href' in a._getAttrMap():
                                        if a['href'].find('viewstory.php?sid=') != -1:
                                            str1 = a.string
                                            (vs, self.storyId) = a['href'].split('=')
                                            logging.debug('self.storyId=%s self.storyName=%s' % (self.storyId, self.storyName))
                                            self.outputName = self.outputName + "-tw_" + self.storyId
                                            logging.debug('self.outputName=%s' % self.outputName)
                                        if a['href'].find('viewuser.php?uid=') != -1:
                                            str1 = a.string
                                            (vs, self.authorId) = a['href'].split('=')
                                            logging.debug('self.authorId=%s self.authorName=%s' % (self.authorId, self.authorName))
                                            self.authorURL = 'http://'+self.host+'/viewuser.php?uid='+self.authorId
                                            logging.debug('self.authorURL=%s' % self.authorURL)
                            if 'class' in div._getAttrMap() and div['class'].find('content') !=   -1:
                                #logging.debug('Div CONTENT: %s' % div)
                                brs = div.findAll('br')
                                for br in brs:
                                    buf = unicode(br).encode('utf-8')
                                    strs = re.split ('<[^>]+>', buf)
                                    #logging.debug('BUF: %s' % strs)
                                    ii = 2
                                    stlen = len(strs)
                                    while stlen > ii+1:
                                        if len(strs[ii]) == 0:
                                            ii = ii+1
                                            continue
                                        if strs[ii] == 'Categories:':
                                            ii = ii+1
                                            while stlen > ii and len(strs[ii]) != 0 and strs[ii].find(':') == -1:
                                                if strs[ii] != ' ' and strs[ii] != ', ':
                                                    if self.category == 'Category':
                                                        self.category = strs[ii].strip(' ')
                                                    self._addSubject(strs[ii].strip(' '))
                                                ii = ii+1
                                            logging.debug('self.subjects=%s' % self.subjects)
                                        if strs[ii] == 'Characters: ':
                                            ii = ii+1
                                            while stlen > ii and len(strs[ii]) != 0 and strs[ii].find(':') == -1:
                                                if strs[ii] != ' ' and strs[ii] != ', ':
                                                    self._addCharacter(strs[ii].strip(' '))
                                                ii = ii+1
                                            logging.debug('self.storyCharacters=%s' % self.storyCharacters)
                                        elif strs[ii] == 'Completed:':
                                            if strs[ii+1].strip(' ') == "No":
                                                self.storyStatus = 'In-Progress'
                                            else:
                                                self.storyStatus = 'Completed'
                                            ii = ii+2
                                            logging.debug('self.storyStatus=%s' % self.storyStatus)
                                        elif strs[ii] == 'Rated:':
                                            self.storyRating = strs[ii+1].strip(' ')
                                            ii = ii+2
                                            logging.debug('self.storyRating=%s' % self.storyRating)
                                        elif strs[ii] == 'Series:':
                                            self.storySeries = strs[ii+1].strip(' ')
                                            if self.storySeries == 'None':
                                                self.storySeries = ''
                                            ii = ii+2
                                            logging.debug('self.storySeries=%s' % self.storySeries)
                                        elif strs[ii] == 'Chapters: ':
                                            self.numChapters = strs[ii+1].strip(' ')
                                            ii = ii+2
                                            logging.debug('self.numChapters=%s' % self.numChapters)
                                        elif strs[ii] == 'Word count:':
                                            self.numWords = strs[ii+1].strip(' ')
                                            ii = ii+2
                                            logging.debug('self.numWords=%s' % self.numWords)
                                        elif strs[ii] == ' Published: ':
                                            self.storyPublished = datetime.datetime.fromtimestamp(time.mktime(time.strptime(strs[ii+1].strip(' '), "%B %d, %Y")))
                                            ii = ii+2
                                            logging.debug('self.storyPublished=%s' % self.storyPublished)
                                        elif strs[ii] == 'Updated:':
                                            self.storyUpdated = datetime.datetime.fromtimestamp(time.mktime(time.strptime(strs[ii+1].strip(' '), "%B %d, %Y")))
                                            ii = ii+2
                                            logging.debug('self.storyUpdated=%s' % self.storyUpdated)
                                        else:
                                            logging.debug('Skipped Label \"%s\" Value \"%s\"' % (strs[ii], strs[ii+1]))
                                            ii = ii+2

        self.uuid = 'urn:uuid:' + self.host + '-u.' + self.authorId + '-s.' + self.storyId
        logging.debug('self.uuid=%s' % self.uuid)

        return result

    def getStoryName(self):
        return self.storyName

    def getOutputName(self):
        return self.outputName

    def getAuthorName(self):
        return self.authorName

    def getText(self, url):
        if url.find('http://') == -1:
          url = 'http://' + self.host + '/' + url

        logging.debug('Getting data from: %s' % url)

        data = self.opener.open(url).read()

        soup = bs.BeautifulStoneSoup(data, convertEntities=bs.BeautifulStoneSoup.HTML_ENTITIES)

        div = soup.find('div', {'id' : 'story'})

        if None == div:
          return '<html/>'

        return div.__str__('utf8')

    def _getLoginScript(self):
        return '/user.php?action=login'

    def reqLoginData(self, data):
        if data.find('Registered Users Only. Please click OK to login or register.') != -1 or data.find('There is no such account on our website') != -1:
          return True
        else:
          return False

    def getHost(self):
        logging.debug('self.host=%s' % self.host)
        return self.host

    def getStoryURL(self):
        logging.debug('self.url=%s' % self.url)
        return self.url

    def getAuthorURL(self):
        logging.debug('self.authorURL=%s' % self.authorURL)
        return self.authorURL

    def getUUID(self):
        logging.debug('self.uuid=%s' % self.uuid)
        return self.uuid

    def getStoryDescription(self):
        logging.debug('self.storyDescription=%s' % self.storyDescription)
        return self.storyDescription

    def getStoryPublished(self):
        logging.debug('self.storyPublished=%s' % self.storyPublished)
        return self.storyPublished

    def getStoryCreated(self):
        self.storyCreated = datetime.datetime.now()
        logging.debug('self.storyCreated=%s' % self.storyCreated)
        return self.storyCreated

    def getStoryUpdated(self):
        logging.debug('self.storyUpdated=%s' % self.storyUpdated)
        return self.storyUpdated

    def getLanguage(self):
        logging.debug('self.language=%s' % self.language)
        return self.language

    def getLanguageId(self):
        logging.debug('self.languageId=%s' % self.languageId)
        return self.languageId

    def getSubjects(self):
        logging.debug('self.subjects=%s' % self.authorName)
        return self.subjects

    def getPublisher(self):
        logging.debug('self.publisher=%s' % self.publisher)
        return self.publisher

    def getNumChapters(self):
        logging.debug('self.numChapters=%s' % self.numChapters)
        return self.numChapters

    def getNumWords(self):
        logging.debug('self.numWords=%s' % self.numWords)
        return self.numWords

    def getAuthorId(self):
        logging.debug('self.authorId=%s' % self.authorId)
        return self.authorId

    def getStoryId(self):
        logging.debug('self.storyId=%s' % self.storyId)
        return self.storyId

    def getCategory(self):
        logging.debug('self.category=%s' % self.category)
        return self.category

    def getGenre(self):
        logging.debug('self.genre=%s' % self.genre)
        return self.genre

    def getStoryStatus(self):
        logging.debug('self.storyStatus=%s' % self.storyStatus)
        return self.storyStatus

    def getStoryRating(self):
        logging.debug('self.storyRating=%s' % self.storyRating)
        return self.storyRating

    def getStoryUserRating(self):
        logging.debug('self.storyUserRating=%s' % self.storyUserRating)
        return self.storyUserRating

    def getStoryCharacters(self):
        logging.debug('self.storyCharacters=%s' % self.storyCharacters)
        return self.storyCharacters

    def getStorySeries(self):
        logging.debug('self.storySeries=%s' % self.storySeries)
        return self.storySeries

class Twilighted_UnitTests(unittest.TestCase):
  def setUp(self):
    logging.basicConfig(level=logging.DEBUG)
    pass

  def testLoginWorks(self):
    url = 'http://www.twilighted.net/viewstory.php?sid=10004'
    self.assertTrue(Twilighted(url).performLogin())

  def testGetUrlsWorks(self):
    url = 'http://www.twilighted.net/viewstory.php?sid=10004'
    self.assertEquals(32, len(Twilighted(url).extractIndividualUrls()))

if __name__ == '__main__':
  unittest.main()