FanFicFare/potionsNsnitches.py

334 lines
13 KiB
Python

# -*- coding: utf-8 -*-
# Copied from the twilighted.py because site is almost the same..
# of course, now that we're trying to scrape more detail about the
# story, there were differences in how headers are displayed
import os
import re
import sys
import shutil
import os.path
import urllib as u
import logging
import pprint as pp
import unittest
import urllib2 as u2
import urlparse as up
import BeautifulSoup as bs
import htmlentitydefs as hdefs
import time
import datetime
from adapter import *
class PotionsNSnitches(FanfictionSiteAdapter):
def __init__(self, url):
self.url = url
parsedUrl = up.urlparse(url)
self.host = parsedUrl.netloc
self.path = parsedUrl.path
self.opener = u2.build_opener(u2.HTTPCookieProcessor())
self.password = ''
self.login='sigizmund'
self.storyDescription = 'Fanfiction Story'
self.authorId = '0'
self.authorURL = ''
self.storyId = '0'
self.storyPublished = datetime.date(1970, 01, 31)
self.storyCreated = datetime.datetime.now()
self.storyUpdated = datetime.date(1970, 01, 31)
self.languageId = 'en-UK'
self.language = 'English'
self.subjects = []
self.subjects.append ('fanfiction')
self.subjects.append ('Harry Potter')
self.publisher = self.host
self.numChapters = 0
self.numWords = 0
self.genre = 'FanFiction'
self.category = 'Category'
self.storyStatus = 'In-Progress'
self.storyRating = 'PG'
self.storyUserRating = '0'
self.storyCharacters = []
self.storySeries = ''
self.outputName = ''
self.outputStorySep = '-pns_'
self.chapurl = False
ss=self.url.split('?')
if ss is not None and len(ss) > 1:
sss = ss[1].replace('&','&').split('&')
if sss is not None and len(sss) > 0:
ssss = sss[0].split('=')
if ssss is not None and len(ssss) > 1 and ssss[0] == 'sid':
self.storyId = ssss[1]
if len(sss) > 1:
ssss = sss[1].split('=')
if ssss is not None and len(ssss) > 1 and ssss[0] == 'chapter':
self.chapurl = True
self.url = 'http://' + self.host + '/' + self.path + '?sid=' + self.storyId
logging.debug('self.url=%s' % self.url)
logging.debug("Created PotionsNSnitches: url=%s" % (self.url))
def _getLoginScript(self):
return '/user.php?action=login'
def reqLoginData(self, data):
if data.find('Registered Users Only. Please click OK to login or register.') != -1 or data.find('There is no such account on our website') != -1:
return True
else:
return False
def _fillCharacters(self, strlist, idx, maxlen):
ii = idx
while ii < maxlen:
chara = strlist[ii].strip()
if len(chara) > 0:
if chara.find(':') != -1:
return (ii-1)
elif chara.find(',') == -1:
self.addCharacter (chara)
ii = ii + 1
return (ii)
def _buildGenre(self, strlist, idx, maxlen):
self.genre = ''
ii = idx
while ii < maxlen:
genre = strlist[ii].strip()
if len(genre) > 0:
if genre.find(':') != -1:
return (ii-1)
elif genre.find(',') != -1:
genre = ', '
else:
self.addSubject (genre)
self.genre = self.genre + genre
ii = ii + 1
return (ii)
def _buildCategory(self, strlist, idx, maxlen):
self.category = ''
ii = idx
while ii < maxlen:
cat = strlist[ii].strip()
if len(cat) > 0:
if cat.find(':') != -1:
return (ii-1)
elif cat.find(',') != -1:
cat = ', '
else:
self.addSubject (cat)
self.category = self.category + cat
ii = ii + 1
return (ii)
def extractIndividualUrls(self):
url = self.url + '&chapter=1'
data = self.opener.open(url).read()
if self.reqLoginData(data):
self.performLogin()
data = self.opener.open(url).read()
if self.reqLoginData(data):
return None
soup = bs.BeautifulStoneSoup(data)
self.storyName = ''
self.authorName = ''
self.storyId = '0'
title = soup.find('title').string
if title is not None and len(title) > 0:
logging.debug('Title: %s' % title)
ss = title.split(' by ')
if ss is not None and len(ss) > 1:
self.storyName = ss[0].strip()
self.authorName = ss[1].strip()
logging.debug('self.storyId=%s, self.storyName=%s' % (self.storyId, self.storyName))
logging.debug('self.authorId=%s, self.authorName=%s' % (self.authorId, self.authorName))
select = soup.find('select', { 'name' : 'chapter' } )
result = []
if select is None:
# no chapters found, try url by itself.
chaptitle = soup.find('div', { 'id' : 'chaptertitle' } )
if chaptitle is not None and chaptitle.string is not None and len(chaptitle.string) > 0:
result.append((url,chaptitle.string))
else:
result.append((url,self.storyName))
else:
allOptions = select.findAll('option')
for o in allOptions:
url = self.url + "&chapter=%s" % o['value']
title = o.string
result.append((url,title))
url = self.url + "&index=1"
data = self.opener.open(url).read()
lines = data.split('\n')
soup = bs.BeautifulStoneSoup(data)
pgt = soup.find('div', {'id' : 'pagetitle'})
#logging.debug('pagetitle: %s' % pgt)
pgtAs = pgt.findAll('a')
#logging.debug('pgtAs: %s' % pgtAs)
for a in pgtAs:
if a['href'].find('viewstory.php') != -1:
(u1, self.storyId) = a['href'].split('=')
self.storyName = a.string
logging.debug('self.storyId=%s, self.storyName=%s' % (self.storyId, self.storyName))
elif a['href'].find('viewuser.php') != -1:
self.authorName = a.string
self.authorURL = 'http://' + self.host + '/' + a['href']
(u1, self.authorId) = a['href'].split('=')
logging.debug('self.authorName=%s, self.authorId=%s' % (self.authorName, self.authorId))
output = soup.find('div', {'id' : 'output'})
#logging.debug('output: %s' % str(output))
if output is not None and len(str(output)) > 1:
s2 = re.split ('<[^>]+>', str(output))
#logging.debug('s2=%s' % s2)
ii = 0
ll = len(s2)
while ii < ll:
if s2[ii] == 'Summary:' and ii+1 < ll:
self.storyDescription = s2[ii+1].strip()
logging.debug('self.storyDescription: %s' % self.storyDescription)
break;
ii = ii+1
cnt = soup.find('div', {'class' : 'content'})
#logging.debug('content: %s' % cnt)
cnttd = cnt.findAll('td')
#logging.debug('cnttd: %s' % cnttd)
for td in cnttd:
#logging.debug('td: %s' % str(td))
ss = str(td).replace('\n','').replace('\r','').replace('&nbsp;', ' ')
if len(ss) > 1:
s2 = re.split ('<[^>]+>', ss)
#logging.debug('s2=%s' % s2)
ii = 0
ll = len(s2)
while ii < ll-1:
if s2[ii] is not None and len(s2[ii]) > 0 and s2[ii].find(':') != -1:
skey = s2[ii].strip()
ii = ii+1
if skey == 'Rated:':
self.storyRating = s2[ii].strip()
logging.debug('self.storyRating=%s' % self.storyRating)
ii = ii + 1
elif skey == 'Chapters:':
self.numChapters = s2[ii].strip()
logging.debug('self.numChapters=%s' % self.numChapters)
ii = ii + 1
elif skey == 'Characters:':
ii = self._fillCharacters(s2, ii, ll)
logging.debug('self.storyCharacters=%s' % self.storyCharacters)
ii = ii + 1
elif skey == 'Genres:':
ii = self._buildGenre(s2, ii, ll)
logging.debug('self.genre=%s' % self.genre)
logging.debug('self.subjects=%s' % self.subjects)
elif skey == 'Categories:':
ii = self._buildCategory(s2, ii, ll)
logging.debug('self.category=%s' % self.category)
logging.debug('self.subjects=%s' % self.subjects)
elif skey == 'Completed:':
if s2[ii].strip(' ') == "No":
self.storyStatus = 'In-Progress'
else:
self.storyStatus = 'Completed'
ii = ii + 1
elif skey == 'Word count:':
self.numWords = s2[ii].strip()
if self.numWords is None or len(self.numWords) == 0:
self.numWords = '0'
logging.debug('self.numWords=%s' % self.numWords)
ii = ii + 1
elif skey == 'Takes Place:':
ii = ii + 1
elif skey == 'Awards:':
ii = ii + 1
elif skey == 'Series:':
ii = ii + 1
elif skey == 'Read:':
ii = ii + 1
elif skey == 'Warnings:':
ii = ii + 1
else:
ii = ii + 1
tls = soup.findAll('div', {'style' : 'text-align: center;'})
for tl in tls:
#logging.debug('tl: %s' % tl)
ss = str(tl).replace('\n','').replace('\r','').replace('&nbsp;', ' ')
if ss.find('Published:') != -1:
s2 = re.split ('<[^>]+>', ss)
#logging.debug('s2: %s' % s2)
ii = 0
ll = len(s2)
while ii < ll-1:
if s2[ii] is not None and len(s2[ii]) > 0 and s2[ii].find(':') != -1:
skey = s2[ii].strip()
#logging.debug('skey: %s' % skey)
ii = ii+1
if skey == 'Published:':
self.storyPublished = datetime.datetime.fromtimestamp(time.mktime(time.strptime(s2[ii].strip(' '), "%b %d %Y")))
logging.debug('self.storyPublished=%s' % self.storyPublished)
ii = ii + 1
elif skey == 'Updated:':
self.storyUpdated = datetime.datetime.fromtimestamp(time.mktime(time.strptime(s2[ii].strip(' '), "%b %d %Y")))
logging.debug('self.storyUpdated=%s' % self.storyUpdated)
ii = ii + 1
else:
ii = ii + 1
if (self.storyName is None or len(self.storyName) == 0) and self.storyId == '0':
logging.error('self.storyName is empty!! Exitting!')
exit(1)
return result
def getText(self, url):
if url.find('http://') == -1:
url = 'http://' + self.host + '/' + url
logging.debug('Getting data from: %s' % url)
data = self.opener.open(url).read()
# need to do this, because for some reason the <br /> tag in the story causes problems
data = data.replace('<br />', ' SOMETHING_BR ')
soup = bs.BeautifulStoneSoup(data, convertEntities=bs.BeautifulStoneSoup.HTML_ENTITIES)
div = soup.find('div', {'id' : 'story'})
if None == div:
return '<html/>'
# put the <br /> tags back in..
text = div.__str__('utf8').replace(' SOMETHING_BR ','<br />')
return text
class PotionsNSnitches_UnitTests(unittest.TestCase):
def setUp(self):
logging.basicConfig(level=logging.DEBUG)
pass
def testLoginWorks(self):
pass
def testGetUrlsWorks(self):
url = 'http://potionsandsnitches.net/fanfiction/viewstory.php?sid=2230'
self.assertEquals(32, len(Twilighted(url).extractIndividualUrls()))
if __name__ == '__main__':
unittest.main()