mirror of
https://github.com/JimmXinu/FanFicFare.git
synced 2025-12-13 12:26:02 +01:00
Fix a couple of problems with fpcom.py. Rearrange the Genre and Category for some of the adapters.
334 lines
13 KiB
Python
334 lines
13 KiB
Python
# -*- coding: utf-8 -*-
|
|
|
|
# Copied from the twilighted.py because site is almost the same..
|
|
# of course, now that we're trying to scrape more detail about the
|
|
# story, there were differences in how headers are displayed
|
|
|
|
import os
|
|
import re
|
|
import sys
|
|
import shutil
|
|
import os.path
|
|
import urllib as u
|
|
import logging
|
|
import pprint as pp
|
|
import unittest
|
|
import urllib2 as u2
|
|
import urlparse as up
|
|
import BeautifulSoup as bs
|
|
import htmlentitydefs as hdefs
|
|
import time
|
|
import datetime
|
|
|
|
from adapter import *
|
|
|
|
class PotionsNSnitches(FanfictionSiteAdapter):
|
|
def __init__(self, url):
|
|
self.url = url
|
|
parsedUrl = up.urlparse(url)
|
|
self.host = parsedUrl.netloc
|
|
self.path = parsedUrl.path
|
|
self.opener = u2.build_opener(u2.HTTPCookieProcessor())
|
|
self.password = ''
|
|
self.login='sigizmund'
|
|
self.storyDescription = 'Fanfiction Story'
|
|
self.authorId = '0'
|
|
self.authorURL = ''
|
|
self.storyId = '0'
|
|
self.storyPublished = datetime.date(1970, 01, 31)
|
|
self.storyCreated = datetime.datetime.now()
|
|
self.storyUpdated = datetime.date(1970, 01, 31)
|
|
self.languageId = 'en-UK'
|
|
self.language = 'English'
|
|
self.subjects = []
|
|
self.subjects.append ('fanfiction')
|
|
self.subjects.append ('Harry Potter')
|
|
self.publisher = self.host
|
|
self.numChapters = 0
|
|
self.numWords = 0
|
|
self.genre = 'FanFiction'
|
|
self.category = 'Category'
|
|
self.storyStatus = 'In-Progress'
|
|
self.storyRating = 'PG'
|
|
self.storyUserRating = '0'
|
|
self.storyCharacters = []
|
|
self.storySeries = ''
|
|
self.outputName = ''
|
|
self.outputStorySep = '-pns_'
|
|
|
|
self.chapurl = False
|
|
ss=self.url.split('?')
|
|
if ss is not None and len(ss) > 1:
|
|
sss = ss[1].replace('&','&').split('&')
|
|
if sss is not None and len(sss) > 0:
|
|
ssss = sss[0].split('=')
|
|
if ssss is not None and len(ssss) > 1 and ssss[0] == 'sid':
|
|
self.storyId = ssss[1]
|
|
if len(sss) > 1:
|
|
ssss = sss[1].split('=')
|
|
if ssss is not None and len(ssss) > 1 and ssss[0] == 'chapter':
|
|
self.chapurl = True
|
|
|
|
self.url = 'http://' + self.host + '/' + self.path + '?sid=' + self.storyId
|
|
logging.debug('self.url=%s' % self.url)
|
|
|
|
logging.debug("Created PotionsNSnitches: url=%s" % (self.url))
|
|
|
|
|
|
def _getLoginScript(self):
|
|
return '/user.php?action=login'
|
|
|
|
def reqLoginData(self, data):
|
|
if data.find('Registered Users Only. Please click OK to login or register.') != -1 or data.find('There is no such account on our website') != -1:
|
|
return True
|
|
else:
|
|
return False
|
|
|
|
def _fillCharacters(self, strlist, idx, maxlen):
|
|
ii = idx
|
|
while ii < maxlen:
|
|
chara = strlist[ii].strip()
|
|
if len(chara) > 0:
|
|
if chara.find(':') != -1:
|
|
return (ii-1)
|
|
elif chara.find(',') == -1:
|
|
self.addCharacter (chara)
|
|
ii = ii + 1
|
|
return (ii)
|
|
|
|
def _buildGenre(self, strlist, idx, maxlen):
|
|
self.genre = ''
|
|
ii = idx
|
|
while ii < maxlen:
|
|
genre = strlist[ii].strip()
|
|
if len(genre) > 0:
|
|
if genre.find(':') != -1:
|
|
return (ii-1)
|
|
elif genre.find(',') != -1:
|
|
genre = ', '
|
|
else:
|
|
self.addSubject (genre)
|
|
self.genre = self.genre + genre
|
|
ii = ii + 1
|
|
return (ii)
|
|
|
|
def _buildCategory(self, strlist, idx, maxlen):
|
|
self.category = ''
|
|
ii = idx
|
|
while ii < maxlen:
|
|
cat = strlist[ii].strip()
|
|
if len(cat) > 0:
|
|
if cat.find(':') != -1:
|
|
return (ii-1)
|
|
elif cat.find(',') != -1:
|
|
cat = ', '
|
|
else:
|
|
self.addSubject (cat)
|
|
self.category = self.category + cat
|
|
ii = ii + 1
|
|
return (ii)
|
|
|
|
def extractIndividualUrls(self):
|
|
url = self.url + '&chapter=1'
|
|
data = self.opener.open(url).read()
|
|
|
|
if self.reqLoginData(data):
|
|
self.performLogin()
|
|
data = self.opener.open(url).read()
|
|
if self.reqLoginData(data):
|
|
return None
|
|
|
|
soup = bs.BeautifulStoneSoup(data)
|
|
|
|
self.storyName = ''
|
|
self.authorName = ''
|
|
self.storyId = '0'
|
|
title = soup.find('title').string
|
|
if title is not None and len(title) > 0:
|
|
logging.debug('Title: %s' % title)
|
|
ss = title.split(' by ')
|
|
if ss is not None and len(ss) > 1:
|
|
self.storyName = ss[0].strip()
|
|
self.authorName = ss[1].strip()
|
|
|
|
logging.debug('self.storyId=%s, self.storyName=%s' % (self.storyId, self.storyName))
|
|
logging.debug('self.authorId=%s, self.authorName=%s' % (self.authorId, self.authorName))
|
|
|
|
select = soup.find('select', { 'name' : 'chapter' } )
|
|
|
|
result = []
|
|
if select is None:
|
|
# no chapters found, try url by itself.
|
|
chaptitle = soup.find('div', { 'id' : 'chaptertitle' } )
|
|
if chaptitle is not None and chaptitle.string is not None and len(chaptitle.string) > 0:
|
|
result.append((url,chaptitle.string))
|
|
else:
|
|
result.append((url,self.storyName))
|
|
else:
|
|
allOptions = select.findAll('option')
|
|
for o in allOptions:
|
|
url = self.url + "&chapter=%s" % o['value']
|
|
title = o.string
|
|
result.append((url,title))
|
|
|
|
url = self.url + "&index=1"
|
|
data = self.opener.open(url).read()
|
|
lines = data.split('\n')
|
|
soup = bs.BeautifulStoneSoup(data)
|
|
pgt = soup.find('div', {'id' : 'pagetitle'})
|
|
#logging.debug('pagetitle: %s' % pgt)
|
|
pgtAs = pgt.findAll('a')
|
|
#logging.debug('pgtAs: %s' % pgtAs)
|
|
for a in pgtAs:
|
|
if a['href'].find('viewstory.php') != -1:
|
|
(u1, self.storyId) = a['href'].split('=')
|
|
self.storyName = a.string
|
|
logging.debug('self.storyId=%s, self.storyName=%s' % (self.storyId, self.storyName))
|
|
elif a['href'].find('viewuser.php') != -1:
|
|
self.authorName = a.string
|
|
self.authorURL = 'http://' + self.host + '/' + a['href']
|
|
(u1, self.authorId) = a['href'].split('=')
|
|
logging.debug('self.authorName=%s, self.authorId=%s' % (self.authorName, self.authorId))
|
|
|
|
output = soup.find('div', {'id' : 'output'})
|
|
#logging.debug('output: %s' % str(output))
|
|
if output is not None and len(str(output)) > 1:
|
|
s2 = re.split ('<[^>]+>', str(output))
|
|
#logging.debug('s2=%s' % s2)
|
|
ii = 0
|
|
ll = len(s2)
|
|
while ii < ll:
|
|
if s2[ii] == 'Summary:' and ii+1 < ll:
|
|
self.storyDescription = s2[ii+1].strip()
|
|
logging.debug('self.storyDescription: %s' % self.storyDescription)
|
|
break;
|
|
ii = ii+1
|
|
|
|
cnt = soup.find('div', {'class' : 'content'})
|
|
#logging.debug('content: %s' % cnt)
|
|
cnttd = cnt.findAll('td')
|
|
#logging.debug('cnttd: %s' % cnttd)
|
|
for td in cnttd:
|
|
#logging.debug('td: %s' % str(td))
|
|
ss = str(td).replace('\n','').replace('\r','').replace(' ', ' ')
|
|
if len(ss) > 1:
|
|
s2 = re.split ('<[^>]+>', ss)
|
|
#logging.debug('s2=%s' % s2)
|
|
ii = 0
|
|
ll = len(s2)
|
|
while ii < ll-1:
|
|
if s2[ii] is not None and len(s2[ii]) > 0 and s2[ii].find(':') != -1:
|
|
skey = s2[ii].strip()
|
|
ii = ii+1
|
|
if skey == 'Rated:':
|
|
self.storyRating = s2[ii].strip()
|
|
logging.debug('self.storyRating=%s' % self.storyRating)
|
|
ii = ii + 1
|
|
elif skey == 'Chapters:':
|
|
self.numChapters = s2[ii].strip()
|
|
logging.debug('self.numChapters=%s' % self.numChapters)
|
|
ii = ii + 1
|
|
elif skey == 'Characters:':
|
|
ii = self._fillCharacters(s2, ii, ll)
|
|
logging.debug('self.storyCharacters=%s' % self.storyCharacters)
|
|
ii = ii + 1
|
|
elif skey == 'Genres:':
|
|
ii = self._buildGenre(s2, ii, ll)
|
|
logging.debug('self.genre=%s' % self.genre)
|
|
logging.debug('self.subjects=%s' % self.subjects)
|
|
elif skey == 'Categories:':
|
|
ii = self._buildCategory(s2, ii, ll)
|
|
logging.debug('self.category=%s' % self.category)
|
|
logging.debug('self.subjects=%s' % self.subjects)
|
|
elif skey == 'Completed:':
|
|
if s2[ii].strip(' ') == "No":
|
|
self.storyStatus = 'In-Progress'
|
|
else:
|
|
self.storyStatus = 'Completed'
|
|
ii = ii + 1
|
|
elif skey == 'Word count:':
|
|
self.numWords = s2[ii].strip()
|
|
if self.numWords is None or len(self.numWords) == 0:
|
|
self.numWords = '0'
|
|
logging.debug('self.numWords=%s' % self.numWords)
|
|
ii = ii + 1
|
|
elif skey == 'Takes Place:':
|
|
ii = ii + 1
|
|
elif skey == 'Awards:':
|
|
ii = ii + 1
|
|
elif skey == 'Series:':
|
|
ii = ii + 1
|
|
elif skey == 'Read:':
|
|
ii = ii + 1
|
|
elif skey == 'Warnings:':
|
|
ii = ii + 1
|
|
else:
|
|
ii = ii + 1
|
|
|
|
tls = soup.findAll('div', {'style' : 'text-align: center;'})
|
|
for tl in tls:
|
|
#logging.debug('tl: %s' % tl)
|
|
ss = str(tl).replace('\n','').replace('\r','').replace(' ', ' ')
|
|
if ss.find('Published:') != -1:
|
|
s2 = re.split ('<[^>]+>', ss)
|
|
#logging.debug('s2: %s' % s2)
|
|
ii = 0
|
|
ll = len(s2)
|
|
while ii < ll-1:
|
|
if s2[ii] is not None and len(s2[ii]) > 0 and s2[ii].find(':') != -1:
|
|
skey = s2[ii].strip()
|
|
#logging.debug('skey: %s' % skey)
|
|
ii = ii+1
|
|
if skey == 'Published:':
|
|
self.storyPublished = datetime.datetime.fromtimestamp(time.mktime(time.strptime(s2[ii].strip(' '), "%b %d %Y")))
|
|
logging.debug('self.storyPublished=%s' % self.storyPublished)
|
|
ii = ii + 1
|
|
elif skey == 'Updated:':
|
|
self.storyUpdated = datetime.datetime.fromtimestamp(time.mktime(time.strptime(s2[ii].strip(' '), "%b %d %Y")))
|
|
logging.debug('self.storyUpdated=%s' % self.storyUpdated)
|
|
ii = ii + 1
|
|
else:
|
|
ii = ii + 1
|
|
|
|
if (self.storyName is None or len(self.storyName) == 0) and self.storyId == '0':
|
|
logging.error('self.storyName is empty!! Exitting!')
|
|
exit(1)
|
|
|
|
return result
|
|
|
|
def getText(self, url):
|
|
if url.find('http://') == -1:
|
|
url = 'http://' + self.host + '/' + url
|
|
|
|
logging.debug('Getting data from: %s' % url)
|
|
|
|
data = self.opener.open(url).read()
|
|
|
|
# need to do this, because for some reason the <br /> tag in the story causes problems
|
|
data = data.replace('<br />', ' SOMETHING_BR ')
|
|
soup = bs.BeautifulStoneSoup(data, convertEntities=bs.BeautifulStoneSoup.HTML_ENTITIES)
|
|
|
|
div = soup.find('div', {'id' : 'story'})
|
|
|
|
if None == div:
|
|
return '<html/>'
|
|
|
|
# put the <br /> tags back in..
|
|
text = div.__str__('utf8').replace(' SOMETHING_BR ','<br />')
|
|
return text
|
|
|
|
|
|
class PotionsNSnitches_UnitTests(unittest.TestCase):
|
|
def setUp(self):
|
|
logging.basicConfig(level=logging.DEBUG)
|
|
pass
|
|
|
|
def testLoginWorks(self):
|
|
pass
|
|
|
|
def testGetUrlsWorks(self):
|
|
url = 'http://potionsandsnitches.net/fanfiction/viewstory.php?sid=2230'
|
|
self.assertEquals(32, len(Twilighted(url).extractIndividualUrls()))
|
|
|
|
if __name__ == '__main__':
|
|
unittest.main()
|