FanFicFare/ficwad.py
wsuetholz 33d1fabd11 Changeset Tag Branch User Date Summary
53:c93e07566456			wsuetholz	2010-11-10 10:11	Ficwad wants you to login in order to view some stories..  They old ficwad.py got around that by starting with the first chapter instead of the story index page.  Since I needed the story index page I had changed it to switch to that page, and then scrape the chapter information from there, which doesn't work if the chapters are blocked.  While it still won't work if you pass in the URL for the story index page, I now switch back to the page that you passed in originally when looking for the chapters to download..  The one problem I have with this, is I change the self.url to the story index page, which should probably remain so that we have a consistent self.url even if the user starts with chapter 9 this time instead of chapter 1.
2010-11-10 10:18:46 -06:00

367 lines
11 KiB
Python

# -*- coding: utf-8 -*-
import os
import re
import sys
import shutil
import os.path
import urllib as u
import pprint as pp
import urllib2 as u2
import urlparse as up
import BeautifulSoup as bs
import htmlentitydefs as hdefs
import logging
import time
import datetime
from adapter import *
class FicWad(FanfictionSiteAdapter):
def __init__(self, url):
self.url = url
self.host = up.urlparse(url).netloc
def requiresLogin(self, url = None):
return False
def performLogin(self, url = None):
pass
def setLogin(self, login):
self.login = login
def setPassword(self, password):
self.password = password
def _addSubject(self, subject):
subj = subject.upper()
for s in self.subjects:
if s.upper() == subj:
return False
self.subjects.append(subject)
return True
def _addCharacter(self, character):
chara = character.upper()
for c in self.storyCharacters:
if c.upper() == chara:
return False
self.storyCharacters.append(character)
return True
def extractIndividualUrls(self):
self.storyDescription = 'Fanfiction Story'
self.authorId = '0'
self.storyId = '0'
self.storyPublished = datetime.date(1970, 01, 31)
self.storyCreated = datetime.datetime.now()
self.storyUpdated = datetime.date(1970, 01, 31)
self.languageId = 'en-UK'
self.language = 'English'
self.subjects = []
self.subjects.append ('fanfiction')
self.publisher = self.host
self.numChapters = 0
self.numWords = 0
self.genre = 'FanFiction'
self.category = 'Category'
self.storyStatus = 'In-Progress'
self.storyRating = 'PG'
self.storyUserRating = '0'
self.storyCharacters = []
self.storySeries = ''
oldurl = ''
data = u2.urlopen(self.url).read()
soup = bs.BeautifulStoneSoup(data)
story = soup.find('div', {'id' : 'story'})
crumbtrail = story.find('h3') # the only h3 ficwad uses.
allAhrefs = crumbtrail.findAll('a')
# last of crumbtrail
storyinfo = allAhrefs[-1]
(u0, u1, storyid) = storyinfo['href'].split('/')
if u1 == "story":
# This page does not have the correct information on it.. Need to get the Story Title Page
logging.debug('URL %s is a chapter URL. Getting Title Page http://%s/%s/%s.' % (self.url, self.host, u1, storyid))
oldurl = self.url
self.url = 'http://' + self.host + '/' + u1 + '/' + storyid
data = u2.urlopen(self.url).read()
soup = bs.BeautifulStoneSoup(data)
story = soup.find('div', {'id' : 'story'})
crumbtrail = story.find('h3') # the only h3 ficwad uses.
allAhrefs = crumbtrail.findAll('a')
# save chapter name from header in case of one-shot.
storyinfo = story.find('h4').find('a')
(u0, u1, self.storyId) = storyinfo['href'].split('/')
self.storyName = storyinfo.string.strip()
self.outputName = self.storyName.replace(" ", "_") + '-fw_' + self.storyId
logging.debug('self.storyName=%s, self.storyId=%s, self.outputName=%s' % (self.storyName, self.storyId, self.outputName))
author = soup.find('span', {'class' : 'author'})
self.authorName = str(author.a.string)
(u0, u1,self.authorId) = author.a['href'].split('/')
self.authorURL = 'http://' + self.host + author.a['href']
logging.debug('self.authorName=%s self.authorId=%s' % (self.authorName, self.authorId))
description = soup.find('blockquote', {'class' : 'summary'})
if description is not None:
self.storyDescription = str(description.p.string)
logging.debug('self.storyDescription=%s' % self.storyDescription)
meta = soup.find('p', {'class' : 'meta'})
if meta is not None:
s = str(meta).replace('\n',' ').replace('\t','').split(' - ')
logging.debug('meta.s=%s' % s)
for ss in s:
s1 = ss.replace(' ','').split(':')
#logging.debug('meta.s.s1=%s' % s1)
if len(s1) > 1:
s2 = re.split ('<[^>]+>', s1[0])
#logging.debug('meta.s.s1.s2=%s' % s2)
if len(s2) > 1:
s1[0] = s2[1]
skey = s1[0].strip()
#logging.debug('Checking = %s' % skey)
if skey == 'Category':
soup1 = bs.BeautifulStoneSoup(s1[1])
allAs = soup1.findAll('a')
for a in allAs:
if self.category == 'Category':
self.category = str(a.string)
logging.debug('self.category=%s' % self.category)
self._addSubject(self.category)
logging.debug('self.subjects=%s' % self.subjects)
elif skey == 'Rating':
self.storyRating = s1[1]
logging.debug('self.storyRating=%s' % self.storyRating)
elif skey == 'Genres':
self.genre = s1[1]
logging.debug('self.genre=%s' % self.genre)
s2 = s1[1].split(', ')
for ss2 in s2:
self._addSubject(ss2)
logging.debug('self.subjects=%s' % self.subjects)
elif skey == 'Characters':
s2 = s1[1].split(', ')
for ss2 in s2:
self._addCharacter(ss2)
logging.debug('self.storyCharacters=%s' % self.storyCharacters)
elif skey == 'Chapters':
self.numChapters = s1[1]
logging.debug('self.numChapters=%s' % self.numChapters)
elif skey == 'Warnings':
logging.debug('Warnings=%s' % s1[1])
elif skey == 'Published':
self.storyPublished = datetime.datetime.fromtimestamp(time.mktime(time.strptime(s1[1].strip(' '), "%Y/%m/%d")))
logging.debug('self.storyPublished=%s' % self.storyPublished)
elif skey == 'Updated':
self.storyUpdated = datetime.datetime.fromtimestamp(time.mktime(time.strptime(s1[1].strip(' '), "%Y/%m/%d")))
logging.debug('self.storyUpdated=%s' % self.storyUpdated)
else:
s3 = re.split ('<[^>]+>', s1[0])
#logging.debug('meta.s.s1.s3=%s' % s3)
if len(s3) > 1:
s1[0] = s3[0]
s4 = s1[0].split('w')
#logging.debug('meta.s.s1.s4=%s' % s4)
if len(s4) > 1 and s4[1] == 'ords':
self.numWords = s4[0]
logging.debug('self.numWords=%s' % self.numWords)
print('Story "%s" by %s' % (self.storyName, self.authorName))
result = []
ii = 1
if oldurl is not None and len(oldurl) > 0:
data = u2.urlopen(oldurl).read()
soup = bs.BeautifulStoneSoup(data)
storylist = soup.find('ul', {'id' : 'storylist'})
if storylist is not None:
allH4s = storylist.findAll('h4')
#logging.debug('allH4s=%s' % allH4s)
if allH4s is not None:
for h4 in allH4s:
chapterinfo = h4.find('a')
#logging.debug('Chapter1=%s' % chapterinfo)
url = 'http://' + self.host + chapterinfo['href']
title = chapterinfo.string.strip()
#logging.debug('Chapter=%s, %s' % (url, title))
# ficwad includes 'Story Index' in the dropdown of chapters,
# but it's not a real chapter.
if title != "Story Index":
logging.debug('Chapter[%s]=%s, %s' % (ii, url, title))
result.append((url,title))
ii = ii+1
else:
logging.debug('Skipping Story Index. URL %s' % url)
if ii == 1:
select = soup.find('select', { 'name' : 'goto' } )
if select is None:
result.append((self.url,self.storyName))
logging.debug('Chapter[%s]=%s %s' % (ii, self.url, self.storyName))
else:
allOptions = select.findAll('option')
for o in allOptions:
url = 'http://' + self.host + o['value']
title = o.string
# ficwad includes 'Story Index' in the dropdown of chapters,
# but it's not a real chapter.
if title != "Story Index":
logging.debug('Chapter[%s]=%s, %s' % (ii, url, title))
result.append((url,title))
ii = ii+1
else:
logging.debug('Skipping Story Index. URL %s' % url)
self.uuid = 'urn:uuid:' + self.host + '-u.' + self.authorId + '-s.' + self.storyId
logging.debug('self.uuid=%s' % self.uuid)
return result
def getStoryName(self):
return self.storyName
def getOutputName(self):
return self.outputName
def getAuthorName(self):
return self.authorName
def getText(self, url):
if url.find('http://') == -1:
url = 'http://' + self.host + '/' + url
data = u2.urlopen(url).read()
soup = bs.BeautifulStoneSoup(data)
div = soup.find('div', {'id' : 'storytext'})
if None == div:
logging.error("Error downloading Chapter: %s" % url)
exit(20)
return '<html/>'
return div.__str__('utf8')
def getStoryURL(self):
logging.debug('self.url=%s' % self.url)
return self.url
def getAuthorURL(self):
logging.debug('self.authorURL=%s' % self.authorURL)
return self.authorURL
def getUUID(self):
logging.debug('self.uuid=%s' % self.uuid)
return self.uuid
def getAuthorId(self):
logging.debug('self.authorId=%s' % self.authorId)
return self.authorId
def getStoryId(self):
logging.debug('self.storyId=%s' % self.storyId)
return self.storyId
def getStoryDescription(self):
logging.debug('self.storyDescription=%s' % self.storyDescription)
return self.storyDescription
def getStoryPublished(self):
logging.debug('self.storyPublished=%s' % self.storyPublished)
return self.storyPublished
def getStoryCreated(self):
self.storyCreated = datetime.datetime.now()
logging.debug('self.storyCreated=%s' % self.storyCreated)
return self.storyCreated
def getStoryUpdated(self):
logging.debug('self.storyUpdated=%s' % self.storyUpdated)
return self.storyUpdated
def getLanguage(self):
logging.debug('self.language=%s' % self.language)
return self.language
def getLanguageId(self):
logging.debug('self.languageId=%s' % self.languageId)
return self.languageId
def getSubjects(self):
logging.debug('self.subjects=%s' % self.authorName)
return self.subjects
def getPublisher(self):
logging.debug('self.publisher=%s' % self.publisher)
return self.publisher
def getNumChapters(self):
logging.debug('self.numChapters=%s' % self.numChapters)
return self.numChapters
def getNumWords(self):
logging.debug('self.numWords=%s' % self.numWords)
return self.numWords
def getCategory(self):
logging.debug('self.category=%s' % self.category)
return self.category
def getGenre(self):
logging.debug('self.genre=%s' % self.genre)
return self.genre
def getStoryStatus(self):
logging.debug('self.storyStatus=%s' % self.storyStatus)
return self.storyStatus
def getStoryRating(self):
logging.debug('self.storyRating=%s' % self.storyRating)
return self.storyRating
def getStoryUserRating(self):
logging.debug('self.storyUserRating=%s' % self.storyUserRating)
return self.storyUserRating
def getPrintableUrl(self, url):
return url
def getPasswordLine(self):
return 'opaopapassword'
def getLoginScript(self):
return 'opaopaloginscript'
def getLoginPasswordOthers(self):
login = dict(login = 'name', password = 'pass')
other = dict(submit = 'Log In', remember='yes')
return (login, other)
def getStoryCharacters(self):
logging.debug('self.storyCharacters=%s' % self.storyCharacters)
return self.storyCharacters
def getStorySeries(self):
logging.debug('self.storySeries=%s' % self.storySeries)
return self.storySeries
if __name__ == '__main__':
url = 'http://www.ficwad.com/story/14536'
data = u2.urlopen(url).read()
host = up.urlparse(url).netloc
fw = FicWad(url)
urls = fw.extractIndividualUrls()
pp.pprint(urls)
print(fw.getText(data))