FanFicFare/ficwad.py
wsuetholz b2ea9c3e73 Changed the title page generation to allow for easily adding and removing entries from the table of information shown.
Modified ffnet.py to use the mangled story title if given instead of the name portion of the URL.  As part of this, the Name and chapter number are now optional in the URL passed in.
2010-11-10 11:18:48 -06:00

371 lines
11 KiB
Python

# -*- coding: utf-8 -*-
import os
import re
import sys
import shutil
import os.path
import urllib as u
import pprint as pp
import urllib2 as u2
import urlparse as up
import BeautifulSoup as bs
import htmlentitydefs as hdefs
import logging
import time
import datetime
from adapter import *
class FicWad(FanfictionSiteAdapter):
def __init__(self, url):
self.url = url
self.host = up.urlparse(url).netloc
def requiresLogin(self, url = None):
return False
def performLogin(self, url = None):
pass
def setLogin(self, login):
self.login = login
def setPassword(self, password):
self.password = password
def _addSubject(self, subject):
subj = subject.upper()
for s in self.subjects:
if s.upper() == subj:
return False
self.subjects.append(subject)
return True
def _addCharacter(self, character):
chara = character.upper()
for c in self.storyCharacters:
if c.upper() == chara:
return False
self.storyCharacters.append(character)
return True
def extractIndividualUrls(self):
self.storyDescription = 'Fanfiction Story'
self.authorId = '0'
self.storyId = '0'
self.storyPublished = datetime.date(1970, 01, 31)
self.storyCreated = datetime.datetime.now()
self.storyUpdated = datetime.date(1970, 01, 31)
self.languageId = 'en-UK'
self.language = 'English'
self.subjects = []
self.subjects.append ('fanfiction')
self.publisher = self.host
self.numChapters = 0
self.numWords = 0
self.genre = 'FanFiction'
self.category = 'Category'
self.storyStatus = 'In-Progress'
self.storyRating = 'PG'
self.storyUserRating = '0'
self.storyCharacters = []
self.storySeries = ''
oldurl = ''
data = u2.urlopen(self.url).read()
soup = bs.BeautifulStoneSoup(data)
story = soup.find('div', {'id' : 'story'})
crumbtrail = story.find('h3') # the only h3 ficwad uses.
allAhrefs = crumbtrail.findAll('a')
# last of crumbtrail
storyinfo = allAhrefs[-1]
(u0, u1, storyid) = storyinfo['href'].split('/')
if u1 == "story":
# This page does not have the correct information on it.. Need to get the Story Title Page
logging.debug('URL %s is a chapter URL. Getting Title Page http://%s/%s/%s.' % (self.url, self.host, u1, storyid))
oldurl = self.url
self.url = 'http://' + self.host + '/' + u1 + '/' + storyid
data = u2.urlopen(self.url).read()
soup = bs.BeautifulStoneSoup(data)
story = soup.find('div', {'id' : 'story'})
crumbtrail = story.find('h3') # the only h3 ficwad uses.
allAhrefs = crumbtrail.findAll('a')
# save chapter name from header in case of one-shot.
storyinfo = story.find('h4').find('a')
(u0, u1, self.storyId) = storyinfo['href'].split('/')
self.storyName = storyinfo.string.strip()
self.outputName = self.storyName.replace(" ", "_") + '-fw_' + self.storyId
logging.debug('self.storyName=%s, self.storyId=%s, self.outputName=%s' % (self.storyName, self.storyId, self.outputName))
author = soup.find('span', {'class' : 'author'})
self.authorName = str(author.a.string)
(u0, u1,self.authorId) = author.a['href'].split('/')
self.authorURL = 'http://' + self.host + author.a['href']
logging.debug('self.authorName=%s self.authorId=%s' % (self.authorName, self.authorId))
description = soup.find('blockquote', {'class' : 'summary'})
if description is not None:
self.storyDescription = str(description.p.string)
logging.debug('self.storyDescription=%s' % self.storyDescription)
meta = soup.find('p', {'class' : 'meta'})
if meta is not None:
s = str(meta).replace('\n',' ').replace('\t','').split(' - ')
logging.debug('meta.s=%s' % s)
for ss in s:
s1 = ss.replace(' ','').split(':')
#logging.debug('meta.s.s1=%s' % s1)
if len(s1) > 1:
s2 = re.split ('<[^>]+>', s1[0])
#logging.debug('meta.s.s1.s2=%s' % s2)
if len(s2) > 1:
s1[0] = s2[1]
skey = s1[0].strip()
#logging.debug('Checking = %s' % skey)
if skey == 'Category':
soup1 = bs.BeautifulStoneSoup(s1[1])
allAs = soup1.findAll('a')
for a in allAs:
if self.category == 'Category':
self.category = str(a.string)
logging.debug('self.category=%s' % self.category)
self._addSubject(self.category)
logging.debug('self.subjects=%s' % self.subjects)
elif skey == 'Rating':
self.storyRating = s1[1]
logging.debug('self.storyRating=%s' % self.storyRating)
elif skey == 'Genres':
self.genre = s1[1]
logging.debug('self.genre=%s' % self.genre)
s2 = s1[1].split(', ')
for ss2 in s2:
self._addSubject(ss2)
logging.debug('self.subjects=%s' % self.subjects)
elif skey == 'Characters':
s2 = s1[1].split(', ')
for ss2 in s2:
self._addCharacter(ss2)
logging.debug('self.storyCharacters=%s' % self.storyCharacters)
elif skey == 'Chapters':
self.numChapters = s1[1]
logging.debug('self.numChapters=%s' % self.numChapters)
elif skey == 'Warnings':
logging.debug('Warnings=%s' % s1[1])
elif skey == 'Published':
self.storyPublished = datetime.datetime.fromtimestamp(time.mktime(time.strptime(s1[1].strip(' '), "%Y/%m/%d")))
logging.debug('self.storyPublished=%s' % self.storyPublished)
elif skey == 'Updated':
self.storyUpdated = datetime.datetime.fromtimestamp(time.mktime(time.strptime(s1[1].strip(' '), "%Y/%m/%d")))
logging.debug('self.storyUpdated=%s' % self.storyUpdated)
else:
s3 = re.split ('<[^>]+>', s1[0])
#logging.debug('meta.s.s1.s3=%s' % s3)
if len(s3) > 1:
s1[0] = s3[0]
s4 = s1[0].split('w')
#logging.debug('meta.s.s1.s4=%s' % s4)
if len(s4) > 1 and s4[1] == 'ords':
self.numWords = s4[0]
logging.debug('self.numWords=%s' % self.numWords)
print('Story "%s" by %s' % (self.storyName, self.authorName))
result = []
ii = 1
if oldurl is not None and len(oldurl) > 0:
data = u2.urlopen(oldurl).read()
soup = bs.BeautifulStoneSoup(data)
storylist = soup.find('ul', {'id' : 'storylist'})
if storylist is not None:
allH4s = storylist.findAll('h4')
#logging.debug('allH4s=%s' % allH4s)
if allH4s is not None:
for h4 in allH4s:
chapterinfo = h4.find('a')
#logging.debug('Chapter1=%s' % chapterinfo)
url = 'http://' + self.host + chapterinfo['href']
title = chapterinfo.string.strip()
#logging.debug('Chapter=%s, %s' % (url, title))
# ficwad includes 'Story Index' in the dropdown of chapters,
# but it's not a real chapter.
if title != "Story Index":
logging.debug('Chapter[%s]=%s, %s' % (ii, url, title))
result.append((url,title))
ii = ii+1
else:
logging.debug('Skipping Story Index. URL %s' % url)
if ii == 1:
select = soup.find('select', { 'name' : 'goto' } )
if select is None:
result.append((self.url,self.storyName))
logging.debug('Chapter[%s]=%s %s' % (ii, self.url, self.storyName))
else:
allOptions = select.findAll('option')
for o in allOptions:
url = 'http://' + self.host + o['value']
title = o.string
# ficwad includes 'Story Index' in the dropdown of chapters,
# but it's not a real chapter.
if title != "Story Index":
logging.debug('Chapter[%s]=%s, %s' % (ii, url, title))
result.append((url,title))
ii = ii+1
else:
logging.debug('Skipping Story Index. URL %s' % url)
self.uuid = 'urn:uuid:' + self.host + '-u.' + self.authorId + '-s.' + self.storyId
logging.debug('self.uuid=%s' % self.uuid)
return result
def getHost(self):
logging.debug('self.host=%s' % self.host)
return self.host
def getStoryName(self):
return self.storyName
def getOutputName(self):
return self.outputName
def getAuthorName(self):
return self.authorName
def getText(self, url):
if url.find('http://') == -1:
url = 'http://' + self.host + '/' + url
data = u2.urlopen(url).read()
soup = bs.BeautifulStoneSoup(data)
div = soup.find('div', {'id' : 'storytext'})
if None == div:
logging.error("Error downloading Chapter: %s" % url)
exit(20)
return '<html/>'
return div.__str__('utf8')
def getStoryURL(self):
logging.debug('self.url=%s' % self.url)
return self.url
def getAuthorURL(self):
logging.debug('self.authorURL=%s' % self.authorURL)
return self.authorURL
def getUUID(self):
logging.debug('self.uuid=%s' % self.uuid)
return self.uuid
def getAuthorId(self):
logging.debug('self.authorId=%s' % self.authorId)
return self.authorId
def getStoryId(self):
logging.debug('self.storyId=%s' % self.storyId)
return self.storyId
def getStoryDescription(self):
logging.debug('self.storyDescription=%s' % self.storyDescription)
return self.storyDescription
def getStoryPublished(self):
logging.debug('self.storyPublished=%s' % self.storyPublished)
return self.storyPublished
def getStoryCreated(self):
self.storyCreated = datetime.datetime.now()
logging.debug('self.storyCreated=%s' % self.storyCreated)
return self.storyCreated
def getStoryUpdated(self):
logging.debug('self.storyUpdated=%s' % self.storyUpdated)
return self.storyUpdated
def getLanguage(self):
logging.debug('self.language=%s' % self.language)
return self.language
def getLanguageId(self):
logging.debug('self.languageId=%s' % self.languageId)
return self.languageId
def getSubjects(self):
logging.debug('self.subjects=%s' % self.authorName)
return self.subjects
def getPublisher(self):
logging.debug('self.publisher=%s' % self.publisher)
return self.publisher
def getNumChapters(self):
logging.debug('self.numChapters=%s' % self.numChapters)
return self.numChapters
def getNumWords(self):
logging.debug('self.numWords=%s' % self.numWords)
return self.numWords
def getCategory(self):
logging.debug('self.category=%s' % self.category)
return self.category
def getGenre(self):
logging.debug('self.genre=%s' % self.genre)
return self.genre
def getStoryStatus(self):
logging.debug('self.storyStatus=%s' % self.storyStatus)
return self.storyStatus
def getStoryRating(self):
logging.debug('self.storyRating=%s' % self.storyRating)
return self.storyRating
def getStoryUserRating(self):
logging.debug('self.storyUserRating=%s' % self.storyUserRating)
return self.storyUserRating
def getPrintableUrl(self, url):
return url
def getPasswordLine(self):
return 'opaopapassword'
def getLoginScript(self):
return 'opaopaloginscript'
def getLoginPasswordOthers(self):
login = dict(login = 'name', password = 'pass')
other = dict(submit = 'Log In', remember='yes')
return (login, other)
def getStoryCharacters(self):
logging.debug('self.storyCharacters=%s' % self.storyCharacters)
return self.storyCharacters
def getStorySeries(self):
logging.debug('self.storySeries=%s' % self.storySeries)
return self.storySeries
if __name__ == '__main__':
url = 'http://www.ficwad.com/story/14536'
data = u2.urlopen(url).read()
host = up.urlparse(url).netloc
fw = FicWad(url)
urls = fw.extractIndividualUrls()
pp.pprint(urls)
print(fw.getText(data))