mirror of
https://github.com/JimmXinu/FanFicFare.git
synced 2025-12-29 20:24:55 +01:00
53:c93e07566456 wsuetholz 2010-11-10 10:11 Ficwad wants you to login in order to view some stories.. They old ficwad.py got around that by starting with the first chapter instead of the story index page. Since I needed the story index page I had changed it to switch to that page, and then scrape the chapter information from there, which doesn't work if the chapters are blocked. While it still won't work if you pass in the URL for the story index page, I now switch back to the page that you passed in originally when looking for the chapters to download.. The one problem I have with this, is I change the self.url to the story index page, which should probably remain so that we have a consistent self.url even if the user starts with chapter 9 this time instead of chapter 1.
367 lines
11 KiB
Python
367 lines
11 KiB
Python
# -*- coding: utf-8 -*-
|
|
|
|
import os
|
|
import re
|
|
import sys
|
|
import shutil
|
|
import os.path
|
|
import urllib as u
|
|
import pprint as pp
|
|
import urllib2 as u2
|
|
import urlparse as up
|
|
import BeautifulSoup as bs
|
|
import htmlentitydefs as hdefs
|
|
import logging
|
|
import time
|
|
import datetime
|
|
|
|
from adapter import *
|
|
|
|
class FicWad(FanfictionSiteAdapter):
|
|
def __init__(self, url):
|
|
self.url = url
|
|
self.host = up.urlparse(url).netloc
|
|
|
|
def requiresLogin(self, url = None):
|
|
return False
|
|
|
|
def performLogin(self, url = None):
|
|
pass
|
|
|
|
def setLogin(self, login):
|
|
self.login = login
|
|
|
|
def setPassword(self, password):
|
|
self.password = password
|
|
|
|
def _addSubject(self, subject):
|
|
subj = subject.upper()
|
|
for s in self.subjects:
|
|
if s.upper() == subj:
|
|
return False
|
|
self.subjects.append(subject)
|
|
return True
|
|
|
|
def _addCharacter(self, character):
|
|
chara = character.upper()
|
|
for c in self.storyCharacters:
|
|
if c.upper() == chara:
|
|
return False
|
|
self.storyCharacters.append(character)
|
|
return True
|
|
|
|
def extractIndividualUrls(self):
|
|
self.storyDescription = 'Fanfiction Story'
|
|
self.authorId = '0'
|
|
self.storyId = '0'
|
|
self.storyPublished = datetime.date(1970, 01, 31)
|
|
self.storyCreated = datetime.datetime.now()
|
|
self.storyUpdated = datetime.date(1970, 01, 31)
|
|
self.languageId = 'en-UK'
|
|
self.language = 'English'
|
|
self.subjects = []
|
|
self.subjects.append ('fanfiction')
|
|
self.publisher = self.host
|
|
self.numChapters = 0
|
|
self.numWords = 0
|
|
self.genre = 'FanFiction'
|
|
self.category = 'Category'
|
|
self.storyStatus = 'In-Progress'
|
|
self.storyRating = 'PG'
|
|
self.storyUserRating = '0'
|
|
self.storyCharacters = []
|
|
self.storySeries = ''
|
|
oldurl = ''
|
|
|
|
data = u2.urlopen(self.url).read()
|
|
soup = bs.BeautifulStoneSoup(data)
|
|
|
|
story = soup.find('div', {'id' : 'story'})
|
|
crumbtrail = story.find('h3') # the only h3 ficwad uses.
|
|
allAhrefs = crumbtrail.findAll('a')
|
|
# last of crumbtrail
|
|
storyinfo = allAhrefs[-1]
|
|
(u0, u1, storyid) = storyinfo['href'].split('/')
|
|
if u1 == "story":
|
|
# This page does not have the correct information on it.. Need to get the Story Title Page
|
|
logging.debug('URL %s is a chapter URL. Getting Title Page http://%s/%s/%s.' % (self.url, self.host, u1, storyid))
|
|
oldurl = self.url
|
|
self.url = 'http://' + self.host + '/' + u1 + '/' + storyid
|
|
data = u2.urlopen(self.url).read()
|
|
soup = bs.BeautifulStoneSoup(data)
|
|
|
|
story = soup.find('div', {'id' : 'story'})
|
|
crumbtrail = story.find('h3') # the only h3 ficwad uses.
|
|
allAhrefs = crumbtrail.findAll('a')
|
|
|
|
# save chapter name from header in case of one-shot.
|
|
storyinfo = story.find('h4').find('a')
|
|
(u0, u1, self.storyId) = storyinfo['href'].split('/')
|
|
self.storyName = storyinfo.string.strip()
|
|
self.outputName = self.storyName.replace(" ", "_") + '-fw_' + self.storyId
|
|
|
|
logging.debug('self.storyName=%s, self.storyId=%s, self.outputName=%s' % (self.storyName, self.storyId, self.outputName))
|
|
|
|
author = soup.find('span', {'class' : 'author'})
|
|
self.authorName = str(author.a.string)
|
|
(u0, u1,self.authorId) = author.a['href'].split('/')
|
|
self.authorURL = 'http://' + self.host + author.a['href']
|
|
logging.debug('self.authorName=%s self.authorId=%s' % (self.authorName, self.authorId))
|
|
|
|
description = soup.find('blockquote', {'class' : 'summary'})
|
|
if description is not None:
|
|
self.storyDescription = str(description.p.string)
|
|
logging.debug('self.storyDescription=%s' % self.storyDescription)
|
|
|
|
meta = soup.find('p', {'class' : 'meta'})
|
|
if meta is not None:
|
|
s = str(meta).replace('\n',' ').replace('\t','').split(' - ')
|
|
logging.debug('meta.s=%s' % s)
|
|
for ss in s:
|
|
s1 = ss.replace(' ','').split(':')
|
|
#logging.debug('meta.s.s1=%s' % s1)
|
|
if len(s1) > 1:
|
|
s2 = re.split ('<[^>]+>', s1[0])
|
|
#logging.debug('meta.s.s1.s2=%s' % s2)
|
|
if len(s2) > 1:
|
|
s1[0] = s2[1]
|
|
skey = s1[0].strip()
|
|
#logging.debug('Checking = %s' % skey)
|
|
if skey == 'Category':
|
|
soup1 = bs.BeautifulStoneSoup(s1[1])
|
|
allAs = soup1.findAll('a')
|
|
for a in allAs:
|
|
if self.category == 'Category':
|
|
self.category = str(a.string)
|
|
logging.debug('self.category=%s' % self.category)
|
|
self._addSubject(self.category)
|
|
logging.debug('self.subjects=%s' % self.subjects)
|
|
elif skey == 'Rating':
|
|
self.storyRating = s1[1]
|
|
logging.debug('self.storyRating=%s' % self.storyRating)
|
|
elif skey == 'Genres':
|
|
self.genre = s1[1]
|
|
logging.debug('self.genre=%s' % self.genre)
|
|
s2 = s1[1].split(', ')
|
|
for ss2 in s2:
|
|
self._addSubject(ss2)
|
|
logging.debug('self.subjects=%s' % self.subjects)
|
|
elif skey == 'Characters':
|
|
s2 = s1[1].split(', ')
|
|
for ss2 in s2:
|
|
self._addCharacter(ss2)
|
|
logging.debug('self.storyCharacters=%s' % self.storyCharacters)
|
|
elif skey == 'Chapters':
|
|
self.numChapters = s1[1]
|
|
logging.debug('self.numChapters=%s' % self.numChapters)
|
|
elif skey == 'Warnings':
|
|
logging.debug('Warnings=%s' % s1[1])
|
|
elif skey == 'Published':
|
|
self.storyPublished = datetime.datetime.fromtimestamp(time.mktime(time.strptime(s1[1].strip(' '), "%Y/%m/%d")))
|
|
logging.debug('self.storyPublished=%s' % self.storyPublished)
|
|
elif skey == 'Updated':
|
|
self.storyUpdated = datetime.datetime.fromtimestamp(time.mktime(time.strptime(s1[1].strip(' '), "%Y/%m/%d")))
|
|
logging.debug('self.storyUpdated=%s' % self.storyUpdated)
|
|
else:
|
|
s3 = re.split ('<[^>]+>', s1[0])
|
|
#logging.debug('meta.s.s1.s3=%s' % s3)
|
|
if len(s3) > 1:
|
|
s1[0] = s3[0]
|
|
s4 = s1[0].split('w')
|
|
#logging.debug('meta.s.s1.s4=%s' % s4)
|
|
if len(s4) > 1 and s4[1] == 'ords':
|
|
self.numWords = s4[0]
|
|
logging.debug('self.numWords=%s' % self.numWords)
|
|
|
|
|
|
print('Story "%s" by %s' % (self.storyName, self.authorName))
|
|
|
|
result = []
|
|
ii = 1
|
|
|
|
if oldurl is not None and len(oldurl) > 0:
|
|
data = u2.urlopen(oldurl).read()
|
|
soup = bs.BeautifulStoneSoup(data)
|
|
|
|
storylist = soup.find('ul', {'id' : 'storylist'})
|
|
if storylist is not None:
|
|
allH4s = storylist.findAll('h4')
|
|
#logging.debug('allH4s=%s' % allH4s)
|
|
|
|
if allH4s is not None:
|
|
for h4 in allH4s:
|
|
chapterinfo = h4.find('a')
|
|
#logging.debug('Chapter1=%s' % chapterinfo)
|
|
url = 'http://' + self.host + chapterinfo['href']
|
|
title = chapterinfo.string.strip()
|
|
#logging.debug('Chapter=%s, %s' % (url, title))
|
|
# ficwad includes 'Story Index' in the dropdown of chapters,
|
|
# but it's not a real chapter.
|
|
if title != "Story Index":
|
|
logging.debug('Chapter[%s]=%s, %s' % (ii, url, title))
|
|
result.append((url,title))
|
|
ii = ii+1
|
|
else:
|
|
logging.debug('Skipping Story Index. URL %s' % url)
|
|
|
|
if ii == 1:
|
|
select = soup.find('select', { 'name' : 'goto' } )
|
|
|
|
if select is None:
|
|
result.append((self.url,self.storyName))
|
|
logging.debug('Chapter[%s]=%s %s' % (ii, self.url, self.storyName))
|
|
else:
|
|
allOptions = select.findAll('option')
|
|
for o in allOptions:
|
|
url = 'http://' + self.host + o['value']
|
|
title = o.string
|
|
# ficwad includes 'Story Index' in the dropdown of chapters,
|
|
# but it's not a real chapter.
|
|
if title != "Story Index":
|
|
logging.debug('Chapter[%s]=%s, %s' % (ii, url, title))
|
|
result.append((url,title))
|
|
ii = ii+1
|
|
else:
|
|
logging.debug('Skipping Story Index. URL %s' % url)
|
|
|
|
self.uuid = 'urn:uuid:' + self.host + '-u.' + self.authorId + '-s.' + self.storyId
|
|
logging.debug('self.uuid=%s' % self.uuid)
|
|
|
|
return result
|
|
|
|
def getStoryName(self):
|
|
return self.storyName
|
|
|
|
def getOutputName(self):
|
|
return self.outputName
|
|
|
|
def getAuthorName(self):
|
|
return self.authorName
|
|
|
|
def getText(self, url):
|
|
if url.find('http://') == -1:
|
|
url = 'http://' + self.host + '/' + url
|
|
|
|
data = u2.urlopen(url).read()
|
|
|
|
soup = bs.BeautifulStoneSoup(data)
|
|
div = soup.find('div', {'id' : 'storytext'})
|
|
if None == div:
|
|
logging.error("Error downloading Chapter: %s" % url)
|
|
exit(20)
|
|
return '<html/>'
|
|
return div.__str__('utf8')
|
|
|
|
def getStoryURL(self):
|
|
logging.debug('self.url=%s' % self.url)
|
|
return self.url
|
|
|
|
def getAuthorURL(self):
|
|
logging.debug('self.authorURL=%s' % self.authorURL)
|
|
return self.authorURL
|
|
|
|
def getUUID(self):
|
|
logging.debug('self.uuid=%s' % self.uuid)
|
|
return self.uuid
|
|
|
|
def getAuthorId(self):
|
|
logging.debug('self.authorId=%s' % self.authorId)
|
|
return self.authorId
|
|
|
|
def getStoryId(self):
|
|
logging.debug('self.storyId=%s' % self.storyId)
|
|
return self.storyId
|
|
|
|
def getStoryDescription(self):
|
|
logging.debug('self.storyDescription=%s' % self.storyDescription)
|
|
return self.storyDescription
|
|
|
|
def getStoryPublished(self):
|
|
logging.debug('self.storyPublished=%s' % self.storyPublished)
|
|
return self.storyPublished
|
|
|
|
def getStoryCreated(self):
|
|
self.storyCreated = datetime.datetime.now()
|
|
logging.debug('self.storyCreated=%s' % self.storyCreated)
|
|
return self.storyCreated
|
|
|
|
def getStoryUpdated(self):
|
|
logging.debug('self.storyUpdated=%s' % self.storyUpdated)
|
|
return self.storyUpdated
|
|
|
|
def getLanguage(self):
|
|
logging.debug('self.language=%s' % self.language)
|
|
return self.language
|
|
|
|
def getLanguageId(self):
|
|
logging.debug('self.languageId=%s' % self.languageId)
|
|
return self.languageId
|
|
|
|
def getSubjects(self):
|
|
logging.debug('self.subjects=%s' % self.authorName)
|
|
return self.subjects
|
|
|
|
def getPublisher(self):
|
|
logging.debug('self.publisher=%s' % self.publisher)
|
|
return self.publisher
|
|
|
|
def getNumChapters(self):
|
|
logging.debug('self.numChapters=%s' % self.numChapters)
|
|
return self.numChapters
|
|
|
|
def getNumWords(self):
|
|
logging.debug('self.numWords=%s' % self.numWords)
|
|
return self.numWords
|
|
|
|
def getCategory(self):
|
|
logging.debug('self.category=%s' % self.category)
|
|
return self.category
|
|
|
|
def getGenre(self):
|
|
logging.debug('self.genre=%s' % self.genre)
|
|
return self.genre
|
|
|
|
def getStoryStatus(self):
|
|
logging.debug('self.storyStatus=%s' % self.storyStatus)
|
|
return self.storyStatus
|
|
|
|
def getStoryRating(self):
|
|
logging.debug('self.storyRating=%s' % self.storyRating)
|
|
return self.storyRating
|
|
|
|
def getStoryUserRating(self):
|
|
logging.debug('self.storyUserRating=%s' % self.storyUserRating)
|
|
return self.storyUserRating
|
|
|
|
def getPrintableUrl(self, url):
|
|
return url
|
|
|
|
def getPasswordLine(self):
|
|
return 'opaopapassword'
|
|
|
|
def getLoginScript(self):
|
|
return 'opaopaloginscript'
|
|
|
|
def getLoginPasswordOthers(self):
|
|
login = dict(login = 'name', password = 'pass')
|
|
other = dict(submit = 'Log In', remember='yes')
|
|
return (login, other)
|
|
|
|
def getStoryCharacters(self):
|
|
logging.debug('self.storyCharacters=%s' % self.storyCharacters)
|
|
return self.storyCharacters
|
|
|
|
def getStorySeries(self):
|
|
logging.debug('self.storySeries=%s' % self.storySeries)
|
|
return self.storySeries
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
url = 'http://www.ficwad.com/story/14536'
|
|
data = u2.urlopen(url).read()
|
|
host = up.urlparse(url).netloc
|
|
fw = FicWad(url)
|
|
urls = fw.extractIndividualUrls()
|
|
pp.pprint(urls)
|
|
print(fw.getText(data))
|