mirror of
https://github.com/JimmXinu/FanFicFare.git
synced 2026-04-27 17:38:19 +02:00
Modified ffnet.py to use the mangled story title if given instead of the name portion of the URL. As part of this, the Name and chapter number are now optional in the URL passed in.
371 lines
11 KiB
Python
371 lines
11 KiB
Python
# -*- coding: utf-8 -*-
|
|
|
|
import os
|
|
import re
|
|
import sys
|
|
import shutil
|
|
import os.path
|
|
import urllib as u
|
|
import pprint as pp
|
|
import urllib2 as u2
|
|
import urlparse as up
|
|
import BeautifulSoup as bs
|
|
import htmlentitydefs as hdefs
|
|
import logging
|
|
import time
|
|
import datetime
|
|
|
|
from adapter import *
|
|
|
|
class FicWad(FanfictionSiteAdapter):
|
|
def __init__(self, url):
|
|
self.url = url
|
|
self.host = up.urlparse(url).netloc
|
|
|
|
def requiresLogin(self, url = None):
|
|
return False
|
|
|
|
def performLogin(self, url = None):
|
|
pass
|
|
|
|
def setLogin(self, login):
|
|
self.login = login
|
|
|
|
def setPassword(self, password):
|
|
self.password = password
|
|
|
|
def _addSubject(self, subject):
|
|
subj = subject.upper()
|
|
for s in self.subjects:
|
|
if s.upper() == subj:
|
|
return False
|
|
self.subjects.append(subject)
|
|
return True
|
|
|
|
def _addCharacter(self, character):
|
|
chara = character.upper()
|
|
for c in self.storyCharacters:
|
|
if c.upper() == chara:
|
|
return False
|
|
self.storyCharacters.append(character)
|
|
return True
|
|
|
|
def extractIndividualUrls(self):
|
|
self.storyDescription = 'Fanfiction Story'
|
|
self.authorId = '0'
|
|
self.storyId = '0'
|
|
self.storyPublished = datetime.date(1970, 01, 31)
|
|
self.storyCreated = datetime.datetime.now()
|
|
self.storyUpdated = datetime.date(1970, 01, 31)
|
|
self.languageId = 'en-UK'
|
|
self.language = 'English'
|
|
self.subjects = []
|
|
self.subjects.append ('fanfiction')
|
|
self.publisher = self.host
|
|
self.numChapters = 0
|
|
self.numWords = 0
|
|
self.genre = 'FanFiction'
|
|
self.category = 'Category'
|
|
self.storyStatus = 'In-Progress'
|
|
self.storyRating = 'PG'
|
|
self.storyUserRating = '0'
|
|
self.storyCharacters = []
|
|
self.storySeries = ''
|
|
oldurl = ''
|
|
|
|
data = u2.urlopen(self.url).read()
|
|
soup = bs.BeautifulStoneSoup(data)
|
|
|
|
story = soup.find('div', {'id' : 'story'})
|
|
crumbtrail = story.find('h3') # the only h3 ficwad uses.
|
|
allAhrefs = crumbtrail.findAll('a')
|
|
# last of crumbtrail
|
|
storyinfo = allAhrefs[-1]
|
|
(u0, u1, storyid) = storyinfo['href'].split('/')
|
|
if u1 == "story":
|
|
# This page does not have the correct information on it.. Need to get the Story Title Page
|
|
logging.debug('URL %s is a chapter URL. Getting Title Page http://%s/%s/%s.' % (self.url, self.host, u1, storyid))
|
|
oldurl = self.url
|
|
self.url = 'http://' + self.host + '/' + u1 + '/' + storyid
|
|
data = u2.urlopen(self.url).read()
|
|
soup = bs.BeautifulStoneSoup(data)
|
|
|
|
story = soup.find('div', {'id' : 'story'})
|
|
crumbtrail = story.find('h3') # the only h3 ficwad uses.
|
|
allAhrefs = crumbtrail.findAll('a')
|
|
|
|
# save chapter name from header in case of one-shot.
|
|
storyinfo = story.find('h4').find('a')
|
|
(u0, u1, self.storyId) = storyinfo['href'].split('/')
|
|
self.storyName = storyinfo.string.strip()
|
|
self.outputName = self.storyName.replace(" ", "_") + '-fw_' + self.storyId
|
|
|
|
logging.debug('self.storyName=%s, self.storyId=%s, self.outputName=%s' % (self.storyName, self.storyId, self.outputName))
|
|
|
|
author = soup.find('span', {'class' : 'author'})
|
|
self.authorName = str(author.a.string)
|
|
(u0, u1,self.authorId) = author.a['href'].split('/')
|
|
self.authorURL = 'http://' + self.host + author.a['href']
|
|
logging.debug('self.authorName=%s self.authorId=%s' % (self.authorName, self.authorId))
|
|
|
|
description = soup.find('blockquote', {'class' : 'summary'})
|
|
if description is not None:
|
|
self.storyDescription = str(description.p.string)
|
|
logging.debug('self.storyDescription=%s' % self.storyDescription)
|
|
|
|
meta = soup.find('p', {'class' : 'meta'})
|
|
if meta is not None:
|
|
s = str(meta).replace('\n',' ').replace('\t','').split(' - ')
|
|
logging.debug('meta.s=%s' % s)
|
|
for ss in s:
|
|
s1 = ss.replace(' ','').split(':')
|
|
#logging.debug('meta.s.s1=%s' % s1)
|
|
if len(s1) > 1:
|
|
s2 = re.split ('<[^>]+>', s1[0])
|
|
#logging.debug('meta.s.s1.s2=%s' % s2)
|
|
if len(s2) > 1:
|
|
s1[0] = s2[1]
|
|
skey = s1[0].strip()
|
|
#logging.debug('Checking = %s' % skey)
|
|
if skey == 'Category':
|
|
soup1 = bs.BeautifulStoneSoup(s1[1])
|
|
allAs = soup1.findAll('a')
|
|
for a in allAs:
|
|
if self.category == 'Category':
|
|
self.category = str(a.string)
|
|
logging.debug('self.category=%s' % self.category)
|
|
self._addSubject(self.category)
|
|
logging.debug('self.subjects=%s' % self.subjects)
|
|
elif skey == 'Rating':
|
|
self.storyRating = s1[1]
|
|
logging.debug('self.storyRating=%s' % self.storyRating)
|
|
elif skey == 'Genres':
|
|
self.genre = s1[1]
|
|
logging.debug('self.genre=%s' % self.genre)
|
|
s2 = s1[1].split(', ')
|
|
for ss2 in s2:
|
|
self._addSubject(ss2)
|
|
logging.debug('self.subjects=%s' % self.subjects)
|
|
elif skey == 'Characters':
|
|
s2 = s1[1].split(', ')
|
|
for ss2 in s2:
|
|
self._addCharacter(ss2)
|
|
logging.debug('self.storyCharacters=%s' % self.storyCharacters)
|
|
elif skey == 'Chapters':
|
|
self.numChapters = s1[1]
|
|
logging.debug('self.numChapters=%s' % self.numChapters)
|
|
elif skey == 'Warnings':
|
|
logging.debug('Warnings=%s' % s1[1])
|
|
elif skey == 'Published':
|
|
self.storyPublished = datetime.datetime.fromtimestamp(time.mktime(time.strptime(s1[1].strip(' '), "%Y/%m/%d")))
|
|
logging.debug('self.storyPublished=%s' % self.storyPublished)
|
|
elif skey == 'Updated':
|
|
self.storyUpdated = datetime.datetime.fromtimestamp(time.mktime(time.strptime(s1[1].strip(' '), "%Y/%m/%d")))
|
|
logging.debug('self.storyUpdated=%s' % self.storyUpdated)
|
|
else:
|
|
s3 = re.split ('<[^>]+>', s1[0])
|
|
#logging.debug('meta.s.s1.s3=%s' % s3)
|
|
if len(s3) > 1:
|
|
s1[0] = s3[0]
|
|
s4 = s1[0].split('w')
|
|
#logging.debug('meta.s.s1.s4=%s' % s4)
|
|
if len(s4) > 1 and s4[1] == 'ords':
|
|
self.numWords = s4[0]
|
|
logging.debug('self.numWords=%s' % self.numWords)
|
|
|
|
|
|
print('Story "%s" by %s' % (self.storyName, self.authorName))
|
|
|
|
result = []
|
|
ii = 1
|
|
|
|
if oldurl is not None and len(oldurl) > 0:
|
|
data = u2.urlopen(oldurl).read()
|
|
soup = bs.BeautifulStoneSoup(data)
|
|
|
|
storylist = soup.find('ul', {'id' : 'storylist'})
|
|
if storylist is not None:
|
|
allH4s = storylist.findAll('h4')
|
|
#logging.debug('allH4s=%s' % allH4s)
|
|
|
|
if allH4s is not None:
|
|
for h4 in allH4s:
|
|
chapterinfo = h4.find('a')
|
|
#logging.debug('Chapter1=%s' % chapterinfo)
|
|
url = 'http://' + self.host + chapterinfo['href']
|
|
title = chapterinfo.string.strip()
|
|
#logging.debug('Chapter=%s, %s' % (url, title))
|
|
# ficwad includes 'Story Index' in the dropdown of chapters,
|
|
# but it's not a real chapter.
|
|
if title != "Story Index":
|
|
logging.debug('Chapter[%s]=%s, %s' % (ii, url, title))
|
|
result.append((url,title))
|
|
ii = ii+1
|
|
else:
|
|
logging.debug('Skipping Story Index. URL %s' % url)
|
|
|
|
if ii == 1:
|
|
select = soup.find('select', { 'name' : 'goto' } )
|
|
|
|
if select is None:
|
|
result.append((self.url,self.storyName))
|
|
logging.debug('Chapter[%s]=%s %s' % (ii, self.url, self.storyName))
|
|
else:
|
|
allOptions = select.findAll('option')
|
|
for o in allOptions:
|
|
url = 'http://' + self.host + o['value']
|
|
title = o.string
|
|
# ficwad includes 'Story Index' in the dropdown of chapters,
|
|
# but it's not a real chapter.
|
|
if title != "Story Index":
|
|
logging.debug('Chapter[%s]=%s, %s' % (ii, url, title))
|
|
result.append((url,title))
|
|
ii = ii+1
|
|
else:
|
|
logging.debug('Skipping Story Index. URL %s' % url)
|
|
|
|
self.uuid = 'urn:uuid:' + self.host + '-u.' + self.authorId + '-s.' + self.storyId
|
|
logging.debug('self.uuid=%s' % self.uuid)
|
|
|
|
return result
|
|
|
|
def getHost(self):
|
|
logging.debug('self.host=%s' % self.host)
|
|
return self.host
|
|
|
|
def getStoryName(self):
|
|
return self.storyName
|
|
|
|
def getOutputName(self):
|
|
return self.outputName
|
|
|
|
def getAuthorName(self):
|
|
return self.authorName
|
|
|
|
def getText(self, url):
|
|
if url.find('http://') == -1:
|
|
url = 'http://' + self.host + '/' + url
|
|
|
|
data = u2.urlopen(url).read()
|
|
|
|
soup = bs.BeautifulStoneSoup(data)
|
|
div = soup.find('div', {'id' : 'storytext'})
|
|
if None == div:
|
|
logging.error("Error downloading Chapter: %s" % url)
|
|
exit(20)
|
|
return '<html/>'
|
|
return div.__str__('utf8')
|
|
|
|
def getStoryURL(self):
|
|
logging.debug('self.url=%s' % self.url)
|
|
return self.url
|
|
|
|
def getAuthorURL(self):
|
|
logging.debug('self.authorURL=%s' % self.authorURL)
|
|
return self.authorURL
|
|
|
|
def getUUID(self):
|
|
logging.debug('self.uuid=%s' % self.uuid)
|
|
return self.uuid
|
|
|
|
def getAuthorId(self):
|
|
logging.debug('self.authorId=%s' % self.authorId)
|
|
return self.authorId
|
|
|
|
def getStoryId(self):
|
|
logging.debug('self.storyId=%s' % self.storyId)
|
|
return self.storyId
|
|
|
|
def getStoryDescription(self):
|
|
logging.debug('self.storyDescription=%s' % self.storyDescription)
|
|
return self.storyDescription
|
|
|
|
def getStoryPublished(self):
|
|
logging.debug('self.storyPublished=%s' % self.storyPublished)
|
|
return self.storyPublished
|
|
|
|
def getStoryCreated(self):
|
|
self.storyCreated = datetime.datetime.now()
|
|
logging.debug('self.storyCreated=%s' % self.storyCreated)
|
|
return self.storyCreated
|
|
|
|
def getStoryUpdated(self):
|
|
logging.debug('self.storyUpdated=%s' % self.storyUpdated)
|
|
return self.storyUpdated
|
|
|
|
def getLanguage(self):
|
|
logging.debug('self.language=%s' % self.language)
|
|
return self.language
|
|
|
|
def getLanguageId(self):
|
|
logging.debug('self.languageId=%s' % self.languageId)
|
|
return self.languageId
|
|
|
|
def getSubjects(self):
|
|
logging.debug('self.subjects=%s' % self.authorName)
|
|
return self.subjects
|
|
|
|
def getPublisher(self):
|
|
logging.debug('self.publisher=%s' % self.publisher)
|
|
return self.publisher
|
|
|
|
def getNumChapters(self):
|
|
logging.debug('self.numChapters=%s' % self.numChapters)
|
|
return self.numChapters
|
|
|
|
def getNumWords(self):
|
|
logging.debug('self.numWords=%s' % self.numWords)
|
|
return self.numWords
|
|
|
|
def getCategory(self):
|
|
logging.debug('self.category=%s' % self.category)
|
|
return self.category
|
|
|
|
def getGenre(self):
|
|
logging.debug('self.genre=%s' % self.genre)
|
|
return self.genre
|
|
|
|
def getStoryStatus(self):
|
|
logging.debug('self.storyStatus=%s' % self.storyStatus)
|
|
return self.storyStatus
|
|
|
|
def getStoryRating(self):
|
|
logging.debug('self.storyRating=%s' % self.storyRating)
|
|
return self.storyRating
|
|
|
|
def getStoryUserRating(self):
|
|
logging.debug('self.storyUserRating=%s' % self.storyUserRating)
|
|
return self.storyUserRating
|
|
|
|
def getPrintableUrl(self, url):
|
|
return url
|
|
|
|
def getPasswordLine(self):
|
|
return 'opaopapassword'
|
|
|
|
def getLoginScript(self):
|
|
return 'opaopaloginscript'
|
|
|
|
def getLoginPasswordOthers(self):
|
|
login = dict(login = 'name', password = 'pass')
|
|
other = dict(submit = 'Log In', remember='yes')
|
|
return (login, other)
|
|
|
|
def getStoryCharacters(self):
|
|
logging.debug('self.storyCharacters=%s' % self.storyCharacters)
|
|
return self.storyCharacters
|
|
|
|
def getStorySeries(self):
|
|
logging.debug('self.storySeries=%s' % self.storySeries)
|
|
return self.storySeries
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
url = 'http://www.ficwad.com/story/14536'
|
|
data = u2.urlopen(url).read()
|
|
host = up.urlparse(url).netloc
|
|
fw = FicWad(url)
|
|
urls = fw.extractIndividualUrls()
|
|
pp.pprint(urls)
|
|
print(fw.getText(data))
|