mirror of
https://github.com/JimmXinu/FanFicFare.git
synced 2025-12-07 17:33:05 +01:00
Modified ffnet.py to use the mangled story title if given instead of the name portion of the URL. As part of this, the Name and chapter number are now optional in the URL passed in.
381 lines
12 KiB
Python
381 lines
12 KiB
Python
import os
|
|
import re
|
|
import sys
|
|
import shutil
|
|
import logging
|
|
import os.path
|
|
import urllib as u
|
|
import pprint as pp
|
|
import urllib2 as u2
|
|
import cookielib as cl
|
|
import urlparse as up
|
|
import BeautifulSoup as bs
|
|
import htmlentitydefs as hdefs
|
|
import time as time
|
|
import datetime
|
|
from adapter import *
|
|
|
|
|
|
class FictionAlley(FanfictionSiteAdapter):
|
|
def __init__(self, url):
|
|
self.url = url
|
|
parsedUrl = up.urlparse(url)
|
|
self.host = parsedUrl.netloc
|
|
self.path = parsedUrl.path
|
|
|
|
logging.debug('self.host=%s' % self.host)
|
|
logging.debug('self.path=%s' % self.path)
|
|
|
|
cookieproc = u2.HTTPCookieProcessor()
|
|
|
|
# FictionAlley wants a cookie to prove you're old enough to read R+ rated stuff.
|
|
cookie = cl.Cookie(version=0, name='fauser', value='wizard',
|
|
port=None, port_specified=False,
|
|
domain='www.fictionalley.org', domain_specified=False, domain_initial_dot=False,
|
|
path='/authors', path_specified=True,
|
|
secure=False,
|
|
expires=time.time()+100,
|
|
discard=False,
|
|
comment=None,
|
|
comment_url=None,
|
|
rest={'HttpOnly': None},
|
|
rfc2109=False)
|
|
cookieproc.cookiejar.set_cookie(cookie)
|
|
self.opener = u2.build_opener(cookieproc)
|
|
|
|
ss = self.path.split('/')
|
|
|
|
self.storyDescription = 'Fanfiction Story'
|
|
self.authorId = ''
|
|
self.authorURL = ''
|
|
self.storyId = ''
|
|
if len(ss) > 2 and ss[1] == 'authors':
|
|
self.authorId = ss[2]
|
|
self.authorURL = 'http://' + self.host + '/authors/' + self.authorId
|
|
if len(ss) > 3:
|
|
self.storyId = ss[3].replace ('.html','')
|
|
self.storyPublished = datetime.date(1970, 01, 31)
|
|
self.storyCreated = datetime.datetime.now()
|
|
self.storyUpdated = datetime.date(1970, 01, 31)
|
|
self.languageId = 'en-UK'
|
|
self.language = 'English'
|
|
self.subjects = []
|
|
self.subjects.append ('fanfiction')
|
|
self.publisher = self.host
|
|
self.numChapters = 0
|
|
self.numWords = 0
|
|
self.genre = 'FanFiction'
|
|
self.category = 'Category'
|
|
self.storyStatus = 'In-Progress'
|
|
self.storyRating = 'K'
|
|
self.storyUserRating = '0'
|
|
self.storyCharacters = []
|
|
self.storySeries = ''
|
|
|
|
|
|
def requiresLogin(self, url = None):
|
|
return False
|
|
|
|
def performLogin(self, url = None):
|
|
pass
|
|
|
|
def setLogin(self, login):
|
|
self.login = login
|
|
|
|
def setPassword(self, password):
|
|
self.password = password
|
|
|
|
def _addSubject(self, subject):
|
|
subj = subject.upper()
|
|
for s in self.subjects:
|
|
if s.upper() == subj:
|
|
return False
|
|
self.subjects.append(subject)
|
|
return True
|
|
|
|
def _addCharacter(self, character):
|
|
chara = character.upper()
|
|
for c in self.storyCharacters:
|
|
if c.upper() == chara:
|
|
return False
|
|
self.storyCharacters.append(character)
|
|
return True
|
|
|
|
def _processChapterHeaders(self, div):
|
|
brs = div.findAll ('br')
|
|
for br in brs:
|
|
keystr=''
|
|
valstr=''
|
|
if len(br.contents) > 2:
|
|
keystr = br.contents[1]
|
|
if keystr is not None:
|
|
strs = re.split ("<[^>]+>", str(keystr))
|
|
keystr=''
|
|
for s in strs:
|
|
keystr = keystr + s
|
|
valstr = br.contents[2].strip(' ')
|
|
if keystr is not None:
|
|
if keystr == 'Rating:':
|
|
self.storyRating = valstr
|
|
logging.debug('self.storyRating=%s' % self.storyRating)
|
|
elif keystr == 'Genre:':
|
|
self.genre = valstr
|
|
logging.debug('self.genre=%s' % self.genre)
|
|
s2 = valstr.split(', ')
|
|
for ss2 in s2:
|
|
self._addSubject(ss2)
|
|
logging.debug('self.subjects=%s' % self.subjects)
|
|
elif keystr == 'Main Character(s):':
|
|
s2 = valstr.split(', ')
|
|
for ss2 in s2:
|
|
self._addCharacter(ss2)
|
|
logging.debug('self.storyCharacters=%s' % self.storyCharacters)
|
|
elif keystr == 'Summary:':
|
|
self.storyDescription = valstr
|
|
logging.debug('self.storyDescription=%s' % self.storyDescription)
|
|
|
|
|
|
def extractIndividualUrls(self):
|
|
data = self.opener.open(self.url).read()
|
|
|
|
# There is some usefull information in the headers of the first chapter page..
|
|
data = data.replace('<!-- headerstart -->','<crazytagstringnobodywouldstumbleonaccidently id="storyheaders">').replace('<!-- headerend -->','</crazytagstringnobodywouldstumbleonaccidently>')
|
|
soup = bs.BeautifulStoneSoup(data)
|
|
|
|
# Get title from <title>, remove before '-'.
|
|
title = soup.find('title').string
|
|
self.storyName = "-".join(title.split('-')[1:]).strip().replace(" (Story Text)","")
|
|
self.outputName = self.storyName.replace(" ", "_") + '-fa_' + self.storyId
|
|
|
|
links = soup.findAll('li')
|
|
|
|
# If it is decided that we really do care about number of words.. It's only available on the author's page..
|
|
#d0 = self.opener.open(self.authorURL).read()
|
|
#soupA = bs.BeautifulStoneSoup(d0)
|
|
#dls = soupA.findAll('dl')
|
|
#logging.debug('dls=%s' % dls)
|
|
|
|
self.numChapters = 0;
|
|
result = []
|
|
if len(links) == 0:
|
|
# Be aware that this means that the user has entered the {STORY}01.html
|
|
# We will not have valid Publised and Updated dates. User should enter
|
|
# the {STORY}.html instead. We should force that instead of this.
|
|
breadcrumbs = soup.find('div', {'class': 'breadcrumbs'})
|
|
self.authorName = breadcrumbs.a.string.replace("'s Fics","")
|
|
result.append((self.url,self.storyName))
|
|
#logging.debug('chapter[%s]=%s, %s' % (self.numChapters+1,self.url,self.storyName))
|
|
self.numChapters = self.numChapters + 1;
|
|
div = soup.find('crazytagstringnobodywouldstumbleonaccidently', {'id' : 'storyheaders'})
|
|
if div is not None:
|
|
self._processChapterHeaders(div)
|
|
else:
|
|
author = soup.find('h1', {'class' : 'title'})
|
|
self.authorName = author.a.string
|
|
|
|
summary = soup.find('div', {'class' : 'summary'})
|
|
ss = summary.contents
|
|
if len(ss) > 1:
|
|
ss1 = ss[0].split(': ')
|
|
if len(ss1) > 1 and ss1[0] == 'Rating':
|
|
self.storyRating = ss1[1]
|
|
logging.debug('self.storyRating=%s' % self.storyRating)
|
|
self.storyDescription = str(ss[1]).replace("<br>","").replace("</br>","").replace('\n','')
|
|
logging.debug('self.storyDescription=%s' % self.storyDescription)
|
|
|
|
for li in links:
|
|
a = li.find('a', {'class' : 'chapterlink'})
|
|
s = li.contents
|
|
if a is not None:
|
|
url = a['href']
|
|
title = a.string
|
|
result.append((url,title))
|
|
#logging.debug('chapter[%s]=%s, %s' % (self.numChapters+1,url,title))
|
|
if self.numChapters == 0:
|
|
# fictionalley uses full URLs in chapter list.
|
|
d1 = self.opener.open(url).read()
|
|
|
|
# find <!-- headerstart --> & <!-- headerend --> and
|
|
# replaced with matching div pair for easier parsing.
|
|
# Yes, it's an evil kludge, but what can ya do? Using
|
|
# something other than div prevents soup from pairing
|
|
# our div with poor html inside the story text.
|
|
d1 = d1.replace('<!-- headerstart -->','<crazytagstringnobodywouldstumbleonaccidently id="storyheaders">').replace('<!-- headerend -->','</crazytagstringnobodywouldstumbleonaccidently>')
|
|
sop = bs.BeautifulStoneSoup(d1)
|
|
|
|
div = sop.find('crazytagstringnobodywouldstumbleonaccidently', {'id' : 'storyheaders'})
|
|
if div is not None:
|
|
self._processChapterHeaders(div)
|
|
|
|
self.numChapters = self.numChapters + 1
|
|
if len(s) > 1:
|
|
datestr=''
|
|
ss2 = s[1].replace('\n','').replace('(','').split(' ')
|
|
if len(ss2) > 2 and ss2[0] == 'Posted:':
|
|
datestr = ss2[1] + ' ' + ss2[2]
|
|
tmpdate = datetime.datetime.fromtimestamp(time.mktime(time.strptime(datestr.strip(' '), "%Y-%m-%d %H:%M:%S")))
|
|
if self.numChapters == 1:
|
|
self.storyPublished = tmpdate
|
|
self.storyUpdated = tmpdate
|
|
logging.debug('self.storyPublished=%s, self.storyUpdated=%s' % (self.storyPublished, self.storyUpdated))
|
|
else:
|
|
logging.debug('li chapterlink not found! li=%s' % li)
|
|
|
|
|
|
print('Story "%s" by %s' % (self.storyName, self.authorName))
|
|
|
|
self.uuid = 'urn:uuid:' + self.host + '-u.' + self.authorId + '-s.' + self.storyId
|
|
logging.debug('self.uuid=%s' % self.uuid)
|
|
|
|
return result
|
|
|
|
def getHost(self):
|
|
logging.debug('self.host=%s' % self.host)
|
|
return self.host
|
|
|
|
def getStoryName(self):
|
|
return self.storyName
|
|
|
|
def getAuthorName(self):
|
|
return self.authorName
|
|
|
|
def getOutputName(self):
|
|
return self.outputName
|
|
|
|
def getText(self, url):
|
|
# fictionalley uses full URLs in chapter list.
|
|
data = self.opener.open(url).read()
|
|
|
|
# find <!-- headerend --> & <!-- footerstart --> and
|
|
# replaced with matching div pair for easier parsing.
|
|
# Yes, it's an evil kludge, but what can ya do? Using
|
|
# something other than div prevents soup from pairing
|
|
# our div with poor html inside the story text.
|
|
data = data.replace('<!-- headerend -->','<crazytagstringnobodywouldstumbleonaccidently id="storytext">').replace('<!-- footerstart -->','</crazytagstringnobodywouldstumbleonaccidently>')
|
|
soup = bs.BeautifulStoneSoup(data)
|
|
|
|
div = soup.find('crazytagstringnobodywouldstumbleonaccidently', {'id' : 'storytext'})
|
|
if None == div:
|
|
logging.error("Error downloading Chapter: %s" % url)
|
|
exit(20)
|
|
return '<html/>'
|
|
|
|
html = soup.findAll('html')
|
|
if len(html) > 1:
|
|
return html[1].__str__('utf8')
|
|
else:
|
|
return div.__str__('utf8').replace('crazytagstringnobodywouldstumbleonaccidently','div')
|
|
|
|
def getStoryURL(self):
|
|
logging.debug('self.url=%s' % self.url)
|
|
return self.url
|
|
|
|
def getAuthorURL(self):
|
|
logging.debug('self.authorURL=%s' % self.authorURL)
|
|
return self.authorURL
|
|
|
|
def getUUID(self):
|
|
logging.debug('self.uuid=%s' % self.uuid)
|
|
return self.uuid
|
|
|
|
def getAuthorId(self):
|
|
logging.debug('self.authorId=%s' % self.authorId)
|
|
return self.authorId
|
|
|
|
def getStoryId(self):
|
|
logging.debug('self.storyId=%s' % self.storyId)
|
|
return self.storyId
|
|
|
|
def getStoryDescription(self):
|
|
logging.debug('self.storyDescription=%s' % self.storyDescription)
|
|
return self.storyDescription
|
|
|
|
def getStoryPublished(self):
|
|
logging.debug('self.storyPublished=%s' % self.storyPublished)
|
|
return self.storyPublished
|
|
|
|
def getStoryCreated(self):
|
|
self.storyCreated = datetime.datetime.now()
|
|
logging.debug('self.storyCreated=%s' % self.storyCreated)
|
|
return self.storyCreated
|
|
|
|
def getStoryUpdated(self):
|
|
logging.debug('self.storyUpdated=%s' % self.storyUpdated)
|
|
return self.storyUpdated
|
|
|
|
def getLanguage(self):
|
|
logging.debug('self.language=%s' % self.language)
|
|
return self.language
|
|
|
|
def getLanguageId(self):
|
|
logging.debug('self.languageId=%s' % self.languageId)
|
|
return self.languageId
|
|
|
|
def getSubjects(self):
|
|
logging.debug('self.subjects=%s' % self.authorName)
|
|
return self.subjects
|
|
|
|
def getPublisher(self):
|
|
logging.debug('self.publisher=%s' % self.publisher)
|
|
return self.publisher
|
|
|
|
def getNumChapters(self):
|
|
logging.debug('self.numChapters=%s' % self.numChapters)
|
|
return self.numChapters
|
|
|
|
def getNumWords(self):
|
|
logging.debug('self.numWords=%s' % self.numWords)
|
|
return self.numWords
|
|
|
|
def getCategory(self):
|
|
logging.debug('self.category=%s' % self.category)
|
|
return self.category
|
|
|
|
def getGenre(self):
|
|
logging.debug('self.genre=%s' % self.genre)
|
|
return self.genre
|
|
|
|
def getStoryStatus(self):
|
|
logging.debug('self.storyStatus=%s' % self.storyStatus)
|
|
return self.storyStatus
|
|
|
|
def getStoryRating(self):
|
|
logging.debug('self.storyRating=%s' % self.storyRating)
|
|
return self.storyRating
|
|
|
|
def getStoryUserRating(self):
|
|
logging.debug('self.storyUserRating=%s' % self.storyUserRating)
|
|
return self.storyUserRating
|
|
|
|
def getPrintableUrl(self, url):
|
|
return url
|
|
|
|
def getPasswordLine(self):
|
|
return 'opaopapassword'
|
|
|
|
def getLoginScript(self):
|
|
return 'opaopaloginscript'
|
|
|
|
def getLoginPasswordOthers(self):
|
|
login = dict(login = 'name', password = 'pass')
|
|
other = dict(submit = 'Log In', remember='yes')
|
|
return (login, other)
|
|
|
|
def getStoryCharacters(self):
|
|
logging.debug('self.storyCharacters=%s' % self.storyCharacters)
|
|
return self.storyCharacters
|
|
|
|
def getStorySeries(self):
|
|
logging.debug('self.storySeries=%s' % self.storySeries)
|
|
return self.storySeries
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
url = 'http://www.fictionalley.org/authors/drt/DA.html'
|
|
data = self.opener.open(url).read()
|
|
host = up.urlparse(url).netloc
|
|
fw = FictionAlley(url)
|
|
urls = fw.extractIndividualUrls(data, host, url)
|
|
pp.pprint(urls)
|
|
print(fw.getText(data))
|