FFA changed, architecture changed, not quite ready yet

This commit is contained in:
sigizmund 2009-12-17 11:26:55 +00:00
parent a9748061f0
commit 6ef95c634d
5 changed files with 174 additions and 151 deletions

View file

@ -83,7 +83,7 @@ TOC_END = '''</navMap>
</ncx>
'''
XHTML_START = '''<?xml version="1.0" encoding="iso-8859-1"?>
XHTML_START = '''<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>

View file

@ -28,7 +28,10 @@ class FanficLoader:
urls = self.adapter.extractIndividualUrls()
self.writer = self.writerClass(self.booksDirectory, self.adapter.getStoryName(), self.adapter.getAuthorName())
i = 0
for u,n in urls:
print('Downloading chapter %d/%d' % (i, len(urls)))
i = i+1
text = self.adapter.getText(u)
self.writer.writeChapter(n, text)

259
ffa.py
View file

@ -1,3 +1,5 @@
# -*- coding: utf-8 -*-
import os
import re
import sys
@ -6,182 +8,183 @@ import uuid
import shutil
import base64
import os.path
import logging
import unittest
import urllib as u
import pprint as pp
import urllib2 as u2
import login_password
import urlparse as up
import BeautifulSoup as bs
import htmlentitydefs as hdefs
from constants import *
from ficwad import *
class FFA:
storyName = None
def __init__(self, url):
self.url = url
parsedUrl = up.urlparse(url)
self.host = parsedUrl.netloc
self.path = parsedUrl.path
self.opener = u2.build_opener(u2.HTTPCookieProcessor())
def __init__(self):
self.grabUrl = re.compile('(\<option.+value=\")(.+?)\"\>(.+?)\<')
self.grabAuthor = re.compile('.+pemail.+\'(\w+)')
logging.debug("Created FFA: url=%s" % (self.url))
def getPasswordLine(self):
return '<input type="password" name="pass"'
def _getLoginScript(self):
return self.path
def getLoginScript(self):
return '/scripts/login.php'
def requiresLogin(self, url = None):
resp = self.opener.open(self.url)
data = resp.read()
if data.find('<legend>Please login to continue</legend>') != -1:
return True
else:
return False
def getLoginPasswordOthers(self):
login = dict(login = 'name', password = 'pass')
other = dict(submit = 'Log In', remember='yes')
return (login, other)
def performLogin(self, url = None):
if url == None:
url = self.url
def getPrintableUrl(self, url):
return url + '?print=yes'
data = {}
def _findIndex(self, lines, what, start):
for i in range(start, len(lines)):
if lines[i].find(what) != -1:
return i
return -1
data['username'] = self.login
data['password'] = self.password
data['submit'] = 'Submit'
urlvals = u.urlencode(data)
loginUrl = 'http://' + self.host + self._getLoginScript()
logging.debug("Will now login to URL %s" % loginUrl)
req = self.opener.open(loginUrl, urlvals)
if self.requiresLogin():
return False
else:
return True
def extractIndividualUrls(self):
data = self.opener.open(self.url).read()
soup = bs.BeautifulStoneSoup(data)
self.author = soup.find('a', {'href' : '/contact/'}).string
self.storyName = str(soup.find('h1', {'class' : 'textCenter'}).contents[0]).strip()
logging.debug("Story `%s` by `%s`" % (self.storyName, self.author))
selector = soup.find('select', {'class' : 'tinput'})
options = selector.findAll('option')
urls = []
for o in options:
title = o.string
url = o['value']
urls.append((url,title))
return urls
def getText(self, url):
if url.find('http://') == -1:
url = 'http://' + self.host + '/' + url
data = self.opener.open(url).read()
def extractIndividualUrls(self, data, host, first, fetch = False):
lines = data.split('\n')
optionLines = filter(lambda x : x.find('<option value="') != -1, lines)
emit = False
authorLines = filter(lambda x : x.find('pemail') != -1, lines)
for al in authorLines:
m = self.grabAuthor.match(al)
if m != None:
self.authorName = m.group(1)
break
post = ''
optionsLines = optionLines[:len(optionLines)/2]
storyName = first.split("/")[1]
result = []
urls = []
for line in optionLines:
m = self.grabUrl.match(line)
u = m.group(2)
if u.find('" selected="selected') != -1:
u = u.replace('" selected="selected', '')
if u in urls:
for l in lines:
if l.find('</div></form>') != -1:
logging.debug('emit = True')
emit = True
continue
elif l.find('<form action="#">') != -1:
logging.debug('emit = False')
if emit:
break
else:
urls.append(u)
emit = False
result.append((self.getPrintableUrl(storyName + "/" + u), m.group(3)))
if emit:
post = post + l + '\n'
self.soup = bs.BeautifulSoup(data)
titles = self.soup.findAll(name = 'title', recursive=True)
if len(titles) > 0:
title = titles[0]
print(title)
(website, rest) = title.string.split('::')
story_chapter = rest.split("-")
return post
story = story_chapter[0].strip()
self.storyName = story
def setLogin(self, login):
self.login = login
return result
def setPassword(self, password):
self.password = password
def getStoryName(self):
return self.storyName
def getAuthorName(self):
return self.authorName
return self.author
def getText(self, data, fetch = False):
lines = data.split('\n')
begin = self._findIndex(lines, '</select>', 0)+1
if begin == 0:
begiun = self._findIndex(lines, '<div><p>', 24)
def getPrintableUrl(self, url):
return url
if begin == 0:
print('BAD start')
pp.pprint(lines)
sys.abort()
end = self._findIndex(lines, '<form action="index.php"><div class="topandbotline"', begin)
print('<!-- ========= begin=%d, end=%d ============= -->' % (begin, end))
return "\n".join(lines[begin:end])
class FFA_UnitTests(unittest.TestCase):
def setUp(self):
logging.basicConfig(level=logging.DEBUG)
pass
class Downloader:
login = None
password = None
url = None
host = None
first = None
opener = None
def testRequiresLoginNeg(self):
f = FFA('http://jeconais.fanficauthors.net/Happily_Ever_After/Introduction/')
self.assertFalse(f.requiresLogin())
writer = None
def testRequiresLogin(self):
f = FFA('http://jeconais.fanficauthors.net/Rons_Harem/Rons_Harem/')
self.assertTrue(f.requiresLogin())
def __init__(self, url, login, password):
self.login = login
self.password = password
self.url = url
def testPerformLogin(self):
f = FFA('http://jeconais.fanficauthors.net/Rons_Harem/Rons_Harem/')
self.infoProvider = FicWad() #FFA()
if login_password != None:
f.setLogin(login_password.login)
f.setPassword(login_password.password)
parse = up.urlparse(url)
self.host = parse.scheme + '://' + parse.netloc
self.first = parse.path;
self.assertTrue(f.performLogin(None))
self.loginUrl = self.host + self.infoProvider.getLoginScript()
def testExtractURLsAuthorStoryName(self):
f = FFA('http://draco664.fanficauthors.net/Apprentice_Potter/Prologue/')
f.extractIndividualUrls()
self.opener = u2.build_opener(u2.HTTPCookieProcessor())
self.assertEquals('Draco664', f.getAuthorName())
self.assertEquals('Apprentice Potter', f.getStoryName())
def testExtractUrls(self):
f = FFA('http://draco664.fanficauthors.net/Apprentice_Potter/Prologue/')
urls = f.extractIndividualUrls()
self.assertEquals(25, len(urls))
def _loginRequired(self):
print('is login required?')
resp = self.opener.open(self.url)
data = resp.read()
if data.find(self.infoProvider.getPasswordLine()) != -1:
print('yep')
return True
else:
print('nada')
return False
self.assertEquals('Grievances', urls[2][1])
self.assertEquals('/Apprentice_Potter/Prologue/', urls[0][0])
def _login(self):
(login, data) = self.infoProvider.getLoginPasswordOthers()
def testGetText(self):
f = FFA('http://jeconais.fanficauthors.net/Happily_Ever_After/Introduction/')
data = f.getText('http://jeconais.fanficauthors.net/Happily_Ever_After/Introduction/')
data[login['login']] = self.login
data[login['password']] = self.password
self.assertTrue(data.find('smiled slightly, and settled back in her rocking chair') != -1)
urlvals = u.urlencode(data)
req = self.opener.open(self.loginUrl, urlvals)
def testGetTextLogin(self):
url = 'http://viridian.fanficauthors.net/Out_of_the_Darkness_A_Jinchuurikis_Tale/A_Harrowing_Escape/'
f = FFA(url)
if req.read().find(self.infoProvider.getPasswordLine()) != -1:
return False
else:
return True
if login_password != None:
f.setLogin(login_password.login)
f.setPassword(login_password.password)
def _getContent(self, url):
print("<!-- Opening %s -->" % url)
return self.opener.open(url).read()
def download(self):
first = self._getContent(self.host + self.first)
urls = self.infoProvider.extractIndividualUrls(first, self.host, self.first)
self.writer = EPubFanficWriter("books", self.infoProvider.getStoryName(), self.infoProvider.getAuthorName())
for u,n in urls:
text = self.infoProvider.getText(self._getContent(self.host+"/"+u))
self.writer.writeChapter(n, text)
self.writer.finalise()
if f.requiresLogin():
f.performLogin()
data = f.getText(url)
seek = 'So Hokage-sama” I said, “this is how we came'
self.assertTrue(data.find(seek) != -1)
if __name__ == '__main__':
f = Downloader(sys.argv[1], 'sigizmund', '***************')
if f._loginRequired():
f._login()
f.download()
unittest.main()

View file

@ -59,7 +59,6 @@ class FicWad:
return self.authorName
def getText(self, url):
print(type(url))
if url.find('http://') == -1:
url = 'http://' + self.host + '/' + url

View file

@ -1,11 +1,15 @@
# -*- coding: utf-8 -*-
import os
import re
import sys
import cgi
import uuid
import codecs
import shutil
import base64
import os.path
import zipfile
import urllib as u
import pprint as pp
import urllib2 as u2
@ -15,6 +19,9 @@ import htmlentitydefs as hdefs
from constants import *
import zipdir
class FanficWriter:
def __init__(self):
pass
@ -54,9 +61,9 @@ class EPubFanficWriter(FanficWriter):
os.mkdir(self.directory + '/META-INF')
os.mkdir(self.directory + '/OEBPS')
print >> open(self.directory + '/mimetype', 'w'), MIMETYPE
print >> open(self.directory + '/META-INF/container.xml', 'w'), CONTAINER
print >> open(self.directory + '/OEBPS/stylesheet.css', 'w'), CSS
print >> codecs.open(self.directory + '/mimetype', 'w', 'utf-8'), MIMETYPE
print >> codecs.open(self.directory + '/META-INF/container.xml', 'w', 'utf-8'), CONTAINER
print >> codecs.open(self.directory + '/OEBPS/stylesheet.css', 'w', 'utf-8'), CSS
def _removeEntities(self, text):
for e in entities:
@ -68,6 +75,7 @@ class EPubFanficWriter(FanficWriter):
def writeChapter(self, title, text):
fileName = base64.b64encode(title) + ".xhtml"
filePath = self.directory + "/OEBPS/" + fileName
f = open(filePath, 'w')
text = self._removeEntities(text)
@ -93,21 +101,24 @@ class EPubFanficWriter(FanficWriter):
# cleanup(self.soup )
text = self.soup.prettify()
print(text)
print >> f, XHTML_START % (title, title)
print >> f, text
f.write(text)
print >> f, XHTML_END
self.chapters.append((title, fileName))
def finalise(self):
print("Finalising...")
### writing table of contents -- ncx file
tocFilePath = self.directory + "/OEBPS/toc.ncx"
toc = open(tocFilePath, 'w')
print >> toc, TOC_START % self.storyTitle
print("Printing toc and refs")
### writing content -- opf file
opfFilePath = self.directory + "/OEBPS/content.opf"
opf = open(opfFilePath, 'w')
@ -127,6 +138,8 @@ class EPubFanficWriter(FanficWriter):
i = i + 1
print('Toc and refs printed, proceesing to ref-ids....')
print >> toc, TOC_END
print >> opf, CONTENT_END_MANIFEST
@ -134,3 +147,8 @@ class EPubFanficWriter(FanficWriter):
print >> opf, CONTENT_ITEMREF % chapterId
print >> opf, CONTENT_END
print('Finished')
filename = self.directory + '.epub'
zipdir.toZip(filename, self.directory)