FFA changed, architecture changed, not quite ready yet

This commit is contained in:
sigizmund 2009-12-17 11:26:55 +00:00
parent a9748061f0
commit 6ef95c634d
5 changed files with 174 additions and 151 deletions

View file

@ -83,7 +83,7 @@ TOC_END = '''</navMap>
</ncx> </ncx>
''' '''
XHTML_START = '''<?xml version="1.0" encoding="iso-8859-1"?> XHTML_START = '''<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd"> <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
<html xmlns="http://www.w3.org/1999/xhtml"> <html xmlns="http://www.w3.org/1999/xhtml">
<head> <head>

View file

@ -28,7 +28,10 @@ class FanficLoader:
urls = self.adapter.extractIndividualUrls() urls = self.adapter.extractIndividualUrls()
self.writer = self.writerClass(self.booksDirectory, self.adapter.getStoryName(), self.adapter.getAuthorName()) self.writer = self.writerClass(self.booksDirectory, self.adapter.getStoryName(), self.adapter.getAuthorName())
i = 0
for u,n in urls: for u,n in urls:
print('Downloading chapter %d/%d' % (i, len(urls)))
i = i+1
text = self.adapter.getText(u) text = self.adapter.getText(u)
self.writer.writeChapter(n, text) self.writer.writeChapter(n, text)

289
ffa.py
View file

@ -1,3 +1,5 @@
# -*- coding: utf-8 -*-
import os import os
import re import re
import sys import sys
@ -6,182 +8,183 @@ import uuid
import shutil import shutil
import base64 import base64
import os.path import os.path
import logging
import unittest
import urllib as u import urllib as u
import pprint as pp import pprint as pp
import urllib2 as u2 import urllib2 as u2
import login_password
import urlparse as up import urlparse as up
import BeautifulSoup as bs import BeautifulSoup as bs
import htmlentitydefs as hdefs import htmlentitydefs as hdefs
from constants import * from constants import *
from ficwad import *
class FFA: class FFA:
storyName = None def __init__(self, url):
self.url = url
parsedUrl = up.urlparse(url)
self.host = parsedUrl.netloc
self.path = parsedUrl.path
self.opener = u2.build_opener(u2.HTTPCookieProcessor())
def __init__(self): logging.debug("Created FFA: url=%s" % (self.url))
self.grabUrl = re.compile('(\<option.+value=\")(.+?)\"\>(.+?)\<')
self.grabAuthor = re.compile('.+pemail.+\'(\w+)')
def getPasswordLine(self): def _getLoginScript(self):
return '<input type="password" name="pass"' return self.path
def requiresLogin(self, url = None):
resp = self.opener.open(self.url)
data = resp.read()
if data.find('<legend>Please login to continue</legend>') != -1:
return True
else:
return False
def performLogin(self, url = None):
if url == None:
url = self.url
def getLoginScript(self): data = {}
return '/scripts/login.php'
def getLoginPasswordOthers(self): data['username'] = self.login
login = dict(login = 'name', password = 'pass') data['password'] = self.password
other = dict(submit = 'Log In', remember='yes') data['submit'] = 'Submit'
return (login, other)
urlvals = u.urlencode(data)
loginUrl = 'http://' + self.host + self._getLoginScript()
logging.debug("Will now login to URL %s" % loginUrl)
req = self.opener.open(loginUrl, urlvals)
if self.requiresLogin():
return False
else:
return True
def getPrintableUrl(self, url): def extractIndividualUrls(self):
return url + '?print=yes' data = self.opener.open(self.url).read()
soup = bs.BeautifulStoneSoup(data)
def _findIndex(self, lines, what, start):
for i in range(start, len(lines)): self.author = soup.find('a', {'href' : '/contact/'}).string
if lines[i].find(what) != -1: self.storyName = str(soup.find('h1', {'class' : 'textCenter'}).contents[0]).strip()
return i
return -1 logging.debug("Story `%s` by `%s`" % (self.storyName, self.author))
def extractIndividualUrls(self, data, host, first, fetch = False): selector = soup.find('select', {'class' : 'tinput'})
options = selector.findAll('option')
urls = []
for o in options:
title = o.string
url = o['value']
urls.append((url,title))
return urls
def getText(self, url):
if url.find('http://') == -1:
url = 'http://' + self.host + '/' + url
data = self.opener.open(url).read()
lines = data.split('\n') lines = data.split('\n')
optionLines = filter(lambda x : x.find('<option value="') != -1, lines) emit = False
authorLines = filter(lambda x : x.find('pemail') != -1, lines) post = ''
for al in authorLines:
m = self.grabAuthor.match(al)
if m != None:
self.authorName = m.group(1)
break
optionsLines = optionLines[:len(optionLines)/2] for l in lines:
if l.find('</div></form>') != -1:
storyName = first.split("/")[1] logging.debug('emit = True')
emit = True
result = []
urls = []
for line in optionLines:
m = self.grabUrl.match(line)
u = m.group(2)
if u.find('" selected="selected') != -1:
u = u.replace('" selected="selected', '')
if u in urls:
continue continue
else: elif l.find('<form action="#">') != -1:
urls.append(u) logging.debug('emit = False')
if emit:
break
else:
emit = False
result.append((self.getPrintableUrl(storyName + "/" + u), m.group(3))) if emit:
post = post + l + '\n'
self.soup = bs.BeautifulSoup(data) return post
titles = self.soup.findAll(name = 'title', recursive=True)
if len(titles) > 0: def setLogin(self, login):
title = titles[0] self.login = login
print(title)
(website, rest) = title.string.split('::') def setPassword(self, password):
story_chapter = rest.split("-") self.password = password
story = story_chapter[0].strip()
self.storyName = story
return result
def getStoryName(self): def getStoryName(self):
return self.storyName return self.storyName
def getAuthorName(self): def getAuthorName(self):
return self.authorName return self.author
def getText(self, data, fetch = False):
lines = data.split('\n')
begin = self._findIndex(lines, '</select>', 0)+1
if begin == 0:
begiun = self._findIndex(lines, '<div><p>', 24)
if begin == 0:
print('BAD start')
pp.pprint(lines)
sys.abort()
end = self._findIndex(lines, '<form action="index.php"><div class="topandbotline"', begin)
print('<!-- ========= begin=%d, end=%d ============= -->' % (begin, end))
return "\n".join(lines[begin:end])
class Downloader: def getPrintableUrl(self, url):
login = None return url
password = None
url = None
host = None
first = None
opener = None
writer = None
def __init__(self, url, login, password):
self.login = login
self.password = password
self.url = url
self.infoProvider = FicWad() #FFA() class FFA_UnitTests(unittest.TestCase):
def setUp(self):
parse = up.urlparse(url) logging.basicConfig(level=logging.DEBUG)
self.host = parse.scheme + '://' + parse.netloc pass
self.first = parse.path;
self.loginUrl = self.host + self.infoProvider.getLoginScript()
self.opener = u2.build_opener(u2.HTTPCookieProcessor())
def _loginRequired(self): def testRequiresLoginNeg(self):
print('is login required?') f = FFA('http://jeconais.fanficauthors.net/Happily_Ever_After/Introduction/')
resp = self.opener.open(self.url) self.assertFalse(f.requiresLogin())
data = resp.read()
if data.find(self.infoProvider.getPasswordLine()) != -1:
print('yep')
return True
else:
print('nada')
return False
def _login(self):
(login, data) = self.infoProvider.getLoginPasswordOthers()
data[login['login']] = self.login
data[login['password']] = self.password
urlvals = u.urlencode(data)
req = self.opener.open(self.loginUrl, urlvals)
if req.read().find(self.infoProvider.getPasswordLine()) != -1:
return False
else:
return True
def _getContent(self, url): def testRequiresLogin(self):
print("<!-- Opening %s -->" % url) f = FFA('http://jeconais.fanficauthors.net/Rons_Harem/Rons_Harem/')
return self.opener.open(url).read() self.assertTrue(f.requiresLogin())
def download(self): def testPerformLogin(self):
first = self._getContent(self.host + self.first) f = FFA('http://jeconais.fanficauthors.net/Rons_Harem/Rons_Harem/')
urls = self.infoProvider.extractIndividualUrls(first, self.host, self.first)
self.writer = EPubFanficWriter("books", self.infoProvider.getStoryName(), self.infoProvider.getAuthorName()) if login_password != None:
f.setLogin(login_password.login)
f.setPassword(login_password.password)
for u,n in urls: self.assertTrue(f.performLogin(None))
text = self.infoProvider.getText(self._getContent(self.host+"/"+u))
self.writer.writeChapter(n, text)
self.writer.finalise() def testExtractURLsAuthorStoryName(self):
f = FFA('http://draco664.fanficauthors.net/Apprentice_Potter/Prologue/')
f.extractIndividualUrls()
self.assertEquals('Draco664', f.getAuthorName())
self.assertEquals('Apprentice Potter', f.getStoryName())
def testExtractUrls(self):
f = FFA('http://draco664.fanficauthors.net/Apprentice_Potter/Prologue/')
urls = f.extractIndividualUrls()
self.assertEquals(25, len(urls))
self.assertEquals('Grievances', urls[2][1])
self.assertEquals('/Apprentice_Potter/Prologue/', urls[0][0])
def testGetText(self):
f = FFA('http://jeconais.fanficauthors.net/Happily_Ever_After/Introduction/')
data = f.getText('http://jeconais.fanficauthors.net/Happily_Ever_After/Introduction/')
self.assertTrue(data.find('smiled slightly, and settled back in her rocking chair') != -1)
def testGetTextLogin(self):
url = 'http://viridian.fanficauthors.net/Out_of_the_Darkness_A_Jinchuurikis_Tale/A_Harrowing_Escape/'
f = FFA(url)
if login_password != None:
f.setLogin(login_password.login)
f.setPassword(login_password.password)
if f.requiresLogin():
f.performLogin()
data = f.getText(url)
seek = 'So Hokage-sama” I said, “this is how we came'
self.assertTrue(data.find(seek) != -1)
if __name__ == '__main__': if __name__ == '__main__':
f = Downloader(sys.argv[1], 'sigizmund', '***************') unittest.main()
if f._loginRequired():
f._login()
f.download()

View file

@ -59,7 +59,6 @@ class FicWad:
return self.authorName return self.authorName
def getText(self, url): def getText(self, url):
print(type(url))
if url.find('http://') == -1: if url.find('http://') == -1:
url = 'http://' + self.host + '/' + url url = 'http://' + self.host + '/' + url

View file

@ -1,11 +1,15 @@
# -*- coding: utf-8 -*-
import os import os
import re import re
import sys import sys
import cgi import cgi
import uuid import uuid
import codecs
import shutil import shutil
import base64 import base64
import os.path import os.path
import zipfile
import urllib as u import urllib as u
import pprint as pp import pprint as pp
import urllib2 as u2 import urllib2 as u2
@ -15,6 +19,9 @@ import htmlentitydefs as hdefs
from constants import * from constants import *
import zipdir
class FanficWriter: class FanficWriter:
def __init__(self): def __init__(self):
pass pass
@ -54,9 +61,9 @@ class EPubFanficWriter(FanficWriter):
os.mkdir(self.directory + '/META-INF') os.mkdir(self.directory + '/META-INF')
os.mkdir(self.directory + '/OEBPS') os.mkdir(self.directory + '/OEBPS')
print >> open(self.directory + '/mimetype', 'w'), MIMETYPE print >> codecs.open(self.directory + '/mimetype', 'w', 'utf-8'), MIMETYPE
print >> open(self.directory + '/META-INF/container.xml', 'w'), CONTAINER print >> codecs.open(self.directory + '/META-INF/container.xml', 'w', 'utf-8'), CONTAINER
print >> open(self.directory + '/OEBPS/stylesheet.css', 'w'), CSS print >> codecs.open(self.directory + '/OEBPS/stylesheet.css', 'w', 'utf-8'), CSS
def _removeEntities(self, text): def _removeEntities(self, text):
for e in entities: for e in entities:
@ -68,6 +75,7 @@ class EPubFanficWriter(FanficWriter):
def writeChapter(self, title, text): def writeChapter(self, title, text):
fileName = base64.b64encode(title) + ".xhtml" fileName = base64.b64encode(title) + ".xhtml"
filePath = self.directory + "/OEBPS/" + fileName filePath = self.directory + "/OEBPS/" + fileName
f = open(filePath, 'w') f = open(filePath, 'w')
text = self._removeEntities(text) text = self._removeEntities(text)
@ -93,21 +101,24 @@ class EPubFanficWriter(FanficWriter):
# cleanup(self.soup ) # cleanup(self.soup )
text = self.soup.prettify() text = self.soup.prettify()
print(text)
print >> f, XHTML_START % (title, title) print >> f, XHTML_START % (title, title)
print >> f, text f.write(text)
print >> f, XHTML_END print >> f, XHTML_END
self.chapters.append((title, fileName)) self.chapters.append((title, fileName))
def finalise(self): def finalise(self):
print("Finalising...")
### writing table of contents -- ncx file ### writing table of contents -- ncx file
tocFilePath = self.directory + "/OEBPS/toc.ncx" tocFilePath = self.directory + "/OEBPS/toc.ncx"
toc = open(tocFilePath, 'w') toc = open(tocFilePath, 'w')
print >> toc, TOC_START % self.storyTitle print >> toc, TOC_START % self.storyTitle
print("Printing toc and refs")
### writing content -- opf file ### writing content -- opf file
opfFilePath = self.directory + "/OEBPS/content.opf" opfFilePath = self.directory + "/OEBPS/content.opf"
opf = open(opfFilePath, 'w') opf = open(opfFilePath, 'w')
@ -126,6 +137,8 @@ class EPubFanficWriter(FanficWriter):
ids.append(chapterId) ids.append(chapterId)
i = i + 1 i = i + 1
print('Toc and refs printed, proceesing to ref-ids....')
print >> toc, TOC_END print >> toc, TOC_END
print >> opf, CONTENT_END_MANIFEST print >> opf, CONTENT_END_MANIFEST
@ -133,4 +146,9 @@ class EPubFanficWriter(FanficWriter):
for chapterId in ids: for chapterId in ids:
print >> opf, CONTENT_ITEMREF % chapterId print >> opf, CONTENT_ITEMREF % chapterId
print >> opf, CONTENT_END print >> opf, CONTENT_END
print('Finished')
filename = self.directory + '.epub'
zipdir.toZip(filename, self.directory)