FFA changed, architecture changed, not quite ready yet

This commit is contained in:
sigizmund 2009-12-17 11:26:55 +00:00
parent a9748061f0
commit 6ef95c634d
5 changed files with 174 additions and 151 deletions

View file

@ -83,7 +83,7 @@ TOC_END = '''</navMap>
</ncx> </ncx>
''' '''
XHTML_START = '''<?xml version="1.0" encoding="iso-8859-1"?> XHTML_START = '''<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd"> <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
<html xmlns="http://www.w3.org/1999/xhtml"> <html xmlns="http://www.w3.org/1999/xhtml">
<head> <head>

View file

@ -28,7 +28,10 @@ class FanficLoader:
urls = self.adapter.extractIndividualUrls() urls = self.adapter.extractIndividualUrls()
self.writer = self.writerClass(self.booksDirectory, self.adapter.getStoryName(), self.adapter.getAuthorName()) self.writer = self.writerClass(self.booksDirectory, self.adapter.getStoryName(), self.adapter.getAuthorName())
i = 0
for u,n in urls: for u,n in urls:
print('Downloading chapter %d/%d' % (i, len(urls)))
i = i+1
text = self.adapter.getText(u) text = self.adapter.getText(u)
self.writer.writeChapter(n, text) self.writer.writeChapter(n, text)

261
ffa.py
View file

@ -1,3 +1,5 @@
# -*- coding: utf-8 -*-
import os import os
import re import re
import sys import sys
@ -6,182 +8,183 @@ import uuid
import shutil import shutil
import base64 import base64
import os.path import os.path
import logging
import unittest
import urllib as u import urllib as u
import pprint as pp import pprint as pp
import urllib2 as u2 import urllib2 as u2
import login_password
import urlparse as up import urlparse as up
import BeautifulSoup as bs import BeautifulSoup as bs
import htmlentitydefs as hdefs import htmlentitydefs as hdefs
from constants import * from constants import *
from ficwad import *
class FFA: class FFA:
storyName = None def __init__(self, url):
self.url = url
parsedUrl = up.urlparse(url)
self.host = parsedUrl.netloc
self.path = parsedUrl.path
self.opener = u2.build_opener(u2.HTTPCookieProcessor())
def __init__(self): logging.debug("Created FFA: url=%s" % (self.url))
self.grabUrl = re.compile('(\<option.+value=\")(.+?)\"\>(.+?)\<')
self.grabAuthor = re.compile('.+pemail.+\'(\w+)')
def getPasswordLine(self): def _getLoginScript(self):
return '<input type="password" name="pass"' return self.path
def getLoginScript(self): def requiresLogin(self, url = None):
return '/scripts/login.php' resp = self.opener.open(self.url)
data = resp.read()
if data.find('<legend>Please login to continue</legend>') != -1:
return True
else:
return False
def getLoginPasswordOthers(self): def performLogin(self, url = None):
login = dict(login = 'name', password = 'pass') if url == None:
other = dict(submit = 'Log In', remember='yes') url = self.url
return (login, other)
def getPrintableUrl(self, url): data = {}
return url + '?print=yes'
def _findIndex(self, lines, what, start): data['username'] = self.login
for i in range(start, len(lines)): data['password'] = self.password
if lines[i].find(what) != -1: data['submit'] = 'Submit'
return i
return -1 urlvals = u.urlencode(data)
loginUrl = 'http://' + self.host + self._getLoginScript()
logging.debug("Will now login to URL %s" % loginUrl)
req = self.opener.open(loginUrl, urlvals)
if self.requiresLogin():
return False
else:
return True
def extractIndividualUrls(self):
data = self.opener.open(self.url).read()
soup = bs.BeautifulStoneSoup(data)
self.author = soup.find('a', {'href' : '/contact/'}).string
self.storyName = str(soup.find('h1', {'class' : 'textCenter'}).contents[0]).strip()
logging.debug("Story `%s` by `%s`" % (self.storyName, self.author))
selector = soup.find('select', {'class' : 'tinput'})
options = selector.findAll('option')
urls = []
for o in options:
title = o.string
url = o['value']
urls.append((url,title))
return urls
def getText(self, url):
if url.find('http://') == -1:
url = 'http://' + self.host + '/' + url
data = self.opener.open(url).read()
def extractIndividualUrls(self, data, host, first, fetch = False):
lines = data.split('\n') lines = data.split('\n')
optionLines = filter(lambda x : x.find('<option value="') != -1, lines) emit = False
authorLines = filter(lambda x : x.find('pemail') != -1, lines) post = ''
for al in authorLines:
m = self.grabAuthor.match(al)
if m != None:
self.authorName = m.group(1)
break
for l in lines:
optionsLines = optionLines[:len(optionLines)/2] if l.find('</div></form>') != -1:
logging.debug('emit = True')
storyName = first.split("/")[1] emit = True
result = []
urls = []
for line in optionLines:
m = self.grabUrl.match(line)
u = m.group(2)
if u.find('" selected="selected') != -1:
u = u.replace('" selected="selected', '')
if u in urls:
continue continue
else: elif l.find('<form action="#">') != -1:
urls.append(u) logging.debug('emit = False')
if emit:
break
else:
emit = False
result.append((self.getPrintableUrl(storyName + "/" + u), m.group(3))) if emit:
post = post + l + '\n'
self.soup = bs.BeautifulSoup(data) return post
titles = self.soup.findAll(name = 'title', recursive=True)
if len(titles) > 0:
title = titles[0]
print(title)
(website, rest) = title.string.split('::')
story_chapter = rest.split("-")
story = story_chapter[0].strip() def setLogin(self, login):
self.storyName = story self.login = login
return result def setPassword(self, password):
self.password = password
def getStoryName(self): def getStoryName(self):
return self.storyName return self.storyName
def getAuthorName(self): def getAuthorName(self):
return self.authorName return self.author
def getText(self, data, fetch = False): def getPrintableUrl(self, url):
lines = data.split('\n') return url
begin = self._findIndex(lines, '</select>', 0)+1
if begin == 0:
begiun = self._findIndex(lines, '<div><p>', 24)
if begin == 0: class FFA_UnitTests(unittest.TestCase):
print('BAD start') def setUp(self):
pp.pprint(lines) logging.basicConfig(level=logging.DEBUG)
sys.abort() pass
end = self._findIndex(lines, '<form action="index.php"><div class="topandbotline"', begin)
print('<!-- ========= begin=%d, end=%d ============= -->' % (begin, end))
return "\n".join(lines[begin:end])
class Downloader: def testRequiresLoginNeg(self):
login = None f = FFA('http://jeconais.fanficauthors.net/Happily_Ever_After/Introduction/')
password = None self.assertFalse(f.requiresLogin())
url = None
host = None
first = None
opener = None
writer = None def testRequiresLogin(self):
f = FFA('http://jeconais.fanficauthors.net/Rons_Harem/Rons_Harem/')
self.assertTrue(f.requiresLogin())
def __init__(self, url, login, password): def testPerformLogin(self):
self.login = login f = FFA('http://jeconais.fanficauthors.net/Rons_Harem/Rons_Harem/')
self.password = password
self.url = url
self.infoProvider = FicWad() #FFA() if login_password != None:
f.setLogin(login_password.login)
f.setPassword(login_password.password)
parse = up.urlparse(url) self.assertTrue(f.performLogin(None))
self.host = parse.scheme + '://' + parse.netloc
self.first = parse.path;
self.loginUrl = self.host + self.infoProvider.getLoginScript() def testExtractURLsAuthorStoryName(self):
f = FFA('http://draco664.fanficauthors.net/Apprentice_Potter/Prologue/')
f.extractIndividualUrls()
self.opener = u2.build_opener(u2.HTTPCookieProcessor()) self.assertEquals('Draco664', f.getAuthorName())
self.assertEquals('Apprentice Potter', f.getStoryName())
def testExtractUrls(self):
f = FFA('http://draco664.fanficauthors.net/Apprentice_Potter/Prologue/')
urls = f.extractIndividualUrls()
self.assertEquals(25, len(urls))
def _loginRequired(self): self.assertEquals('Grievances', urls[2][1])
print('is login required?') self.assertEquals('/Apprentice_Potter/Prologue/', urls[0][0])
resp = self.opener.open(self.url)
data = resp.read()
if data.find(self.infoProvider.getPasswordLine()) != -1:
print('yep')
return True
else:
print('nada')
return False
def _login(self): def testGetText(self):
(login, data) = self.infoProvider.getLoginPasswordOthers() f = FFA('http://jeconais.fanficauthors.net/Happily_Ever_After/Introduction/')
data = f.getText('http://jeconais.fanficauthors.net/Happily_Ever_After/Introduction/')
data[login['login']] = self.login self.assertTrue(data.find('smiled slightly, and settled back in her rocking chair') != -1)
data[login['password']] = self.password
urlvals = u.urlencode(data) def testGetTextLogin(self):
req = self.opener.open(self.loginUrl, urlvals) url = 'http://viridian.fanficauthors.net/Out_of_the_Darkness_A_Jinchuurikis_Tale/A_Harrowing_Escape/'
f = FFA(url)
if req.read().find(self.infoProvider.getPasswordLine()) != -1: if login_password != None:
return False f.setLogin(login_password.login)
else: f.setPassword(login_password.password)
return True
def _getContent(self, url): if f.requiresLogin():
print("<!-- Opening %s -->" % url) f.performLogin()
return self.opener.open(url).read()
def download(self):
first = self._getContent(self.host + self.first)
urls = self.infoProvider.extractIndividualUrls(first, self.host, self.first)
self.writer = EPubFanficWriter("books", self.infoProvider.getStoryName(), self.infoProvider.getAuthorName())
for u,n in urls:
text = self.infoProvider.getText(self._getContent(self.host+"/"+u))
self.writer.writeChapter(n, text)
self.writer.finalise()
data = f.getText(url)
seek = 'So Hokage-sama” I said, “this is how we came'
self.assertTrue(data.find(seek) != -1)
if __name__ == '__main__': if __name__ == '__main__':
f = Downloader(sys.argv[1], 'sigizmund', '***************') unittest.main()
if f._loginRequired():
f._login()
f.download()

View file

@ -59,7 +59,6 @@ class FicWad:
return self.authorName return self.authorName
def getText(self, url): def getText(self, url):
print(type(url))
if url.find('http://') == -1: if url.find('http://') == -1:
url = 'http://' + self.host + '/' + url url = 'http://' + self.host + '/' + url

View file

@ -1,11 +1,15 @@
# -*- coding: utf-8 -*-
import os import os
import re import re
import sys import sys
import cgi import cgi
import uuid import uuid
import codecs
import shutil import shutil
import base64 import base64
import os.path import os.path
import zipfile
import urllib as u import urllib as u
import pprint as pp import pprint as pp
import urllib2 as u2 import urllib2 as u2
@ -15,6 +19,9 @@ import htmlentitydefs as hdefs
from constants import * from constants import *
import zipdir
class FanficWriter: class FanficWriter:
def __init__(self): def __init__(self):
pass pass
@ -54,9 +61,9 @@ class EPubFanficWriter(FanficWriter):
os.mkdir(self.directory + '/META-INF') os.mkdir(self.directory + '/META-INF')
os.mkdir(self.directory + '/OEBPS') os.mkdir(self.directory + '/OEBPS')
print >> open(self.directory + '/mimetype', 'w'), MIMETYPE print >> codecs.open(self.directory + '/mimetype', 'w', 'utf-8'), MIMETYPE
print >> open(self.directory + '/META-INF/container.xml', 'w'), CONTAINER print >> codecs.open(self.directory + '/META-INF/container.xml', 'w', 'utf-8'), CONTAINER
print >> open(self.directory + '/OEBPS/stylesheet.css', 'w'), CSS print >> codecs.open(self.directory + '/OEBPS/stylesheet.css', 'w', 'utf-8'), CSS
def _removeEntities(self, text): def _removeEntities(self, text):
for e in entities: for e in entities:
@ -68,6 +75,7 @@ class EPubFanficWriter(FanficWriter):
def writeChapter(self, title, text): def writeChapter(self, title, text):
fileName = base64.b64encode(title) + ".xhtml" fileName = base64.b64encode(title) + ".xhtml"
filePath = self.directory + "/OEBPS/" + fileName filePath = self.directory + "/OEBPS/" + fileName
f = open(filePath, 'w') f = open(filePath, 'w')
text = self._removeEntities(text) text = self._removeEntities(text)
@ -93,21 +101,24 @@ class EPubFanficWriter(FanficWriter):
# cleanup(self.soup ) # cleanup(self.soup )
text = self.soup.prettify() text = self.soup.prettify()
print(text)
print >> f, XHTML_START % (title, title) print >> f, XHTML_START % (title, title)
print >> f, text f.write(text)
print >> f, XHTML_END print >> f, XHTML_END
self.chapters.append((title, fileName)) self.chapters.append((title, fileName))
def finalise(self): def finalise(self):
print("Finalising...")
### writing table of contents -- ncx file ### writing table of contents -- ncx file
tocFilePath = self.directory + "/OEBPS/toc.ncx" tocFilePath = self.directory + "/OEBPS/toc.ncx"
toc = open(tocFilePath, 'w') toc = open(tocFilePath, 'w')
print >> toc, TOC_START % self.storyTitle print >> toc, TOC_START % self.storyTitle
print("Printing toc and refs")
### writing content -- opf file ### writing content -- opf file
opfFilePath = self.directory + "/OEBPS/content.opf" opfFilePath = self.directory + "/OEBPS/content.opf"
opf = open(opfFilePath, 'w') opf = open(opfFilePath, 'w')
@ -127,6 +138,8 @@ class EPubFanficWriter(FanficWriter):
i = i + 1 i = i + 1
print('Toc and refs printed, proceesing to ref-ids....')
print >> toc, TOC_END print >> toc, TOC_END
print >> opf, CONTENT_END_MANIFEST print >> opf, CONTENT_END_MANIFEST
@ -134,3 +147,8 @@ class EPubFanficWriter(FanficWriter):
print >> opf, CONTENT_ITEMREF % chapterId print >> opf, CONTENT_ITEMREF % chapterId
print >> opf, CONTENT_END print >> opf, CONTENT_END
print('Finished')
filename = self.directory + '.epub'
zipdir.toZip(filename, self.directory)