FFA changed, architecture changed, not quite ready yet

2025-12-06 08:52:55 +01:00 · 2009-12-17 11:26:55 +00:00 · 2009-12-17 11:26:55 +00:00 · 6ef95c634d
commit 6ef95c634d
parent a9748061f0
5 changed files with 174 additions and 151 deletions
--- a/constants.py
+++ b/constants.py
@ -83,7 +83,7 @@ TOC_END = '''</navMap>
 </ncx>
 '''

-XHTML_START = '''<?xml version="1.0" encoding="iso-8859-1"?>
+XHTML_START = '''<?xml version="1.0" encoding="UTF-8"?>
 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
 <html xmlns="http://www.w3.org/1999/xhtml">
 <head>
--- a/downaloder.py
+++ b/downaloder.py
@ -28,7 +28,10 @@ class FanficLoader:
 		urls = self.adapter.extractIndividualUrls()
 		self.writer = self.writerClass(self.booksDirectory, self.adapter.getStoryName(), self.adapter.getAuthorName())
 		
+		i = 0
 		for u,n in urls:
+			print('Downloading chapter %d/%d' % (i, len(urls)))
+			i = i+1
 			text = self.adapter.getText(u)
 			self.writer.writeChapter(n, text)
 		
--- a/ffa.py
+++ b/ffa.py
@ -1,3 +1,5 @@
+# -*- coding: utf-8 -*-
+
 import os
 import re
 import sys
@ -6,182 +8,183 @@ import uuid
 import shutil
 import base64
 import os.path
+import logging
+import unittest
 import urllib as u
 import pprint as pp
 import urllib2 as u2
+import login_password
 import urlparse as up
 import BeautifulSoup as bs
 import htmlentitydefs as hdefs

 from constants import *

-from ficwad import *
-
 class FFA:
-	storyName = None
+	def __init__(self, url):
+		self.url = url
+		parsedUrl = up.urlparse(url)
+		self.host = parsedUrl.netloc
+		self.path = parsedUrl.path
+		self.opener = u2.build_opener(u2.HTTPCookieProcessor())
 	
-	def __init__(self):
-		self.grabUrl = re.compile('(\<option.+value=\")(.+?)\"\>(.+?)\<')
-		self.grabAuthor = re.compile('.+pemail.+\'(\w+)')
+		logging.debug("Created FFA: url=%s" % (self.url))
 	
-	def getPasswordLine(self):
-		return '<input type="password" name="pass"'
+	def _getLoginScript(self):
+		return self.path

-	def getLoginScript(self):
-		return '/scripts/login.php'
+	def requiresLogin(self, url = None):
+		resp = self.opener.open(self.url)
+		data = resp.read()
+		if data.find('<legend>Please login to continue</legend>') != -1:
+			return True
+		else:
+			return False
 
-	def getLoginPasswordOthers(self):
-		login = dict(login = 'name', password = 'pass')
-		other = dict(submit = 'Log In', remember='yes')
-		return (login, other)
+	def performLogin(self, url = None):
+		if url == None:
+			url = self.url
 		
-	def getPrintableUrl(self, url):
-		return url + '?print=yes'
+		data = {}
 		
-	def _findIndex(self, lines, what, start):
-		for i in range(start, len(lines)):
-			if lines[i].find(what) != -1:
-				return i
-		return -1
+		data['username'] = self.login
+		data['password'] = self.password
+		data['submit'] = 'Submit'
+		
+		urlvals = u.urlencode(data)
+		loginUrl = 'http://' + self.host + self._getLoginScript()
+		logging.debug("Will now login to URL %s" % loginUrl)
+		
+		req = self.opener.open(loginUrl, urlvals)
+		
+		if self.requiresLogin():
+			return False
+		else:
+			return True
+	
+	def extractIndividualUrls(self):
+		data = self.opener.open(self.url).read()
+		soup = bs.BeautifulStoneSoup(data)
+
+		self.author = soup.find('a', {'href' : '/contact/'}).string
+		self.storyName = str(soup.find('h1', {'class' : 'textCenter'}).contents[0]).strip()
+		
+		logging.debug("Story `%s` by `%s`" % (self.storyName, self.author))
+		
+		selector = soup.find('select', {'class' : 'tinput'})
+		options = selector.findAll('option')
+		
+		urls = []
+		
+		for o in options:
+			title = o.string
+			url = o['value']
+			
+			urls.append((url,title))
+		
+		return urls
+
+	def getText(self, url):
+		if url.find('http://') == -1:
+			url = 'http://' + self.host + '/' + url
+		
+		data = self.opener.open(url).read()
 		
-	def extractIndividualUrls(self, data, host, first, fetch = False):
 		lines = data.split('\n')
 		
-		optionLines = filter(lambda x : x.find('<option value="') != -1, lines)
+		emit = False
 		
-		authorLines = filter(lambda x : x.find('pemail') != -1, lines)
-		for al in authorLines:
-			m = self.grabAuthor.match(al)
-			if m != None:
-				self.authorName = m.group(1)
-				break
+		post = ''
 		
-		
-		optionsLines = optionLines[:len(optionLines)/2]
-		
-		storyName = first.split("/")[1]
-		
-		result = []
-		urls = []
-		for line in optionLines:
-			m = self.grabUrl.match(line)
-			u = m.group(2)
-			if u.find('" selected="selected') != -1:
-				u = u.replace('" selected="selected', '')
-			
-			if u in urls:
+		for l in lines:
+			if l.find('</div></form>') != -1:
+				logging.debug('emit = True')
+				emit = True
 				continue
+			elif l.find('<form action="#">') != -1:
+				logging.debug('emit = False')
+				if emit:
+					break
 				else:
-				urls.append(u)
+					emit = False
 			
-			result.append((self.getPrintableUrl(storyName + "/" + u), m.group(3)))
+			if emit:
+				post = post + l + '\n'
 		
-		self.soup = bs.BeautifulSoup(data)
-		titles = self.soup.findAll(name = 'title', recursive=True)
-		if len(titles) > 0:
-			title = titles[0]
-			print(title)
-			(website, rest) = title.string.split('::')
-			story_chapter = rest.split("-")
+		return post

-			story = story_chapter[0].strip()
-			self.storyName = story
+	def setLogin(self, login):
+		self.login = login

-		return result
+	def setPassword(self, password):
+		self.password = password
 	
 	def getStoryName(self):
 		return self.storyName
 		
 	def getAuthorName(self):
-		return self.authorName
+		return self.author

-	def getText(self, data, fetch = False):
-		lines = data.split('\n')
-		begin = self._findIndex(lines, '</select>', 0)+1
-		if begin == 0:
-			begiun = self._findIndex(lines, '<div><p>', 24)
+	def getPrintableUrl(self, url):
+		return url

-		if begin == 0:
-			print('BAD start')
-			pp.pprint(lines)
-			sys.abort()
-		end = self._findIndex(lines, '<form action="index.php"><div class="topandbotline"', begin)
-		print('<!-- ========= begin=%d, end=%d ============= -->' % (begin, end))
-		return "\n".join(lines[begin:end])
+class FFA_UnitTests(unittest.TestCase):
+	def setUp(self):
+		logging.basicConfig(level=logging.DEBUG)
+		pass
 	
-class Downloader:
-	login = None
-	password = None
-	url = None
-	host = None
-	first = None
-	opener = None
+	def testRequiresLoginNeg(self):
+		f = FFA('http://jeconais.fanficauthors.net/Happily_Ever_After/Introduction/')
+		self.assertFalse(f.requiresLogin())
 	
-	writer = None
+	def testRequiresLogin(self):
+		f = FFA('http://jeconais.fanficauthors.net/Rons_Harem/Rons_Harem/')
+		self.assertTrue(f.requiresLogin())
 	
-	def __init__(self, url, login, password):
-		self.login = login
-		self.password = password
-		self.url = url
+	def testPerformLogin(self):
+		f = FFA('http://jeconais.fanficauthors.net/Rons_Harem/Rons_Harem/')
 		
-		self.infoProvider = FicWad() #FFA()
+		if login_password != None:
+			f.setLogin(login_password.login)
+			f.setPassword(login_password.password)
 		
-		parse = up.urlparse(url)
-		self.host = parse.scheme + '://' + parse.netloc
-		self.first = parse.path;
+		self.assertTrue(f.performLogin(None))
 		
-		self.loginUrl = self.host + self.infoProvider.getLoginScript()
+	def testExtractURLsAuthorStoryName(self):
+		f = FFA('http://draco664.fanficauthors.net/Apprentice_Potter/Prologue/')
+		f.extractIndividualUrls()
 		
-		self.opener = u2.build_opener(u2.HTTPCookieProcessor())
+		self.assertEquals('Draco664', f.getAuthorName())
+		self.assertEquals('Apprentice Potter', f.getStoryName())
 	
+	def testExtractUrls(self):
+		f = FFA('http://draco664.fanficauthors.net/Apprentice_Potter/Prologue/')
+		urls = f.extractIndividualUrls()
+		self.assertEquals(25, len(urls))
 		
-	def _loginRequired(self):
-		print('is login required?')
-		resp = self.opener.open(self.url)
-		data = resp.read()
-		if data.find(self.infoProvider.getPasswordLine()) != -1:
-			print('yep')
-			return True
-		else:
-			print('nada')
-			return False
+		self.assertEquals('Grievances', urls[2][1])
+		self.assertEquals('/Apprentice_Potter/Prologue/', urls[0][0])
 	
-	def _login(self):
-		(login, data) = self.infoProvider.getLoginPasswordOthers()
+	def testGetText(self):
+		f = FFA('http://jeconais.fanficauthors.net/Happily_Ever_After/Introduction/')
+		data = f.getText('http://jeconais.fanficauthors.net/Happily_Ever_After/Introduction/')
 		
-		data[login['login']] = self.login
-		data[login['password']] = self.password
+		self.assertTrue(data.find('smiled slightly, and settled back in her rocking chair') != -1)
 		
-		urlvals = u.urlencode(data)
-		req = self.opener.open(self.loginUrl, urlvals)
+	def testGetTextLogin(self):
+		url = 'http://viridian.fanficauthors.net/Out_of_the_Darkness_A_Jinchuurikis_Tale/A_Harrowing_Escape/'
+		f = FFA(url)
 		
-		if req.read().find(self.infoProvider.getPasswordLine()) != -1:
-			return False
-		else:
-			return True
+		if login_password != None:
+			f.setLogin(login_password.login)
+			f.setPassword(login_password.password)
 		
-	def _getContent(self, url):
-		print("<!-- Opening %s -->" % url)
-		return self.opener.open(url).read()
-	
-	def download(self):
-		first = self._getContent(self.host + self.first)
-		urls = self.infoProvider.extractIndividualUrls(first, self.host, self.first)
-		
-		self.writer = EPubFanficWriter("books", self.infoProvider.getStoryName(), self.infoProvider.getAuthorName())
-		
-		for u,n in urls:
-			text = self.infoProvider.getText(self._getContent(self.host+"/"+u))
-			self.writer.writeChapter(n, text)
-		
-		self.writer.finalise()
+		if f.requiresLogin():
+			f.performLogin()
 		
+		data = f.getText(url)
+		seek = 'So Hokage-sama” I said, “this is how we came'
+		self.assertTrue(data.find(seek) != -1)
 		
 if __name__ == '__main__':
-	f = Downloader(sys.argv[1], 'sigizmund', '***************')
-	if f._loginRequired():
-		f._login()
-	f.download()
-	
-	
-	
+	unittest.main()
--- a/ficwad.py
+++ b/ficwad.py
@ -59,7 +59,6 @@ class FicWad:
 		return self.authorName
 	
 	def getText(self, url):
-		print(type(url))
 		if url.find('http://') == -1:
 			url = 'http://' + self.host + '/' + url
 		
--- a/output.py
+++ b/output.py
@ -1,11 +1,15 @@
+# -*- coding: utf-8 -*-
+
 import os
 import re
 import sys
 import cgi
 import uuid
+import codecs
 import shutil
 import base64
 import os.path
+import zipfile
 import urllib as u
 import pprint as pp
 import urllib2 as u2
@ -15,6 +19,9 @@ import htmlentitydefs as hdefs

 from constants import *

+import zipdir
+
+
 class FanficWriter:
 	def __init__(self):
 		pass
@ -54,9 +61,9 @@ class EPubFanficWriter(FanficWriter):
 		os.mkdir(self.directory + '/META-INF')
 		os.mkdir(self.directory + '/OEBPS')
 		
-		print >> open(self.directory + '/mimetype', 'w'), MIMETYPE
-		print >> open(self.directory + '/META-INF/container.xml', 'w'), CONTAINER
-		print >> open(self.directory + '/OEBPS/stylesheet.css', 'w'), CSS
+		print >> codecs.open(self.directory + '/mimetype', 'w', 'utf-8'), MIMETYPE
+		print >> codecs.open(self.directory + '/META-INF/container.xml', 'w', 'utf-8'), CONTAINER
+		print >> codecs.open(self.directory + '/OEBPS/stylesheet.css', 'w', 'utf-8'), CSS

 	def _removeEntities(self, text):
 		for e in entities:
@ -68,6 +75,7 @@ class EPubFanficWriter(FanficWriter):
 	def writeChapter(self, title, text):
 		fileName = base64.b64encode(title) + ".xhtml"
 		filePath = self.directory + "/OEBPS/" + fileName
+		
 		f = open(filePath, 'w')
 		
 		text = self._removeEntities(text)
@ -93,21 +101,24 @@ class EPubFanficWriter(FanficWriter):
 #		cleanup(self.soup )
 		
 		text = self.soup.prettify()
+		print(text)
 		
 		print >> f, XHTML_START % (title, title)
-		print >> f, text
+		f.write(text)
 		print >> f, XHTML_END
 		
 		self.chapters.append((title, fileName))
 	
 	def finalise(self):
-		
+		print("Finalising...")
 		### writing table of contents -- ncx file
 		
 		tocFilePath = self.directory + "/OEBPS/toc.ncx"
 		toc = open(tocFilePath, 'w')
 		print >> toc, TOC_START % self.storyTitle

+		print("Printing toc and refs")
+
 		### writing content -- opf file
 		opfFilePath = self.directory + "/OEBPS/content.opf"
 		opf = open(opfFilePath, 'w')
@ -127,6 +138,8 @@ class EPubFanficWriter(FanficWriter):
 			
 			i = i + 1
 			
+		print('Toc and refs printed, proceesing to ref-ids....')
+		
 		print >> toc, TOC_END
 		print >> opf, CONTENT_END_MANIFEST		
 		
@ -134,3 +147,8 @@ class EPubFanficWriter(FanficWriter):
 			print >> opf, CONTENT_ITEMREF % chapterId
 		
 		print >> opf, CONTENT_END
+		
+		print('Finished')
+		
+		filename = self.directory + '.epub'
+		zipdir.toZip(filename, self.directory)