Stop doing prettify on the HTML, it sometimes introduces extraneous white space. Change the chapter file names and ids inside the metadata to sequential chapter0001 rather than base64 of the chapter title. Replace invalid center & u tags with div styled centered and span styled underlined. Remove dead ffa.py file. Change downloda.py to not call ffa.py, but do call fictionalley.

2026-05-04 18:40:51 +02:00 · 2010-09-30 21:33:58 -05:00 · 2010-09-30 21:33:58 -05:00 · f75910ce7d
commit f75910ce7d
parent 773ff3c03c
9 changed files with 32 additions and 302 deletions
--- a/constants.py
+++ b/constants.py
@ -48,13 +48,15 @@ CONTENT_START = '''<?xml version="1.0"?>
  <item id="style" href="stylesheet.css" media-type="text/css" />
 '''

-CONTENT_ITEM = '<item id="%s" href="%s" media-type="application/xhtml+xml" />'
+CONTENT_ITEM = '''<item id="%s" href="%s" media-type="application/xhtml+xml" />
+'''

 CONTENT_END_MANIFEST = '''</manifest>
 <spine toc="ncx">
 '''

-CONTENT_ITEMREF = '''<itemref idref="%s" />'''
+CONTENT_ITEMREF = '''<itemref idref="%s" />
+'''

 CONTENT_END = '''</spine>
 </package>
--- a/downaloder.py
+++ b/downaloder.py
@ -13,7 +13,6 @@ import BeautifulSoup as bs
 import htmlentitydefs as hdefs


-import ffa
 import ffnet
 import ficwad
 import output
@ -51,13 +50,13 @@ class FanficLoader:
 		urls = self.adapter.extractIndividualUrls()
 		self.writer = self.writerClass(self.booksDirectory, self.adapter.getStoryName(), self.adapter.getAuthorName(), inmemory=self.inmemory, compress=self.compress)
 		
-		i = 0
+		i = 1
 		for u,n in urls:
 			if not self.quiet:
 				print('Downloading chapter %d/%d' % (i, len(urls)))
-			i = i+1
 			text = self.adapter.getText(u)
-			self.writer.writeChapter(n, text)
+			self.writer.writeChapter(i, n, text)
+			i = i+1
 			
 		self.writer.finalise()
 		
@ -78,11 +77,10 @@ if __name__ == '__main__':
 	writerClass = None
 	
 	if url.find('fanficauthors') != -1:
-		adapter = ffa.FFA(url)
+		print >> sys.stderr, "fanficauthors.net already provides ebooks"
+		sys.exit(0)
 	elif url.find('fictionalley') != -1:
 		adapter = fictionalley.FictionAlley(url)
-		#print >> sys.stderr, "FictionAlley adapter is broken, try to find this fic on fanfiction.net or fanficauthors"
-		#sys.exit(0)
 	elif url.find('ficwad') != -1:
 		adapter = ficwad.FicWad(url)
 	elif url.find('fanfiction.net') != -1 or url.find('fictionpress.com') != -1:
--- a/ffa.py
+++ b/ffa.py
@ -1,235 +0,0 @@
-# -*- coding: utf-8 -*-
-
-import os
-import re
-import sys
-import cgi
-import uuid
-import shutil
-import base64
-import os.path
-import logging
-import unittest
-import urllib as u
-import pprint as pp
-import urllib2 as u2
-import urlparse as up
-import BeautifulSoup as bs
-import htmlentitydefs as hdefs
-
-from constants import *
-from adapter import *
-
-try:
-	import login_password
-except:
-	# tough luck
-	pass
-
-class FFA(FanfictionSiteAdapter):
-	def __init__(self, url):
-		self.url = url
-		parsedUrl = up.urlparse(url)
-		self.host = parsedUrl.netloc
-		self.path = parsedUrl.path
-		self.opener = u2.build_opener(u2.HTTPCookieProcessor())
-	
-		logging.debug("Created FFA: url=%s" % (self.url))
-	
-	def _getLoginScript(self):
-		return self.path
-
-	def reqLoginData(self, data):
-		if data.find('<legend>Please login to continue</legend>') != -1 or data.find('<h4>Username or password not found.  Please') != -1 or data.find("This story is rated Mature, you must be logged in to view it") != -1:
-			return True
-		else:
-			return False
-
-	def requiresLogin(self, url = None):
-		if url == None:
-			u = self.url
-		else:
-			u = url
-
-		resp = self.opener.open(u)
-		data = resp.read()
-		return self.reqLoginData(data)
- 
-	def performLogin(self, url = None):
-		if url == None:
-			url = self.url
-		
-		data = {}
-		
-		data['username'] = self.login
-		data['password'] = self.password
-		data['submit'] = 'Submit'
-		
-		urlvals = u.urlencode(data)
-		loginUrl = 'http://' + self.host + self._getLoginScript()
-		logging.debug("Will now login to URL %s" % loginUrl)
-		
-		req = self.opener.open(loginUrl, urlvals)
-		
-		d = req.read()
-
-		if self.reqLoginData(d) :
-			return False
-		else:
-			return True
-	
-	def extractIndividualUrls(self):
-		data = self.opener.open(self.url).read()
-		soup = bs.BeautifulStoneSoup(data)
-
-		if self.reqLoginData(data):
-			logging.debug('Data requires login, trying to login')
-			if not self.performLogin(url):
-				logging.error('Cannot login, raising exception ... ')
-				raise LoginRequiredException(url)
-			else:
-				data = self.opener.open(url).read()
-			
-
-		self.author = str(soup.find('a', {'href' : '/contact/'}).string)
-		self.storyName = str(soup.find('h1', {'class' : 'textCenter'}).contents[0]).strip()
-		
-		logging.debug("Story `%s` by `%s`" % (self.storyName, self.author))
-		
-		selector = soup.find('select', {'class' : 'tinput'})
-		options = selector.findAll('option')
-		
-		urls = []
-		
-		for o in options:
-			title = o.string
-			url = o['value']
-			
-			urls.append((url,title))
-		
-		return urls
-
-	def getText(self, url):
-		if url.find('http://') == -1:
-			url = 'http://' + self.host + '/' + url
-		
-		logging.info('Downloading: %s' % url)
-		data = self.opener.open(url).read()
-		
-		if self.reqLoginData(data):
-			logging.debug('Data requires login, trying to login')
-			if not self.performLogin(url):
-				logging.error('Cannot login, raising exception ... ')
-				raise LoginRequiredException(url)
-			else:
-				data = self.opener.open(url).read()
-		
-		lines = data.split('\n')
-		
-		emit = False
-		
-		post = ''
-		
-		for l in lines:
-			if l.find('</div></form>') != -1:
-				logging.debug('emit = True')
-				emit = True
-				continue
-			elif l.find('<form action="#">') != -1:
-				logging.debug('emit = False')
-				if emit:
-					break
-				else:
-					emit = False
-			
-			if emit:
-				post = post + l + '\n'
-		
-		return post
-
-	def setLogin(self, login):
-		self.login = login
-
-	def setPassword(self, password):
-		self.password = password
-	
-	def getStoryName(self):
-		return self.storyName
-		
-	def getAuthorName(self):
-		return self.author
-
-	def getPrintableUrl(self, url):
-		return url
-
-class FFA_UnitTests(unittest.TestCase):
-	def setUp(self):
-		logging.basicConfig(level=logging.DEBUG)
-		pass
-	
-	def testRequiresLoginNeg(self):
-		f = FFA('http://jeconais.fanficauthors.net/Happily_Ever_After/Introduction/')
-		self.assertFalse(f.requiresLogin())
-	
-	def testRequiresLogin(self):
-		f = FFA('http://jeconais.fanficauthors.net/Rons_Harem/Rons_Harem/')
-		self.assertTrue(f.requiresLogin())
-	
-	def testPerformLogin(self):
-		f = FFA('http://jeconais.fanficauthors.net/Rons_Harem/Rons_Harem/')
-		
-		if login_password != None:
-			f.setLogin(login_password.login)
-			f.setPassword(login_password.password)
-		
-		self.assertTrue(f.performLogin(None))
-		
-	def testExtractURLsAuthorStoryName(self):
-		f = FFA('http://draco664.fanficauthors.net/Apprentice_Potter/Prologue/')
-		f.extractIndividualUrls()
-		
-		self.assertEquals('Draco664', f.getAuthorName())
-		self.assertEquals('Apprentice Potter', f.getStoryName())
-	
-	def testExtractUrls(self):
-		f = FFA('http://draco664.fanficauthors.net/Apprentice_Potter/Prologue/')
-		urls = f.extractIndividualUrls()
-		self.assertEquals(25, len(urls))
-		
-		self.assertEquals('Grievances', urls[2][1])
-		self.assertEquals('/Apprentice_Potter/Prologue/', urls[0][0])
-	
-	def testGetText(self):
-		f = FFA('http://jeconais.fanficauthors.net/Happily_Ever_After/Introduction/')
-		data = f.getText('http://jeconais.fanficauthors.net/Happily_Ever_After/Introduction/')
-		
-		self.assertTrue(data.find('smiled slightly, and settled back in her rocking chair') != -1)
-		
-	def testGetTextLogin(self):
-		url = 'http://viridian.fanficauthors.net/Out_of_the_Darkness_A_Jinchuurikis_Tale/A_Harrowing_Escape/'
-		f = FFA(url)
-		
-		if login_password != None:
-			f.setLogin(login_password.login)
-			f.setPassword(login_password.password)
-		
-		if f.requiresLogin():
-			f.performLogin()
-		
-		data = f.getText(url)
-		seek = 'So Hokage-sama” I said, “this is how we came'
-		self.assertTrue(data.find(seek) != -1)
-	
-	def testSemiLoginRequired(self):
-		f = FFA('http://viridian.fanficauthors.net/Harry_Potter_and_the_Nightmares_of_Futures_Past/The_End_of_Days/')
-		
-		urls = f.extractIndividualUrls()
-		
-		try:
-			data = f.getText('http://viridian.fanficauthors.net/Harry_Potter_and_the_Nightmares_of_Futures_Past/Doing_the_Mungo_Shuffle/')
-			self.assertTrue(False)
-		except LoginRequiredException, e:
-			self.assertTrue(True)
-		
-if __name__ == '__main__':
-	unittest.main()
--- a/ffnet.py
+++ b/ffnet.py
@ -6,7 +6,6 @@ import sys
 import cgi
 import uuid
 import shutil
-import base64
 import os.path
 import logging
 import unittest
@ -136,26 +135,8 @@ class FFNet(FanfictionSiteAdapter):
 			logging.error("Error downloading Chapter: %s" % url)
 			exit(1)
 			return '<html/>'
-		
-		return div.prettify()
-		
-		# 
-		# for l in lines:
-		# 	if l.find("<div id=storytextp class=storytextp") != -1 or l.find('<!-- start story -->') != -1 or l.find('<div id="storytextp"') != -1:
-		# 		logging.debug("starting at line: %s" % l)
-		# 		#s2 = bs.BeautifulStoneSoup(l)
-		# 		#return s2.div.prettify()
-		# 		emit = True
-		# 	
-		# 	if emit:
-		# 		textbuf = textbuf + "\n" + l
-		# 	
-		# 	if l.find("</div><div style='height:10px'></div> ") != -1 or l.find('<!-- end story -->') != -1:
-		# 		emit = False
-		# 
-		# s2 = bs.BeautifulStoneSoup(textbuf)
-		# return s2.div.prettify()
-		
+			
+		return div.__str__('utf8')
 		
 	def setLogin(self, login):
 		self.login = login
--- a/fictionalley.py
+++ b/fictionalley.py
@ -98,7 +98,7 @@ class FictionAlley(FanfictionSiteAdapter):
 			exit(1)
 			return '<html/>'
 		
-		return div.prettify()
+		return div.__str__('utf8')
 	
 	def getPrintableUrl(self, url):
 		return url
--- a/ficwad.py
+++ b/ficwad.py
@ -84,7 +84,7 @@ class FicWad(FanfictionSiteAdapter):
 			logging.error("Error downloading Chapter: %s" % url)
 			exit(1)
 			return '<html/>'
-		return div.prettify()
+		return div.__str__('utf8')
 	
 	def getPrintableUrl(self, url):
 		return url
--- a/hpfiction.py
+++ b/hpfiction.py
@ -6,7 +6,6 @@ import sys
 import cgi
 import uuid
 import shutil
-import base64
 import os.path
 import logging
 import unittest
@ -86,7 +85,7 @@ class HPFiction(FanfictionSiteAdapter):
 		if None == divtext:
 			logging.error("Error downloading Chapter: %s" % url)
 			exit(1)
-		return divtext.prettify()
+		return divtext.__str__('utf8')

 class FF_UnitTests(unittest.TestCase):
 	def setUp(self):
--- a/output.py
+++ b/output.py
@ -8,7 +8,6 @@ import uuid
 import codecs
 import shutil
 import string
-import base64
 import os.path
 import zipfile
 import StringIO
@ -33,7 +32,7 @@ class FanficWriter:
 	def __init__(self):
 		pass
 		
-	def writeChapter(self, title, text):
+	def writeChapter(self, index, title, text):
 		pass
 	
 	def finalise(self):
@ -45,8 +44,8 @@ class TextWriter(FanficWriter):
 	def __init__(self, base, name, author, inmemory=False, compress=False):
 		self.htmlWriter = HTMLWriter(base, name, author, True, False)
 	
-	def writeChapter(self, title, text):
-		self.htmlWriter.writeChapter(title, text)
+	def writeChapter(self, index, title, text):
+		self.htmlWriter.writeChapter(index, title, text)
 	
 	def finalise(self):
 		self.htmlWriter.finalise()
@ -85,7 +84,7 @@ class HTMLWriter(FanficWriter):
 		except:
 			return text
 	
-	def writeChapter(self, title, text):
+	def writeChapter(self, index, title, text):
 		title = self._printableVersion(title) #title.decode('utf-8')
 		text = self._printableVersion(text) #text.decode('utf-8')
 		self.body = self.body + '\n' + self.chapterStartTemplate.substitute({'chapter' : title})
@ -94,7 +93,7 @@ class HTMLWriter(FanficWriter):
 	def finalise(self):
 		html = self.xhtmlTemplate.substitute({'title' : self.storyTitle, 'author' : self.authorName, 'body' : self.body})
 		soup = bs.BeautifulSoup(html)
-		result = soup.prettify()
+		result = soup.__str__('utf8')
 		
 #		f = open(self.fileName, 'w')
 #		f.write(result)
@ -178,16 +177,9 @@ class EPubFanficWriter(FanficWriter):
 		
 		return text
 	
-	def writeChapter(self, title, text):
+	def writeChapter(self, index, title, text):
 		logging.debug("Writing chapter: %s" % title)
-		try:
-			fileName = base64.b64encode(title) + ".xhtml"
-		except UnicodeEncodeError, e:
-			fileName = base64.b64encode(title.encode('utf-8')) + ".xhtml"
-		# Base64 can include +, / and =, which XML technically doesn't like
-		# in it's id attributes.  _ and - are okay and not otherwise used in Base64.
-		# The = for padding is superfluous
-		fileName = fileName.replace('/', '_').replace('+', '-').replace('=','')
+		fileName="chapter%04d.xhtml" % index

 		filePath = self.directory + "/OEBPS/" + fileName
 		
@ -207,21 +199,21 @@ class EPubFanficWriter(FanficWriter):
 			for attr in t._getAttrMap().keys():
 				if attr not in acceptable_attributes:
 					del t[attr]
+			# these are not acceptable strict XHTML.  But we do already have 
+			# CSS classes of the same names defined in constants.py
+			if t.name in ('u'):
+				t['class']=t.name
+				t.name='span'
+			if t.name in ('center'):
+				t['class']=t.name
+				t.name='div'

 		allPs = self.soup.findAll(recursive=True)
 		for p in allPs:
 			if p.string != None and len(p.string.strip()) == 0 :
 				p.extract()

-		# xhtml doesn't like <p> nesting in <p>, so leave divs.
-		# allBrs = self.soup.findAll(recursive=True, name = ['div'])
-		# for br in allBrs:
-			# if (br.string != None and len(br.string.strip()) != 0) or (br.contents != None):
-				# br.name = 'p'
-
-#		cleanup(self.soup )
-		
-		text = self.soup.prettify()
+		text = self.soup.__str__('utf8')
 		
 		tt = self._removeEntities(title)
 		
@ -253,14 +245,7 @@ class EPubFanficWriter(FanficWriter):
 		
 		i = 1
 		for t,f in self.chapters:
-			try:
-				chapterId = base64.b64encode(t)
-			except UnicodeEncodeError, e:
-				chapterId = base64.b64encode(t.encode('utf-8'))
-			# Base64 can include +, / and =, which XML technically doesn't like
-			# in it's id attributes.  _ and - are okay and not otherwise used in Base64.
-			# The = for padding is superfluous
-			chapterId = chapterId.replace('/', '_').replace('+', '-').replace('=','')
+			chapterId = "chapter%04d" % i
 			
 			self._writeFile(tocFilePath, TOC_ITEM % (chapterId, i, cgi.escape(t), f))
 			self._writeFile(opfFilePath, CONTENT_ITEM % (chapterId, f))
--- a/twilighted.py
+++ b/twilighted.py
@ -109,7 +109,7 @@ class Twilighted(FanfictionSiteAdapter):
    if None == div:
      return '<html/>'

-    return div.prettify()
+    return div.__str__('utf8')

  def _getLoginScript(self):
    return '/user.php?action=login'