FanFicFare/output.py

# -*- coding: utf-8 -*-

import os
import re
import sys
import cgi
import uuid
import unicodedata
import codecs
import shutil
import string
import os.path
import zipfile
import StringIO
import logging
import hashlib
import urllib as u
import pprint as pp
import urllib2 as u2
import urlparse as up
import BeautifulSoup as bs
import htmlentitydefs as hdefs

import zipdir
import html_constants
from constants import *


import html2text
import datetime


class FanficWriter:
	def __init__(self):
		pass

	def writeChapter(self, index, title, text):
		pass

	def finalise(self):
		pass

class TextWriter(FanficWriter):
	htmlWriter = None

	def __init__(self, base, adapter, inmemory=False, compress=False):
		self.htmlWriter = HTMLWriter(base, adapter, True, False)

	def writeChapter(self, index, title, text):
		self.htmlWriter.writeChapter(index, title, text)

	def finalise(self):
		self.htmlWriter.finalise()
		self.output = StringIO.StringIO()
		self.output.write(html2text.html2text(self.htmlWriter.output.getvalue().decode('utf-8')).encode('utf-8'))
		self.name = self.htmlWriter.name


class HTMLWriter(FanficWriter):
	body = ''

	def __init__(self, base, adapter, inmemory=False, compress=False):
		self.basePath = base
		self.storyTitle = removeEntities(adapter.getStoryName())
		self.name = makeAcceptableFilename(adapter.getOutputName())
		self.fileName = self.basePath + '/' + self.name + '.html'
		self.authorName = removeEntities(adapter.getAuthorName())
		self.adapter = adapter

		self.inmemory = inmemory

		if not self.inmemory and os.path.exists(self.fileName):
			os.remove(self.fileName)

		if self.inmemory:
			self.output = StringIO.StringIO()
		else:
			self.output = open(self.fileName, 'w')

		self.xhtmlTemplate = string.Template(html_constants.XHTML_START)
		self.chapterStartTemplate = string.Template(html_constants.XHTML_CHAPTER_START)

	def _printableVersion(self, text):
		try:
			d = text.decode('utf-8')
			return d
		except:
			return text

	def writeChapter(self, index, title, text):
		title = self._printableVersion(title) #title.decode('utf-8')
		text = self._printableVersion(text) #text.decode('utf-8')
		self.body = self.body + '\n' + self.chapterStartTemplate.substitute({'chapter' : title})
		self.body = self.body + '\n' + text

	def finalise(self):
		html = self.xhtmlTemplate.substitute({'title' : self.storyTitle, 'author' : self.authorName, 'body' : self.body})
		soup = bs.BeautifulSoup(html)
		result = soup.__str__('utf8')

#		f = open(self.fileName, 'w')
#		f.write(result)
#		f.close()

		self.output.write(result)
		if not self.inmemory:
			self.output.close()

class EPubFanficWriter(FanficWriter):
	chapters = []

	files = {}

	def _writeFile(self, fileName, data):
		#logging.debug('_writeFile(`%s`, data)' % fileName)
		if fileName in self.files:
			try:
				d = data.decode('utf-8')
			except UnicodeEncodeError, e:
				d = data

			self.files[fileName].write(d)
		else:
			if self.inmemory:
				self.files[fileName] = StringIO.StringIO()
			else:
				self.files[fileName] = open(self.directory + '/' + fileName, encoding='utf-8', mode='w')

			self._writeFile(fileName, data)


	def _closeFiles(self):
		if not self.inmemory:
			for f in self.files:
				self.files[f].close()

	def __init__(self, base, adapter, inmemory=False, compress=True):
		self.basePath = base
		self.storyTitle = removeEntities(adapter.getStoryName())
		self.name = makeAcceptableFilename(adapter.getOutputName())
		self.directory = self.basePath + '/' + self.name
		self.authorName = removeEntities(adapter.getAuthorName())
		self.inmemory = inmemory
		self.adapter = adapter

		self.files = {}
		self.chapters = []

		if not self.inmemory:
			self.inmemory = True
			self.writeToFile = True
		else:
			self.writeToFile = False


		if not self.inmemory:
			if os.path.exists(self.directory):
				shutil.rmtree(self.directory)

			os.mkdir(self.directory)

			os.mkdir(self.directory + '/META-INF')
			os.mkdir(self.directory + '/OEBPS')

		self._writeFile('mimetype', MIMETYPE)
		self._writeFile('META-INF/container.xml', CONTAINER)
		self._writeFile('OEBPS/stylesheet.css', CSS)

	def writeChapter(self, index, title, text):
		title = removeEntities(title)
		logging.debug("Writing chapter: %s" % title)
		fileName="chapter%04d.xhtml" % index

		filePath = self.directory + "/OEBPS/" + fileName

		fn = 'OEBPS/' + fileName

#		f = open(filePath, 'w')

		text = removeEntities(text)

		# BeautifulStoneSoup doesn't have any selfClosingTags by default.
		# hr & br needs to be if they're going to work.
		# Some stories do use multiple br tags as their section breaks...
		self.soup = bs.BeautifulStoneSoup(text, selfClosingTags=('br','hr'))

		allTags = self.soup.findAll(recursive=True)
		for t in allTags:
			for attr in t._getAttrMap().keys():
				if attr not in acceptable_attributes:
					del t[attr]
			# these are not acceptable strict XHTML.  But we do already have
			# CSS classes of the same names defined in constants.py
			if t.name in ('u'):
				t['class']=t.name
				t.name='span'
			if t.name in ('center'):
				t['class']=t.name
				t.name='div'
			# removes paired, but empty tags.
			if t.string != None and len(t.string.strip()) == 0 :
				t.extract()

		text = self.soup.__str__('utf8')

		# ffnet(& maybe others) gives the whole chapter text
		# as one line.  This causes problems for nook(at
		# least) when the chapter size starts getting big
		# (200k+) Using Soup's prettify() messes up italics
		# and such.  Done after soup extract so <p> and <br>
		# tags are normalized.  Doing it here seems less evil
		# than hacking BeautifulSoup, but it's debatable.
		text = text.replace('</p>','</p>\n').replace('<br />','<br />\n')

		self._writeFile(fn, XHTML_START % (title, title))
		self._writeFile(fn, text)
		self._writeFile(fn, XHTML_END)
#		print >> f, XHTML_START % (title, title)
#		f.write(text)
#		print >> f, XHTML_END

		self.chapters.append((title, fileName))

	def finalise(self):
		logging.debug("Finalising...")
		### writing table of contents -- ncx file

		tocFilePath = "OEBPS/toc.ncx"
#		toc = open(tocFilePath, 'w')
#		print >> toc, TOC_START % self.storyTitle
		self._writeFile(tocFilePath, TOC_START % (self.adapter.getUUID(), self.storyTitle))

		published = self.adapter.getStoryPublished().strftime("%Y-%m-%d")
		createda = self.adapter.getStoryCreated().strftime("%Y-%m-%d %H:%M:%S")
		created = self.adapter.getStoryCreated().strftime("%Y-%m-%d")
		updated = self.adapter.getStoryUpdated().strftime("%Y-%m-%d")
		calibre = self.adapter.getStoryUpdated().strftime("%Y-%m-%dT%H:%M:%S")

		description = self.adapter.getStoryDescription()
		if hasattr(description, "text"):
			description = str(description.text)
		else:
			description = str(description)
		if description is not None and len(description) > 0:
			description = description.replace ('\\\'', '').replace('\\\"', '')
			description =  removeEntities(description.replace('&nbsp;',' ').replace('&rsquo;',''))
		else:
			description = ' '

		### writing content -- title page
		titleFilePath = "OEBPS/title_page.xhtml"
		self._writeFile(titleFilePath, TITLE_HEADER % (self.authorName, self.storyTitle, self.adapter.getStoryURL(), self.storyTitle, self.adapter.getAuthorURL(), self.authorName))
		self._writeFile(titleFilePath, TITLE_ENTRY % ('Category:', self.adapter.getCategory()))
		self._writeFile(titleFilePath, TITLE_ENTRY % ('Genre:', self.adapter.getGenre()))
		self._writeFile(titleFilePath, TITLE_ENTRY % ('Status:', self.adapter.getStoryStatus()))
		self._writeFile(titleFilePath, TITLE_ENTRY % ('Published:', published))
		self._writeFile(titleFilePath, TITLE_ENTRY % ('Updated:', updated))
		self._writeFile(titleFilePath, TITLE_ENTRY % ('Packaged:', createda))
		tmpstr = self.adapter.getStoryRating() + " / " + self.adapter.getStoryUserRating()
		self._writeFile(titleFilePath, TITLE_ENTRY % ('Rating Age/User:', tmpstr))
		tmpstr = str(self.adapter.getNumChapters()) + " / " + str(self.adapter.getNumWords())
		self._writeFile(titleFilePath, TITLE_ENTRY % ('Chapters/Words:', tmpstr))
		self._writeFile(titleFilePath, TITLE_ENTRY % ('Publisher:', self.adapter.getHost()))
		self._writeFile(titleFilePath, TITLE_ENTRY % ('Story ID:', self.adapter.getStoryId()))
		self._writeFile(titleFilePath, TITLE_ENTRY % ('Author ID:', self.adapter.getAuthorId()))

		self._writeFile(titleFilePath, TITLE_FOOTER % description )

		### writing content -- opf file
		opfFilePath = "OEBPS/content.opf"

#		opf = open(opfFilePath, 'w')
		self._writeFile(opfFilePath, CONTENT_START % (uuid.uuid4().urn, self.storyTitle, self.authorName, self.adapter.getLanguageId(), published, created, updated, calibre, description))

		i = 0
		subjs = []
		subjs = self.adapter.getSubjects()
		for subj in subjs:
			self._writeFile(opfFilePath, CONTENT_SUBJECT % subj)
			i = i + 1
		if (i <= 0):
			self._writeFile(opfFilePath, CONTENT_SUBJECT % "FanFiction")

		self._writeFile(opfFilePath, CONTENT_END_METADATA % (self.adapter.getPublisher(), self.adapter.getUUID(), self.adapter.getStoryURL(), self.adapter.getStoryURL(), self.adapter.getStoryUserRating()))
#		print >> opf, CONTENT_START % (uuid.uuid4().urn, self.storyTitle, self.authorName)

		ids = []

		i = 0

		t = "Title Page"
		f = "title_page.xhtml"
		chapterId = "Title Page"
		self._writeFile(tocFilePath, TOC_ITEM % (chapterId, i, t, f))
		self._writeFile(opfFilePath, CONTENT_ITEM % (chapterId, f))

		ids.append(chapterId)

		i = i + 1

		for t,f in self.chapters:
			chapterId = "chapter%04d" % i

			self._writeFile(tocFilePath, TOC_ITEM % (chapterId, i, t, f))
			self._writeFile(opfFilePath, CONTENT_ITEM % (chapterId, f))

			ids.append(chapterId)

			i = i + 1

#		logging.d('Toc and refs printed, proceesing to ref-ids....')

		self._writeFile(tocFilePath, TOC_END)
		self._writeFile(opfFilePath, CONTENT_END_MANIFEST)

		for chapterId in ids:
			self._writeFile(opfFilePath, CONTENT_ITEMREF % chapterId)

		self._writeFile(opfFilePath, CONTENT_END)

		self._closeFiles()

		filename = self.directory + '.epub'

		zipdata = zipdir.inMemoryZip(self.files)

		if self.writeToFile:
			f = open(filename, 'wb')
			f.write(zipdata.getvalue())
			f.close()
		else:
			self.output = zipdata

#		zipdir.toZip(filename, self.directory)

def unirepl(match):
	"Return the unicode string for a decimal number"
	s = match.group()
	if s[2].lower()=='x':
		radix=16
	else:
		radix=10
	value = int(s[3:-1], radix )
	return unichr(value)

def replaceNumberEntities(data):
	p = re.compile(r'&#(x?)(\d+);')
	return p.sub(unirepl, data)

def removeEntities(text):
	# replace numeric versions of [&<>] with named versions.

	try:
		t = text.decode('utf-8')
	except UnicodeEncodeError, e:
		try:
			t = text.encode ('ascii', 'xmlcharrefreplace')
		except UnicodeEncodeError, e:
			t = text
	text = t
	text = re.sub(r'&#0*38;','&amp;',text)
	text = re.sub(r'&#0*60;','&lt;',text)
	text = re.sub(r'&#0*62;','&gt;',text)

	# replace remaining &#000; entities with unicode value, such as &#039; -> '
	text = replaceNumberEntities(text)

	# replace several named entities with character, such as &mdash; -> -
	# see constants.py for the list.
	# reverse sort will put entities with ; before the same one without, when valid.
	for e in reversed(sorted(entities.keys())):
		v = entities[e]
		try:
			text = text.replace(e, v)
		except UnicodeDecodeError, ex:
			# for the pound symbol in constants.py
			text = text.replace(e, v.decode('utf-8'))

	# &lt; &lt; and &amp; are the only html entities allowed in xhtml, put those back.
	text = text.replace('&', '&amp;').replace('&amp;lt;', '&lt;').replace('&amp;gt;', '&gt;')

	return text

def makeAcceptableFilename(text):
	return re.sub('[^a-zA-Z0-9_\'-]+','',removeEntities(text).replace(" ", "_").replace(":","_"))