FanFicFare/output.py

# -*- coding: utf-8 -*-

import os
import re
import sys
import cgi
import uuid
import codecs
import shutil
import string
import os.path
import zipfile
import StringIO
import logging
import hashlib
import urllib as u
import pprint as pp
import urllib2 as u2
import urlparse as up
import BeautifulSoup as bs
import htmlentitydefs as hdefs

import zipdir
import html_constants
from constants import *


import html2text
import datetime


class FanficWriter:
	def __init__(self):
		pass

	def writeChapter(self, index, title, text):
		pass

	def finalise(self):
		pass

class TextWriter(FanficWriter):
	htmlWriter = None

	def __init__(self, base, adapter, inmemory=False, compress=False):
		self.htmlWriter = HTMLWriter(base, adapter, True, False)

	def writeChapter(self, index, title, text):
		self.htmlWriter.writeChapter(index, title, text)

	def finalise(self):
		self.htmlWriter.finalise()
		self.output = StringIO.StringIO()
		self.output.write(html2text.html2text(self.htmlWriter.output.getvalue().decode('utf-8')).encode('utf-8'))
		self.name = self.htmlWriter.name


class HTMLWriter(FanficWriter):
	body = ''

	def __init__(self, base, adapter, inmemory=False, compress=False):
		self.basePath = base
		self.storyTitle = removeEntities(adapter.getStoryName())
		self.name = makeAcceptableFilename(adapter.getOutputName())
		self.fileName = self.basePath + '/' + self.name + '.html'
		self.authorName = removeEntities(adapter.getAuthorName())
		self.adapter = adapter

		self.inmemory = inmemory

		if not self.inmemory and os.path.exists(self.fileName):
			os.remove(self.fileName)

		if self.inmemory:
			self.output = StringIO.StringIO()
		else:
			self.output = open(self.fileName, 'w')

		self.xhtmlTemplate = string.Template(html_constants.XHTML_START)
		self.chapterStartTemplate = string.Template(html_constants.XHTML_CHAPTER_START)

	def _printableVersion(self, text):
		try:
			d = text.decode('utf-8')
			return d
		except:
			return text

	def writeChapter(self, index, title, text):
		title = self._printableVersion(title) #title.decode('utf-8')
		text = self._printableVersion(text) #text.decode('utf-8')
		self.body = self.body + '\n' + self.chapterStartTemplate.substitute({'chapter' : title})
		self.body = self.body + '\n' + text

	def finalise(self):
		html = self.xhtmlTemplate.substitute({'title' : self.storyTitle, 'author' : self.authorName, 'body' : self.body})
		soup = bs.BeautifulSoup(html)
		result = soup.__str__('utf8')

#		f = open(self.fileName, 'w')
#		f.write(result)
#		f.close()

		self.output.write(result)
		if not self.inmemory:
			self.output.close()

class EPubFanficWriter(FanficWriter):
	chapters = []

	files = {}

	def _writeFile(self, fileName, data):
		#logging.debug('_writeFile(`%s`, data)' % fileName)
		if fileName in self.files:
			try:
				d = data.decode('utf-8')
			except UnicodeEncodeError, e:
				d = data

			self.files[fileName].write(d)
		else:
			if self.inmemory:
				self.files[fileName] = StringIO.StringIO()
			else:
				self.files[fileName] = open(self.directory + '/' + fileName, 'w')

			self._writeFile(fileName, data)


	def _closeFiles(self):
		if not self.inmemory:
			for f in self.files:
				self.files[f].close()

	def __init__(self, base, adapter, inmemory=False, compress=True):
		self.basePath = base
		self.storyTitle = removeEntities(adapter.getStoryName())
		self.name = makeAcceptableFilename(adapter.getOutputName())
		self.directory = self.basePath + '/' + self.name
		self.authorName = removeEntities(adapter.getAuthorName())
		self.inmemory = inmemory
		self.adapter = adapter

		self.files = {}
		self.chapters = []

		if not self.inmemory:
			self.inmemory = True
			self.writeToFile = True
		else:
			self.writeToFile = False


		if not self.inmemory:
			if os.path.exists(self.directory):
				shutil.rmtree(self.directory)

			os.mkdir(self.directory)

			os.mkdir(self.directory + '/META-INF')
			os.mkdir(self.directory + '/OEBPS')

		self._writeFile('mimetype', MIMETYPE)
		self._writeFile('META-INF/container.xml', CONTAINER)
		self._writeFile('OEBPS/stylesheet.css', CSS)

	def writeChapter(self, index, title, text):
		title = removeEntities(title)
		logging.debug("Writing chapter: %s" % title)
		fileName="chapter%04d.xhtml" % index

		filePath = self.directory + "/OEBPS/" + fileName

		fn = 'OEBPS/' + fileName

#		f = open(filePath, 'w')

		text = removeEntities(text)

		# BeautifulStoneSoup doesn't have any selfClosingTags by default.
		# hr & br needs to be if they're going to work.
		# Some stories do use multiple br tags as their section breaks...
		self.soup = bs.BeautifulStoneSoup(text.decode('utf-8'), selfClosingTags=('br','hr'))

		allTags = self.soup.findAll(recursive=True)
		for t in allTags:
			for attr in t._getAttrMap().keys():
				if attr not in acceptable_attributes:
					del t[attr]
			# these are not acceptable strict XHTML.  But we do already have
			# CSS classes of the same names defined in constants.py
			if t.name in ('u'):
				t['class']=t.name
				t.name='span'
			if t.name in ('center'):
				t['class']=t.name
				t.name='div'
			# removes paired, but empty tags.
			if t.string != None and len(t.string.strip()) == 0 :
				t.extract()

		text = self.soup.__str__('utf8')

		# ffnet(& maybe others) gives the whole chapter text
		# as one line.  This causes problems for nook(at
		# least) when the chapter size starts getting big
		# (200k+) Using Soup's prettify() messes up italics
		# and such.  Done after soup extract so <p> and <br>
		# tags are normalized.  Doing it here seems less evil
		# than hacking BeautifulSoup, but it's debatable.
		text = text.replace('</p>','</p>\n').replace('<br />','<br />\n')

		self._writeFile(fn, XHTML_START % (title, title))
		self._writeFile(fn, text)
		self._writeFile(fn, XHTML_END)
#		print >> f, XHTML_START % (title, title)
#		f.write(text)
#		print >> f, XHTML_END

		self.chapters.append((title, fileName))

	def finalise(self):
		logging.debug("Finalising...")
		### writing table of contents -- ncx file

		tocFilePath = "OEBPS/toc.ncx"
#		toc = open(tocFilePath, 'w')
#		print >> toc, TOC_START % self.storyTitle
		self._writeFile(tocFilePath, TOC_START % (self.adapter.getUUID(), self.storyTitle))

		published = self.adapter.getStoryPublished().strftime("%Y-%m-%d")
		createda = self.adapter.getStoryCreated().strftime("%Y-%m-%d %H:%M:%S")
		created = self.adapter.getStoryCreated().strftime("%Y-%m-%d")
		updated = self.adapter.getStoryUpdated().strftime("%Y-%m-%d")
		calibre = self.adapter.getStoryUpdated().strftime("%Y-%m-%dT%H:%M:%S")

		description = self.adapter.getStoryDescription()
		if hasattr(description, "text"):
			description = str(description.text)
		else:
			description = str(description)
		if description is not None and len(description) > 0:
			description =  removeEntities(description.replace('&nbsp;',' ').replace('&rsquo;',''))
		### writing content -- title page
		titleFilePath = "OEBPS/title_page.xhtml"
		self._writeFile(titleFilePath, TITLE_PAGE % (self.authorName, self.storyTitle, self.adapter.getStoryURL(), self.storyTitle, self.adapter.getAuthorURL(), self.authorName, self.adapter.getCategory(), self.adapter.getGenre(), self.adapter.getStoryStatus(), published, updated, createda, self.adapter.getStoryRating(), self.adapter.getStoryUserRating(), self.adapter.getNumChapters(), self.adapter.getNumWords(), self.adapter.getStoryURL(), self.adapter.getStoryURL(), description))

		### writing content -- opf file
		opfFilePath = "OEBPS/content.opf"

#		opf = open(opfFilePath, 'w')
		self._writeFile(opfFilePath, CONTENT_START % (uuid.uuid4().urn, self.storyTitle, self.authorName, self.adapter.getLanguageId(), published, created, updated, calibre, description))

		i = 0
		subjs = []
		subjs = self.adapter.getSubjects()
		for subj in subjs:
			self._writeFile(opfFilePath, CONTENT_SUBJECT % subj)
			i = i + 1
		if (i <= 0):
			self._writeFile(opfFilePath, CONTENT_SUBJECT % "FanFiction")

		self._writeFile(opfFilePath, CONTENT_END_METADATA % (self.adapter.getPublisher(), self.adapter.getUUID(), self.adapter.getStoryURL(), self.adapter.getStoryURL(), self.adapter.getStoryUserRating()))
#		print >> opf, CONTENT_START % (uuid.uuid4().urn, self.storyTitle, self.authorName)

		ids = []

		i = 0

		t = "Title Page"
		f = "title_page.xhtml"
		chapterId = "Title Page"
		self._writeFile(tocFilePath, TOC_ITEM % (chapterId, i, t, f))
		self._writeFile(opfFilePath, CONTENT_ITEM % (chapterId, f))

		ids.append(chapterId)

		i = i + 1

		for t,f in self.chapters:
			chapterId = "chapter%04d" % i

			self._writeFile(tocFilePath, TOC_ITEM % (chapterId, i, t, f))
			self._writeFile(opfFilePath, CONTENT_ITEM % (chapterId, f))

			ids.append(chapterId)

			i = i + 1

#		logging.d('Toc and refs printed, proceesing to ref-ids....')

		self._writeFile(tocFilePath, TOC_END)
		self._writeFile(opfFilePath, CONTENT_END_MANIFEST)

		for chapterId in ids:
			self._writeFile(opfFilePath, CONTENT_ITEMREF % chapterId)

		self._writeFile(opfFilePath, CONTENT_END)

		self._closeFiles()

		filename = self.directory + '.epub'

		zipdata = zipdir.inMemoryZip(self.files)

		if self.writeToFile:
			f = open(filename, 'wb')
			f.write(zipdata.getvalue())
			f.close()
		else:
			self.output = zipdata

#		zipdir.toZip(filename, self.directory)

def unirepl(match):
	"Return the unicode string for a decimal number"
	s = match.group()
	if s[2].lower()=='x':
		radix=16
	else:
		radix=10
	value = int(s[3:-1], radix )
	return unichr(value)

def replaceNumberEntities(data):
	p = re.compile(r'&#(x?)(\d+);')
	return p.sub(unirepl, data)

def removeEntities(text):
	# replace numeric versions of [&<>] with named versions.
	text = re.sub(r'&#0*38;','&amp;',text)
	text = re.sub(r'&#0*60;','&lt;',text)
	text = re.sub(r'&#0*62;','&gt;',text)

	# replace remaining &#000; entities with unicode value, such as &#039; -> '
	text = replaceNumberEntities(text)

	# replace several named entities with character, such as &mdash; -> -
	# see constants.py for the list.
	# reverse sort will put entities with ; before the same one without, when valid.
	for e in reversed(sorted(entities.keys())):
		v = entities[e]
		try:
			text = text.replace(e, v)
		except UnicodeDecodeError, ex:
			# for the pound symbol in constants.py
			text = text.replace(e, v.decode('utf-8'))

	# &lt; &lt; and &amp; are the only html entities allowed in xhtml, put those back.
	text = text.replace('&', '&amp;').replace('&amp;lt;', '&lt;').replace('&amp;gt;', '&gt;')

	return text

def makeAcceptableFilename(text):
	return re.sub('[^a-zA-Z0-9_\'-]+','',removeEntities(text).replace(" ", "_").replace(":","_"))