FanFicFare/output.py
wsuetholz ed4ff4b6ab Trying to fix outputting some utf-8 text. Added the utf-8 header to all the source code.
Ended up modifying the removeEntities function to do a weird decode/encode step on the text passed in.  This seems to at least stop things from crashing..
2010-11-10 13:47:13 -06:00

386 lines
12 KiB
Python

# -*- coding: utf-8 -*-
import os
import re
import sys
import cgi
import uuid
import unicodedata
import codecs
import shutil
import string
import os.path
import zipfile
import StringIO
import logging
import hashlib
import urllib as u
import pprint as pp
import urllib2 as u2
import urlparse as up
import BeautifulSoup as bs
import htmlentitydefs as hdefs
import zipdir
import html_constants
from constants import *
import html2text
import datetime
class FanficWriter:
def __init__(self):
pass
def writeChapter(self, index, title, text):
pass
def finalise(self):
pass
class TextWriter(FanficWriter):
htmlWriter = None
def __init__(self, base, adapter, inmemory=False, compress=False):
self.htmlWriter = HTMLWriter(base, adapter, True, False)
def writeChapter(self, index, title, text):
self.htmlWriter.writeChapter(index, title, text)
def finalise(self):
self.htmlWriter.finalise()
self.output = StringIO.StringIO()
self.output.write(html2text.html2text(self.htmlWriter.output.getvalue().decode('utf-8')).encode('utf-8'))
self.name = self.htmlWriter.name
class HTMLWriter(FanficWriter):
body = ''
def __init__(self, base, adapter, inmemory=False, compress=False):
self.basePath = base
self.storyTitle = removeEntities(adapter.getStoryName())
self.name = makeAcceptableFilename(adapter.getOutputName())
self.fileName = self.basePath + '/' + self.name + '.html'
self.authorName = removeEntities(adapter.getAuthorName())
self.adapter = adapter
self.inmemory = inmemory
if not self.inmemory and os.path.exists(self.fileName):
os.remove(self.fileName)
if self.inmemory:
self.output = StringIO.StringIO()
else:
self.output = open(self.fileName, 'w')
self.xhtmlTemplate = string.Template(html_constants.XHTML_START)
self.chapterStartTemplate = string.Template(html_constants.XHTML_CHAPTER_START)
def _printableVersion(self, text):
try:
d = text.decode('utf-8')
return d
except:
return text
def writeChapter(self, index, title, text):
title = self._printableVersion(title) #title.decode('utf-8')
text = self._printableVersion(text) #text.decode('utf-8')
self.body = self.body + '\n' + self.chapterStartTemplate.substitute({'chapter' : title})
self.body = self.body + '\n' + text
def finalise(self):
html = self.xhtmlTemplate.substitute({'title' : self.storyTitle, 'author' : self.authorName, 'body' : self.body})
soup = bs.BeautifulSoup(html)
result = soup.__str__('utf8')
# f = open(self.fileName, 'w')
# f.write(result)
# f.close()
self.output.write(result)
if not self.inmemory:
self.output.close()
class EPubFanficWriter(FanficWriter):
chapters = []
files = {}
def _writeFile(self, fileName, data):
#logging.debug('_writeFile(`%s`, data)' % fileName)
if fileName in self.files:
try:
d = data.decode('utf-8')
except UnicodeEncodeError, e:
d = data
self.files[fileName].write(d)
else:
if self.inmemory:
self.files[fileName] = StringIO.StringIO()
else:
self.files[fileName] = open(self.directory + '/' + fileName, encoding='utf-8', mode='w')
self._writeFile(fileName, data)
def _closeFiles(self):
if not self.inmemory:
for f in self.files:
self.files[f].close()
def __init__(self, base, adapter, inmemory=False, compress=True):
self.basePath = base
self.storyTitle = removeEntities(adapter.getStoryName())
self.name = makeAcceptableFilename(adapter.getOutputName())
self.directory = self.basePath + '/' + self.name
self.authorName = removeEntities(adapter.getAuthorName())
self.inmemory = inmemory
self.adapter = adapter
self.files = {}
self.chapters = []
if not self.inmemory:
self.inmemory = True
self.writeToFile = True
else:
self.writeToFile = False
if not self.inmemory:
if os.path.exists(self.directory):
shutil.rmtree(self.directory)
os.mkdir(self.directory)
os.mkdir(self.directory + '/META-INF')
os.mkdir(self.directory + '/OEBPS')
self._writeFile('mimetype', MIMETYPE)
self._writeFile('META-INF/container.xml', CONTAINER)
self._writeFile('OEBPS/stylesheet.css', CSS)
def writeChapter(self, index, title, text):
title = removeEntities(title)
logging.debug("Writing chapter: %s" % title)
fileName="chapter%04d.xhtml" % index
filePath = self.directory + "/OEBPS/" + fileName
fn = 'OEBPS/' + fileName
# f = open(filePath, 'w')
text = removeEntities(text)
# BeautifulStoneSoup doesn't have any selfClosingTags by default.
# hr & br needs to be if they're going to work.
# Some stories do use multiple br tags as their section breaks...
self.soup = bs.BeautifulStoneSoup(text, selfClosingTags=('br','hr'))
allTags = self.soup.findAll(recursive=True)
for t in allTags:
for attr in t._getAttrMap().keys():
if attr not in acceptable_attributes:
del t[attr]
# these are not acceptable strict XHTML. But we do already have
# CSS classes of the same names defined in constants.py
if t.name in ('u'):
t['class']=t.name
t.name='span'
if t.name in ('center'):
t['class']=t.name
t.name='div'
# removes paired, but empty tags.
if t.string != None and len(t.string.strip()) == 0 :
t.extract()
text = self.soup.__str__('utf8')
# ffnet(& maybe others) gives the whole chapter text
# as one line. This causes problems for nook(at
# least) when the chapter size starts getting big
# (200k+) Using Soup's prettify() messes up italics
# and such. Done after soup extract so <p> and <br>
# tags are normalized. Doing it here seems less evil
# than hacking BeautifulSoup, but it's debatable.
text = text.replace('</p>','</p>\n').replace('<br />','<br />\n')
self._writeFile(fn, XHTML_START % (title, title))
self._writeFile(fn, text)
self._writeFile(fn, XHTML_END)
# print >> f, XHTML_START % (title, title)
# f.write(text)
# print >> f, XHTML_END
self.chapters.append((title, fileName))
def finalise(self):
logging.debug("Finalising...")
### writing table of contents -- ncx file
tocFilePath = "OEBPS/toc.ncx"
# toc = open(tocFilePath, 'w')
# print >> toc, TOC_START % self.storyTitle
self._writeFile(tocFilePath, TOC_START % (self.adapter.getUUID(), self.storyTitle))
published = self.adapter.getStoryPublished().strftime("%Y-%m-%d")
createda = self.adapter.getStoryCreated().strftime("%Y-%m-%d %H:%M:%S")
created = self.adapter.getStoryCreated().strftime("%Y-%m-%d")
updated = self.adapter.getStoryUpdated().strftime("%Y-%m-%d")
calibre = self.adapter.getStoryUpdated().strftime("%Y-%m-%dT%H:%M:%S")
description = self.adapter.getStoryDescription()
if hasattr(description, "text"):
description = str(description.text)
else:
description = str(description)
if description is not None and len(description) > 0:
description = description.replace ('\\\'', '').replace('\\\"', '')
description = removeEntities(description.replace('&nbsp;',' ').replace('&rsquo;',''))
else:
description = ' '
### writing content -- title page
titleFilePath = "OEBPS/title_page.xhtml"
self._writeFile(titleFilePath, TITLE_HEADER % (self.authorName, self.storyTitle, self.adapter.getStoryURL(), self.storyTitle, self.adapter.getAuthorURL(), self.authorName))
self._writeFile(titleFilePath, TITLE_ENTRY % ('Category:', self.adapter.getCategory()))
self._writeFile(titleFilePath, TITLE_ENTRY % ('Genre:', self.adapter.getGenre()))
self._writeFile(titleFilePath, TITLE_ENTRY % ('Status:', self.adapter.getStoryStatus()))
self._writeFile(titleFilePath, TITLE_ENTRY % ('Published:', published))
self._writeFile(titleFilePath, TITLE_ENTRY % ('Updated:', updated))
self._writeFile(titleFilePath, TITLE_ENTRY % ('Packaged:', createda))
tmpstr = self.adapter.getStoryRating() + " / " + self.adapter.getStoryUserRating()
self._writeFile(titleFilePath, TITLE_ENTRY % ('Rating Age/User:', tmpstr))
tmpstr = str(self.adapter.getNumChapters()) + " / " + str(self.adapter.getNumWords())
self._writeFile(titleFilePath, TITLE_ENTRY % ('Chapters/Words:', tmpstr))
self._writeFile(titleFilePath, TITLE_ENTRY % ('Publisher:', self.adapter.getHost()))
self._writeFile(titleFilePath, TITLE_ENTRY % ('Story ID:', self.adapter.getStoryId()))
self._writeFile(titleFilePath, TITLE_ENTRY % ('Author ID:', self.adapter.getAuthorId()))
self._writeFile(titleFilePath, TITLE_FOOTER % description )
### writing content -- opf file
opfFilePath = "OEBPS/content.opf"
# opf = open(opfFilePath, 'w')
self._writeFile(opfFilePath, CONTENT_START % (uuid.uuid4().urn, self.storyTitle, self.authorName, self.adapter.getLanguageId(), published, created, updated, calibre, description))
i = 0
subjs = []
subjs = self.adapter.getSubjects()
for subj in subjs:
self._writeFile(opfFilePath, CONTENT_SUBJECT % subj)
i = i + 1
if (i <= 0):
self._writeFile(opfFilePath, CONTENT_SUBJECT % "FanFiction")
self._writeFile(opfFilePath, CONTENT_END_METADATA % (self.adapter.getPublisher(), self.adapter.getUUID(), self.adapter.getStoryURL(), self.adapter.getStoryURL(), self.adapter.getStoryUserRating()))
# print >> opf, CONTENT_START % (uuid.uuid4().urn, self.storyTitle, self.authorName)
ids = []
i = 0
t = "Title Page"
f = "title_page.xhtml"
chapterId = "Title Page"
self._writeFile(tocFilePath, TOC_ITEM % (chapterId, i, t, f))
self._writeFile(opfFilePath, CONTENT_ITEM % (chapterId, f))
ids.append(chapterId)
i = i + 1
for t,f in self.chapters:
chapterId = "chapter%04d" % i
self._writeFile(tocFilePath, TOC_ITEM % (chapterId, i, t, f))
self._writeFile(opfFilePath, CONTENT_ITEM % (chapterId, f))
ids.append(chapterId)
i = i + 1
# logging.d('Toc and refs printed, proceesing to ref-ids....')
self._writeFile(tocFilePath, TOC_END)
self._writeFile(opfFilePath, CONTENT_END_MANIFEST)
for chapterId in ids:
self._writeFile(opfFilePath, CONTENT_ITEMREF % chapterId)
self._writeFile(opfFilePath, CONTENT_END)
self._closeFiles()
filename = self.directory + '.epub'
zipdata = zipdir.inMemoryZip(self.files)
if self.writeToFile:
f = open(filename, 'wb')
f.write(zipdata.getvalue())
f.close()
else:
self.output = zipdata
# zipdir.toZip(filename, self.directory)
def unirepl(match):
"Return the unicode string for a decimal number"
s = match.group()
if s[2].lower()=='x':
radix=16
else:
radix=10
value = int(s[3:-1], radix )
return unichr(value)
def replaceNumberEntities(data):
p = re.compile(r'&#(x?)(\d+);')
return p.sub(unirepl, data)
def removeEntities(text):
# replace numeric versions of [&<>] with named versions.
try:
t = text.decode('utf-8')
except UnicodeEncodeError, e:
try:
t = text.encode ('ascii', 'xmlcharrefreplace')
except UnicodeEncodeError, e:
t = text
text = t
text = re.sub(r'&#0*38;','&amp;',text)
text = re.sub(r'&#0*60;','&lt;',text)
text = re.sub(r'&#0*62;','&gt;',text)
# replace remaining &#000; entities with unicode value, such as &#039; -> '
text = replaceNumberEntities(text)
# replace several named entities with character, such as &mdash; -> -
# see constants.py for the list.
# reverse sort will put entities with ; before the same one without, when valid.
for e in reversed(sorted(entities.keys())):
v = entities[e]
try:
text = text.replace(e, v)
except UnicodeDecodeError, ex:
# for the pound symbol in constants.py
text = text.replace(e, v.decode('utf-8'))
# &lt; &lt; and &amp; are the only html entities allowed in xhtml, put those back.
text = text.replace('&', '&amp;').replace('&amp;lt;', '&lt;').replace('&amp;gt;', '&gt;')
return text
def makeAcceptableFilename(text):
return re.sub('[^a-zA-Z0-9_\'-]+','',removeEntities(text).replace(" ", "_").replace(":","_"))