# -*- coding: utf-8 -*- import os import re import sys import cgi import uuid import unicodedata import codecs import shutil import string import os.path import zipfile import StringIO import logging import hashlib import urllib as u import pprint as pp import urllib2 as u2 import urlparse as up import BeautifulSoup as bs import htmlentitydefs as hdefs import zipdir import html_constants from constants import * import html2text import datetime class FanficWriter: def __init__(self): pass def writeChapter(self, index, title, text): pass def finalise(self): pass class TextWriter(FanficWriter): htmlWriter = None def __init__(self, base, adapter, inmemory=False, compress=False): self.htmlWriter = HTMLWriter(base, adapter, True, False) def writeChapter(self, index, title, text): self.htmlWriter.writeChapter(index, title, text) def finalise(self): self.htmlWriter.finalise() self.output = StringIO.StringIO() self.output.write(html2text.html2text(self.htmlWriter.output.getvalue().decode('utf-8')).encode('utf-8')) self.name = self.htmlWriter.name class HTMLWriter(FanficWriter): body = '' def __init__(self, base, adapter, inmemory=False, compress=False): self.basePath = base self.storyTitle = removeEntities(adapter.getStoryName()) self.name = makeAcceptableFilename(adapter.getOutputName()) self.fileName = self.basePath + '/' + self.name + '.html' self.authorName = removeEntities(adapter.getAuthorName()) self.adapter = adapter self.inmemory = inmemory if not self.inmemory and os.path.exists(self.fileName): os.remove(self.fileName) if self.inmemory: self.output = StringIO.StringIO() else: self.output = open(self.fileName, 'w') self.xhtmlTemplate = string.Template(html_constants.XHTML_START) self.chapterStartTemplate = string.Template(html_constants.XHTML_CHAPTER_START) def _printableVersion(self, text): try: d = text.decode('utf-8') return d except: return text def writeChapter(self, index, title, text): title = self._printableVersion(title) #title.decode('utf-8') text = self._printableVersion(text) #text.decode('utf-8') self.body = self.body + '\n' + self.chapterStartTemplate.substitute({'chapter' : title}) self.body = self.body + '\n' + text def finalise(self): html = self.xhtmlTemplate.substitute({'title' : self.storyTitle, 'author' : self.authorName, 'body' : self.body}) soup = bs.BeautifulSoup(html) result = soup.__str__('utf8') # f = open(self.fileName, 'w') # f.write(result) # f.close() self.output.write(result) if not self.inmemory: self.output.close() class EPubFanficWriter(FanficWriter): chapters = [] files = {} def _writeFile(self, fileName, data): #logging.debug('_writeFile(`%s`, data)' % fileName) if fileName in self.files: try: d = data.decode('utf-8') except UnicodeEncodeError, e: d = data self.files[fileName].write(d) else: if self.inmemory: self.files[fileName] = StringIO.StringIO() else: self.files[fileName] = open(self.directory + '/' + fileName, encoding='utf-8', mode='w') self._writeFile(fileName, data) def _closeFiles(self): if not self.inmemory: for f in self.files: self.files[f].close() def __init__(self, base, adapter, inmemory=False, compress=True): self.basePath = base self.storyTitle = removeEntities(adapter.getStoryName()) self.name = makeAcceptableFilename(adapter.getOutputName()) self.directory = self.basePath + '/' + self.name self.authorName = removeEntities(adapter.getAuthorName()) self.inmemory = inmemory self.adapter = adapter self.files = {} self.chapters = [] if not self.inmemory: self.inmemory = True self.writeToFile = True else: self.writeToFile = False if not self.inmemory: if os.path.exists(self.directory): shutil.rmtree(self.directory) os.mkdir(self.directory) os.mkdir(self.directory + '/META-INF') os.mkdir(self.directory + '/OEBPS') self._writeFile('mimetype', MIMETYPE) self._writeFile('META-INF/container.xml', CONTAINER) self._writeFile('OEBPS/stylesheet.css', CSS) def writeChapter(self, index, title, text): title = removeEntities(title) logging.debug("Writing chapter: %s" % title) fileName="chapter%04d.xhtml" % index filePath = self.directory + "/OEBPS/" + fileName fn = 'OEBPS/' + fileName # f = open(filePath, 'w') text = removeEntities(text) # BeautifulStoneSoup doesn't have any selfClosingTags by default. # hr & br needs to be if they're going to work. # Some stories do use multiple br tags as their section breaks... self.soup = bs.BeautifulStoneSoup(text, selfClosingTags=('br','hr')) allTags = self.soup.findAll(recursive=True) for t in allTags: for attr in t._getAttrMap().keys(): if attr not in acceptable_attributes: del t[attr] # these are not acceptable strict XHTML. But we do already have # CSS classes of the same names defined in constants.py if t.name in ('u'): t['class']=t.name t.name='span' if t.name in ('center'): t['class']=t.name t.name='div' # removes paired, but empty tags. if t.string != None and len(t.string.strip()) == 0 : t.extract() text = self.soup.__str__('utf8') # ffnet(& maybe others) gives the whole chapter text # as one line. This causes problems for nook(at # least) when the chapter size starts getting big # (200k+) Using Soup's prettify() messes up italics # and such. Done after soup extract so

and
# tags are normalized. Doing it here seems less evil # than hacking BeautifulSoup, but it's debatable. text = text.replace('

','

\n').replace('
','
\n') self._writeFile(fn, XHTML_START % (title, title)) self._writeFile(fn, text) self._writeFile(fn, XHTML_END) # print >> f, XHTML_START % (title, title) # f.write(text) # print >> f, XHTML_END self.chapters.append((title, fileName)) def finalise(self): logging.debug("Finalising...") ### writing table of contents -- ncx file tocFilePath = "OEBPS/toc.ncx" # toc = open(tocFilePath, 'w') # print >> toc, TOC_START % self.storyTitle self._writeFile(tocFilePath, TOC_START % (self.adapter.getUUID(), self.storyTitle)) published = self.adapter.getStoryPublished().strftime("%Y-%m-%d") createda = self.adapter.getStoryCreated().strftime("%Y-%m-%d %H:%M:%S") created = self.adapter.getStoryCreated().strftime("%Y-%m-%d") updated = self.adapter.getStoryUpdated().strftime("%Y-%m-%d") calibre = self.adapter.getStoryUpdated().strftime("%Y-%m-%dT%H:%M:%S") description = self.adapter.getStoryDescription() if hasattr(description, "text"): description = str(description.text) else: description = str(description) if description is not None and len(description) > 0: description = description.replace ('\\\'', '').replace('\\\"', '') description = removeEntities(description.replace(' ',' ').replace('’','')) else: description = ' ' ### writing content -- title page titleFilePath = "OEBPS/title_page.xhtml" self._writeFile(titleFilePath, TITLE_HEADER % (self.authorName, self.storyTitle, self.adapter.getStoryURL(), self.storyTitle, self.adapter.getAuthorURL(), self.authorName)) self._writeFile(titleFilePath, TITLE_ENTRY % ('Category:', self.adapter.getCategory())) self._writeFile(titleFilePath, TITLE_ENTRY % ('Genre:', self.adapter.getGenre())) self._writeFile(titleFilePath, TITLE_ENTRY % ('Status:', self.adapter.getStoryStatus())) self._writeFile(titleFilePath, TITLE_ENTRY % ('Published:', published)) self._writeFile(titleFilePath, TITLE_ENTRY % ('Updated:', updated)) self._writeFile(titleFilePath, TITLE_ENTRY % ('Packaged:', createda)) tmpstr = self.adapter.getStoryRating() + " / " + self.adapter.getStoryUserRating() self._writeFile(titleFilePath, TITLE_ENTRY % ('Rating Age/User:', tmpstr)) tmpstr = str(self.adapter.getNumChapters()) + " / " + str(self.adapter.getNumWords()) self._writeFile(titleFilePath, TITLE_ENTRY % ('Chapters/Words:', tmpstr)) self._writeFile(titleFilePath, TITLE_ENTRY % ('Publisher:', self.adapter.getHost())) self._writeFile(titleFilePath, TITLE_ENTRY % ('Story ID:', self.adapter.getStoryId())) self._writeFile(titleFilePath, TITLE_ENTRY % ('Author ID:', self.adapter.getAuthorId())) self._writeFile(titleFilePath, TITLE_FOOTER % description ) ### writing content -- opf file opfFilePath = "OEBPS/content.opf" # opf = open(opfFilePath, 'w') self._writeFile(opfFilePath, CONTENT_START % (uuid.uuid4().urn, self.storyTitle, self.authorName, self.adapter.getLanguageId(), published, created, updated, calibre, description)) i = 0 subjs = [] subjs = self.adapter.getSubjects() for subj in subjs: self._writeFile(opfFilePath, CONTENT_SUBJECT % subj) i = i + 1 if (i <= 0): self._writeFile(opfFilePath, CONTENT_SUBJECT % "FanFiction") self._writeFile(opfFilePath, CONTENT_END_METADATA % (self.adapter.getPublisher(), self.adapter.getUUID(), self.adapter.getStoryURL(), self.adapter.getStoryURL(), self.adapter.getStoryUserRating())) # print >> opf, CONTENT_START % (uuid.uuid4().urn, self.storyTitle, self.authorName) ids = [] i = 0 t = "Title Page" f = "title_page.xhtml" chapterId = "Title Page" self._writeFile(tocFilePath, TOC_ITEM % (chapterId, i, t, f)) self._writeFile(opfFilePath, CONTENT_ITEM % (chapterId, f)) ids.append(chapterId) i = i + 1 for t,f in self.chapters: chapterId = "chapter%04d" % i self._writeFile(tocFilePath, TOC_ITEM % (chapterId, i, t, f)) self._writeFile(opfFilePath, CONTENT_ITEM % (chapterId, f)) ids.append(chapterId) i = i + 1 # logging.d('Toc and refs printed, proceesing to ref-ids....') self._writeFile(tocFilePath, TOC_END) self._writeFile(opfFilePath, CONTENT_END_MANIFEST) for chapterId in ids: self._writeFile(opfFilePath, CONTENT_ITEMREF % chapterId) self._writeFile(opfFilePath, CONTENT_END) self._closeFiles() filename = self.directory + '.epub' zipdata = zipdir.inMemoryZip(self.files) if self.writeToFile: f = open(filename, 'wb') f.write(zipdata.getvalue()) f.close() else: self.output = zipdata # zipdir.toZip(filename, self.directory) def unirepl(match): "Return the unicode string for a decimal number" s = match.group() if s[2].lower()=='x': radix=16 else: radix=10 value = int(s[3:-1], radix ) return unichr(value) def replaceNumberEntities(data): p = re.compile(r'&#(x?)(\d+);') return p.sub(unirepl, data) def removeEntities(text): # replace numeric versions of [&<>] with named versions. try: t = text.decode('utf-8') except UnicodeEncodeError, e: try: t = text.encode ('ascii', 'xmlcharrefreplace') except UnicodeEncodeError, e: t = text text = t text = re.sub(r'�*38;','&',text) text = re.sub(r'�*60;','<',text) text = re.sub(r'�*62;','>',text) # replace remaining � entities with unicode value, such as ' -> ' text = replaceNumberEntities(text) # replace several named entities with character, such as — -> - # see constants.py for the list. # reverse sort will put entities with ; before the same one without, when valid. for e in reversed(sorted(entities.keys())): v = entities[e] try: text = text.replace(e, v) except UnicodeDecodeError, ex: # for the pound symbol in constants.py text = text.replace(e, v.decode('utf-8')) # < < and & are the only html entities allowed in xhtml, put those back. text = text.replace('&', '&').replace('&lt;', '<').replace('&gt;', '>') return text def makeAcceptableFilename(text): return re.sub('[^a-zA-Z0-9_\'-]+','',removeEntities(text).replace(" ", "_").replace(":","_"))