mirror of
https://github.com/JimmXinu/FanFicFare.git
synced 2025-12-06 08:52:55 +01:00
310 lines
8.2 KiB
Python
310 lines
8.2 KiB
Python
# -*- coding: utf-8 -*-
|
|
|
|
import os
|
|
import re
|
|
import sys
|
|
import cgi
|
|
import uuid
|
|
import codecs
|
|
import shutil
|
|
import string
|
|
import os.path
|
|
import zipfile
|
|
import StringIO
|
|
import logging
|
|
import hashlib
|
|
import urllib as u
|
|
import pprint as pp
|
|
import urllib2 as u2
|
|
import urlparse as up
|
|
import BeautifulSoup as bs
|
|
import htmlentitydefs as hdefs
|
|
|
|
import zipdir
|
|
import html_constants
|
|
from constants import *
|
|
|
|
|
|
import html2text
|
|
|
|
|
|
class FanficWriter:
|
|
def __init__(self):
|
|
pass
|
|
|
|
def writeChapter(self, index, title, text):
|
|
pass
|
|
|
|
def finalise(self):
|
|
pass
|
|
|
|
class TextWriter(FanficWriter):
|
|
htmlWriter = None
|
|
|
|
def __init__(self, base, name, author, inmemory=False, compress=False):
|
|
self.htmlWriter = HTMLWriter(base, name, author, True, False)
|
|
|
|
def writeChapter(self, index, title, text):
|
|
self.htmlWriter.writeChapter(index, title, text)
|
|
|
|
def finalise(self):
|
|
self.htmlWriter.finalise()
|
|
self.output = StringIO.StringIO()
|
|
self.output.write(html2text.html2text(self.htmlWriter.output.getvalue().decode('utf-8')).encode('utf-8'))
|
|
self.name = self.htmlWriter.name
|
|
|
|
|
|
class HTMLWriter(FanficWriter):
|
|
body = ''
|
|
|
|
def __init__(self, base, name, author, inmemory=False, compress=False):
|
|
self.basePath = base
|
|
self.storyTitle = removeEntities(name)
|
|
self.name = makeAcceptableFilename(name)
|
|
self.fileName = self.basePath + '/' + self.name + '.html'
|
|
self.authorName = removeEntities(author)
|
|
|
|
self.inmemory = inmemory
|
|
|
|
if not self.inmemory and os.path.exists(self.fileName):
|
|
os.remove(self.fileName)
|
|
|
|
if self.inmemory:
|
|
self.output = StringIO.StringIO()
|
|
else:
|
|
self.output = open(self.fileName, 'w')
|
|
|
|
self.xhtmlTemplate = string.Template(html_constants.XHTML_START)
|
|
self.chapterStartTemplate = string.Template(html_constants.XHTML_CHAPTER_START)
|
|
|
|
def _printableVersion(self, text):
|
|
try:
|
|
d = text.decode('utf-8')
|
|
return d
|
|
except:
|
|
return text
|
|
|
|
def writeChapter(self, index, title, text):
|
|
title = self._printableVersion(title) #title.decode('utf-8')
|
|
text = self._printableVersion(text) #text.decode('utf-8')
|
|
self.body = self.body + '\n' + self.chapterStartTemplate.substitute({'chapter' : title})
|
|
self.body = self.body + '\n' + text
|
|
|
|
def finalise(self):
|
|
html = self.xhtmlTemplate.substitute({'title' : self.storyTitle, 'author' : self.authorName, 'body' : self.body})
|
|
soup = bs.BeautifulSoup(html)
|
|
result = soup.__str__('utf8')
|
|
|
|
# f = open(self.fileName, 'w')
|
|
# f.write(result)
|
|
# f.close()
|
|
|
|
self.output.write(result)
|
|
if not self.inmemory:
|
|
self.output.close()
|
|
|
|
class EPubFanficWriter(FanficWriter):
|
|
chapters = []
|
|
|
|
files = {}
|
|
|
|
def _writeFile(self, fileName, data):
|
|
#logging.debug('_writeFile(`%s`, data)' % fileName)
|
|
if fileName in self.files:
|
|
try:
|
|
d = data.decode('utf-8')
|
|
except UnicodeEncodeError, e:
|
|
d = data
|
|
|
|
self.files[fileName].write(d)
|
|
else:
|
|
if self.inmemory:
|
|
self.files[fileName] = StringIO.StringIO()
|
|
else:
|
|
self.files[fileName] = open(self.directory + '/' + fileName, 'w')
|
|
|
|
self._writeFile(fileName, data)
|
|
|
|
|
|
def _closeFiles(self):
|
|
if not self.inmemory:
|
|
for f in self.files:
|
|
self.files[f].close()
|
|
|
|
def __init__(self, base, name, author, inmemory=False, compress=True):
|
|
self.basePath = base
|
|
self.storyTitle = removeEntities(name)
|
|
self.name = makeAcceptableFilename(name)
|
|
self.directory = self.basePath + '/' + self.name
|
|
self.authorName = removeEntities(author+" aa")
|
|
|
|
self.inmemory = inmemory
|
|
|
|
self.files = {}
|
|
self.chapters = []
|
|
|
|
if not self.inmemory:
|
|
self.inmemory = True
|
|
self.writeToFile = True
|
|
else:
|
|
self.writeToFile = False
|
|
|
|
|
|
|
|
if not self.inmemory:
|
|
if os.path.exists(self.directory):
|
|
shutil.rmtree(self.directory)
|
|
|
|
os.mkdir(self.directory)
|
|
|
|
os.mkdir(self.directory + '/META-INF')
|
|
os.mkdir(self.directory + '/OEBPS')
|
|
|
|
self._writeFile('mimetype', MIMETYPE)
|
|
self._writeFile('META-INF/container.xml', CONTAINER)
|
|
self._writeFile('OEBPS/stylesheet.css', CSS)
|
|
|
|
def writeChapter(self, index, title, text):
|
|
title = removeEntities(title)
|
|
logging.debug("Writing chapter: %s" % title)
|
|
fileName="chapter%04d.xhtml" % index
|
|
|
|
filePath = self.directory + "/OEBPS/" + fileName
|
|
|
|
fn = 'OEBPS/' + fileName
|
|
|
|
# f = open(filePath, 'w')
|
|
|
|
text = removeEntities(text)
|
|
|
|
# BeautifulStoneSoup doesn't have any selfClosingTags by default.
|
|
# hr & br needs to be if they're going to work.
|
|
# Some stories do use multiple br tags as their section breaks...
|
|
self.soup = bs.BeautifulStoneSoup(text.decode('utf-8'), selfClosingTags=('br','hr'))
|
|
|
|
allTags = self.soup.findAll(recursive=True)
|
|
for t in allTags:
|
|
for attr in t._getAttrMap().keys():
|
|
if attr not in acceptable_attributes:
|
|
del t[attr]
|
|
# these are not acceptable strict XHTML. But we do already have
|
|
# CSS classes of the same names defined in constants.py
|
|
if t.name in ('u'):
|
|
t['class']=t.name
|
|
t.name='span'
|
|
if t.name in ('center'):
|
|
t['class']=t.name
|
|
t.name='div'
|
|
# removes paired, but empty tags.
|
|
if t.string != None and len(t.string.strip()) == 0 :
|
|
t.extract()
|
|
|
|
text = self.soup.__str__('utf8')
|
|
|
|
# ffnet(& maybe others) gives the whole chapter text
|
|
# as one line. This causes problems for nook(at
|
|
# least) when the chapter size starts getting big
|
|
# (200k+) Using Soup's prettify() messes up italics
|
|
# and such. Done after soup extract so <p> and <br>
|
|
# tags are normalized. Doing it here seems less evil
|
|
# than hacking BeautifulSoup, but it's debatable.
|
|
text = text.replace('</p>','</p>\n').replace('<br />','<br />\n')
|
|
|
|
self._writeFile(fn, XHTML_START % (title, title))
|
|
self._writeFile(fn, text)
|
|
self._writeFile(fn, XHTML_END)
|
|
# print >> f, XHTML_START % (title, title)
|
|
# f.write(text)
|
|
# print >> f, XHTML_END
|
|
|
|
self.chapters.append((title, fileName))
|
|
|
|
def finalise(self):
|
|
logging.debug("Finalising...")
|
|
### writing table of contents -- ncx file
|
|
|
|
tocFilePath = "OEBPS/toc.ncx"
|
|
# toc = open(tocFilePath, 'w')
|
|
# print >> toc, TOC_START % self.storyTitle
|
|
self._writeFile(tocFilePath, TOC_START % self.storyTitle)
|
|
### writing content -- opf file
|
|
opfFilePath = "OEBPS/content.opf"
|
|
|
|
# opf = open(opfFilePath, 'w')
|
|
self._writeFile(opfFilePath, CONTENT_START % (self.storyTitle, self.authorName, uuid.uuid4().urn))
|
|
# print >> opf, CONTENT_START % (uuid.uuid4().urn, self.storyTitle, self.authorName)
|
|
|
|
ids = []
|
|
|
|
i = 1
|
|
for t,f in self.chapters:
|
|
chapterId = "chapter%04d" % i
|
|
|
|
self._writeFile(tocFilePath, TOC_ITEM % (chapterId, i, t, f))
|
|
self._writeFile(opfFilePath, CONTENT_ITEM % (chapterId, f))
|
|
|
|
ids.append(chapterId)
|
|
|
|
i = i + 1
|
|
|
|
# logging.d('Toc and refs printed, proceesing to ref-ids....')
|
|
|
|
self._writeFile(tocFilePath, TOC_END)
|
|
self._writeFile(opfFilePath, CONTENT_END_MANIFEST)
|
|
|
|
for chapterId in ids:
|
|
self._writeFile(opfFilePath, CONTENT_ITEMREF % chapterId)
|
|
|
|
self._writeFile(opfFilePath, CONTENT_END)
|
|
|
|
self._closeFiles()
|
|
|
|
filename = self.directory + '.epub'
|
|
|
|
zipdata = zipdir.inMemoryZip(self.files)
|
|
|
|
if self.writeToFile:
|
|
f = open(filename, 'wb')
|
|
f.write(zipdata.getvalue())
|
|
f.close()
|
|
else:
|
|
self.output = zipdata
|
|
|
|
# zipdir.toZip(filename, self.directory)
|
|
|
|
def unirepl(match):
|
|
"Return the unicode string for a decimal number"
|
|
s = match.group()
|
|
value = int(s[2:-1])
|
|
return unichr(value)
|
|
|
|
def replaceNumberEntities(data):
|
|
p = re.compile(r'&#(\d+);')
|
|
return p.sub(unirepl, data)
|
|
|
|
def removeEntities(text):
|
|
# replace numeric versions of [&<>] with named versions.
|
|
text = re.sub(r'�*38;','&',text)
|
|
text = re.sub(r'�*60;','<',text)
|
|
text = re.sub(r'�*62;','>',text)
|
|
|
|
# replace remaining � entities with unicode value, such as ' -> '
|
|
text = replaceNumberEntities(text)
|
|
|
|
# replace several named entities with character, such as — -> -
|
|
# see constants.py for the list.
|
|
for e in entities:
|
|
v = entities[e]
|
|
try:
|
|
text = text.replace(e, v)
|
|
except UnicodeDecodeError, ex:
|
|
# for the pound symbol in constants.py
|
|
text = text.replace(e, v.decode('utf-8'))
|
|
|
|
# < < and & are the only html entities allowed in xhtml, put those back.
|
|
text = text.replace('&', '&').replace('&lt;', '<').replace('&gt;', '>')
|
|
|
|
return text
|
|
|
|
def makeAcceptableFilename(text):
|
|
return re.sub('[^a-zA-Z0-9_\'-]+','',removeEntities(text).replace(" ", "_").replace(":","_"))
|