mirror of
https://github.com/JimmXinu/FanFicFare.git
synced 2026-04-29 02:12:10 +02:00
Modified ffnet.py to use the mangled story title if given instead of the name portion of the URL. As part of this, the Name and chapter number are now optional in the URL passed in.
376 lines
11 KiB
Python
376 lines
11 KiB
Python
# -*- coding: utf-8 -*-
|
|
|
|
import os
|
|
import re
|
|
import sys
|
|
import cgi
|
|
import uuid
|
|
import codecs
|
|
import shutil
|
|
import string
|
|
import os.path
|
|
import zipfile
|
|
import StringIO
|
|
import logging
|
|
import hashlib
|
|
import urllib as u
|
|
import pprint as pp
|
|
import urllib2 as u2
|
|
import urlparse as up
|
|
import BeautifulSoup as bs
|
|
import htmlentitydefs as hdefs
|
|
|
|
import zipdir
|
|
import html_constants
|
|
from constants import *
|
|
|
|
|
|
import html2text
|
|
import datetime
|
|
|
|
|
|
class FanficWriter:
|
|
def __init__(self):
|
|
pass
|
|
|
|
def writeChapter(self, index, title, text):
|
|
pass
|
|
|
|
def finalise(self):
|
|
pass
|
|
|
|
class TextWriter(FanficWriter):
|
|
htmlWriter = None
|
|
|
|
def __init__(self, base, adapter, inmemory=False, compress=False):
|
|
self.htmlWriter = HTMLWriter(base, adapter, True, False)
|
|
|
|
def writeChapter(self, index, title, text):
|
|
self.htmlWriter.writeChapter(index, title, text)
|
|
|
|
def finalise(self):
|
|
self.htmlWriter.finalise()
|
|
self.output = StringIO.StringIO()
|
|
self.output.write(html2text.html2text(self.htmlWriter.output.getvalue().decode('utf-8')).encode('utf-8'))
|
|
self.name = self.htmlWriter.name
|
|
|
|
|
|
class HTMLWriter(FanficWriter):
|
|
body = ''
|
|
|
|
def __init__(self, base, adapter, inmemory=False, compress=False):
|
|
self.basePath = base
|
|
self.storyTitle = removeEntities(adapter.getStoryName())
|
|
self.name = makeAcceptableFilename(adapter.getOutputName())
|
|
self.fileName = self.basePath + '/' + self.name + '.html'
|
|
self.authorName = removeEntities(adapter.getAuthorName())
|
|
self.adapter = adapter
|
|
|
|
self.inmemory = inmemory
|
|
|
|
if not self.inmemory and os.path.exists(self.fileName):
|
|
os.remove(self.fileName)
|
|
|
|
if self.inmemory:
|
|
self.output = StringIO.StringIO()
|
|
else:
|
|
self.output = open(self.fileName, 'w')
|
|
|
|
self.xhtmlTemplate = string.Template(html_constants.XHTML_START)
|
|
self.chapterStartTemplate = string.Template(html_constants.XHTML_CHAPTER_START)
|
|
|
|
def _printableVersion(self, text):
|
|
try:
|
|
d = text.decode('utf-8')
|
|
return d
|
|
except:
|
|
return text
|
|
|
|
def writeChapter(self, index, title, text):
|
|
title = self._printableVersion(title) #title.decode('utf-8')
|
|
text = self._printableVersion(text) #text.decode('utf-8')
|
|
self.body = self.body + '\n' + self.chapterStartTemplate.substitute({'chapter' : title})
|
|
self.body = self.body + '\n' + text
|
|
|
|
def finalise(self):
|
|
html = self.xhtmlTemplate.substitute({'title' : self.storyTitle, 'author' : self.authorName, 'body' : self.body})
|
|
soup = bs.BeautifulSoup(html)
|
|
result = soup.__str__('utf8')
|
|
|
|
# f = open(self.fileName, 'w')
|
|
# f.write(result)
|
|
# f.close()
|
|
|
|
self.output.write(result)
|
|
if not self.inmemory:
|
|
self.output.close()
|
|
|
|
class EPubFanficWriter(FanficWriter):
|
|
chapters = []
|
|
|
|
files = {}
|
|
|
|
def _writeFile(self, fileName, data):
|
|
#logging.debug('_writeFile(`%s`, data)' % fileName)
|
|
if fileName in self.files:
|
|
try:
|
|
d = data.decode('utf-8')
|
|
except UnicodeEncodeError, e:
|
|
d = data
|
|
|
|
self.files[fileName].write(d)
|
|
else:
|
|
if self.inmemory:
|
|
self.files[fileName] = StringIO.StringIO()
|
|
else:
|
|
self.files[fileName] = open(self.directory + '/' + fileName, 'w')
|
|
|
|
self._writeFile(fileName, data)
|
|
|
|
|
|
def _closeFiles(self):
|
|
if not self.inmemory:
|
|
for f in self.files:
|
|
self.files[f].close()
|
|
|
|
def __init__(self, base, adapter, inmemory=False, compress=True):
|
|
self.basePath = base
|
|
self.storyTitle = removeEntities(adapter.getStoryName())
|
|
self.name = makeAcceptableFilename(adapter.getOutputName())
|
|
self.directory = self.basePath + '/' + self.name
|
|
self.authorName = removeEntities(adapter.getAuthorName())
|
|
self.inmemory = inmemory
|
|
self.adapter = adapter
|
|
|
|
self.files = {}
|
|
self.chapters = []
|
|
|
|
if not self.inmemory:
|
|
self.inmemory = True
|
|
self.writeToFile = True
|
|
else:
|
|
self.writeToFile = False
|
|
|
|
|
|
|
|
if not self.inmemory:
|
|
if os.path.exists(self.directory):
|
|
shutil.rmtree(self.directory)
|
|
|
|
os.mkdir(self.directory)
|
|
|
|
os.mkdir(self.directory + '/META-INF')
|
|
os.mkdir(self.directory + '/OEBPS')
|
|
|
|
self._writeFile('mimetype', MIMETYPE)
|
|
self._writeFile('META-INF/container.xml', CONTAINER)
|
|
self._writeFile('OEBPS/stylesheet.css', CSS)
|
|
|
|
def writeChapter(self, index, title, text):
|
|
title = removeEntities(title)
|
|
logging.debug("Writing chapter: %s" % title)
|
|
fileName="chapter%04d.xhtml" % index
|
|
|
|
filePath = self.directory + "/OEBPS/" + fileName
|
|
|
|
fn = 'OEBPS/' + fileName
|
|
|
|
# f = open(filePath, 'w')
|
|
|
|
text = removeEntities(text)
|
|
|
|
# BeautifulStoneSoup doesn't have any selfClosingTags by default.
|
|
# hr & br needs to be if they're going to work.
|
|
# Some stories do use multiple br tags as their section breaks...
|
|
self.soup = bs.BeautifulStoneSoup(text.decode('utf-8'), selfClosingTags=('br','hr'))
|
|
|
|
allTags = self.soup.findAll(recursive=True)
|
|
for t in allTags:
|
|
for attr in t._getAttrMap().keys():
|
|
if attr not in acceptable_attributes:
|
|
del t[attr]
|
|
# these are not acceptable strict XHTML. But we do already have
|
|
# CSS classes of the same names defined in constants.py
|
|
if t.name in ('u'):
|
|
t['class']=t.name
|
|
t.name='span'
|
|
if t.name in ('center'):
|
|
t['class']=t.name
|
|
t.name='div'
|
|
# removes paired, but empty tags.
|
|
if t.string != None and len(t.string.strip()) == 0 :
|
|
t.extract()
|
|
|
|
text = self.soup.__str__('utf8')
|
|
|
|
# ffnet(& maybe others) gives the whole chapter text
|
|
# as one line. This causes problems for nook(at
|
|
# least) when the chapter size starts getting big
|
|
# (200k+) Using Soup's prettify() messes up italics
|
|
# and such. Done after soup extract so <p> and <br>
|
|
# tags are normalized. Doing it here seems less evil
|
|
# than hacking BeautifulSoup, but it's debatable.
|
|
text = text.replace('</p>','</p>\n').replace('<br />','<br />\n')
|
|
|
|
self._writeFile(fn, XHTML_START % (title, title))
|
|
self._writeFile(fn, text)
|
|
self._writeFile(fn, XHTML_END)
|
|
# print >> f, XHTML_START % (title, title)
|
|
# f.write(text)
|
|
# print >> f, XHTML_END
|
|
|
|
self.chapters.append((title, fileName))
|
|
|
|
def finalise(self):
|
|
logging.debug("Finalising...")
|
|
### writing table of contents -- ncx file
|
|
|
|
tocFilePath = "OEBPS/toc.ncx"
|
|
# toc = open(tocFilePath, 'w')
|
|
# print >> toc, TOC_START % self.storyTitle
|
|
self._writeFile(tocFilePath, TOC_START % (self.adapter.getUUID(), self.storyTitle))
|
|
|
|
published = self.adapter.getStoryPublished().strftime("%Y-%m-%d")
|
|
createda = self.adapter.getStoryCreated().strftime("%Y-%m-%d %H:%M:%S")
|
|
created = self.adapter.getStoryCreated().strftime("%Y-%m-%d")
|
|
updated = self.adapter.getStoryUpdated().strftime("%Y-%m-%d")
|
|
calibre = self.adapter.getStoryUpdated().strftime("%Y-%m-%dT%H:%M:%S")
|
|
|
|
description = self.adapter.getStoryDescription()
|
|
if hasattr(description, "text"):
|
|
description = str(description.text)
|
|
else:
|
|
description = str(description)
|
|
if description is not None and len(description) > 0:
|
|
description = description.replace ('\\\'', '').replace('\\\"', '')
|
|
description = removeEntities(description.replace(' ',' ').replace('’',''))
|
|
else:
|
|
description = ' '
|
|
|
|
### writing content -- title page
|
|
titleFilePath = "OEBPS/title_page.xhtml"
|
|
self._writeFile(titleFilePath, TITLE_HEADER % (self.authorName, self.storyTitle, self.adapter.getStoryURL(), self.storyTitle, self.adapter.getAuthorURL(), self.authorName))
|
|
self._writeFile(titleFilePath, TITLE_ENTRY % ('Category:', self.adapter.getCategory()))
|
|
self._writeFile(titleFilePath, TITLE_ENTRY % ('Genre:', self.adapter.getGenre()))
|
|
self._writeFile(titleFilePath, TITLE_ENTRY % ('Status:', self.adapter.getStoryStatus()))
|
|
self._writeFile(titleFilePath, TITLE_ENTRY % ('Published:', published))
|
|
self._writeFile(titleFilePath, TITLE_ENTRY % ('Updated:', updated))
|
|
self._writeFile(titleFilePath, TITLE_ENTRY % ('Packaged:', createda))
|
|
tmpstr = self.adapter.getStoryRating() + " / " + self.adapter.getStoryUserRating()
|
|
self._writeFile(titleFilePath, TITLE_ENTRY % ('Rating Age/User:', tmpstr))
|
|
tmpstr = self.adapter.getNumChapters() + " / " + self.adapter.getNumWords()
|
|
self._writeFile(titleFilePath, TITLE_ENTRY % ('Chapters/Words:', tmpstr))
|
|
self._writeFile(titleFilePath, TITLE_ENTRY % ('Publisher:', self.adapter.getHost()))
|
|
self._writeFile(titleFilePath, TITLE_ENTRY % ('Story ID:', self.adapter.getStoryId()))
|
|
self._writeFile(titleFilePath, TITLE_ENTRY % ('Author ID:', self.adapter.getAuthorId()))
|
|
|
|
self._writeFile(titleFilePath, TITLE_FOOTER % description )
|
|
|
|
### writing content -- opf file
|
|
opfFilePath = "OEBPS/content.opf"
|
|
|
|
# opf = open(opfFilePath, 'w')
|
|
self._writeFile(opfFilePath, CONTENT_START % (uuid.uuid4().urn, self.storyTitle, self.authorName, self.adapter.getLanguageId(), published, created, updated, calibre, description))
|
|
|
|
i = 0
|
|
subjs = []
|
|
subjs = self.adapter.getSubjects()
|
|
for subj in subjs:
|
|
self._writeFile(opfFilePath, CONTENT_SUBJECT % subj)
|
|
i = i + 1
|
|
if (i <= 0):
|
|
self._writeFile(opfFilePath, CONTENT_SUBJECT % "FanFiction")
|
|
|
|
self._writeFile(opfFilePath, CONTENT_END_METADATA % (self.adapter.getPublisher(), self.adapter.getUUID(), self.adapter.getStoryURL(), self.adapter.getStoryURL(), self.adapter.getStoryUserRating()))
|
|
# print >> opf, CONTENT_START % (uuid.uuid4().urn, self.storyTitle, self.authorName)
|
|
|
|
ids = []
|
|
|
|
i = 0
|
|
|
|
t = "Title Page"
|
|
f = "title_page.xhtml"
|
|
chapterId = "Title Page"
|
|
self._writeFile(tocFilePath, TOC_ITEM % (chapterId, i, t, f))
|
|
self._writeFile(opfFilePath, CONTENT_ITEM % (chapterId, f))
|
|
|
|
ids.append(chapterId)
|
|
|
|
i = i + 1
|
|
|
|
for t,f in self.chapters:
|
|
chapterId = "chapter%04d" % i
|
|
|
|
self._writeFile(tocFilePath, TOC_ITEM % (chapterId, i, t, f))
|
|
self._writeFile(opfFilePath, CONTENT_ITEM % (chapterId, f))
|
|
|
|
ids.append(chapterId)
|
|
|
|
i = i + 1
|
|
|
|
# logging.d('Toc and refs printed, proceesing to ref-ids....')
|
|
|
|
self._writeFile(tocFilePath, TOC_END)
|
|
self._writeFile(opfFilePath, CONTENT_END_MANIFEST)
|
|
|
|
for chapterId in ids:
|
|
self._writeFile(opfFilePath, CONTENT_ITEMREF % chapterId)
|
|
|
|
self._writeFile(opfFilePath, CONTENT_END)
|
|
|
|
self._closeFiles()
|
|
|
|
filename = self.directory + '.epub'
|
|
|
|
zipdata = zipdir.inMemoryZip(self.files)
|
|
|
|
if self.writeToFile:
|
|
f = open(filename, 'wb')
|
|
f.write(zipdata.getvalue())
|
|
f.close()
|
|
else:
|
|
self.output = zipdata
|
|
|
|
# zipdir.toZip(filename, self.directory)
|
|
|
|
def unirepl(match):
|
|
"Return the unicode string for a decimal number"
|
|
s = match.group()
|
|
if s[2].lower()=='x':
|
|
radix=16
|
|
else:
|
|
radix=10
|
|
value = int(s[3:-1], radix )
|
|
return unichr(value)
|
|
|
|
def replaceNumberEntities(data):
|
|
p = re.compile(r'&#(x?)(\d+);')
|
|
return p.sub(unirepl, data)
|
|
|
|
def removeEntities(text):
|
|
# replace numeric versions of [&<>] with named versions.
|
|
text = re.sub(r'�*38;','&',text)
|
|
text = re.sub(r'�*60;','<',text)
|
|
text = re.sub(r'�*62;','>',text)
|
|
|
|
# replace remaining � entities with unicode value, such as ' -> '
|
|
text = replaceNumberEntities(text)
|
|
|
|
# replace several named entities with character, such as — -> -
|
|
# see constants.py for the list.
|
|
# reverse sort will put entities with ; before the same one without, when valid.
|
|
for e in reversed(sorted(entities.keys())):
|
|
v = entities[e]
|
|
try:
|
|
text = text.replace(e, v)
|
|
except UnicodeDecodeError, ex:
|
|
# for the pound symbol in constants.py
|
|
text = text.replace(e, v.decode('utf-8'))
|
|
|
|
# < < and & are the only html entities allowed in xhtml, put those back.
|
|
text = text.replace('&', '&').replace('&lt;', '<').replace('&gt;', '>')
|
|
|
|
return text
|
|
|
|
def makeAcceptableFilename(text):
|
|
return re.sub('[^a-zA-Z0-9_\'-]+','',removeEntities(text).replace(" ", "_").replace(":","_"))
|