Improved metadata for epubcheck, fixed a problem I introduced with >/<.

This commit is contained in:
retiefjimm 2010-09-29 21:32:04 -05:00
parent 9434431c7b
commit 773ff3c03c
2 changed files with 25 additions and 19 deletions

View file

@ -32,7 +32,7 @@ CONTAINER = '''<?xml version="1.0"?>
CONTENT_START = '''<?xml version="1.0"?>
<package version="2.0" xmlns="http://www.idpf.org/2007/opf"
unique-identifier="BookId-Epub-%s">
unique-identifier="BookID">
<metadata xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:opf="http://www.idpf.org/2007/opf">
<dc:title>%s</dc:title>
@ -41,7 +41,7 @@ CONTENT_START = '''<?xml version="1.0"?>
<dc:rights></dc:rights>
<dc:subject>fanfiction</dc:subject>
<dc:publisher>sgzmd</dc:publisher>
<dc:identifier id="BookId">urn:uuid:sigizmund.com062820072147132</dc:identifier>
<dc:identifier id="BookID">%s</dc:identifier>
</metadata>
<manifest>
<item id="ncx" href="toc.ncx" media-type="application/x-dtbncx+xml"/>
@ -114,8 +114,7 @@ acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b', 'big',
acceptable_attributes = ['href']
entities = { '&ndash;' : ' - ', '&mdash;' : ' - ', '&rdquo;' : '"', '&ldquo;' : '"', '&rsquo;' : '\'',
'&lsquo;' : '\'', '&quot;' : '"', '&hellip;' : '...', '&amp;' : '&', '&pound;' : '£', '&nbsp;' : ' ',
'&lt;' : '<', '&gt;' : '>' }
'&lsquo;' : '\'', '&quot;' : '"', '&hellip;' : '...', '&amp;' : '&', '&pound;' : '£', '&nbsp;' : ' ' }
FB2_PROLOGUE = '<FictionBook>'
FB2_DESCRIPTION = '''<description>

View file

@ -173,20 +173,22 @@ class EPubFanficWriter(FanficWriter):
# for the pound symbol in constants.py
text = text.replace(e, v.decode('utf-8'))
text = text.replace('&', '&amp;')
# &lt; &lt; and &amp; are the only html entities allowed in xhtml.
text = text.replace('&', '&amp;').replace('&amp;lt;', '&lt;').replace('&amp;gt;', '&gt;')
return text
def writeChapter(self, title, text):
logging.debug("Writing chapter: %s" % title)
try:
fileName = base64.b64encode(title).replace('/', '_') + ".xhtml"
fileName = base64.b64encode(title) + ".xhtml"
except UnicodeEncodeError, e:
fileName = base64.b64encode(title.encode('utf-8')).replace('/', '_') + ".xhtml"
# title = cgi.esca#title.decode('utf-8')
# sha = hashlib.sha224(title)
# fileName = sha.hexdigest() + ".xhtml"
#fileName = cgi.escape(title) + '.xhtml'
fileName = base64.b64encode(title.encode('utf-8')) + ".xhtml"
# Base64 can include +, / and =, which XML technically doesn't like
# in it's id attributes. _ and - are okay and not otherwise used in Base64.
# The = for padding is superfluous
fileName = fileName.replace('/', '_').replace('+', '-').replace('=','')
filePath = self.directory + "/OEBPS/" + fileName
fn = 'OEBPS/' + fileName
@ -208,13 +210,14 @@ class EPubFanficWriter(FanficWriter):
allPs = self.soup.findAll(recursive=True)
for p in allPs:
if p.string != None and (len(p.string.strip()) == 0 or p.string.strip() == '&nbsp;' ) :
if p.string != None and len(p.string.strip()) == 0 :
p.extract()
allBrs = self.soup.findAll(recursive=True, name = ['div'])
for br in allBrs:
if (br.string != None and len(br.string.strip()) != 0) or (br.contents != None):
br.name = 'p'
# xhtml doesn't like <p> nesting in <p>, so leave divs.
# allBrs = self.soup.findAll(recursive=True, name = ['div'])
# for br in allBrs:
# if (br.string != None and len(br.string.strip()) != 0) or (br.contents != None):
# br.name = 'p'
# cleanup(self.soup )
@ -243,17 +246,21 @@ class EPubFanficWriter(FanficWriter):
opfFilePath = "OEBPS/content.opf"
# opf = open(opfFilePath, 'w')
self._writeFile(opfFilePath, CONTENT_START % (uuid.uuid4().urn, self.storyTitle, self.authorName))
self._writeFile(opfFilePath, CONTENT_START % (self.storyTitle, self.authorName, uuid.uuid4().urn))
# print >> opf, CONTENT_START % (uuid.uuid4().urn, self.storyTitle, self.authorName)
ids = []
i = 0
i = 1
for t,f in self.chapters:
try:
chapterId = base64.b64encode(t)
except UnicodeEncodeError, e:
chapterId = base64.b64encode(t.encode('utf-8'))
# Base64 can include +, / and =, which XML technically doesn't like
# in it's id attributes. _ and - are okay and not otherwise used in Base64.
# The = for padding is superfluous
chapterId = chapterId.replace('/', '_').replace('+', '-').replace('=','')
self._writeFile(tocFilePath, TOC_ITEM % (chapterId, i, cgi.escape(t), f))
self._writeFile(opfFilePath, CONTENT_ITEM % (chapterId, f))