Improved metadata for epubcheck, fixed a problem I introduced with >/<.

2025-12-15 21:32:28 +01:00 · 2010-09-29 21:32:04 -05:00 · 2010-09-29 21:32:04 -05:00 · 773ff3c03c
commit 773ff3c03c
parent 9434431c7b
2 changed files with 25 additions and 19 deletions
--- a/constants.py
+++ b/constants.py
@ -32,7 +32,7 @@ CONTAINER = '''<?xml version="1.0"?>

 CONTENT_START = '''<?xml version="1.0"?>
 <package version="2.0" xmlns="http://www.idpf.org/2007/opf"
-         unique-identifier="BookId-Epub-%s">
+         unique-identifier="BookID">
 <metadata xmlns:dc="http://purl.org/dc/elements/1.1/"
           xmlns:opf="http://www.idpf.org/2007/opf">
   <dc:title>%s</dc:title> 
@ -41,7 +41,7 @@ CONTENT_START = '''<?xml version="1.0"?>
   <dc:rights></dc:rights>
   <dc:subject>fanfiction</dc:subject> 
   <dc:publisher>sgzmd</dc:publisher> 
-   <dc:identifier id="BookId">urn:uuid:sigizmund.com062820072147132</dc:identifier>
+   <dc:identifier id="BookID">%s</dc:identifier>
 </metadata>
 <manifest>
  <item id="ncx" href="toc.ncx" media-type="application/x-dtbncx+xml"/>
@ -114,8 +114,7 @@ acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b', 'big',
 acceptable_attributes = ['href']

 entities = { '&ndash;' : ' - ', '&mdash;' : ' - ', '&rdquo;' : '"', '&ldquo;' : '"', '&rsquo;' : '\'', 
-		'&lsquo;' : '\'', '&quot;' : '"', '&hellip;' : '...', '&amp;' : '&', '&pound;' : '£', '&nbsp;' : ' ',
-		'&lt;' : '<', '&gt;' : '>' }
+		'&lsquo;' : '\'', '&quot;' : '"', '&hellip;' : '...', '&amp;' : '&', '&pound;' : '£', '&nbsp;' : ' ' }

 FB2_PROLOGUE = '<FictionBook>'
 FB2_DESCRIPTION = '''<description>
--- a/output.py
+++ b/output.py
@ -173,20 +173,22 @@ class EPubFanficWriter(FanficWriter):
 				# for the pound symbol in constants.py
 				text = text.replace(e, v.decode('utf-8'))
 		
-		text = text.replace('&', '&amp;')
+		# &lt; &lt; and &amp; are the only html entities allowed in xhtml.
+		text = text.replace('&', '&amp;').replace('&amp;lt;', '&lt;').replace('&amp;gt;', '&gt;')
 		
 		return text
 	
 	def writeChapter(self, title, text):
 		logging.debug("Writing chapter: %s" % title)
 		try:
-			fileName = base64.b64encode(title).replace('/', '_') + ".xhtml"
+			fileName = base64.b64encode(title) + ".xhtml"
 		except UnicodeEncodeError, e:
-			fileName = base64.b64encode(title.encode('utf-8')).replace('/', '_') + ".xhtml"
-#		title = cgi.esca#title.decode('utf-8')
-#		sha = hashlib.sha224(title)
-#		fileName = sha.hexdigest() + ".xhtml"
-		#fileName = cgi.escape(title) + '.xhtml'
+			fileName = base64.b64encode(title.encode('utf-8')) + ".xhtml"
+		# Base64 can include +, / and =, which XML technically doesn't like
+		# in it's id attributes.  _ and - are okay and not otherwise used in Base64.
+		# The = for padding is superfluous
+		fileName = fileName.replace('/', '_').replace('+', '-').replace('=','')
+
 		filePath = self.directory + "/OEBPS/" + fileName
 		
 		fn = 'OEBPS/' + fileName
@ -208,13 +210,14 @@ class EPubFanficWriter(FanficWriter):

 		allPs = self.soup.findAll(recursive=True)
 		for p in allPs:
-			if p.string != None and (len(p.string.strip()) == 0 or p.string.strip() == '&nbsp;' ) :
+			if p.string != None and len(p.string.strip()) == 0 :
 				p.extract()
-				
-		allBrs = self.soup.findAll(recursive=True, name = ['div'])
-		for br in allBrs:
-			if (br.string != None and len(br.string.strip()) != 0) or (br.contents != None):
-				br.name = 'p'
+
+		# xhtml doesn't like <p> nesting in <p>, so leave divs.
+		# allBrs = self.soup.findAll(recursive=True, name = ['div'])
+		# for br in allBrs:
+			# if (br.string != None and len(br.string.strip()) != 0) or (br.contents != None):
+				# br.name = 'p'

 #		cleanup(self.soup )
 		
@ -243,17 +246,21 @@ class EPubFanficWriter(FanficWriter):
 		opfFilePath = "OEBPS/content.opf"
 		
 #		opf = open(opfFilePath, 'w')
-		self._writeFile(opfFilePath, CONTENT_START % (uuid.uuid4().urn, self.storyTitle, self.authorName))
+		self._writeFile(opfFilePath, CONTENT_START % (self.storyTitle, self.authorName, uuid.uuid4().urn))
 #		print >> opf, CONTENT_START % (uuid.uuid4().urn, self.storyTitle, self.authorName)

 		ids = []
 		
-		i = 0
+		i = 1
 		for t,f in self.chapters:
 			try:
 				chapterId = base64.b64encode(t)
 			except UnicodeEncodeError, e:
 				chapterId = base64.b64encode(t.encode('utf-8'))
+			# Base64 can include +, / and =, which XML technically doesn't like
+			# in it's id attributes.  _ and - are okay and not otherwise used in Base64.
+			# The = for padding is superfluous
+			chapterId = chapterId.replace('/', '_').replace('+', '-').replace('=','')
 			
 			self._writeFile(tocFilePath, TOC_ITEM % (chapterId, i, cgi.escape(t), f))
 			self._writeFile(opfFilePath, CONTENT_ITEM % (chapterId, f))