+ # Stability: Added in v1.12, will exist for all future v1.x releases. + # @return Returns a {@link #xmp.XmpInformation XmlInformation} instance + # that can be used to access XMP metadata from the document. Can also + # return None if no metadata was found on the document root. + def getXmpMetadata(self): + metadata = self.get("/Metadata", None) + if metadata == None: + return None + metadata = metadata.getObject() + import xmp + if not isinstance(metadata, xmp.XmpInformation): + metadata = xmp.XmpInformation(metadata) + self[NameObject("/Metadata")] = metadata + return metadata + + ## + # Read-only property that accesses the {@link + # #DictionaryObject.getXmpData getXmpData} function. + #
+ # Stability: Added in v1.12, will exist for all future v1.x releases. + xmpMetadata = property(lambda self: self.getXmpMetadata(), None, None) def writeToStream(self, stream, encryption_key): stream.write("<<\n") @@ -563,7 +632,7 @@ def getData(self): return self.decodedSelf.getData() else: # create decoded object - decoded = StreamObject() + decoded = DecodedStreamObject() decoded._data = filters.decodeStreamData(self) for key, value in self.items(): if not key in ("/Length", "/Filter", "/DecodeParms"): @@ -583,8 +652,8 @@ def __init__(self, arr): ArrayObject.__init__(self, [self.ensureIsNumber(x) for x in arr]) def ensureIsNumber(self, value): - if not isinstance(value, NumberObject): - value = NumberObject(value) + if not isinstance(value, (NumberObject, FloatObject)): + value = FloatObject(value) return value def __repr__(self): diff --git a/src/calibre/ebooks/pyPdf/pdf.py b/src/pyPdf/pdf.py similarity index 86% rename from src/calibre/ebooks/pyPdf/pdf.py rename to src/pyPdf/pdf.py index f64c1a6c22..ce4331b498 100644 --- a/src/calibre/ebooks/pyPdf/pdf.py +++ b/src/pyPdf/pdf.py @@ -88,7 +88,8 @@ def _addObject(self, obj): return IndirectObject(len(self._objects), 0, self) def getObject(self, ido): - assert ido.pdf == self + if ido.pdf != self: + raise ValueError("pdf must be self") return self._objects[ido.idnum - 1] ## @@ -105,7 +106,7 @@ def addPage(self, page): page = self._addObject(page) pages = self.getObject(self._pages) pages["/Kids"].append(page) - pages["/Count"] = NumberObject(pages["/Count"] + 1) + pages[NameObject("/Count")] = NumberObject(pages["/Count"] + 1) ## # Encrypt this PDF file with the PDF Standard encryption handler. @@ -272,7 +273,6 @@ def _sweepIndirectReferences(self, externMap, data): class PdfFileReader(object): def __init__(self, stream): self.flattenedPages = None - self.pageNumbers = {} self.resolvedObjects = {} self.read(stream) self.stream = stream @@ -290,7 +290,7 @@ def __init__(self, stream): def getDocumentInfo(self): if not self.trailer.has_key("/Info"): return None - obj = self.getObject(self.trailer['/Info']) + obj = self.trailer['/Info'] retval = DocumentInformation() retval.update(obj) return retval @@ -302,6 +302,28 @@ def getDocumentInfo(self): # Stability: Added in v1.7, will exist for all future v1.x releases. documentInfo = property(lambda self: self.getDocumentInfo(), None, None) + ## + # Retrieves XMP (Extensible Metadata Platform) data from the PDF document + # root. + #
+ # Stability: Added in v1.12, will exist for all future v1.x releases. + # @return Returns a {@link #generic.XmpInformation XmlInformation} + # instance that can be used to access XMP metadata from the document. + # Can also return None if no metadata was found on the document root. + def getXmpMetadata(self): + try: + self._override_encryption = True + return self.trailer["/Root"].getXmpMetadata() + finally: + self._override_encryption = False + + ## + # Read-only property that accesses the {@link #PdfFileReader.getXmpData + # getXmpData} function. + #
+ # Stability: Added in v1.12, will exist for all future v1.x releases. + xmpMetadata = property(lambda self: self.getXmpMetadata(), None, None) + ## # Calculates the number of pages in this PDF file. #
@@ -346,43 +368,39 @@ def getPage(self, pageNumber): # Stability: Added in v1.10, will exist for all future v1.x releases. # @return Returns a dict which maps names to {@link #Destination # destinations}. - def getNamedDestinations(self, tree = None, map = None): - if self.flattenedPages == None: - self._flatten() - - get = self.safeGetObject - if map == None: - map = {} - catalog = get(self.trailer["/Root"]) + def getNamedDestinations(self, tree=None, retval=None): + if retval == None: + retval = {} + catalog = self.trailer["/Root"] # get the name tree if catalog.has_key("/Dests"): - tree = get(catalog["/Dests"]) + tree = catalog["/Dests"] elif catalog.has_key("/Names"): - names = get(catalog['/Names']) + names = catalog['/Names'] if names.has_key("/Dests"): - tree = get(names['/Dests']) + tree = names['/Dests'] if tree == None: - return map + return retval if tree.has_key("/Kids"): # recurse down the tree - for kid in get(tree["/Kids"]): - self.getNamedDestinations(get(kid), map) + for kid in tree["/Kids"]: + self.getNamedDestinations(kid.getObject(), retval) if tree.has_key("/Names"): - names = get(tree["/Names"]) + names = tree["/Names"] for i in range(0, len(names), 2): - key = get(names[i]) - val = get(names[i+1]) + key = names[i].getObject() + val = names[i+1].getObject() if isinstance(val, DictionaryObject) and val.has_key('/D'): - val = get(val['/D']) - dest = self._buildDestination(val, key) + val = val['/D'] + dest = self._buildDestination(key, val) if dest != None: - map[key] = dest + retval[key] = dest - return map + return retval ## # Read-only property that accesses the {@link #PdfFileReader.getOutlines @@ -396,20 +414,16 @@ def getNamedDestinations(self, tree = None, map = None): #
# Stability: Added in v1.10, will exist for all future v1.x releases. # @return Returns a nested list of {@link #Destination destinations}. - def getOutlines(self, node = None, outlines = None): - if self.flattenedPages == None: - self._flatten() - - get = self.safeGetObject + def getOutlines(self, node=None, outlines=None): if outlines == None: outlines = [] - catalog = get(self.trailer["/Root"]) + catalog = self.trailer["/Root"] # get the outline dictionary and named destinations if catalog.has_key("/Outlines"): - lines = get(catalog["/Outlines"]) + lines = catalog["/Outlines"] if lines.has_key("/First"): - node = get(lines["/First"]) + node = lines["/First"] self._namedDests = self.getNamedDestinations() if node == None: @@ -424,49 +438,44 @@ def getOutlines(self, node = None, outlines = None): # check for sub-outlines if node.has_key("/First"): subOutlines = [] - self.getOutlines(get(node["/First"]), subOutlines) + self.getOutlines(node["/First"], subOutlines) if subOutlines: outlines.append(subOutlines) if not node.has_key("/Next"): break - node = get(node["/Next"]) + node = node["/Next"] return outlines - def _buildDestination(self, array, title): - if not (isinstance(array, ArrayObject) and len(array) >= 2 and \ - isinstance(array[0], IndirectObject)): - return None - - pageKey = (array[0].generation, array[0].idnum) - if not self.pageNumbers.has_key(pageKey): - return None - - pageNum = self.pageNumbers[pageKey] - return Destination(*([title, pageNum]+array[1:])) + def _buildDestination(self, title, array): + page, typ = array[0:2] + array = array[2:] + return Destination(title, page, typ, *array) def _buildOutline(self, node): dest, title, outline = None, None, None if node.has_key("/A") and node.has_key("/Title"): # Action, section 8.5 (only type GoTo supported) - title = self.safeGetObject(node["/Title"]) - action = self.safeGetObject(node["/A"]) + title = node["/Title"] + action = node["/A"] if action["/S"] == "/GoTo": - dest = self.safeGetObject(action["/D"]) + dest = action["/D"] elif node.has_key("/Dest") and node.has_key("/Title"): # Destination, section 8.2.1 - title = self.safeGetObject(node["/Title"]) - dest = self.safeGetObject(node["/Dest"]) + title = node["/Title"] + dest = node["/Dest"] # if destination found, then create outline if dest: if isinstance(dest, ArrayObject): - outline = self._buildDestination(dest, title) - elif isinstance(dest, str) and self._namedDests.has_key(dest): + outline = self._buildDestination(title, dest) + elif isinstance(dest, unicode) and self._namedDests.has_key(dest): outline = self._namedDests[dest] - outline.title = title + outline[NameObject("/Title")] = title + else: + raise utils.PdfReadError("Unexpected destination %r" % dest) return outline ## @@ -478,7 +487,7 @@ def _buildOutline(self, node): pages = property(lambda self: ConvertFunctionsToVirtualList(self.getNumPages, self.getPage), None, None) - def _flatten(self, pages = None, inherit = None): + def _flatten(self, pages=None, inherit=None): inheritablePageAttributes = ( NameObject("/Resources"), NameObject("/MediaBox"), NameObject("/CropBox"), NameObject("/Rotate") @@ -487,37 +496,25 @@ def _flatten(self, pages = None, inherit = None): inherit = dict() if pages == None: self.flattenedPages = [] - catalog = self.getObject(self.trailer["/Root"]) - pages = self.getObject(catalog["/Pages"]) - indirectReference = None - if isinstance(pages, IndirectObject): - indirectReference = pages - pages = self.getObject(pages) + catalog = self.trailer["/Root"].getObject() + pages = catalog["/Pages"].getObject() t = pages["/Type"] if t == "/Pages": for attr in inheritablePageAttributes: if pages.has_key(attr): inherit[attr] = pages[attr] - for page in self.safeGetObject(pages["/Kids"]): - self._flatten(page, inherit) + for page in pages["/Kids"]: + self._flatten(page.getObject(), inherit) elif t == "/Page": for attr,value in inherit.items(): # if the page has it's own value, it does not inherit the # parent's value: if not pages.has_key(attr): pages[attr] = value - pageObj = PageObject(self, indirectReference) + pageObj = PageObject(self) pageObj.update(pages) - if indirectReference: - key = (indirectReference.generation, indirectReference.idnum) - self.pageNumbers[key] = len(self.flattenedPages) self.flattenedPages.append(pageObj) - def safeGetObject(self, obj): - if isinstance(obj, IndirectObject): - return self.safeGetObject(self.getObject(obj)) - return obj - def getObject(self, indirectReference): retval = self.resolvedObjects.get(indirectReference.generation, {}).get(indirectReference.idnum, None) if retval != None: @@ -527,7 +524,7 @@ def getObject(self, indirectReference): # indirect reference to object in object stream # read the entire object stream into memory stmnum,idx = self.xref_objStm[indirectReference.idnum] - objStm = self.getObject(IndirectObject(stmnum, 0, self)) + objStm = IndirectObject(stmnum, 0, self).getObject() assert objStm['/Type'] == '/ObjStm' assert idx < objStm['/N'] streamData = StringIO(objStm.getData()) @@ -619,7 +616,7 @@ def read(self, stream): # read all cross reference tables and their trailers self.xref = {} self.xref_objStm = {} - self.trailer = {} + self.trailer = DictionaryObject() while 1: # load the xref table stream.seek(startxref, 0) @@ -641,6 +638,16 @@ def read(self, stream): cnt = 0 while cnt < size: line = stream.read(20) + # It's very clear in section 3.4.3 of the PDF spec + # that all cross-reference table lines are a fixed + # 20 bytes. However... some malformed PDF files + # use a single character EOL without a preceeding + # space. Detect that case, and seek the stream + # back one character. (0-9 means we've bled into + # the next xref entry, t means we've bled into the + # text "trailer"): + if line[-1] in "0123456789t": + stream.seek(-1, 1) offset, generation = line[:16].split(" ") offset, generation = int(offset), int(generation) if not self.xref.has_key(generation): @@ -669,8 +676,8 @@ def read(self, stream): for key, value in newTrailer.items(): if not self.trailer.has_key(key): self.trailer[key] = value - if newTrailer.has_key(NameObject("/Prev")): - startxref = newTrailer[NameObject("/Prev")] + if newTrailer.has_key("/Prev"): + startxref = newTrailer["/Prev"] else: break elif x.isdigit(): @@ -681,43 +688,46 @@ def read(self, stream): assert xrefstream["/Type"] == "/XRef" self.cacheIndirectObject(generation, idnum, xrefstream) streamData = StringIO(xrefstream.getData()) - num, size = xrefstream.get("/Index", [0, xrefstream.get("/Size")]) + idx_pairs = xrefstream.get("/Index", [0, xrefstream.get("/Size")]) entrySizes = xrefstream.get("/W") - cnt = 0 - while cnt < size: - for i in range(len(entrySizes)): - d = streamData.read(entrySizes[i]) - di = convertToInt(d, entrySizes[i]) - if i == 0: - xref_type = di - elif i == 1: - if xref_type == 0: - next_free_object = di - elif xref_type == 1: - byte_offset = di - elif xref_type == 2: - objstr_num = di - elif i == 2: - if xref_type == 0: - next_generation = di - elif xref_type == 1: - generation = di - elif xref_type == 2: - obstr_idx = di - if xref_type == 0: - pass - elif xref_type == 1: - if not self.xref.has_key(generation): - self.xref[generation] = {} - self.xref[generation][num] = byte_offset - elif xref_type == 2: - self.xref_objStm[num] = [objstr_num, obstr_idx] - cnt += 1 - num += 1 + for num, size in self._pairs(idx_pairs): + cnt = 0 + while cnt < size: + for i in range(len(entrySizes)): + d = streamData.read(entrySizes[i]) + di = convertToInt(d, entrySizes[i]) + if i == 0: + xref_type = di + elif i == 1: + if xref_type == 0: + next_free_object = di + elif xref_type == 1: + byte_offset = di + elif xref_type == 2: + objstr_num = di + elif i == 2: + if xref_type == 0: + next_generation = di + elif xref_type == 1: + generation = di + elif xref_type == 2: + obstr_idx = di + if xref_type == 0: + pass + elif xref_type == 1: + if not self.xref.has_key(generation): + self.xref[generation] = {} + if not num in self.xref[generation]: + self.xref[generation][num] = byte_offset + elif xref_type == 2: + if not num in self.xref_objStm: + self.xref_objStm[num] = [objstr_num, obstr_idx] + cnt += 1 + num += 1 trailerKeys = "/Root", "/Encrypt", "/Info", "/ID" for key in trailerKeys: if xrefstream.has_key(key) and not self.trailer.has_key(key): - self.trailer[NameObject(key)] = xrefstream[key] + self.trailer[NameObject(key)] = xrefstream.raw_get(key) if xrefstream.has_key("/Prev"): startxref = xrefstream["/Prev"] else: @@ -737,6 +747,14 @@ def read(self, stream): assert False break + def _pairs(self, array): + i = 0 + while True: + yield array[i], array[i+1] + i += 2 + if (i+1) >= len(array): + break + def readNextEndLine(self, stream): line = "" while True: @@ -778,7 +796,7 @@ def decrypt(self, password): self._override_encryption = False def _decrypt(self, password): - encrypt = self.safeGetObject(self.trailer['/Encrypt']) + encrypt = self.trailer['/Encrypt'].getObject() if encrypt['/Filter'] != '/Standard': raise NotImplementedError, "only Standard PDF encryption handler is available" if not (encrypt['/V'] in (1, 2)): @@ -788,13 +806,13 @@ def _decrypt(self, password): self._decryption_key = key return 1 else: - rev = self.safeGetObject(encrypt['/R']) + rev = encrypt['/R'].getObject() if rev == 2: keylen = 5 else: - keylen = self.safeGetObject(encrypt['/Length']) / 8 + keylen = encrypt['/Length'].getObject() / 8 key = _alg33_1(password, rev, keylen) - real_O = self.safeGetObject(encrypt["/O"]) + real_O = encrypt["/O"].getObject() if rev == 2: userpass = utils.RC4_encrypt(key, real_O) else: @@ -812,20 +830,20 @@ def _decrypt(self, password): return 0 def _authenticateUserPassword(self, password): - encrypt = self.safeGetObject(self.trailer['/Encrypt']) - rev = self.safeGetObject(encrypt['/R']) - owner_entry = self.safeGetObject(encrypt['/O']).original_bytes - p_entry = self.safeGetObject(encrypt['/P']) - id_entry = self.safeGetObject(self.trailer['/ID']) - id1_entry = self.safeGetObject(id_entry[0]) + encrypt = self.trailer['/Encrypt'].getObject() + rev = encrypt['/R'].getObject() + owner_entry = encrypt['/O'].getObject().original_bytes + p_entry = encrypt['/P'].getObject() + id_entry = self.trailer['/ID'].getObject() + id1_entry = id_entry[0].getObject() if rev == 2: U, key = _alg34(password, owner_entry, p_entry, id1_entry) elif rev >= 3: U, key = _alg35(password, rev, - self.safeGetObject(encrypt["/Length"]) / 8, owner_entry, + encrypt["/Length"].getObject() / 8, owner_entry, p_entry, id1_entry, - self.safeGetObject(encrypt.get("/EncryptMetadata", False))) - real_U = self.safeGetObject(encrypt['/U']).original_bytes + encrypt.get("/EncryptMetadata", BooleanObject(False)).getObject()) + real_U = encrypt['/U'].getObject().original_bytes return U == real_U, key def getIsEncrypted(self): @@ -874,10 +892,9 @@ def createRectangleAccessor(name, fallback): # will be created by accessing the {@link #PdfFileReader.getPage getPage} # function of the {@link #PdfFileReader PdfFileReader} class. class PageObject(DictionaryObject): - def __init__(self, pdf, indirectReference = None): + def __init__(self, pdf): DictionaryObject.__init__(self) self.pdf = pdf - self.indirectReference = indirectReference ## # Rotates a page clockwise by increments of 90 degrees. @@ -1058,7 +1075,7 @@ def extractText(self): # implementation-defined manner. Default value: same as MediaBox. #
# Stability: Added in v1.4, will exist for all future v1.x releases.
- cropBox = createRectangleAccessor("/CropBox", ("/CropBox",))
+ cropBox = createRectangleAccessor("/CropBox", ("/MediaBox",))
##
# A rectangle (RectangleObject), expressed in default user space units,
@@ -1110,7 +1127,15 @@ def __parseContentStream(self, stream):
break
stream.seek(-1, 1)
if peek.isalpha() or peek == "'" or peek == '"':
- operator = readUntilWhitespace(stream, maxchars=2)
+ operator = ""
+ while True:
+ tok = stream.read(1)
+ if tok.isspace() or tok in NameObject.delimiterCharacters:
+ stream.seek(-1, 1)
+ break
+ elif tok == '':
+ break
+ operator += tok
if operator == "BI":
# begin inline image - a completely different parsing
# mechanism is required, of course... thanks buddy...
@@ -1120,6 +1145,14 @@ def __parseContentStream(self, stream):
else:
self.operations.append((operands, operator))
operands = []
+ elif peek == '%':
+ # If we encounter a comment in the content stream, we have to
+ # handle it here. Typically, readObject will handle
+ # encountering a comment -- but readObject assumes that
+ # following the comment must be the object we're trying to
+ # read. In this case, it could be an operator instead.
+ while peek not in ('\r', '\n'):
+ peek = stream.read(1)
else:
operands.append(readObject(stream, None))
@@ -1251,86 +1284,74 @@ def getText(self, key):
# See section 8.2.1 of the PDF 1.6 reference.
# Stability: Added in v1.10, will exist for all v1.x releases.
class Destination(DictionaryObject):
- def __init__(self, *args):
+ def __init__(self, title, page, typ, *args):
DictionaryObject.__init__(self)
- self.title = args[0]
- self["/Page"], self["/Type"] = args[1], args[2]
+ self[NameObject("/Title")] = title
+ self[NameObject("/Page")] = page
+ self[NameObject("/Type")] = typ
# from table 8.2 of the PDF 1.6 reference.
- mapNull = lambda x: {True: None, False: x}[isinstance(x, NullObject)]
- params = map(mapNull, args[3:])
- type = self["/Type"]
-
- if type == "/XYZ":
- self["/Left"], self["/Top"], self["/Zoom"] = params
- elif type == "/FitR":
- self["/Left"], self["/Bottom"], \
- self["/Right"], self["/Top"] = params
- elif type in ["/FitH", "FitBH"]:
- self["/Top"], = params
- elif type in ["/FitV", "FitBV"]:
- self["/Left"], = params
- elif type in ["/Fit", "FitB"]:
+ if typ == "/XYZ":
+ (self[NameObject("/Left")], self[NameObject("/Top")],
+ self[NameObject("/Zoom")]) = args
+ elif typ == "/FitR":
+ (self[NameObject("/Left")], self[NameObject("/Bottom")],
+ self[NameObject("/Right")], self[NameObject("/Top")]) = args
+ elif typ in ["/FitH", "FitBH"]:
+ self[NameObject("/Top")], = args
+ elif typ in ["/FitV", "FitBV"]:
+ self[NameObject("/Left")], = args
+ elif typ in ["/Fit", "FitB"]:
pass
else:
- raise utils.PdfReadError, "Unknown Destination Type: " + type
+ raise utils.PdfReadError("Unknown Destination Type: %r" % typ)
- def setTitle(self, title):
- self["/Title"] = title.strip()
-
##
- # Read-write property accessing the destination title.
+ # Read-only property accessing the destination title.
# @return A string.
- title = property(lambda self: self.get("/Title"), setTitle, None)
+ title = property(lambda self: self.get("/Title"))
##
# Read-only property accessing the destination page.
# @return An integer.
- page = property(lambda self: self.get("/Page"), None, None)
+ page = property(lambda self: self.get("/Page"))
##
# Read-only property accessing the destination type.
# @return A string.
- type = property(lambda self: self.get("/Type"), None, None)
+ typ = property(lambda self: self.get("/Type"))
##
# Read-only property accessing the zoom factor.
# @return A number, or None if not available.
- zoom = property(lambda self: self.get("/Zoom", None), None, None)
+ zoom = property(lambda self: self.get("/Zoom", None))
##
# Read-only property accessing the left horizontal coordinate.
# @return A number, or None if not available.
- left = property(lambda self: self.get("/Left", None), None, None)
+ left = property(lambda self: self.get("/Left", None))
##
# Read-only property accessing the right horizontal coordinate.
# @return A number, or None if not available.
- right = property(lambda self: self.get("/Right", None), None, None)
+ right = property(lambda self: self.get("/Right", None))
##
# Read-only property accessing the top vertical coordinate.
# @return A number, or None if not available.
- top = property(lambda self: self.get("/Top", None), None, None)
+ top = property(lambda self: self.get("/Top", None))
##
# Read-only property accessing the bottom vertical coordinate.
# @return A number, or None if not available.
- bottom = property(lambda self: self.get("/Bottom", None), None, None)
-
+ bottom = property(lambda self: self.get("/Bottom", None))
def convertToInt(d, size):
- if size <= 4:
- d = "\x00\x00\x00\x00" + d
- d = d[-4:]
- return struct.unpack(">l", d)[0]
- elif size <= 8:
- d = "\x00\x00\x00\x00\x00\x00\x00\x00" + d
- d = d[-8:]
- return struct.unpack(">q", d)[0]
- else:
- # size too big
- assert False
+ if size > 8:
+ raise utils.PdfReadError("invalid size in convertToInt")
+ d = "\x00\x00\x00\x00\x00\x00\x00\x00" + d
+ d = d[-8:]
+ return struct.unpack(">q", d)[0]
# ref: pdf1.8 spec section 3.5.2 algorithm 3.2
_encryption_padding = '\x28\xbf\x4e\x5e\x4e\x75\x8a\x41\x64\x00\x4e\x56' + \
diff --git a/src/calibre/ebooks/pyPdf/utils.py b/src/pyPdf/utils.py
similarity index 90%
rename from src/calibre/ebooks/pyPdf/utils.py
rename to src/pyPdf/utils.py
index 860a42e669..dd0a3d002a 100644
--- a/src/calibre/ebooks/pyPdf/utils.py
+++ b/src/pyPdf/utils.py
@@ -34,6 +34,19 @@
__author__ = "Mathieu Fenniak"
__author_email__ = "biziqe@mathieu.fenniak.net"
+#ENABLE_PSYCO = False
+#if ENABLE_PSYCO:
+# try:
+# import psyco
+# except ImportError:
+# ENABLE_PSYCO = False
+#
+#if not ENABLE_PSYCO:
+# class psyco:
+# def proxy(func):
+# return func
+# proxy = staticmethod(proxy)
+
def readUntilWhitespace(stream, maxchars=None):
txt = ""
while True:
diff --git a/src/pyPdf/xmp.py b/src/pyPdf/xmp.py
new file mode 100644
index 0000000000..b070df9093
--- /dev/null
+++ b/src/pyPdf/xmp.py
@@ -0,0 +1,355 @@
+import re
+import datetime
+import decimal
+from generic import PdfObject
+from xml.dom import getDOMImplementation
+from xml.dom.minidom import parseString
+
+RDF_NAMESPACE = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+DC_NAMESPACE = "http://purl.org/dc/elements/1.1/"
+XMP_NAMESPACE = "http://ns.adobe.com/xap/1.0/"
+PDF_NAMESPACE = "http://ns.adobe.com/pdf/1.3/"
+XMPMM_NAMESPACE = "http://ns.adobe.com/xap/1.0/mm/"
+
+# What is the PDFX namespace, you might ask? I might ask that too. It's
+# a completely undocumented namespace used to place "custom metadata"
+# properties, which are arbitrary metadata properties with no semantic or
+# documented meaning. Elements in the namespace are key/value-style storage,
+# where the element name is the key and the content is the value. The keys
+# are transformed into valid XML identifiers by substituting an invalid
+# identifier character with \u2182 followed by the unicode hex ID of the
+# original character. A key like "my car" is therefore "my\u21820020car".
+#
+# \u2182, in case you're wondering, is the unicode character
+# \u{ROMAN NUMERAL TEN THOUSAND}, a straightforward and obvious choice for
+# escaping characters.
+#
+# Intentional users of the pdfx namespace should be shot on sight. A
+# custom data schema and sensical XML elements could be used instead, as is
+# suggested by Adobe's own documentation on XMP (under "Extensibility of
+# Schemas").
+#
+# Information presented here on the /pdfx/ schema is a result of limited
+# reverse engineering, and does not constitute a full specification.
+PDFX_NAMESPACE = "http://ns.adobe.com/pdfx/1.3/"
+
+iso8601 = re.compile("""
+ (?P Stability: Added in v1.12, will exist for all future v1.x releases.
+ dc_contributor = property(_getter_bag(DC_NAMESPACE, "contributor", _converter_string))
+
+ ##
+ # Text describing the extent or scope of the resource.
+ # Stability: Added in v1.12, will exist for all future v1.x releases.
+ dc_coverage = property(_getter_single(DC_NAMESPACE, "coverage", _converter_string))
+
+ ##
+ # A sorted array of names of the authors of the resource, listed in order
+ # of precedence.
+ # Stability: Added in v1.12, will exist for all future v1.x releases.
+ dc_creator = property(_getter_seq(DC_NAMESPACE, "creator", _converter_string))
+
+ ##
+ # A sorted array of dates (datetime.datetime instances) of signifigance to
+ # the resource. The dates and times are in UTC.
+ # Stability: Added in v1.12, will exist for all future v1.x releases.
+ dc_date = property(_getter_seq(DC_NAMESPACE, "date", _converter_date))
+
+ ##
+ # A language-keyed dictionary of textual descriptions of the content of the
+ # resource.
+ # Stability: Added in v1.12, will exist for all future v1.x releases.
+ dc_description = property(_getter_langalt(DC_NAMESPACE, "description", _converter_string))
+
+ ##
+ # The mime-type of the resource.
+ # Stability: Added in v1.12, will exist for all future v1.x releases.
+ dc_format = property(_getter_single(DC_NAMESPACE, "format", _converter_string))
+
+ ##
+ # Unique identifier of the resource.
+ # Stability: Added in v1.12, will exist for all future v1.x releases.
+ dc_identifier = property(_getter_single(DC_NAMESPACE, "identifier", _converter_string))
+
+ ##
+ # An unordered array specifying the languages used in the resource.
+ # Stability: Added in v1.12, will exist for all future v1.x releases.
+ dc_language = property(_getter_bag(DC_NAMESPACE, "language", _converter_string))
+
+ ##
+ # An unordered array of publisher names.
+ # Stability: Added in v1.12, will exist for all future v1.x releases.
+ dc_publisher = property(_getter_bag(DC_NAMESPACE, "publisher", _converter_string))
+
+ ##
+ # An unordered array of text descriptions of relationships to other
+ # documents.
+ # Stability: Added in v1.12, will exist for all future v1.x releases.
+ dc_relation = property(_getter_bag(DC_NAMESPACE, "relation", _converter_string))
+
+ ##
+ # A language-keyed dictionary of textual descriptions of the rights the
+ # user has to this resource.
+ # Stability: Added in v1.12, will exist for all future v1.x releases.
+ dc_rights = property(_getter_langalt(DC_NAMESPACE, "rights", _converter_string))
+
+ ##
+ # Unique identifier of the work from which this resource was derived.
+ # Stability: Added in v1.12, will exist for all future v1.x releases.
+ dc_source = property(_getter_single(DC_NAMESPACE, "source", _converter_string))
+
+ ##
+ # An unordered array of descriptive phrases or keywrods that specify the
+ # topic of the content of the resource.
+ # Stability: Added in v1.12, will exist for all future v1.x releases.
+ dc_subject = property(_getter_bag(DC_NAMESPACE, "subject", _converter_string))
+
+ ##
+ # A language-keyed dictionary of the title of the resource.
+ # Stability: Added in v1.12, will exist for all future v1.x releases.
+ dc_title = property(_getter_langalt(DC_NAMESPACE, "title", _converter_string))
+
+ ##
+ # An unordered array of textual descriptions of the document type.
+ # Stability: Added in v1.12, will exist for all future v1.x releases.
+ dc_type = property(_getter_bag(DC_NAMESPACE, "type", _converter_string))
+
+ ##
+ # An unformatted text string representing document keywords.
+ # Stability: Added in v1.12, will exist for all future v1.x releases.
+ pdf_keywords = property(_getter_single(PDF_NAMESPACE, "Keywords", _converter_string))
+
+ ##
+ # The PDF file version, for example 1.0, 1.3.
+ # Stability: Added in v1.12, will exist for all future v1.x releases.
+ pdf_pdfversion = property(_getter_single(PDF_NAMESPACE, "PDFVersion", _converter_string))
+
+ ##
+ # The name of the tool that created the PDF document.
+ # Stability: Added in v1.12, will exist for all future v1.x releases.
+ pdf_producer = property(_getter_single(PDF_NAMESPACE, "Producer", _converter_string))
+
+ ##
+ # The date and time the resource was originally created. The date and
+ # time are returned as a UTC datetime.datetime object.
+ # Stability: Added in v1.12, will exist for all future v1.x releases.
+ xmp_createDate = property(_getter_single(XMP_NAMESPACE, "CreateDate", _converter_date))
+
+ ##
+ # The date and time the resource was last modified. The date and time
+ # are returned as a UTC datetime.datetime object.
+ # Stability: Added in v1.12, will exist for all future v1.x releases.
+ xmp_modifyDate = property(_getter_single(XMP_NAMESPACE, "ModifyDate", _converter_date))
+
+ ##
+ # The date and time that any metadata for this resource was last
+ # changed. The date and time are returned as a UTC datetime.datetime
+ # object.
+ # Stability: Added in v1.12, will exist for all future v1.x releases.
+ xmp_metadataDate = property(_getter_single(XMP_NAMESPACE, "MetadataDate", _converter_date))
+
+ ##
+ # The name of the first known tool used to create the resource.
+ # Stability: Added in v1.12, will exist for all future v1.x releases.
+ xmp_creatorTool = property(_getter_single(XMP_NAMESPACE, "CreatorTool", _converter_string))
+
+ ##
+ # The common identifier for all versions and renditions of this resource.
+ # Stability: Added in v1.12, will exist for all future v1.x releases.
+ xmpmm_documentId = property(_getter_single(XMPMM_NAMESPACE, "DocumentID", _converter_string))
+
+ ##
+ # An identifier for a specific incarnation of a document, updated each
+ # time a file is saved.
+ # Stability: Added in v1.12, will exist for all future v1.x releases.
+ xmpmm_instanceId = property(_getter_single(XMPMM_NAMESPACE, "InstanceID", _converter_string))
+
+ def custom_properties(self):
+ if not hasattr(self, "_custom_properties"):
+ self._custom_properties = {}
+ for node in self.getNodesInNamespace("", PDFX_NAMESPACE):
+ key = node.localName
+ while True:
+ # see documentation about PDFX_NAMESPACE earlier in file
+ idx = key.find(u"\u2182")
+ if idx == -1:
+ break
+ key = key[:idx] + chr(int(key[idx+1:idx+5], base=16)) + key[idx+5:]
+ if node.nodeType == node.ATTRIBUTE_NODE:
+ value = node.nodeValue
+ else:
+ value = self._getText(node)
+ self._custom_properties[key] = value
+ return self._custom_properties
+
+ ##
+ # Retrieves custom metadata properties defined in the undocumented pdfx
+ # metadata schema.
+ # Stability: Added in v1.12, will exist for all future v1.x releases.
+ # @return Returns a dictionary of key/value items for custom metadata
+ # properties.
+ custom_properties = property(custom_properties)
+
+