When creating ZIP/EPUB files encode all file names in UTF-8 and set the UTF-8 bit marker. When extracting from a ZIP/EPUB file, if the UTF-8 bit marker is not set, still assume that the file name is encoded in UTF-8. If UTF-8 decoding fails try to detect character encoding using chardet. This is because most ZIP/EPUB files calibre comes across seem to be utf-8 encoded.

This commit is contained in:
Kovid Goyal 2010-03-06 11:06:40 -07:00
parent cd531451bb
commit 65c3a9c2a3

View file

@ -138,12 +138,16 @@ class LargeZipFile(Exception):
def decode_arcname(name):
if not isinstance(name, unicode):
encoding = detect(name)['encoding']
try:
name = name.decode(encoding)
name = name.decode('utf-8')
except:
name = name.decode('utf-8', 'replace')
return name.encode(filesystem_encoding, 'replace')
res = detect(name)
encoding = res['encoding']
try:
name = name.decode(encoding)
except:
name = name.decode('utf-8', 'replace')
return name
def is_zipfile(filename):
@ -352,10 +356,7 @@ def FileHeader(self):
def _encodeFilenameFlags(self):
if isinstance(self.filename, unicode):
try:
return self.filename.encode('ascii'), self.flag_bits
except:
return self.filename.encode('utf-8'), self.flag_bits | 0x800
return self.filename.encode('utf-8'), self.flag_bits | 0x800
else:
return self.filename, self.flag_bits
@ -363,7 +364,7 @@ def _decodeFilename(self):
if self.flag_bits & 0x800:
return self.filename.decode('utf-8')
else:
return self.filename
return decode_arcname(self.filename)
def _decodeExtra(self):
# Try to decode the extra field.
@ -1059,7 +1060,9 @@ def _extract_member(self, member, targetpath, pwd):
targetpath = targetpath[:-1]
# don't include leading "/" from file name if present
fname = decode_arcname(member.filename)
fname = member.filename
if isinstance(fname, unicode):
fname = fname.encode(filesystem_encoding, 'replace')
if fname.startswith('/'):
fname = fname[1:]
targetpath = os.path.join(targetpath, fname)
@ -1111,8 +1114,6 @@ def _writecheck(self, zinfo):
def write(self, filename, arcname=None, compress_type=None):
"""Put the bytes from filename into the archive under the name
arcname."""
if isinstance(filename, unicode):
filename = filename.encode('utf-8')
if not self.fp:
raise RuntimeError(
"Attempt to write to ZIP archive that was already closed")
@ -1126,6 +1127,8 @@ def write(self, filename, arcname=None, compress_type=None):
arcname = os.path.normpath(os.path.splitdrive(arcname)[1])
while arcname[0] in (os.sep, os.altsep):
arcname = arcname[1:]
if not isinstance(arcname, unicode):
arcname = arcname.decode(filesystem_encoding)
zinfo = ZipInfo(arcname, date_time)
zinfo.external_attr = (st[0] & 0xFFFF) << 16L # Unix attributes
if compress_type is None:
@ -1187,8 +1190,8 @@ def writestr(self, zinfo_or_arcname, bytes, permissions=0600,
assert not raw_bytes or (raw_bytes and
isinstance(zinfo_or_arcname, ZipInfo))
if not isinstance(zinfo_or_arcname, ZipInfo):
if isinstance(zinfo_or_arcname, unicode):
zinfo_or_arcname = zinfo_or_arcname.encode('utf-8')
if not isinstance(zinfo_or_arcname, unicode):
zinfo_or_arcname = zinfo_or_arcname.decode(filesystem_encoding)
zinfo = ZipInfo(filename=zinfo_or_arcname,
date_time=time.localtime(time.time())[:6])
zinfo.compress_type = compression