From 8311b262050e5105d1c7e35ff4509e437f9fc7ea Mon Sep 17 00:00:00 2001 From: Eli Schwartz Date: Wed, 1 May 2019 10:54:40 -0400 Subject: [PATCH 1/2] wip: tcr compression regex flags do not need "u" when the search string does not make use of the feature In python3, re.U is the default for unicode strings. For byte strings, which is what we use, it is a fatal error. --- src/calibre/ebooks/compression/tcr.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/calibre/ebooks/compression/tcr.py b/src/calibre/ebooks/compression/tcr.py index 565399eb4d..6abd243fbb 100644 --- a/src/calibre/ebooks/compression/tcr.py +++ b/src/calibre/ebooks/compression/tcr.py @@ -1,5 +1,4 @@ # -*- coding: utf-8 -*- - from __future__ import absolute_import, division, print_function, unicode_literals __license__ = 'GPL 3' @@ -35,10 +34,10 @@ def _combine_codes(self): The intent is to create more unused codes. ''' possible_codes = [] - a_code = set(re.findall(b'(?msu).', self.coded_txt)) + a_code = set(re.findall(b'(?ms).', self.coded_txt)) for code in a_code: - single_code = set(re.findall(b'(?msu)%s.' % re.escape(code), self.coded_txt)) + single_code = set(re.findall(b'(?ms)%s.' % re.escape(code), self.coded_txt)) if len(single_code) == 1: possible_codes.append(single_code.pop()) @@ -60,7 +59,7 @@ def _new_codes(self): ''' Create new codes from codes that occur in pairs often. ''' - possible_new_codes = list(set(re.findall(b'(?msu)..', self.coded_txt))) + possible_new_codes = list(set(re.findall(b'(?ms)..', self.coded_txt))) new_codes_count = [] for c in possible_new_codes: @@ -77,7 +76,7 @@ def _new_codes(self): def compress(self, txt): self._reset() - self.codes = list(set(re.findall(b'(?msu).', txt))) + self.codes = list(set(re.findall(b'(?ms).', txt))) # Replace the text with their corresponding code for c in txt: From 28767243257769847fa82f0465f436ad4fdcf30d Mon Sep 17 00:00:00 2001 From: Eli Schwartz Date: Thu, 12 Sep 2019 19:51:20 -0400 Subject: [PATCH 2/2] py3: make tcr input/output work Fix tcr compression by unifying bytearrays on python2/python3 and acting appropriately. --- src/calibre/ebooks/compression/tcr.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/src/calibre/ebooks/compression/tcr.py b/src/calibre/ebooks/compression/tcr.py index 6abd243fbb..3080084196 100644 --- a/src/calibre/ebooks/compression/tcr.py +++ b/src/calibre/ebooks/compression/tcr.py @@ -6,7 +6,7 @@ __docformat__ = 'restructuredtext en' import re -from polyglot.builtins import range, int_to_byte +from polyglot.builtins import int_to_byte, is_py3, range class TCRCompressor(object): @@ -42,8 +42,10 @@ def _combine_codes(self): possible_codes.append(single_code.pop()) for code in possible_codes: + if not is_py3: + code = bytearray(code) self.coded_txt = self.coded_txt.replace(code, code[0:1]) - self.codes[ord(code[0:1])] = b'%s%s' % (self.codes[ord(code[0:1])], self.codes[ord(code[1:2])]) + self.codes[code[0]] = b'%s%s' % (self.codes[code[0]], self.codes[code[1]]) def _free_unused_codes(self): ''' @@ -79,8 +81,9 @@ def compress(self, txt): self.codes = list(set(re.findall(b'(?ms).', txt))) # Replace the text with their corresponding code - for c in txt: - self.coded_txt += int_to_byte(self.codes.index(c)) + # FIXME: python3 is native bytearray, but all we want are bytes + for c in bytearray(txt): + self.coded_txt += int_to_byte(self.codes.index(int_to_byte(c))) # Zero the unused codes and record which are unused. for i in range(len(self.codes), 256): @@ -95,9 +98,9 @@ def compress(self, txt): unused_code = self.unused_codes.pop() # Take the last possible codes and split it into individual # codes. The last possible code is the most often occurring. - code1, code2 = possible_codes.pop() - self.codes[unused_code] = b'%s%s' % (self.codes[ord(code1)], self.codes[ord(code2)]) - self.coded_txt = self.coded_txt.replace(b'%s%s' % (code1, code2), int_to_byte(unused_code)) + code = possible_codes.pop() + self.codes[unused_code] = b'%s%s' % (self.codes[ord(code[0:1])], self.codes[ord(code[1:2])]) + self.coded_txt = self.coded_txt.replace(code, int_to_byte(unused_code)) self._combine_codes() self._free_unused_codes() possible_codes = self._new_codes()