From 8311b262050e5105d1c7e35ff4509e437f9fc7ea Mon Sep 17 00:00:00 2001
From: Eli Schwartz <eschwartz@archlinux.org>
Date: Wed, 1 May 2019 10:54:40 -0400
Subject: [PATCH 1/2] wip: tcr compression

regex flags do not need "u" when the search string does not make use of the feature

In python3, re.U is the default for unicode strings. For byte strings,
which is what we use, it is a fatal error.
---
 src/calibre/ebooks/compression/tcr.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/calibre/ebooks/compression/tcr.py b/src/calibre/ebooks/compression/tcr.py
index 565399eb4d..6abd243fbb 100644
--- a/src/calibre/ebooks/compression/tcr.py
+++ b/src/calibre/ebooks/compression/tcr.py
@@ -1,5 +1,4 @@
 # -*- coding: utf-8 -*-
-
 from __future__ import absolute_import, division, print_function, unicode_literals
 
 __license__ = 'GPL 3'
@@ -35,10 +34,10 @@ def _combine_codes(self):
         The intent is to create more unused codes.
         '''
         possible_codes = []
-        a_code = set(re.findall(b'(?msu).', self.coded_txt))
+        a_code = set(re.findall(b'(?ms).', self.coded_txt))
 
         for code in a_code:
-            single_code = set(re.findall(b'(?msu)%s.' % re.escape(code), self.coded_txt))
+            single_code = set(re.findall(b'(?ms)%s.' % re.escape(code), self.coded_txt))
             if len(single_code) == 1:
                 possible_codes.append(single_code.pop())
 
@@ -60,7 +59,7 @@ def _new_codes(self):
         '''
         Create new codes from codes that occur in pairs often.
         '''
-        possible_new_codes = list(set(re.findall(b'(?msu)..', self.coded_txt)))
+        possible_new_codes = list(set(re.findall(b'(?ms)..', self.coded_txt)))
         new_codes_count = []
 
         for c in possible_new_codes:
@@ -77,7 +76,7 @@ def _new_codes(self):
     def compress(self, txt):
         self._reset()
 
-        self.codes = list(set(re.findall(b'(?msu).', txt)))
+        self.codes = list(set(re.findall(b'(?ms).', txt)))
 
         # Replace the text with their corresponding code
         for c in txt:

From 28767243257769847fa82f0465f436ad4fdcf30d Mon Sep 17 00:00:00 2001
From: Eli Schwartz <eschwartz@archlinux.org>
Date: Thu, 12 Sep 2019 19:51:20 -0400
Subject: [PATCH 2/2] py3: make tcr input/output work

Fix tcr compression by unifying bytearrays on python2/python3 and acting
appropriately.
---
 src/calibre/ebooks/compression/tcr.py | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/src/calibre/ebooks/compression/tcr.py b/src/calibre/ebooks/compression/tcr.py
index 6abd243fbb..3080084196 100644
--- a/src/calibre/ebooks/compression/tcr.py
+++ b/src/calibre/ebooks/compression/tcr.py
@@ -6,7 +6,7 @@
 __docformat__ = 'restructuredtext en'
 
 import re
-from polyglot.builtins import range, int_to_byte
+from polyglot.builtins import int_to_byte, is_py3, range
 
 
 class TCRCompressor(object):
@@ -42,8 +42,10 @@ def _combine_codes(self):
                 possible_codes.append(single_code.pop())
 
         for code in possible_codes:
+            if not is_py3:
+                code = bytearray(code)
             self.coded_txt = self.coded_txt.replace(code, code[0:1])
-            self.codes[ord(code[0:1])] = b'%s%s' % (self.codes[ord(code[0:1])], self.codes[ord(code[1:2])])
+            self.codes[code[0]] = b'%s%s' % (self.codes[code[0]], self.codes[code[1]])
 
     def _free_unused_codes(self):
         '''
@@ -79,8 +81,9 @@ def compress(self, txt):
         self.codes = list(set(re.findall(b'(?ms).', txt)))
 
         # Replace the text with their corresponding code
-        for c in txt:
-            self.coded_txt += int_to_byte(self.codes.index(c))
+        # FIXME: python3 is native bytearray, but all we want are bytes
+        for c in bytearray(txt):
+            self.coded_txt += int_to_byte(self.codes.index(int_to_byte(c)))
 
         # Zero the unused codes and record which are unused.
         for i in range(len(self.codes), 256):
@@ -95,9 +98,9 @@ def compress(self, txt):
                 unused_code = self.unused_codes.pop()
                 # Take the last possible codes and split it into individual
                 # codes. The last possible code is the most often occurring.
-                code1, code2 = possible_codes.pop()
-                self.codes[unused_code] = b'%s%s' % (self.codes[ord(code1)], self.codes[ord(code2)])
-                self.coded_txt = self.coded_txt.replace(b'%s%s' % (code1, code2), int_to_byte(unused_code))
+                code = possible_codes.pop()
+                self.codes[unused_code] = b'%s%s' % (self.codes[ord(code[0:1])], self.codes[ord(code[1:2])])
+                self.coded_txt = self.coded_txt.replace(code, int_to_byte(unused_code))
             self._combine_codes()
             self._free_unused_codes()
             possible_codes = self._new_codes()