wip: tcr compression

regex flags do not need "u" when the search string does not make use of the feature In python3, re.U is the default for unicode strings. For byte strings, which is what we use, it is a fatal error.
2025-12-27 09:05:57 +01:00 · 2019-05-01 10:54:40 -04:00 · 2019-05-01 10:54:40 -04:00 · 8311b26205
commit 8311b26205
parent 51d4b5a5e9
1 changed files with 4 additions and 5 deletions
--- a/src/calibre/ebooks/compression/tcr.py
+++ b/src/calibre/ebooks/compression/tcr.py
@ -1,5 +1,4 @@
 # -*- coding: utf-8 -*-
-
 from __future__ import absolute_import, division, print_function, unicode_literals

 __license__ = 'GPL 3'
@ -35,10 +34,10 @@ def _combine_codes(self):
        The intent is to create more unused codes.
        '''
        possible_codes = []
-        a_code = set(re.findall(b'(?msu).', self.coded_txt))
+        a_code = set(re.findall(b'(?ms).', self.coded_txt))

        for code in a_code:
-            single_code = set(re.findall(b'(?msu)%s.' % re.escape(code), self.coded_txt))
+            single_code = set(re.findall(b'(?ms)%s.' % re.escape(code), self.coded_txt))
            if len(single_code) == 1:
                possible_codes.append(single_code.pop())

@ -60,7 +59,7 @@ def _new_codes(self):
        '''
        Create new codes from codes that occur in pairs often.
        '''
-        possible_new_codes = list(set(re.findall(b'(?msu)..', self.coded_txt)))
+        possible_new_codes = list(set(re.findall(b'(?ms)..', self.coded_txt)))
        new_codes_count = []

        for c in possible_new_codes:
@ -77,7 +76,7 @@ def _new_codes(self):
    def compress(self, txt):
        self._reset()

-        self.codes = list(set(re.findall(b'(?msu).', txt)))
+        self.codes = list(set(re.findall(b'(?ms).', txt)))

        # Replace the text with their corresponding code
        for c in txt: