mirror of
git://github.com/kovidgoyal/calibre.git
synced 2025-12-27 09:05:57 +01:00
wip: tcr compression
regex flags do not need "u" when the search string does not make use of the feature In python3, re.U is the default for unicode strings. For byte strings, which is what we use, it is a fatal error.
This commit is contained in:
parent
51d4b5a5e9
commit
8311b26205
1 changed files with 4 additions and 5 deletions
|
|
@ -1,5 +1,4 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
|
|
@ -35,10 +34,10 @@ def _combine_codes(self):
|
|||
The intent is to create more unused codes.
|
||||
'''
|
||||
possible_codes = []
|
||||
a_code = set(re.findall(b'(?msu).', self.coded_txt))
|
||||
a_code = set(re.findall(b'(?ms).', self.coded_txt))
|
||||
|
||||
for code in a_code:
|
||||
single_code = set(re.findall(b'(?msu)%s.' % re.escape(code), self.coded_txt))
|
||||
single_code = set(re.findall(b'(?ms)%s.' % re.escape(code), self.coded_txt))
|
||||
if len(single_code) == 1:
|
||||
possible_codes.append(single_code.pop())
|
||||
|
||||
|
|
@ -60,7 +59,7 @@ def _new_codes(self):
|
|||
'''
|
||||
Create new codes from codes that occur in pairs often.
|
||||
'''
|
||||
possible_new_codes = list(set(re.findall(b'(?msu)..', self.coded_txt)))
|
||||
possible_new_codes = list(set(re.findall(b'(?ms)..', self.coded_txt)))
|
||||
new_codes_count = []
|
||||
|
||||
for c in possible_new_codes:
|
||||
|
|
@ -77,7 +76,7 @@ def _new_codes(self):
|
|||
def compress(self, txt):
|
||||
self._reset()
|
||||
|
||||
self.codes = list(set(re.findall(b'(?msu).', txt)))
|
||||
self.codes = list(set(re.findall(b'(?ms).', txt)))
|
||||
|
||||
# Replace the text with their corresponding code
|
||||
for c in txt:
|
||||
|
|
|
|||
Loading…
Reference in a new issue