Skip to content

Instantly share code, notes, and snippets.

@kojiishi
Created August 20, 2021 06:03
Show Gist options
  • Save kojiishi/ba93f5ab9f10316b183fa7e9ec19ac2f to your computer and use it in GitHub Desktop.
Save kojiishi/ba93f5ab9f10316b183fa7e9ec19ac2f to your computer and use it in GitHub Desktop.
UnicodeDataCompressor
#!/usr/bin/env python3
import gzip
from typing import Dict
from unicodedata_parser import *
class UnicodeDataCompressor(object):
def add(self, code_and_values):
buffer = bytearray()
value_indices = {}
next_value_index = 0
last_value = None
last_value_start = -1
for code, value in code_and_values:
assert value
if value == last_value:
continue
if last_value is not None:
value_index = value_indices.setdefault(value, next_value_index)
if value_index == next_value_index:
next_value_index += 1
count = code - last_value_start
print(
f'{code:04X} {value}: {value_index} {code - last_value_start}'
)
self._add(value_index, count, buffer)
last_value = value
last_value_start = code
print(next_value_index, len(buffer), len(gzip.compress(buffer)))
def _add(self, value_index, count, buffer):
buffer.append(value_index)
while count >= 0x80:
buffer.append((count & 0x7F) | 0x80)
count >>= 7
buffer.append(count)
def main():
parser = UnicodeDataParser()
lb = parser.line_break()
compressor = UnicodeDataCompressor()
compressor.add(lb.items())
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment