Skip to content

Instantly share code, notes, and snippets.

@Hello1024
Created March 16, 2022 14:14
Show Gist options
  • Save Hello1024/9cc20300205c6aec214519cf84bbebb3 to your computer and use it in GitHub Desktop.
Save Hello1024/9cc20300205c6aec214519cf84bbebb3 to your computer and use it in GitHub Desktop.
test code to try out hierarchical compression for ZIM files
#!/bin/python3
import pyzstd
import os
import re
from collections import Counter
import itertools
# All files in "wiki/A" directory get added
# Create with:
# zimdump dump --dir=wiki wikipedia_nb_50000_nopic_2021-05.zim
def getData():
for root, dirs, files in os.walk("wiki/A", topdown=False):
for name in files:
with open(os.path.join(root, name), mode='rb') as file: # b is important -> binary
yield file.read()
def compressor(parent_blobs, blob):
c = pyzstd.ZstdCompressor(1) # compression level. 1 is good for testing. 22 is highest.
# Note this could be made far more CPU efficient by caching
# the ZstdCompressor object rather that recompressing all
# parents from scratch each time. Requires ZstdCompressor be duplicatable,
# which in the python API it is not.
for d in parent_blobs:
c.compress(d, c.FLUSH_BLOCK)
return c.compress(blob, c.FLUSH_BLOCK)
def buildTreeNode(parent_node, parent_blobs, datasource):
if len(b''.join(parent_blobs)) >= 2000000:
return [];
try:
# Take 40 documents at a time
data = b''.join(list(itertools.islice(datasource, 40-1))) + next(datasource)
except StopIteration:
return []
this_node = {'parent': parent_node, 'cdata': compressor(parent_blobs, data)}
compressed_output = [this_node]
# 5 = branching factor.
for i in range(5):
compressed_output+=buildTreeNode(this_node, parent_blobs+[data], datasource)
return compressed_output
# Compresses a datasource into a list of compressed blobs, each with an accompanying pointer to a parent blob needed to decode it.
def buildTree(datasource):
compressed_output = []
while True:
child_nodelist = buildTreeNode(None, [], datasource)
compressed_output+=child_nodelist
if child_nodelist == []:
break
return compressed_output
def decompressNode(node):
if node['parent'] is None:
d = pyzstd.ZstdDecompressor()
parentData = parentParentData= b''
else:
parentData, d, parentParentData = decompressNode(node['parent'])
return d.decompress(node['cdata']), d, parentData+parentParentData
def main():
print("compressing!")
datastore = buildTree(getData());
print("Compressed Data size is: ", sum([len(a['cdata']) for a in datastore]))
print("Nodes: ", len(datastore))
print("Top level nodes: ", len([a for a in datastore if a['parent'] is None]))
print("decompressing arbitary node")
data, _, parentData = decompressNode(datastore[37])
print("contents: ", data[0:min(len(data), 20)])
print("bytes decompressed for parent Nodes: ", len(parentData))
print("bytes decompressed in target Node: ", len(data))
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment