Created
July 11, 2023 05:00
-
-
Save kochhar/926a69ae48eaff618af8686086ad95e7 to your computer and use it in GitHub Desktop.
Testing JSON vs Msgpack serialisation and compression.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import gzip | |
import json | |
import msgpack | |
import random | |
import sys | |
import time | |
random.seed(42) | |
def generate_row(): | |
row = [] | |
# Generate floating point values which represent | |
# 40 distinct features with 5 different lookback periods | |
for _ in range(40): | |
for _ in range(5): | |
row.append(random.random()) | |
# Generate floating point values which represent | |
# the user's interaction with 50 distinct categories | |
for _ in range(50): | |
row.append(random.random()) | |
# Generate floating point values which represent | |
# 5 distinct 10-dimensional embeddings | |
for _ in range(5): | |
for _ in range(10): | |
row.append(random.random()) | |
# Generate floating point values which represent | |
# a single 128-dimensional embedding | |
for _ in range(128): | |
row.append(random.random()) | |
return row | |
def generate_features(num_rows=1000): | |
lookback_features = [f"fl{i}_{p}" for i in range(40) for p in range(5)] | |
interact_features = [f"fi{i}" for i in range(50)] | |
cat_features = [f"fc{i}" for i in range(5) for d in range(10)] | |
emb = "fe" | |
headers = lookback_features + interact_features + cat_features + [emb] | |
features = [generate_row() for _ in range(num_rows)] | |
return headers, features | |
def construct_dataset(header, features): | |
updates = [[i, time.time()] + row for i, row in enumerate(features)] | |
return { | |
"request_id": "aljaslcjals", | |
"feature_store": "acksnkc", | |
"feature_names": ["entity_id", "updated_at"] + header, | |
"updates": updates, | |
} | |
def main(): | |
num_rows = 10000 | |
print(f"Generating {num_rows} feature rows") | |
dataset = construct_dataset(*generate_features(num_rows)) | |
print(f"with {len(dataset['updates'][0])} features per row") | |
start = time.time() | |
dataset_json = json.dumps(dataset) | |
end = time.time() | |
print(f"Json size : {len(dataset_json):,} bytes in {end-start:.5g}s") | |
start = time.time() | |
dataset_msgpack = msgpack.dumps(dataset) | |
end = time.time() | |
print(f"Msgpack size: {len(dataset_msgpack):,} bytes in {end - start:.5g}s") | |
start = time.time() | |
json_deflated = gzip.compress(dataset_json.encode('utf-8')) | |
end = time.time() | |
print(f"Gzip json size : {len(json_deflated):,} bytes in {end-start:.5g}s") | |
start = time.time() | |
msgpack_deflated = gzip.compress(dataset_msgpack) | |
end = time.time() | |
print(f"Gzip msgpack size: {len(msgpack_deflated):,} bytes in {end-start:.5g}s") | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment