Skip to content

Instantly share code, notes, and snippets.

@kochhar
Created July 11, 2023 05:00
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kochhar/926a69ae48eaff618af8686086ad95e7 to your computer and use it in GitHub Desktop.
Save kochhar/926a69ae48eaff618af8686086ad95e7 to your computer and use it in GitHub Desktop.
Testing JSON vs Msgpack serialisation and compression.
import gzip
import json
import msgpack
import random
import sys
import time
random.seed(42)
def generate_row():
row = []
# Generate floating point values which represent
# 40 distinct features with 5 different lookback periods
for _ in range(40):
for _ in range(5):
row.append(random.random())
# Generate floating point values which represent
# the user's interaction with 50 distinct categories
for _ in range(50):
row.append(random.random())
# Generate floating point values which represent
# 5 distinct 10-dimensional embeddings
for _ in range(5):
for _ in range(10):
row.append(random.random())
# Generate floating point values which represent
# a single 128-dimensional embedding
for _ in range(128):
row.append(random.random())
return row
def generate_features(num_rows=1000):
lookback_features = [f"fl{i}_{p}" for i in range(40) for p in range(5)]
interact_features = [f"fi{i}" for i in range(50)]
cat_features = [f"fc{i}" for i in range(5) for d in range(10)]
emb = "fe"
headers = lookback_features + interact_features + cat_features + [emb]
features = [generate_row() for _ in range(num_rows)]
return headers, features
def construct_dataset(header, features):
updates = [[i, time.time()] + row for i, row in enumerate(features)]
return {
"request_id": "aljaslcjals",
"feature_store": "acksnkc",
"feature_names": ["entity_id", "updated_at"] + header,
"updates": updates,
}
def main():
num_rows = 10000
print(f"Generating {num_rows} feature rows")
dataset = construct_dataset(*generate_features(num_rows))
print(f"with {len(dataset['updates'][0])} features per row")
start = time.time()
dataset_json = json.dumps(dataset)
end = time.time()
print(f"Json size : {len(dataset_json):,} bytes in {end-start:.5g}s")
start = time.time()
dataset_msgpack = msgpack.dumps(dataset)
end = time.time()
print(f"Msgpack size: {len(dataset_msgpack):,} bytes in {end - start:.5g}s")
start = time.time()
json_deflated = gzip.compress(dataset_json.encode('utf-8'))
end = time.time()
print(f"Gzip json size : {len(json_deflated):,} bytes in {end-start:.5g}s")
start = time.time()
msgpack_deflated = gzip.compress(dataset_msgpack)
end = time.time()
print(f"Gzip msgpack size: {len(msgpack_deflated):,} bytes in {end-start:.5g}s")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment