Skip to content

Instantly share code, notes, and snippets.

@mmontagna
Created August 30, 2016 22:52
Show Gist options
  • Save mmontagna/1f8b3508555ab306c916fa4ab2e7265a to your computer and use it in GitHub Desktop.
Save mmontagna/1f8b3508555ab306c916fa4ab2e7265a to your computer and use it in GitHub Desktop.
benchmark_storage_formats.py
import bson
import json
import random
import time
import datetime
import pandas as pd
import numpy
import os, sys
import csv
def generate_data(number):
o={'fields' :
['converted_ppm_fast_max', 'converted_ppm_fast_avg', 'raw_min converted_ppm_fast_min',
'raw_fast_avg', 'raw_avg', 'raw_fast_min', 'converted_ppm_max', 'raw_max', 'time_stamp',
'converted_ppm_min', 'raw_fast_max', 'converted_ppm_avg'
]
}
o['readings'] = []
for i in range(0, number):
o['readings'].append([
405 + 200 * random.random(),
405 + 200 * random.random(),
int(405 + 200 * random.random()),
405 + 200 * random.random(),
int(405 + 200 * random.random()),
int(405 + 200 * random.random()),
405 + 200 * random.random(),
int(405 + 200 * random.random()),
time.time(),
405 + 200 * random.random(),
int(405 + 200 * random.random()),
405 + 200 * random.random(),
])
return o
number_of_datapoints = [10, 1000000] #1000, 10000, 100000,
def save_pd(d):
df = pd.DataFrame.from_records(d['readings'], columns=d['fields'])
df.to_pickle('/tmp/pandas_pickle')
return '/tmp/pandas_pickle'
def load_pd(file):
return pd.read_pickle(file)
def save_np_array_bin(d):
arr = numpy.array(d['readings'])
numpy.savez_compressed('/tmp/numpy_array.npy.npz', arr)
return '/tmp/numpy_array.npy.npz'
def load_np_array_bin(file):
return numpy.load(file)
def save_np_array_text(d):
arr = numpy.array(d['readings'])
numpy.savetxt('/tmp/numpy_array.csv.gz', arr)
return '/tmp/numpy_array.csv.gz'
def load_np_array_text(file):
return numpy.loadtxt(file)
def json_dump(d):
with open('/tmp/json_data.json', 'wb') as f:
json.dump(d, f)
return '/tmp/json_data.json'
def json_load(file):
with open(file, 'r') as f:
return json.load(f)
def bson_dump(d):
with open('/tmp/bson_data.bson', 'wb') as f:
f.write(bson.BSON.encode(d))
return '/tmp/bson_data.bson'
def bson_load(file):
with open(file, 'rb') as f:
return bson.decode_all(f.read())[0]
def save_csv_naive(d):
with open('/tmp/naive_csv.csv', 'w') as f:
writer = csv.writer(f)
writer.writerow(d['fields'])
for reading in d['readings']:
writer.writerow(reading)
return '/tmp/naive_csv.csv'
def load_naive_csv(file):
readings = []
with open(file, 'r') as f:
reader = csv.reader(f)
for row in reader:
readings.append(row)
methods = []
methods.append(('naive csv (uncompressed)', save_csv_naive, load_naive_csv))
methods.append(('json (uncompressed)', json_dump, json_load))
methods.append(('bson', bson_dump, bson_load))
methods.append(('pandas_data_frame_pickle', save_pd, load_pd))
methods.append(('numpy_array_bin', save_np_array_bin, load_np_array_bin))
methods.append(('numpy_array_text (gzip)', save_np_array_text, load_np_array_text))
out = csv.writer(sys.stdout)
method_rows = [(x[0] +' save time (s)', x[0] + ' load time (s)', x[0] + ' file size (bytes)') for x in methods]
method_rows = [x for y in method_rows for x in y]
out.writerow(['name', 'num data points', ' save time (s)', ' load time (s)', ' file size (bytes)'])
for num_datapoints in number_of_datapoints:
data = generate_data(num_datapoints)
for name, save, load in methods:
save_start = time.time()
file = save(data)
save_stop = time.time()
load_start = time.time()
load(file)
load_stop = time.time()
out.writerow([name, num_datapoints, save_stop - save_start, load_stop - load_start, os.path.getsize(file)])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment