Created
August 30, 2016 22:52
-
-
Save mmontagna/1f8b3508555ab306c916fa4ab2e7265a to your computer and use it in GitHub Desktop.
benchmark_storage_formats.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import bson | |
import json | |
import random | |
import time | |
import datetime | |
import pandas as pd | |
import numpy | |
import os, sys | |
import csv | |
def generate_data(number): | |
o={'fields' : | |
['converted_ppm_fast_max', 'converted_ppm_fast_avg', 'raw_min converted_ppm_fast_min', | |
'raw_fast_avg', 'raw_avg', 'raw_fast_min', 'converted_ppm_max', 'raw_max', 'time_stamp', | |
'converted_ppm_min', 'raw_fast_max', 'converted_ppm_avg' | |
] | |
} | |
o['readings'] = [] | |
for i in range(0, number): | |
o['readings'].append([ | |
405 + 200 * random.random(), | |
405 + 200 * random.random(), | |
int(405 + 200 * random.random()), | |
405 + 200 * random.random(), | |
int(405 + 200 * random.random()), | |
int(405 + 200 * random.random()), | |
405 + 200 * random.random(), | |
int(405 + 200 * random.random()), | |
time.time(), | |
405 + 200 * random.random(), | |
int(405 + 200 * random.random()), | |
405 + 200 * random.random(), | |
]) | |
return o | |
number_of_datapoints = [10, 1000000] #1000, 10000, 100000, | |
def save_pd(d): | |
df = pd.DataFrame.from_records(d['readings'], columns=d['fields']) | |
df.to_pickle('/tmp/pandas_pickle') | |
return '/tmp/pandas_pickle' | |
def load_pd(file): | |
return pd.read_pickle(file) | |
def save_np_array_bin(d): | |
arr = numpy.array(d['readings']) | |
numpy.savez_compressed('/tmp/numpy_array.npy.npz', arr) | |
return '/tmp/numpy_array.npy.npz' | |
def load_np_array_bin(file): | |
return numpy.load(file) | |
def save_np_array_text(d): | |
arr = numpy.array(d['readings']) | |
numpy.savetxt('/tmp/numpy_array.csv.gz', arr) | |
return '/tmp/numpy_array.csv.gz' | |
def load_np_array_text(file): | |
return numpy.loadtxt(file) | |
def json_dump(d): | |
with open('/tmp/json_data.json', 'wb') as f: | |
json.dump(d, f) | |
return '/tmp/json_data.json' | |
def json_load(file): | |
with open(file, 'r') as f: | |
return json.load(f) | |
def bson_dump(d): | |
with open('/tmp/bson_data.bson', 'wb') as f: | |
f.write(bson.BSON.encode(d)) | |
return '/tmp/bson_data.bson' | |
def bson_load(file): | |
with open(file, 'rb') as f: | |
return bson.decode_all(f.read())[0] | |
def save_csv_naive(d): | |
with open('/tmp/naive_csv.csv', 'w') as f: | |
writer = csv.writer(f) | |
writer.writerow(d['fields']) | |
for reading in d['readings']: | |
writer.writerow(reading) | |
return '/tmp/naive_csv.csv' | |
def load_naive_csv(file): | |
readings = [] | |
with open(file, 'r') as f: | |
reader = csv.reader(f) | |
for row in reader: | |
readings.append(row) | |
methods = [] | |
methods.append(('naive csv (uncompressed)', save_csv_naive, load_naive_csv)) | |
methods.append(('json (uncompressed)', json_dump, json_load)) | |
methods.append(('bson', bson_dump, bson_load)) | |
methods.append(('pandas_data_frame_pickle', save_pd, load_pd)) | |
methods.append(('numpy_array_bin', save_np_array_bin, load_np_array_bin)) | |
methods.append(('numpy_array_text (gzip)', save_np_array_text, load_np_array_text)) | |
out = csv.writer(sys.stdout) | |
method_rows = [(x[0] +' save time (s)', x[0] + ' load time (s)', x[0] + ' file size (bytes)') for x in methods] | |
method_rows = [x for y in method_rows for x in y] | |
out.writerow(['name', 'num data points', ' save time (s)', ' load time (s)', ' file size (bytes)']) | |
for num_datapoints in number_of_datapoints: | |
data = generate_data(num_datapoints) | |
for name, save, load in methods: | |
save_start = time.time() | |
file = save(data) | |
save_stop = time.time() | |
load_start = time.time() | |
load(file) | |
load_stop = time.time() | |
out.writerow([name, num_datapoints, save_stop - save_start, load_stop - load_start, os.path.getsize(file)]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment