Comparing length of json-encoded DataFrame. to_json doc
Conclusions with my sample dataset
CSV
- can be signifantly more compact if your data has lots of
nan
s / strings, since their encoding is empty / not-quoted - but index is lost
- and careful with incompatible quoting if you embed csv in json
- can be signifantly more compact if your data has lots of
split
is the most compactjson
encoding that keeps column and index names, use it overcolumns
(default) /record
- but index name is lost
- and MultiIndex does not wround-trip
table
is the json mode with best roundtrip support, but is 3x bigger- an additional 2x (csv) / 3x (split) can be gained with bz2 + base64 at the cost of CPU/complexity
- and an additional -7% size with base85 at a 13x cpu cost
for orient in 'records split index columns values table'.split():
j = o.to_json(orient=orient)
print(orient, len(j), j[:500], j[-100:], sep='\n')
import pickle
print('pickle', len(pickle.dumps(o)))
import zlib
import bz2
import pickle
import base64
import pyarrow
import pyarrow.parquet
df = o
for orient in 'records split index columns values table'.split():
print(orient, len(df.to_json(orient=orient)))
# csv 64
dfcsv = df.to_csv().encode()
print('~csv', len(dfcsv))
print('csv.b64', len(json.dumps(base64.b64encode(dfcsv).decode())))
dfcsv_gz_64 = base64.b64encode(zlib.compress(dfcsv)).decode()
print('csv.gz.b64', len(json.dumps(dfcsv_gz_64)))
dfcsv_bz_64 = base64.b64encode(bz2.compress(dfcsv)).decode()
print('csv.bz.b64', len(json.dumps(dfcsv_bz_64)))
# pickle base 64 (&85 which is 13x slower)
pik = pickle.dumps(df)
print('~pickle', len(pik))
print('pickle.b64', len(json.dumps(base64.b64encode(pik).decode())))
pickle_gz_64 = base64.b64encode(bz2.compress(pik)).decode()
pickle_gz_85 = base64.b85encode(bz2.compress(pik)).decode()
print('pickle.bz.b64 b85', len(json.dumps(pickle_gz_64)), len(json.dumps(pickle_gz_85)))
# parquet 64
table = pyarrow.Table.from_pandas(df)
buf = pyarrow.BufferOutputStream()
pyarrow.parquet.write_table(table, buf)
park = buf.getvalue().to_pybytes()
print('~parquet', len(park))
print('parquet.b64', len(json.dumps(base64.b64encode(park).decode())))
park_gz_64 = base64.b64encode(zlib.compress(park)).decode()
print('parquet.gz.b64', len(json.dumps(park_gz_64)))
park_bz_64 = base64.b64encode(bz2.compress(park)).decode()
print('parquet.bz.b64', len(json.dumps(park_bz_64)))
# split base64
split = df.to_json(orient='split').encode()
print('split', len(split))
print('~split.gz', len(zlib.compress(split)))
splitgz64 = base64.b64encode(zlib.compress(split)).decode()
print('split.gz.b64', len(json.dumps(splitgz64)))
splitbz64 = base64.b64encode(bz2.compress(split)).decode()
print('split.bz.b64', len(json.dumps(splitbz64)))
Results:
records 68728
split 37894
index 79318
columns 78468
values 27227
table 87551
csv 21561
csv.gz.b64 12602
csv.bz.b64 11166
~pickle 46633
pickle.b64 62182
pickle.bz.b64 b85 16138 15129
~parquet 22832
parquet.b64 30446
parquet.gz.b64 21214
parquet.bz.b64 22994
split 37894
~split.gz 10353
split.gz.b64 13806
split.bz.b64 11550
✕ records -- 68728 -- no index!
[
{"column1":null,"column2":null,"column3":null,"column4":null,"column5":145502.6447},
{"column1":null,"column2":null,"column3":null,"column4":9.3134,"column5":null},
{"column1":null,"column2":null,"column3":16.7342,"column4":23.3783,"column5":null},
...
{"column1":152.7775,"column2":null,"column3":null,"column4":null,"column5":null}
]
✓ split -- 37894 -- smallest complete since no repetition of columns etc.
{
"columns":["column1","column2","column3","column4","column5"],
"index":[null,"index1","index2","index3",...
"data":
[
[587.295,null,192.5252,null,null],
[null,null,61.2949,129.5094,null],
...
[152.7775,null,null,null,null]
]
}
✓ index -- 79318
{
"nan": {"column1":null,"column2":null,"column3":null,"column4":null,"column5":145502.6447},
"index1":{"column1":null,"column2":null,"column3":null,"column4":9.3134,"column5":null},
"index2":{"column1":null,"column2":null,"column3":16.7342,"column4":23.3783,"column5":null},
...
"index999":{"column1":152.7775,"column2":null,"column3":null,"column4":null,"column5":null}
}
✓ columns -- 78468 -- verbose if index is verbose
{"column1":
{"nan":null,"index1":null,"ALAKC08AA":5.466,"ALAKC08EE":0.9689,...}
...
{"column5":
...
{..., "index999":null}
}
✕ values -- 27227 -- no index, no column names !
[
[null,null,null,null,145502.6447],
[null,null,null,6.5745,null],
...
[152.7775,null,null,null,null]
]
✓ table -- 87551 --
{"schema":
{
"fields":
[{"name":"office","type":"string"},{"name":"column1","type":"number"},{"name":"column2","type":"number"},{"name":"column3","type":"number"},{"name":"column4","type":"number"},{"name":"column5","type":"number"}],
"primaryKey":["office"],
"pandas_version":"0.20.0"
},
"data":[
{"office":null,"column1":null,"column2":null,"column3":null,"column4":null,"column5":145502.6447}
{"office":"index2" , ...}
...
{"office":"index999","column1":152.7775,"column2":null,"column3":null,"column4":null,"column5":null}
]
}