Skip to content

Instantly share code, notes, and snippets.

@eddy-geek
Last active April 16, 2020 17:26
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save eddy-geek/3e726747b5efe575b64f6d6c17eea8ed to your computer and use it in GitHub Desktop.
Save eddy-geek/3e726747b5efe575b64f6d6c17eea8ed to your computer and use it in GitHub Desktop.

Comparing length of json-encoded DataFrame. to_json doc

Conclusions with my sample dataset

  • CSV
    • can be signifantly more compact if your data has lots of nans / strings, since their encoding is empty / not-quoted
    • but index is lost
    • and careful with incompatible quoting if you embed csv in json
  • split is the most compact json encoding that keeps column and index names, use it over columns (default) / record
  • table is the json mode with best roundtrip support, but is 3x bigger
  • an additional 2x (csv) / 3x (split) can be gained with bz2 + base64 at the cost of CPU/complexity
  • and an additional -7% size with base85 at a 13x cpu cost
for orient in 'records split index columns values table'.split():
    j = o.to_json(orient=orient)
    print(orient, len(j), j[:500], j[-100:], sep='\n')

import pickle
print('pickle', len(pickle.dumps(o)))


import zlib
import bz2
import pickle
import base64
import pyarrow
import pyarrow.parquet

df = o

for orient in 'records split index columns values table'.split():
    print(orient, len(df.to_json(orient=orient)))

# csv 64
dfcsv = df.to_csv().encode()
print('~csv', len(dfcsv))
print('csv.b64', len(json.dumps(base64.b64encode(dfcsv).decode())))
dfcsv_gz_64 = base64.b64encode(zlib.compress(dfcsv)).decode()
print('csv.gz.b64', len(json.dumps(dfcsv_gz_64)))
dfcsv_bz_64 = base64.b64encode(bz2.compress(dfcsv)).decode()
print('csv.bz.b64', len(json.dumps(dfcsv_bz_64)))

# pickle base 64 (&85 which is 13x slower)
pik = pickle.dumps(df)
print('~pickle', len(pik))
print('pickle.b64', len(json.dumps(base64.b64encode(pik).decode())))
pickle_gz_64 = base64.b64encode(bz2.compress(pik)).decode()
pickle_gz_85 = base64.b85encode(bz2.compress(pik)).decode()
print('pickle.bz.b64 b85', len(json.dumps(pickle_gz_64)),  len(json.dumps(pickle_gz_85)))

# parquet 64
table = pyarrow.Table.from_pandas(df)
buf = pyarrow.BufferOutputStream()
pyarrow.parquet.write_table(table, buf)
park = buf.getvalue().to_pybytes()

print('~parquet', len(park))
print('parquet.b64', len(json.dumps(base64.b64encode(park).decode())))
park_gz_64 = base64.b64encode(zlib.compress(park)).decode()
print('parquet.gz.b64', len(json.dumps(park_gz_64)))
park_bz_64 = base64.b64encode(bz2.compress(park)).decode()
print('parquet.bz.b64', len(json.dumps(park_bz_64)))

# split base64
split = df.to_json(orient='split').encode()
print('split', len(split))
print('~split.gz', len(zlib.compress(split)))
splitgz64 = base64.b64encode(zlib.compress(split)).decode()
print('split.gz.b64', len(json.dumps(splitgz64)))
splitbz64 = base64.b64encode(bz2.compress(split)).decode()
print('split.bz.b64', len(json.dumps(splitbz64)))

Results:

records 68728
split 37894
index 79318
columns 78468
values 27227
table 87551
csv 21561
csv.gz.b64 12602
csv.bz.b64 11166
~pickle 46633
pickle.b64 62182
pickle.bz.b64 b85 16138 15129
~parquet 22832
parquet.b64 30446
parquet.gz.b64 21214
parquet.bz.b64 22994
split 37894
~split.gz 10353
split.gz.b64 13806
split.bz.b64 11550

✕ records -- 68728 -- no index!

[
    {"column1":null,"column2":null,"column3":null,"column4":null,"column5":145502.6447},
    {"column1":null,"column2":null,"column3":null,"column4":9.3134,"column5":null},
    {"column1":null,"column2":null,"column3":16.7342,"column4":23.3783,"column5":null},
    ...
    {"column1":152.7775,"column2":null,"column3":null,"column4":null,"column5":null}
]

✓ split -- 37894 -- smallest complete since no repetition of columns etc.

{
    "columns":["column1","column2","column3","column4","column5"],
    "index":[null,"index1","index2","index3",...
    "data":
    [
        [587.295,null,192.5252,null,null],
        [null,null,61.2949,129.5094,null],
        ...
        [152.7775,null,null,null,null]
    ]
}

✓ index -- 79318

{
    "nan":      {"column1":null,"column2":null,"column3":null,"column4":null,"column5":145502.6447},
    "index1":{"column1":null,"column2":null,"column3":null,"column4":9.3134,"column5":null},
    "index2":{"column1":null,"column2":null,"column3":16.7342,"column4":23.3783,"column5":null},
    ...
    "index999":{"column1":152.7775,"column2":null,"column3":null,"column4":null,"column5":null}
 }

✓ columns -- 78468 -- verbose if index is verbose

{"column1":
    {"nan":null,"index1":null,"ALAKC08AA":5.466,"ALAKC08EE":0.9689,...}
...
{"column5":
    ...
    {..., "index999":null}
}

✕ values -- 27227 -- no index, no column names !

[
    [null,null,null,null,145502.6447],
    [null,null,null,6.5745,null],
    ...
    [152.7775,null,null,null,null]
]

✓ table -- 87551 --

{"schema":
    {
        "fields":
            [{"name":"office","type":"string"},{"name":"column1","type":"number"},{"name":"column2","type":"number"},{"name":"column3","type":"number"},{"name":"column4","type":"number"},{"name":"column5","type":"number"}],
        "primaryKey":["office"],
        "pandas_version":"0.20.0"
    },
    "data":[
        {"office":null,"column1":null,"column2":null,"column3":null,"column4":null,"column5":145502.6447}
        {"office":"index2" , ...}
        ...
        {"office":"index999","column1":152.7775,"column2":null,"column3":null,"column4":null,"column5":null}
    ]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment