Created
September 22, 2016 18:43
-
-
Save ExpandingMan/2ffd9d0240aa063b87806f33298666ca to your computer and use it in GitHub Desktop.
Julia deserialization performance (compared with Python pickle protocol 3)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using DataFrames | |
using PyCall | |
# using DatasToolbox | |
import Base.serialize | |
import Base.deserialize | |
const NROWS = 2*10^6 | |
const FILENAME = "devtest1.jbin" | |
const PICKLE_FILENAME = "devtest1.pkl" | |
function serialize(filename::AbstractString, object) | |
f = open(filename, "w") | |
serialize(f, object) | |
close(f) | |
end | |
function pickle(filename::AbstractString, object::Any) | |
@pyimport pickle as pypickle | |
pyobject = PyObject(object) | |
f = pyeval("open('$filename', 'wb')") | |
pypickle.dump(pyobject, f) | |
end | |
function deserialize(filename::AbstractString) | |
f = open(filename) | |
o = deserialize(f) | |
close(f) | |
return o | |
end | |
function unpickle(filename::AbstractString) | |
@pyimport pickle as pypickle | |
f = pyeval("open('$filename', 'rb')") | |
pyobj = pypickle.load(f) | |
end | |
function pandas(df::DataFrame)::PyObject | |
@pyimport pandas as pd | |
pydf = pd.DataFrame() | |
for col in names(df) | |
set!(pydf, string(col), df[col]) | |
end | |
return pydf | |
end | |
function make_test_df(dtypes::DataType...) | |
df = DataFrame() | |
for (idx, dtype) in enumerate(dtypes) | |
col = Symbol(string(dtype)*string(idx)) | |
if dtype <: Real | |
df[col] = rand(dtype, NROWS) | |
elseif dtype <: AbstractString | |
df[col] = [string("this is a string") for i in 1:NROWS] | |
elseif dtype <: Dates.TimeType | |
df[col] = [dtype(now()) + Dates.Day(i) for i in 1:NROWS] | |
elseif dtype <: Symbol | |
df[col] = [Symbol("this is a symbol") for i in 1:NROWS] | |
end | |
end | |
return df | |
end | |
# df = make_test_df(Float64, Int64, String, String, Date, DateTime) | |
# df = make_test_df(String, String, DateTime, Float64, String, String, String) | |
# df = make_test_df(Symbol, Symbol, DateTime, Float64, Symbol, Symbol, Symbol) | |
# df = make_test_df(Float64, Float64, Float64, Float64) | |
# df = make_test_df(Int64, Int64, Int64, Int64) | |
df = make_test_df(String, String, String, String) | |
# df = make_test_df(Symbol, Symbol, Symbol, Symbol) | |
# df = make_test_df(DateTime, DateTime, DateTime, DateTime) | |
info("Serializing...") | |
@time serialize(FILENAME, df) | |
pydf = pandas(df) | |
info("Pickling...") | |
@time pickle(PICKLE_FILENAME, pydf) | |
info("Deserializing...") | |
@time df_test = deserialize(FILENAME) | |
info("Unpickling...") | |
@time pydf_test = unpickle(PICKLE_FILENAME) | |
# info("Converting back to julia") | |
# @time df_unpickle = convertPyDF(pydf_test) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment