Skip to content

Instantly share code, notes, and snippets.

@ExpandingMan
Created September 22, 2016 18:43
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ExpandingMan/2ffd9d0240aa063b87806f33298666ca to your computer and use it in GitHub Desktop.
Save ExpandingMan/2ffd9d0240aa063b87806f33298666ca to your computer and use it in GitHub Desktop.
Julia deserialization performance (compared with Python pickle protocol 3)
using DataFrames
using PyCall
# using DatasToolbox
import Base.serialize
import Base.deserialize
const NROWS = 2*10^6
const FILENAME = "devtest1.jbin"
const PICKLE_FILENAME = "devtest1.pkl"
function serialize(filename::AbstractString, object)
f = open(filename, "w")
serialize(f, object)
close(f)
end
function pickle(filename::AbstractString, object::Any)
@pyimport pickle as pypickle
pyobject = PyObject(object)
f = pyeval("open('$filename', 'wb')")
pypickle.dump(pyobject, f)
end
function deserialize(filename::AbstractString)
f = open(filename)
o = deserialize(f)
close(f)
return o
end
function unpickle(filename::AbstractString)
@pyimport pickle as pypickle
f = pyeval("open('$filename', 'rb')")
pyobj = pypickle.load(f)
end
function pandas(df::DataFrame)::PyObject
@pyimport pandas as pd
pydf = pd.DataFrame()
for col in names(df)
set!(pydf, string(col), df[col])
end
return pydf
end
function make_test_df(dtypes::DataType...)
df = DataFrame()
for (idx, dtype) in enumerate(dtypes)
col = Symbol(string(dtype)*string(idx))
if dtype <: Real
df[col] = rand(dtype, NROWS)
elseif dtype <: AbstractString
df[col] = [string("this is a string") for i in 1:NROWS]
elseif dtype <: Dates.TimeType
df[col] = [dtype(now()) + Dates.Day(i) for i in 1:NROWS]
elseif dtype <: Symbol
df[col] = [Symbol("this is a symbol") for i in 1:NROWS]
end
end
return df
end
# df = make_test_df(Float64, Int64, String, String, Date, DateTime)
# df = make_test_df(String, String, DateTime, Float64, String, String, String)
# df = make_test_df(Symbol, Symbol, DateTime, Float64, Symbol, Symbol, Symbol)
# df = make_test_df(Float64, Float64, Float64, Float64)
# df = make_test_df(Int64, Int64, Int64, Int64)
df = make_test_df(String, String, String, String)
# df = make_test_df(Symbol, Symbol, Symbol, Symbol)
# df = make_test_df(DateTime, DateTime, DateTime, DateTime)
info("Serializing...")
@time serialize(FILENAME, df)
pydf = pandas(df)
info("Pickling...")
@time pickle(PICKLE_FILENAME, pydf)
info("Deserializing...")
@time df_test = deserialize(FILENAME)
info("Unpickling...")
@time pydf_test = unpickle(PICKLE_FILENAME)
# info("Converting back to julia")
# @time df_unpickle = convertPyDF(pydf_test)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment