Skip to content

Instantly share code, notes, and snippets.

@lungben
Created November 20, 2020 10:41
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lungben/7d967eb5058bbe5708bb08fb1aeb2815 to your computer and use it in GitHub Desktop.
Save lungben/7d967eb5058bbe5708bb08fb1aeb2815 to your computer and use it in GitHub Desktop.
### A Pluto.jl notebook ###
# v0.12.11
using Markdown
using InteractiveUtils
# This Pluto notebook uses @bind for interactivity. When running this notebook outside of Pluto, the following 'mock version' of @bind gives bound variables a default value (instead of an error).
macro bind(def, element)
quote
local el = $(esc(element))
global $(esc(def)) = Core.applicable(Base.get, el) ? Base.get(el) : missing
el
end
end
# ╔═╡ f1da0f10-2b06-11eb-03dd-917c7f049b40
begin
using Pkg
Pkg.activate(mktempdir())
end
# ╔═╡ 032fc5c0-2b07-11eb-00e8-3f72a5bd1ad0
begin
Pkg.add(string.([:DataFrames, :TableIO, :XLSX, :Arrow, :JDF, :SQLite, :CSV, :JSONTables, :Parquet, :ZipFile, :Plots, :PlotlyJS, :PlutoUI]))
using DataFrames, TableIO
using XLSX, Arrow, JDF, SQLite, CSV, JSONTables, Parquet, ZipFile
using Plots, Dates, PlutoUI
plotlyjs()
end
# ╔═╡ 0d1fa0a0-2b1b-11eb-2256-b1b3cadc3db8
md"# Read-Write Performance of Various Formats for Tabular Data"
# ╔═╡ 415ed9c2-2b08-11eb-0f78-01fd83d120fc
@bind x Slider(1:6, show_value=true)
# ╔═╡ 03d708a0-2b0f-11eb-1dd2-c36ee9536ea8
n = round(Int, exp10(x))
# ╔═╡ 04aaf970-2b1a-11eb-245f-bd860f402540
md"""Dates are imported / exported in different formats depending on the capabilities of the package. Conversions between Date and String formats are not timed.
* native date format for CSV, Zipped CSV, XLSX, Arrow and JDF
* strings for Json, SQLite, Parquet
"""
# ╔═╡ 78b00e00-2b1a-11eb-0bb7-ff9eb9a41fe1
md"Test Data (shown for n=10):"
# ╔═╡ bb8eabe0-2b11-11eb-3f8d-f3d912fb027d
md"# Appendix"
# ╔═╡ 78257be0-2b07-11eb-254a-557bb7b304d0
extensions = ["csv", "zip", "json", "xlsx", "arrow", "jdf", "sqlite", "parquet"]
# ╔═╡ 2a5cce30-2b08-11eb-0b26-ef9a890a2fdd
make_df(n) = DataFrame(a=1:n, b=rand(n), c="hello".* string.(1:n), d=Bool.((1:n) .% 2), e=Date("2020-08-15") .+ Day.(1:n), f="world!" .* string.(1:n))
# ╔═╡ eda62430-2b18-11eb-0260-af53c53b6504
make_df(10)
# ╔═╡ 8dc94410-2b14-11eb-2549-47e6bd954963
function foldersize(dir=".")
size = 0
for (root, dirs, files) in walkdir(dir)
size += sum(map(filesize, joinpath.(root, files)))
end
return size
end
# ╔═╡ 4e3b6460-2b08-11eb-3843-5121a1d64405
begin
function time_all(n, extensions)
df = make_df(n)
df_date_as_string = df[!, Not(:e)]
df_date_as_string[!, :e] = string.(df.e)
timing_read = Dict{String, Float64}()
timing_write = Dict{String, Float64}()
filesizes = Dict{String, Float64}()
tmpdir = mktempdir()
for ext ∈ extensions
fname = joinpath(tmpdir, "test."*ext)
if ext == "sqlite"
timing_write[ext] = @elapsed write_table!(fname, "test_table", df_date_as_string)
timing_read[ext] = @elapsed df_in = DataFrame(read_table(fname, "test_table"); copycols=false)
@assert df_date_as_string == df_in "inconsistent dfs for $ext"
elseif ext == "parquet"
mapping = Dict(["c"] => (String, Parquet.logical_string),
["e"] => (String, Parquet.logical_string),
["f"] => (String, Parquet.logical_string))
# String field types must be mapped to appropriate data types
timing_write[ext] = @elapsed write_table!(fname, df_date_as_string)
timing_read[ext] = @elapsed df_in = DataFrame(read_table(fname; map_logical_types=mapping); copycols=false)
@assert df_date_as_string == df_in "inconsistent dfs for $ext"
elseif ext == "json"
timing_write[ext] = @elapsed write_table!(fname, df_date_as_string)
timing_read[ext] = @elapsed df_in = DataFrame(read_table(fname); copycols=false)
@assert df_date_as_string == df_in "inconsistent dfs for $ext"
else
timing_write[ext] = @elapsed write_table!(fname, df)
timing_read[ext] = @elapsed df_in = DataFrame(read_table(fname); copycols=false)
@assert df == df_in "inconsistent dfs for $ext"
end
if ext == "jdf"
filesizes[ext] = foldersize(fname) / (1024^2) # in MB
else
filesizes[ext] = filesize(fname) / (1024^2) # in MB
end
end
(;timing_read, timing_write, filesizes)
end
time_all(10, extensions) # warmup
end
# ╔═╡ 59b18bc0-2b09-11eb-0b6b-c7ffc3d6ef98
timings = time_all(n, extensions)
# ╔═╡ 77daa120-2b10-11eb-04f0-8b1f8b00303a
begin
plt_timing = scatter(timings.timing_write; label="writing", title="Timings for $n Lines", yscale=:log10, ylabel="time [s]")
scatter!(plt_timing, timings.timing_read; label="reading")
savefig(plt_timing, "timings_$n.png")
plt_timing
end
# ╔═╡ 382c74f0-2b14-11eb-05f2-6bede9afa601
begin
plt_size = bar(timings.filesizes, label=nothing, title="File sizes for $n Lines", ylabel="file size [MB]")
savefig(plt_size, "filesizes_$n.png")
plt_size
end
# ╔═╡ 1f98dba0-2b0e-11eb-1496-11ab64828a2c
begin
bar(timings.timing_write; label="writing", title="Timings for $n Lines", ylabel="time [s]")
bar!(timings.timing_read; label="reading")
end
# ╔═╡ Cell order:
# ╟─0d1fa0a0-2b1b-11eb-2256-b1b3cadc3db8
# ╟─03d708a0-2b0f-11eb-1dd2-c36ee9536ea8
# ╟─415ed9c2-2b08-11eb-0f78-01fd83d120fc
# ╟─77daa120-2b10-11eb-04f0-8b1f8b00303a
# ╟─382c74f0-2b14-11eb-05f2-6bede9afa601
# ╟─1f98dba0-2b0e-11eb-1496-11ab64828a2c
# ╟─04aaf970-2b1a-11eb-245f-bd860f402540
# ╟─78b00e00-2b1a-11eb-0bb7-ff9eb9a41fe1
# ╠═eda62430-2b18-11eb-0260-af53c53b6504
# ╟─bb8eabe0-2b11-11eb-3f8d-f3d912fb027d
# ╠═78257be0-2b07-11eb-254a-557bb7b304d0
# ╠═59b18bc0-2b09-11eb-0b6b-c7ffc3d6ef98
# ╠═2a5cce30-2b08-11eb-0b26-ef9a890a2fdd
# ╠═4e3b6460-2b08-11eb-3843-5121a1d64405
# ╠═8dc94410-2b14-11eb-2549-47e6bd954963
# ╠═f1da0f10-2b06-11eb-03dd-917c7f049b40
# ╠═032fc5c0-2b07-11eb-00e8-3f72a5bd1ad0
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment