Created
November 20, 2020 10:41
-
-
Save lungben/7d967eb5058bbe5708bb08fb1aeb2815 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
### A Pluto.jl notebook ### | |
# v0.12.11 | |
using Markdown | |
using InteractiveUtils | |
# This Pluto notebook uses @bind for interactivity. When running this notebook outside of Pluto, the following 'mock version' of @bind gives bound variables a default value (instead of an error). | |
macro bind(def, element) | |
quote | |
local el = $(esc(element)) | |
global $(esc(def)) = Core.applicable(Base.get, el) ? Base.get(el) : missing | |
el | |
end | |
end | |
# ╔═╡ f1da0f10-2b06-11eb-03dd-917c7f049b40 | |
begin | |
using Pkg | |
Pkg.activate(mktempdir()) | |
end | |
# ╔═╡ 032fc5c0-2b07-11eb-00e8-3f72a5bd1ad0 | |
begin | |
Pkg.add(string.([:DataFrames, :TableIO, :XLSX, :Arrow, :JDF, :SQLite, :CSV, :JSONTables, :Parquet, :ZipFile, :Plots, :PlotlyJS, :PlutoUI])) | |
using DataFrames, TableIO | |
using XLSX, Arrow, JDF, SQLite, CSV, JSONTables, Parquet, ZipFile | |
using Plots, Dates, PlutoUI | |
plotlyjs() | |
end | |
# ╔═╡ 0d1fa0a0-2b1b-11eb-2256-b1b3cadc3db8 | |
md"# Read-Write Performance of Various Formats for Tabular Data" | |
# ╔═╡ 415ed9c2-2b08-11eb-0f78-01fd83d120fc | |
@bind x Slider(1:6, show_value=true) | |
# ╔═╡ 03d708a0-2b0f-11eb-1dd2-c36ee9536ea8 | |
n = round(Int, exp10(x)) | |
# ╔═╡ 04aaf970-2b1a-11eb-245f-bd860f402540 | |
md"""Dates are imported / exported in different formats depending on the capabilities of the package. Conversions between Date and String formats are not timed. | |
* native date format for CSV, Zipped CSV, XLSX, Arrow and JDF | |
* strings for Json, SQLite, Parquet | |
""" | |
# ╔═╡ 78b00e00-2b1a-11eb-0bb7-ff9eb9a41fe1 | |
md"Test Data (shown for n=10):" | |
# ╔═╡ bb8eabe0-2b11-11eb-3f8d-f3d912fb027d | |
md"# Appendix" | |
# ╔═╡ 78257be0-2b07-11eb-254a-557bb7b304d0 | |
extensions = ["csv", "zip", "json", "xlsx", "arrow", "jdf", "sqlite", "parquet"] | |
# ╔═╡ 2a5cce30-2b08-11eb-0b26-ef9a890a2fdd | |
make_df(n) = DataFrame(a=1:n, b=rand(n), c="hello".* string.(1:n), d=Bool.((1:n) .% 2), e=Date("2020-08-15") .+ Day.(1:n), f="world!" .* string.(1:n)) | |
# ╔═╡ eda62430-2b18-11eb-0260-af53c53b6504 | |
make_df(10) | |
# ╔═╡ 8dc94410-2b14-11eb-2549-47e6bd954963 | |
function foldersize(dir=".") | |
size = 0 | |
for (root, dirs, files) in walkdir(dir) | |
size += sum(map(filesize, joinpath.(root, files))) | |
end | |
return size | |
end | |
# ╔═╡ 4e3b6460-2b08-11eb-3843-5121a1d64405 | |
begin | |
function time_all(n, extensions) | |
df = make_df(n) | |
df_date_as_string = df[!, Not(:e)] | |
df_date_as_string[!, :e] = string.(df.e) | |
timing_read = Dict{String, Float64}() | |
timing_write = Dict{String, Float64}() | |
filesizes = Dict{String, Float64}() | |
tmpdir = mktempdir() | |
for ext ∈ extensions | |
fname = joinpath(tmpdir, "test."*ext) | |
if ext == "sqlite" | |
timing_write[ext] = @elapsed write_table!(fname, "test_table", df_date_as_string) | |
timing_read[ext] = @elapsed df_in = DataFrame(read_table(fname, "test_table"); copycols=false) | |
@assert df_date_as_string == df_in "inconsistent dfs for $ext" | |
elseif ext == "parquet" | |
mapping = Dict(["c"] => (String, Parquet.logical_string), | |
["e"] => (String, Parquet.logical_string), | |
["f"] => (String, Parquet.logical_string)) | |
# String field types must be mapped to appropriate data types | |
timing_write[ext] = @elapsed write_table!(fname, df_date_as_string) | |
timing_read[ext] = @elapsed df_in = DataFrame(read_table(fname; map_logical_types=mapping); copycols=false) | |
@assert df_date_as_string == df_in "inconsistent dfs for $ext" | |
elseif ext == "json" | |
timing_write[ext] = @elapsed write_table!(fname, df_date_as_string) | |
timing_read[ext] = @elapsed df_in = DataFrame(read_table(fname); copycols=false) | |
@assert df_date_as_string == df_in "inconsistent dfs for $ext" | |
else | |
timing_write[ext] = @elapsed write_table!(fname, df) | |
timing_read[ext] = @elapsed df_in = DataFrame(read_table(fname); copycols=false) | |
@assert df == df_in "inconsistent dfs for $ext" | |
end | |
if ext == "jdf" | |
filesizes[ext] = foldersize(fname) / (1024^2) # in MB | |
else | |
filesizes[ext] = filesize(fname) / (1024^2) # in MB | |
end | |
end | |
(;timing_read, timing_write, filesizes) | |
end | |
time_all(10, extensions) # warmup | |
end | |
# ╔═╡ 59b18bc0-2b09-11eb-0b6b-c7ffc3d6ef98 | |
timings = time_all(n, extensions) | |
# ╔═╡ 77daa120-2b10-11eb-04f0-8b1f8b00303a | |
begin | |
plt_timing = scatter(timings.timing_write; label="writing", title="Timings for $n Lines", yscale=:log10, ylabel="time [s]") | |
scatter!(plt_timing, timings.timing_read; label="reading") | |
savefig(plt_timing, "timings_$n.png") | |
plt_timing | |
end | |
# ╔═╡ 382c74f0-2b14-11eb-05f2-6bede9afa601 | |
begin | |
plt_size = bar(timings.filesizes, label=nothing, title="File sizes for $n Lines", ylabel="file size [MB]") | |
savefig(plt_size, "filesizes_$n.png") | |
plt_size | |
end | |
# ╔═╡ 1f98dba0-2b0e-11eb-1496-11ab64828a2c | |
begin | |
bar(timings.timing_write; label="writing", title="Timings for $n Lines", ylabel="time [s]") | |
bar!(timings.timing_read; label="reading") | |
end | |
# ╔═╡ Cell order: | |
# ╟─0d1fa0a0-2b1b-11eb-2256-b1b3cadc3db8 | |
# ╟─03d708a0-2b0f-11eb-1dd2-c36ee9536ea8 | |
# ╟─415ed9c2-2b08-11eb-0f78-01fd83d120fc | |
# ╟─77daa120-2b10-11eb-04f0-8b1f8b00303a | |
# ╟─382c74f0-2b14-11eb-05f2-6bede9afa601 | |
# ╟─1f98dba0-2b0e-11eb-1496-11ab64828a2c | |
# ╟─04aaf970-2b1a-11eb-245f-bd860f402540 | |
# ╟─78b00e00-2b1a-11eb-0bb7-ff9eb9a41fe1 | |
# ╠═eda62430-2b18-11eb-0260-af53c53b6504 | |
# ╟─bb8eabe0-2b11-11eb-3f8d-f3d912fb027d | |
# ╠═78257be0-2b07-11eb-254a-557bb7b304d0 | |
# ╠═59b18bc0-2b09-11eb-0b6b-c7ffc3d6ef98 | |
# ╠═2a5cce30-2b08-11eb-0b26-ef9a890a2fdd | |
# ╠═4e3b6460-2b08-11eb-3843-5121a1d64405 | |
# ╠═8dc94410-2b14-11eb-2549-47e6bd954963 | |
# ╠═f1da0f10-2b06-11eb-03dd-917c7f049b40 | |
# ╠═032fc5c0-2b07-11eb-00e8-3f72a5bd1ad0 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment