Skip to content

Instantly share code, notes, and snippets.

@EuRoXy
Last active April 4, 2024 08:04
Show Gist options
  • Save EuRoXy/4f25e88ab01acd28dcf0ed3530aaf38c to your computer and use it in GitHub Desktop.
Save EuRoXy/4f25e88ab01acd28dcf0ed3530aaf38c to your computer and use it in GitHub Desktop.
memo for DataFrames.jl
# split-apply-combine
gx = groupby(x, [:id1, :id2])
parent(gx)
vcat(gx...) # back to df # or
DataFrame(gx, keepkeys=false)
groupcols(gx)
valuecols(gx)
groupindices(gx)
k = keys(gx)[1]
ntk = NamedTuple(k)
tk = Tuple(k)
## handle missing
groupby(x, :id) # last group missing
# combine, select, transform
x.id2 = axes(x, 1) # keep col from src
@chain x begin
groupby(:id)
combine(:id2, :v => mean => :res)
combine(:v => mean => :res1, :v => sum => :res2, nrow => :n) # src => func => target
end
combine(groupby(x, :id)) do sdf # sub df
n = nrow(sdf)
n < 25 ? DataFrame() : DataFrame(n=n)
end
## produce multiple cols in 1 operation
combine(:val => (x -> [x]) => AsTable) #
combine(:val => () => [:c1, :c2]) # custom col name
## unnest
df = DataFrame(a=[(p=1, q=2), (p=3, q=4)])
select(df, :a => AsTable)
## broadcast
combine([:x1, :x2] .=> [minimum maximum])
# aggregate using mapcols
mapcols(mean, x)
# map rows & cols using eachcol and eachrow
map(mean, eachcol(x))
foreach(c -> println(c[1], ": ", mean(c[2])), pairs(eachcol(x)))
map(r -> r.x1 / r.x2, eachrow(x))
er = eachrow(x) # & ec = eachcol(x)
er.x1
# transpose
df.names = ["a", "b", "c"]
permutedims(df, :names)
using FreqTables
df = DataFrame(a=rand('a':'d', 1000), b=rand(["x", "y", "z"], 1000))
ft = freqtable(df, :a, :b)
prop(ft, margins=1) # calc proportions in rows 2 in cols
using DataFramesMeta
df = DataFrame(x=1:8, y='a':'h', z=repeat([true, false], outer=4))
@with(df, :x + :z) # create hard scope so vars do not leak out
@subset(df, :x .< 4, :z .== true) # filtering
@select(df, :x, y = 2*:x, z=:y) # create new df
@transform(df, :x = 2*:x, :y=:x) # create new df adding cols based on old one
@orderby(df, :z, -:x) # sort into new df, less powerful than sort but lightweight
using Chain
@chain df begin
@subset(:x .< 5)
@orderby(:z)
@transform(:x^2 = :x .^ 2)
@select(:z, :x, :x^2)
end
# grouped df
g = groupby(df, :b)
@by(df, :b, :first=first(:a), :last=last(:a), :mean=mean(:a)) # or
@combine(g, ...) # or
combine(g, .a .=> [first, last, mean])
@transform(g, :center = mean(:a), :centered = :a .- mean(:a))
DataFrame(g) # ordered by :b
@transform(g) # orig order
# rowwise operations
@transform(df, :x = ifelse.((:a .> 6) .& (:b .== 4), "yes", "no"))
myfun(a, b) = a > 6 && b == 4 ? "yes" : "no"
@transform(df, :x = myfun.(:a, :b)) # or
@eachrow df begin
@newcol x::Vector{String}
:x = :a > 6 && :b == 4 ? "yes" : "no"
end
# in df.jl:
transform(df, [:a, :b] => ByRow((a,b) -> ifelse(a > 6 && b == 4, "yes", "no")) => :x)
DataFrame(A=1:3, C=randstring.(3,3,3), fixed=1)
x = Dict("A" => [1,2], fixed => Ref([1,1])) # Ref to protect vector
DataFrame(x)
DataFrame(:B => [true, false]) # symbols (preferred, coz faster)
DataFrame("B" => [true, false])
DataFrame( [:A => [1,2], :C => ['a','b'], :fixed => "const"] ) # pass a vec of pairs also do
DataFrame([rand(3) for i in 1:3], [:x1, :x2, :x3]) # or ["x1", "x2", "x3"]
DataFrame(permutedims([1, 2, 3]), :auto) # auto generate col names
v = [(a=1, b=2), (a=3, b=4)] # NamedTuples
DataFrame(v)
# or
n = (a=1:3, b=11:13)
DataFrame(n)
DataFrame(rand(3,4), Symbol.('a':'d')) # from matrix, or string.('a':'d')
DataFrame(A=Int[], B=Float64[], C=String[]) # predefined cols & types
x = DataFrame(a=1:2, b='a':'b')
y = copy(x) # or y = DataFrame(x) # copycols=false -> (x.a === y.a) is true
(x === y), isequal(x, y), (x.a == y.a), (x.a === y.a)
# (f, t, t, f) ## === f
similar(x, 0) # 2. arg: n_rows
sdf = view(x, [1,1], :) # SubDataFrame
DataFrame(sdf)
dfr = x[1,:] # DataFrameRow
DataFrame(dfr)
Matrix(x) # or Array(x)
# Matrix{Union{Missing, Int64}} contains `missing`
ct = Tables.columntable(x) # convert df to NamedTuples of vectors
rt = Tables.rowtable(x) # convert df to vector of NamedTuples
DataFrame(ct) # convert back
ec = eachcol(x) # vec but not AbstractVector
ec[1] # or ec["x"]
er = eachrow(x) # is AbstractVector
er[end]
# lazy iterator is type stable
nti = Tables.namedtupleiterator(x)
for row in enumerate(nti)
@show row
end
DataFrame(nti) # convert back
df = DataFrame(:a=>1, :a=>2, :a_1=>3; makeunique=true)
# `nothing` not displayed in ipynb
empty(df) # or empty!() to remove all rows from df
x = DataFrame(A=[1, 2], B=[1.0, missing], C=["a", "b"])
# dims of df
size(x) # or
nrow(x), ncol(x) # (2,3)
size(x, 1), size(x, 2)
# 2, 3
describe(x, cols=1:2)
names(x)
# "A", "B", "C"
names(x, String) # get col name with given `eltype`
# "C"
propertynames(x)
# :A, :B, :C # vec of `Symbols`
eltype.(eachcol(x))
y = DataFrame(rand(1:10, 1000, 10), :auto)
first(y, 5) # first 5 rows
last(y, 3)
first(y) # or last(y) # only 1 row
# display certain nums of rows & cols
ENV["LINES"] = 10
ENV["COLUMNS"] = 200
x.A, x[!, 1], x[!, :A] # without copy
x."A", x[!,"A"]
x[:, 1] # : create a copy thus safe & preferred except for critical performance
x[:, 1] === x[:, 1] # false
x[1:1, :] # grob 1 row
x[1, :] # produce 1 DataFrameRow similar to NamedTuple
x[Not(1), r"A"] # use Regex to select col and `Not` from InvertedIndices.jl
x[!, Not(1)] # ! indicate not copy thus unsafe as might corrupt
x[1:2, 1:2] .= 1 # or [1, 2] # DataFrame([5 6; 7 8], [:A, :B])
x[:, Cols( "x1", Between(:x2, :x4) )]
@view x[1:2, 1:2]
df = DataFrame()
x = [1, 2, 3]
df.a = x # add new col to df
df.a === x # true # no copy
df[!, :b] = x # or df[:, :c] = x
# or .= # copy, in this case `!` same effect as `:`
df.c === x # false # copy
df.a === df.b === x # true # no copy
# could lead to silent error
for (n, c) in pairs( eachcol(df) )
println("$n: ", pop!(c))
end
# b: 2 , others: 3 # pop! removed 3 on col:a
collect( pairs(eachcol(df)) )
df2 = copy(df)
df == df2 # comparison true
df3 = df2 .+ eps()
df == df3 # false
isapprox(df, df3) # true
isapprox(df, df3, atol=eps()/2) # false
df = DataFrame(a=missing)
df == df # missing
df === df # or isequal(df, df) # true
ismissing(1), ismissing(missing)
missing == missing, missing != missing, missing < missing # or
1 == missing, 1 != missing, 1 < missing
# (missing, missing, missing)
isequal(missing, missing), missing === missing, isless(1, missing)
# true, true, true # missing considered > any numeric
collect(skipmissing([1, missing, 2, missing]))
# [1, 2]
replace([1.0, missing, 2.0, missing], missing=>NaN) # or
coalesce.([1.0, missing, 2.0, missing], NaN)
# or
using CategoricalArrays
recode([1.0, missing, 2.0, missing], false, missing=>true)
# [0, 1, 0, 1]
df = DataFrame(a=[1,2,missing], b=["a","b",missing])
replace!(df.a, missing=>100)
df.b = coalesce.(df.b, 100) # type changed to Any
# get unique values with or without missings
unique([1, missing, 2, missing]), levels([1, missing, 2, missing])
x = [1, 2, 3]
y = allowmissing(x)
z = disallowmissing(y) # would fail if y contain missing
df = allowmissing(DataFrame(ones(2,3), :auto))
df[1, 1] = missing
disallowmissing(df) # throw error
disallowmissing(df, error=false)
# after using allowmissing!
# eltype.(eachcol(x)) = Union{Missing, Int64}
x = DataFrame(A=[1, missing, 3, 4], B=["A", "B", missing, "C"])
completecases(x)
# Bool[1, 0, 0, 1]
y = dropmissing(x)
dropmissing!(x)
# or
dropmissing!(x, disallowmissing=false)
string(missing) # "missing"
lift_string = passmissing(string)
lift_string(missing) # missing
lift_string(missing, " ", missing) # missing
df = DataFrame(a=[1,missing,missing], b=[1,2,missing])
sum.(skipmissing.(eachrow(df))) # eachrow is type unstable, so use
sum.(skipmissing.(Tables.namedtupleiterator(df)))
x = DataFrame(A=[true, false, true], B=[1, 2, missing],
C=[missing, "b", "c"], D=['a', missing, 'c'])
eltype.(eachcol(x))
CSV.write("x1.csv", x)
print(read("x1.csv", String))
y = CSV.read("x1.csv", DataFrame)
# type pf col :D changed to string
## Serialization
### Way 1
open("x.bin", "w") do io
serialize(io, x)
end
y = open(deserialize, "x.bin")
### Way 2
JDF.save("x.jdf", x)
x_loaded = JDF.load("x.jdf") |> DataFrame
isequal(x_loaded, x) # true
# JDF can load certain cols to help work with large files
x_ondisk = jdf"x.jdf"
names(x_ondisk) # :A :B :C :D
xd = JDF.load(x_ondisk; cols=["A", "D"]) |> DataFrame
### Way 3
JLSO.save("x.jlso", :data => x)
y = JLSO.load("x.jlso")[:data]
# JSON
open(io -> arraytable(io, x), "x1.json", "w")
open(io -> objecttable(io, x), "x2.json", "w")
print(read("x1.json", String))
y1 = open(jsontable, "x1.json") |> DataFrame
# Apache Arrow fmt for interchange with R/Py
Arrow.write("x.arrow", x)
y = Arrow.Table("arrow") |> DataFrame
y.A[1] = false # ReadOnlyMemoryError() # cols of y immutable
y.A # Arrow.BoolVector{Bool} # custom vector type
y.B # Arrow.Primitive{Union{Missing, Int64}, Vector{Int64}}
y2 = copy(y) # get standard Julia Base vector
# gzip compression
df = DataFrame(rand(1:10, 10, 1000), :auto)
# CodecZlib.jl
open("df_compress_test.csv.gz", "w") do io
stream = GzipCompressorStream(io)
CSV.write(stream, df)
close(stream)
end
df2 = CSV.File(transcode(GzipDecompressor, Mmap.mmap("df_compress_test.csv.gz"))) |> Dataframe
# ZipFile.jl
w = ZipFile.Writer("x.zip")
f1 = ZipFile.addfile(w, "x1.csv")
close(w)
z = ZiipFile.Reader("x.zip")
index_xcsv = findfirst(x->x.name == "x1.csv", z.files)
df1_2 = CSV.read((read(z.files[index_xcsv]), DataFrame)
close(z)
# rename
x = DtatFrame(rand(Bool, 3, 4), :auto)
rename(x, :x => :A)
rename!(c -> c^2, x) # apply func to every colnam
rename(x, 3 => :third) # change 3rd col
rename!(x, [:a, :b, :c, :d])
rename!(x, string.('a':'d'))
rename!(x, "a"=>"d", "d"=>"a") # allow circular renaming
rename(x, fill(:a, 4), makeunique=true)
# reorder cols
Random.seed!(1234)
x[:, shuffle(names(x))]
select!(x, 4:-1:1)
# merge/add
x = DataFrame([(i,j) for i in 1:3, j in 1:4], :auto)
hcat(x, x, makeunique=true)
y = hcat(x, [1,2,3], makeunique=true)
hcat([1,2,3], x, makeunique=true) # or
y = [x DataFrame(A=[1,2,3])]
insertcols!(y, 2, "newcol"=>[1,2,3])
insertcols!(y, 2, :newcol=>[1,2,3], makeunique=true)
insertcols!(x, :A => [1,2,3]) # append # drop index
insertcols!(x, 1, :B => [1,2,3])
df = DataFrame(a = [1,2,3])
insertcols!(df, :b => "x", :c => 'a':'c', :d => Ref([1,2,3]) # broadcast if needed
df1 = DataFrame(a=[1,2])
df2 = DataFrame(b=[2,3], c=[3,4])
insertcols!(df1, pairs(eachcol(df2))...)
# subset/remove
x[:, [1,2,4,5]]
x[:, [:x1, :x4]] # by col name
x[:, [true, false, true, false, true]]
x[:, [:x1]] # single-col df
using InvertedIndices
x[!, r"[12]"]
x[!, Not(1)]
x[!, Between(:x2, :x4)]
x[!, Cols(:x1, Between(:x3, :x5))] # or
select(x, :x1, Between(:x3, :x5), copycols=false)
df2 = select(df, [1, 2])
select(df, Not([1,2]))
# views
@view x[:, [1,3,5]] # faster
# modify col by name
x[!, :x1] = x[:, :x2]
x[!, :A] = [1, 2, 3]
# find col name
hasproperty(x, :x1) # true
columnindex(x, :x2)
# advanced col selection
df."col 2"
for n in names(df) # interpolate col name by :()
println(n, "\n", df.:($n), "\n")
end
# work on collection of cols
df = DataFrame(reshape(1:12, 3, 4), :auto)
ec_df = eachcol(df) # DataFrameColumns
ec_df[1] # or
ec_df.x1
# transform
transform(df, [:x1, :x2] => (+) => :res) # src_col => func_to_apply => target_col
using Statistics
combine(df, [:x1, :x2] => cor) # automatically generate 1 col of 1 row
select(df, [:x1, :x2] => cor) # 1 col of 3 row
select(df, :x1, :x2, [:x1, :x2] => ByRow(string)) # apply func on each row
select(df, :x1, :x2, AsTable([:x1, :x2]) => x -> x.x1 + x.x2)
select(df, :x1, nrows => "number_of_rows")
select(df, :x1, :x1 => ByRow(x -> [x ^ 2, x ^ 3]) => ["x1²", "x1³"]) # create multi cols with 1 func # or
select(df, :x1, :x1 => (x -> DataFrame("x1²" => x .^ 2, "x1³" => x .^ 3)) => AsTable)
# select rows
sdf = view(df, 1:3, 1:3) # SubDataFrame without copy
parent(sdf), parentindices(sdf)
dfr = df[3, :] # DataFrameRow
rownumber(dfr)
df[!, :Z] .= 1
df = DataFrame(a=1:4)
dfv = view(df, [3,2], :)
dfr = dfv[1, :]
# reorder
x = DataFrame(id=1:12, x=rand(12), y=[zeros(6); ones(6)])
issorted(x), issorted(x, :x) # true, false
sort!(x, :x) # in place
y = sort(x, :id) # create new df
sort(x, [:y, :x], rev=[true, false]) # sort by 2 cols, 1st decr, 2nd incr # or
sort(x, [order(:y, rev=true), :x])
sort(x, [order(:y, rev=true), order(:x, by=v->-v)]) # ??
x[shuffle(1:10), :] # reorder randomly
sort!(x, :id)
x[[1,10], :] .= x[[10,1], :] # swap rows
# merge/add rows
[x; x] # merge by rows
reduce(vcat, [x, x, x]) # efficient
y = x[:, reverse(names(x))]
vcat(x, y)
vcat(x, y[:, 1:3], cols=:intersect) # or :union
vcat(x, y[:, 1:3], cols=[:x1, :x5])
append!(x, y) # modify in place
repeat(x, 2) # twice rows
push!(x, 1:5) # add 1 row to the end
push!(x, Dict(:x1=>11, :x2=>12, :x3=>13, :x4=>14, :x5=>15))
push!(x, (x2=2, x1=1, x4=4, x3=3, x5=5)) # NamedTuples via name matching
push!(x, x[1, :]) # DataFrameRow
source = [(a=1, b=2), (a=missing, b=10, c=20), (b="s", c=1, d=1)]
df = DataFrame()
for row in source
push!(df, row, cols=:union)
end
# subset/remove
x = DataFrame(id=1:10, val='a':'j')
x[1,:] # a DataFrameRow
x[1:1, :] # a df
view(x, 1:2, :)
x[repeat([true, false], 5), :] # or
view(x, repeat([true, false]), :)
delete!(x, 6:7)
x[Not(1:2), :] # create new df
# filter
filter(r -> r.x1 > 2.5, x, view=true) # or
filter(:x1 => >(2.5), x)
filter!(x) do r
if r.x1 > 2.5
return r.x2 < 4.5
end
end
df = DataFrame(x=1:12, y=modl.(1:12, 4))
filter(row -> row.y in [1,4], df) # or
filter(:y => in([1,4]), df) # or
df[in.(df.y, Ref([1,4])), :]
subset(x, :x1 => x -> x .< mean(x), :x2 => ByRow(<(2.5))) # multi conditions
# deduplicate
x = DataFrame(A=[1,2], B=["x","y"])
append!(x, x)
x.C = 1:4
unique(x, [1,2]) # get first unique row for given index
unique(x) # whole row
nonunique(x, :A) # get indicators of non-unique rows
unique!(x, :B)
# extract one row from a df into standard collections
cols = [:y, :z]
Vector(x[1, cols]) # or
Array(x[1, cols])
[Vector(x[i, cols]) for i in axes(x, 1)]
copy(x[1, cols]) # convert a dfr to NamedTuple
Tuple(x[1, cols])
# work with a collection of rows of df
er_df = eachrow(df)
er_df[1]
last(er_df)
er_df[end]
er_df.x1
# flatten a df
df = DataFrame(a='a':'c', b=[[1,2,3], [4,5], 6])
flatten(df, :b)
# only 1 row
df = DataFrame(a=1)
only(df)
df2 = repeat(df, 2)
only(df2) # ArgumentError: df must contain exactly 1 row
innerjoin(x, y, on=:ID=>:id, matchmissing=:equal) # missing not allowed to join-on be default
leftjoin(x, y, on="ID"=>"id", matchmissing=:equal) # or rightjoin() based on 2nd tab ids
outerjoin() # include ids from both tabs
semijoin()
antijoin()
crossjoin(DataFrame(x=[1, 2]), DataFrame(y=["a","b","c"]))
innerjoin(x, y, on=[:id1, :id2], matchmissing=:equal)
outerjoin(x, y, on=:id1, makeunique=true, indicator=:source, matchmissing=:equal) # produce all combis
innerjoin(x, y, on=[:id1, :id2=>:ID2])
xs = [DataFrame("id"=>1:6, "v$i"=>((1:6) .+ 10i) for i in 1:5] # join >2 dfs
innerjoin(xs..., on=:id)
# wide to long
stack(x, [:M1, :M2], :id, variable_name="key", value_name="observed")
stack(x, Not([:id1, :id2])) # or
stack(x, Not([1, 2]) # use idx instead of symbol
stack(DataFrame(rand(3,2), :auto))
# long to wide
unstack(y, :id2, :variable, :value)
unstack(y, renamecols=n -> string("unstacked_", n))
# unable to unstack when no key col present
import Downloads
using Chain
ENV["LINES"] = 20
# Data preprocessing
Downloads.download("http:...csv", "participation.csv")
readlines("participation")
df_raw = CSV.read("participation.csv", DataFrame)
de = select(df_raw,
:lfp => x -> recode(x, "yes" => 1, "no" => 0), # Labor Force Participation
:lnnlinc,
:age,
:age => ByRow(x -> x^2) => :age^2,
Between(:educ, :noc),
:foreign => categorical,
renamecols=false)
# exploratory analysis
@chain df begin
groupby(:lfp)
combine(names(df, Real) .=> mean) # shorter than
# combine([:lnnlinc, :age, :educ, :nyc, :noc] .=> mean)
end
@chain df begin
groupby([:lfp, :foreign])
combine(nrow)
unstack(:lfp, :foreighn, :nrow) # into pivot table
select(:lfp, [:no, :yes] => ByRow((x,y) -> y / (x+y)) => :foreign_yes) # or shorter:
combine(:foreign => (x -> mean(x .== "yes")) => :foreign_yes)
end
gd = groupby(df, :lfp)
gd[(lfp=0,)] # select by val, not position
gd[Dict(:lfp => 0)]
gd[(0,)]
using StatsPlots
@df df density(:age, group=:lfp)
# Build a predictive model
using GLM
probit = glm(Term(:lfp) ~ sum(Term.(propertynames(df)[2:end])),
df, Binomial(), ProbitLink()) # or
# @formula(lfp ~ linnlinc + age + age^2 + educ + nyc + noc + foreign)
df_pred = DataFrame(linnlinc=10.0, age=2.0:0.01:6.2, educ=9, nyc=0, noc=1, foreign="yes") # modify :age while keep other const
probit_pred = predict(probit, df_pred, interval=:confidence)
plot(df_pred.age, Matrix(probit_pred)...)
# advanced functionalities: bootstrapping
function boot_sample(df)
df_boot = df[rand(1:nrow(df), nrow(df)), :]
probit_boot = glm(@formula(lfp ~),
df_boot, Binomial()...)
return (; (Symbol.(coefnames(probit_boot)) .=> coef(probit_boot))...)
end
function run_boot(df, reps)
coef_boot = DataFrame()
for _ in 1:reps
push!(coef_boot, boot_sample(df))
end
return coef_boot
end
conf_boot = mapcols(x -> quantile(x, [0.025, 0.975]), coef_boot)
confint(probit)
conf_param = DataFrame(permutedims(confint(probint)), names(conf_boot))
append!(conf_boot, conf_param)
insertcols!(conf_boot, 1, :statistic => ["", "", ])
conf_boot_t = permutedims(conf_boot, :statistic)
select!(conf_boot_t, :statistic, :estimate, 3:6 .=> x -> abs(x .- conf_boot_t.estimate), renamecols=false)
scatter(0.05 .+ (1:8), conf_boot_t.estimate, yerror=(conf_boot_t."boot lo", conf_boot_t."boot hi"))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment