Last active
April 4, 2024 08:04
-
-
Save EuRoXy/4f25e88ab01acd28dcf0ed3530aaf38c to your computer and use it in GitHub Desktop.
memo for DataFrames.jl
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# split-apply-combine | |
gx = groupby(x, [:id1, :id2]) | |
parent(gx) | |
vcat(gx...) # back to df # or | |
DataFrame(gx, keepkeys=false) | |
groupcols(gx) | |
valuecols(gx) | |
groupindices(gx) | |
k = keys(gx)[1] | |
ntk = NamedTuple(k) | |
tk = Tuple(k) | |
## handle missing | |
groupby(x, :id) # last group missing | |
# combine, select, transform | |
x.id2 = axes(x, 1) # keep col from src | |
@chain x begin | |
groupby(:id) | |
combine(:id2, :v => mean => :res) | |
combine(:v => mean => :res1, :v => sum => :res2, nrow => :n) # src => func => target | |
end | |
combine(groupby(x, :id)) do sdf # sub df | |
n = nrow(sdf) | |
n < 25 ? DataFrame() : DataFrame(n=n) | |
end | |
## produce multiple cols in 1 operation | |
combine(:val => (x -> [x]) => AsTable) # | |
combine(:val => () => [:c1, :c2]) # custom col name | |
## unnest | |
df = DataFrame(a=[(p=1, q=2), (p=3, q=4)]) | |
select(df, :a => AsTable) | |
## broadcast | |
combine([:x1, :x2] .=> [minimum maximum]) | |
# aggregate using mapcols | |
mapcols(mean, x) | |
# map rows & cols using eachcol and eachrow | |
map(mean, eachcol(x)) | |
foreach(c -> println(c[1], ": ", mean(c[2])), pairs(eachcol(x))) | |
map(r -> r.x1 / r.x2, eachrow(x)) | |
er = eachrow(x) # & ec = eachcol(x) | |
er.x1 | |
# transpose | |
df.names = ["a", "b", "c"] | |
permutedims(df, :names) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using FreqTables | |
df = DataFrame(a=rand('a':'d', 1000), b=rand(["x", "y", "z"], 1000)) | |
ft = freqtable(df, :a, :b) | |
prop(ft, margins=1) # calc proportions in rows 2 in cols | |
using DataFramesMeta | |
df = DataFrame(x=1:8, y='a':'h', z=repeat([true, false], outer=4)) | |
@with(df, :x + :z) # create hard scope so vars do not leak out | |
@subset(df, :x .< 4, :z .== true) # filtering | |
@select(df, :x, y = 2*:x, z=:y) # create new df | |
@transform(df, :x = 2*:x, :y=:x) # create new df adding cols based on old one | |
@orderby(df, :z, -:x) # sort into new df, less powerful than sort but lightweight | |
using Chain | |
@chain df begin | |
@subset(:x .< 5) | |
@orderby(:z) | |
@transform(:x^2 = :x .^ 2) | |
@select(:z, :x, :x^2) | |
end | |
# grouped df | |
g = groupby(df, :b) | |
@by(df, :b, :first=first(:a), :last=last(:a), :mean=mean(:a)) # or | |
@combine(g, ...) # or | |
combine(g, .a .=> [first, last, mean]) | |
@transform(g, :center = mean(:a), :centered = :a .- mean(:a)) | |
DataFrame(g) # ordered by :b | |
@transform(g) # orig order | |
# rowwise operations | |
@transform(df, :x = ifelse.((:a .> 6) .& (:b .== 4), "yes", "no")) | |
myfun(a, b) = a > 6 && b == 4 ? "yes" : "no" | |
@transform(df, :x = myfun.(:a, :b)) # or | |
@eachrow df begin | |
@newcol x::Vector{String} | |
:x = :a > 6 && :b == 4 ? "yes" : "no" | |
end | |
# in df.jl: | |
transform(df, [:a, :b] => ByRow((a,b) -> ifelse(a > 6 && b == 4, "yes", "no")) => :x) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
DataFrame(A=1:3, C=randstring.(3,3,3), fixed=1) | |
x = Dict("A" => [1,2], fixed => Ref([1,1])) # Ref to protect vector | |
DataFrame(x) | |
DataFrame(:B => [true, false]) # symbols (preferred, coz faster) | |
DataFrame("B" => [true, false]) | |
DataFrame( [:A => [1,2], :C => ['a','b'], :fixed => "const"] ) # pass a vec of pairs also do | |
DataFrame([rand(3) for i in 1:3], [:x1, :x2, :x3]) # or ["x1", "x2", "x3"] | |
DataFrame(permutedims([1, 2, 3]), :auto) # auto generate col names | |
v = [(a=1, b=2), (a=3, b=4)] # NamedTuples | |
DataFrame(v) | |
# or | |
n = (a=1:3, b=11:13) | |
DataFrame(n) | |
DataFrame(rand(3,4), Symbol.('a':'d')) # from matrix, or string.('a':'d') | |
DataFrame(A=Int[], B=Float64[], C=String[]) # predefined cols & types | |
x = DataFrame(a=1:2, b='a':'b') | |
y = copy(x) # or y = DataFrame(x) # copycols=false -> (x.a === y.a) is true | |
(x === y), isequal(x, y), (x.a == y.a), (x.a === y.a) | |
# (f, t, t, f) ## === f | |
similar(x, 0) # 2. arg: n_rows | |
sdf = view(x, [1,1], :) # SubDataFrame | |
DataFrame(sdf) | |
dfr = x[1,:] # DataFrameRow | |
DataFrame(dfr) | |
Matrix(x) # or Array(x) | |
# Matrix{Union{Missing, Int64}} contains `missing` | |
ct = Tables.columntable(x) # convert df to NamedTuples of vectors | |
rt = Tables.rowtable(x) # convert df to vector of NamedTuples | |
DataFrame(ct) # convert back | |
ec = eachcol(x) # vec but not AbstractVector | |
ec[1] # or ec["x"] | |
er = eachrow(x) # is AbstractVector | |
er[end] | |
# lazy iterator is type stable | |
nti = Tables.namedtupleiterator(x) | |
for row in enumerate(nti) | |
@show row | |
end | |
DataFrame(nti) # convert back | |
df = DataFrame(:a=>1, :a=>2, :a_1=>3; makeunique=true) | |
# `nothing` not displayed in ipynb | |
empty(df) # or empty!() to remove all rows from df |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
x = DataFrame(A=[1, 2], B=[1.0, missing], C=["a", "b"]) | |
# dims of df | |
size(x) # or | |
nrow(x), ncol(x) # (2,3) | |
size(x, 1), size(x, 2) | |
# 2, 3 | |
describe(x, cols=1:2) | |
names(x) | |
# "A", "B", "C" | |
names(x, String) # get col name with given `eltype` | |
# "C" | |
propertynames(x) | |
# :A, :B, :C # vec of `Symbols` | |
eltype.(eachcol(x)) | |
y = DataFrame(rand(1:10, 1000, 10), :auto) | |
first(y, 5) # first 5 rows | |
last(y, 3) | |
first(y) # or last(y) # only 1 row | |
# display certain nums of rows & cols | |
ENV["LINES"] = 10 | |
ENV["COLUMNS"] = 200 | |
x.A, x[!, 1], x[!, :A] # without copy | |
x."A", x[!,"A"] | |
x[:, 1] # : create a copy thus safe & preferred except for critical performance | |
x[:, 1] === x[:, 1] # false | |
x[1:1, :] # grob 1 row | |
x[1, :] # produce 1 DataFrameRow similar to NamedTuple | |
x[Not(1), r"A"] # use Regex to select col and `Not` from InvertedIndices.jl | |
x[!, Not(1)] # ! indicate not copy thus unsafe as might corrupt | |
x[1:2, 1:2] .= 1 # or [1, 2] # DataFrame([5 6; 7 8], [:A, :B]) | |
x[:, Cols( "x1", Between(:x2, :x4) )] | |
@view x[1:2, 1:2] | |
df = DataFrame() | |
x = [1, 2, 3] | |
df.a = x # add new col to df | |
df.a === x # true # no copy | |
df[!, :b] = x # or df[:, :c] = x | |
# or .= # copy, in this case `!` same effect as `:` | |
df.c === x # false # copy | |
df.a === df.b === x # true # no copy | |
# could lead to silent error | |
for (n, c) in pairs( eachcol(df) ) | |
println("$n: ", pop!(c)) | |
end | |
# b: 2 , others: 3 # pop! removed 3 on col:a | |
collect( pairs(eachcol(df)) ) | |
df2 = copy(df) | |
df == df2 # comparison true | |
df3 = df2 .+ eps() | |
df == df3 # false | |
isapprox(df, df3) # true | |
isapprox(df, df3, atol=eps()/2) # false | |
df = DataFrame(a=missing) | |
df == df # missing | |
df === df # or isequal(df, df) # true |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
ismissing(1), ismissing(missing) | |
missing == missing, missing != missing, missing < missing # or | |
1 == missing, 1 != missing, 1 < missing | |
# (missing, missing, missing) | |
isequal(missing, missing), missing === missing, isless(1, missing) | |
# true, true, true # missing considered > any numeric | |
collect(skipmissing([1, missing, 2, missing])) | |
# [1, 2] | |
replace([1.0, missing, 2.0, missing], missing=>NaN) # or | |
coalesce.([1.0, missing, 2.0, missing], NaN) | |
# or | |
using CategoricalArrays | |
recode([1.0, missing, 2.0, missing], false, missing=>true) | |
# [0, 1, 0, 1] | |
df = DataFrame(a=[1,2,missing], b=["a","b",missing]) | |
replace!(df.a, missing=>100) | |
df.b = coalesce.(df.b, 100) # type changed to Any | |
# get unique values with or without missings | |
unique([1, missing, 2, missing]), levels([1, missing, 2, missing]) | |
x = [1, 2, 3] | |
y = allowmissing(x) | |
z = disallowmissing(y) # would fail if y contain missing | |
df = allowmissing(DataFrame(ones(2,3), :auto)) | |
df[1, 1] = missing | |
disallowmissing(df) # throw error | |
disallowmissing(df, error=false) | |
# after using allowmissing! | |
# eltype.(eachcol(x)) = Union{Missing, Int64} | |
x = DataFrame(A=[1, missing, 3, 4], B=["A", "B", missing, "C"]) | |
completecases(x) | |
# Bool[1, 0, 0, 1] | |
y = dropmissing(x) | |
dropmissing!(x) | |
# or | |
dropmissing!(x, disallowmissing=false) | |
string(missing) # "missing" | |
lift_string = passmissing(string) | |
lift_string(missing) # missing | |
lift_string(missing, " ", missing) # missing | |
df = DataFrame(a=[1,missing,missing], b=[1,2,missing]) | |
sum.(skipmissing.(eachrow(df))) # eachrow is type unstable, so use | |
sum.(skipmissing.(Tables.namedtupleiterator(df))) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
x = DataFrame(A=[true, false, true], B=[1, 2, missing], | |
C=[missing, "b", "c"], D=['a', missing, 'c']) | |
eltype.(eachcol(x)) | |
CSV.write("x1.csv", x) | |
print(read("x1.csv", String)) | |
y = CSV.read("x1.csv", DataFrame) | |
# type pf col :D changed to string | |
## Serialization | |
### Way 1 | |
open("x.bin", "w") do io | |
serialize(io, x) | |
end | |
y = open(deserialize, "x.bin") | |
### Way 2 | |
JDF.save("x.jdf", x) | |
x_loaded = JDF.load("x.jdf") |> DataFrame | |
isequal(x_loaded, x) # true | |
# JDF can load certain cols to help work with large files | |
x_ondisk = jdf"x.jdf" | |
names(x_ondisk) # :A :B :C :D | |
xd = JDF.load(x_ondisk; cols=["A", "D"]) |> DataFrame | |
### Way 3 | |
JLSO.save("x.jlso", :data => x) | |
y = JLSO.load("x.jlso")[:data] | |
# JSON | |
open(io -> arraytable(io, x), "x1.json", "w") | |
open(io -> objecttable(io, x), "x2.json", "w") | |
print(read("x1.json", String)) | |
y1 = open(jsontable, "x1.json") |> DataFrame | |
# Apache Arrow fmt for interchange with R/Py | |
Arrow.write("x.arrow", x) | |
y = Arrow.Table("arrow") |> DataFrame | |
y.A[1] = false # ReadOnlyMemoryError() # cols of y immutable | |
y.A # Arrow.BoolVector{Bool} # custom vector type | |
y.B # Arrow.Primitive{Union{Missing, Int64}, Vector{Int64}} | |
y2 = copy(y) # get standard Julia Base vector | |
# gzip compression | |
df = DataFrame(rand(1:10, 10, 1000), :auto) | |
# CodecZlib.jl | |
open("df_compress_test.csv.gz", "w") do io | |
stream = GzipCompressorStream(io) | |
CSV.write(stream, df) | |
close(stream) | |
end | |
df2 = CSV.File(transcode(GzipDecompressor, Mmap.mmap("df_compress_test.csv.gz"))) |> Dataframe | |
# ZipFile.jl | |
w = ZipFile.Writer("x.zip") | |
f1 = ZipFile.addfile(w, "x1.csv") | |
close(w) | |
z = ZiipFile.Reader("x.zip") | |
index_xcsv = findfirst(x->x.name == "x1.csv", z.files) | |
df1_2 = CSV.read((read(z.files[index_xcsv]), DataFrame) | |
close(z) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# rename | |
x = DtatFrame(rand(Bool, 3, 4), :auto) | |
rename(x, :x => :A) | |
rename!(c -> c^2, x) # apply func to every colnam | |
rename(x, 3 => :third) # change 3rd col | |
rename!(x, [:a, :b, :c, :d]) | |
rename!(x, string.('a':'d')) | |
rename!(x, "a"=>"d", "d"=>"a") # allow circular renaming | |
rename(x, fill(:a, 4), makeunique=true) | |
# reorder cols | |
Random.seed!(1234) | |
x[:, shuffle(names(x))] | |
select!(x, 4:-1:1) | |
# merge/add | |
x = DataFrame([(i,j) for i in 1:3, j in 1:4], :auto) | |
hcat(x, x, makeunique=true) | |
y = hcat(x, [1,2,3], makeunique=true) | |
hcat([1,2,3], x, makeunique=true) # or | |
y = [x DataFrame(A=[1,2,3])] | |
insertcols!(y, 2, "newcol"=>[1,2,3]) | |
insertcols!(y, 2, :newcol=>[1,2,3], makeunique=true) | |
insertcols!(x, :A => [1,2,3]) # append # drop index | |
insertcols!(x, 1, :B => [1,2,3]) | |
df = DataFrame(a = [1,2,3]) | |
insertcols!(df, :b => "x", :c => 'a':'c', :d => Ref([1,2,3]) # broadcast if needed | |
df1 = DataFrame(a=[1,2]) | |
df2 = DataFrame(b=[2,3], c=[3,4]) | |
insertcols!(df1, pairs(eachcol(df2))...) | |
# subset/remove | |
x[:, [1,2,4,5]] | |
x[:, [:x1, :x4]] # by col name | |
x[:, [true, false, true, false, true]] | |
x[:, [:x1]] # single-col df | |
using InvertedIndices | |
x[!, r"[12]"] | |
x[!, Not(1)] | |
x[!, Between(:x2, :x4)] | |
x[!, Cols(:x1, Between(:x3, :x5))] # or | |
select(x, :x1, Between(:x3, :x5), copycols=false) | |
df2 = select(df, [1, 2]) | |
select(df, Not([1,2])) | |
# views | |
@view x[:, [1,3,5]] # faster | |
# modify col by name | |
x[!, :x1] = x[:, :x2] | |
x[!, :A] = [1, 2, 3] | |
# find col name | |
hasproperty(x, :x1) # true | |
columnindex(x, :x2) | |
# advanced col selection | |
df."col 2" | |
for n in names(df) # interpolate col name by :() | |
println(n, "\n", df.:($n), "\n") | |
end | |
# work on collection of cols | |
df = DataFrame(reshape(1:12, 3, 4), :auto) | |
ec_df = eachcol(df) # DataFrameColumns | |
ec_df[1] # or | |
ec_df.x1 | |
# transform | |
transform(df, [:x1, :x2] => (+) => :res) # src_col => func_to_apply => target_col | |
using Statistics | |
combine(df, [:x1, :x2] => cor) # automatically generate 1 col of 1 row | |
select(df, [:x1, :x2] => cor) # 1 col of 3 row | |
select(df, :x1, :x2, [:x1, :x2] => ByRow(string)) # apply func on each row | |
select(df, :x1, :x2, AsTable([:x1, :x2]) => x -> x.x1 + x.x2) | |
select(df, :x1, nrows => "number_of_rows") | |
select(df, :x1, :x1 => ByRow(x -> [x ^ 2, x ^ 3]) => ["x1²", "x1³"]) # create multi cols with 1 func # or | |
select(df, :x1, :x1 => (x -> DataFrame("x1²" => x .^ 2, "x1³" => x .^ 3)) => AsTable) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# select rows | |
sdf = view(df, 1:3, 1:3) # SubDataFrame without copy | |
parent(sdf), parentindices(sdf) | |
dfr = df[3, :] # DataFrameRow | |
rownumber(dfr) | |
df[!, :Z] .= 1 | |
df = DataFrame(a=1:4) | |
dfv = view(df, [3,2], :) | |
dfr = dfv[1, :] | |
# reorder | |
x = DataFrame(id=1:12, x=rand(12), y=[zeros(6); ones(6)]) | |
issorted(x), issorted(x, :x) # true, false | |
sort!(x, :x) # in place | |
y = sort(x, :id) # create new df | |
sort(x, [:y, :x], rev=[true, false]) # sort by 2 cols, 1st decr, 2nd incr # or | |
sort(x, [order(:y, rev=true), :x]) | |
sort(x, [order(:y, rev=true), order(:x, by=v->-v)]) # ?? | |
x[shuffle(1:10), :] # reorder randomly | |
sort!(x, :id) | |
x[[1,10], :] .= x[[10,1], :] # swap rows | |
# merge/add rows | |
[x; x] # merge by rows | |
reduce(vcat, [x, x, x]) # efficient | |
y = x[:, reverse(names(x))] | |
vcat(x, y) | |
vcat(x, y[:, 1:3], cols=:intersect) # or :union | |
vcat(x, y[:, 1:3], cols=[:x1, :x5]) | |
append!(x, y) # modify in place | |
repeat(x, 2) # twice rows | |
push!(x, 1:5) # add 1 row to the end | |
push!(x, Dict(:x1=>11, :x2=>12, :x3=>13, :x4=>14, :x5=>15)) | |
push!(x, (x2=2, x1=1, x4=4, x3=3, x5=5)) # NamedTuples via name matching | |
push!(x, x[1, :]) # DataFrameRow | |
source = [(a=1, b=2), (a=missing, b=10, c=20), (b="s", c=1, d=1)] | |
df = DataFrame() | |
for row in source | |
push!(df, row, cols=:union) | |
end | |
# subset/remove | |
x = DataFrame(id=1:10, val='a':'j') | |
x[1,:] # a DataFrameRow | |
x[1:1, :] # a df | |
view(x, 1:2, :) | |
x[repeat([true, false], 5), :] # or | |
view(x, repeat([true, false]), :) | |
delete!(x, 6:7) | |
x[Not(1:2), :] # create new df | |
# filter | |
filter(r -> r.x1 > 2.5, x, view=true) # or | |
filter(:x1 => >(2.5), x) | |
filter!(x) do r | |
if r.x1 > 2.5 | |
return r.x2 < 4.5 | |
end | |
end | |
df = DataFrame(x=1:12, y=modl.(1:12, 4)) | |
filter(row -> row.y in [1,4], df) # or | |
filter(:y => in([1,4]), df) # or | |
df[in.(df.y, Ref([1,4])), :] | |
subset(x, :x1 => x -> x .< mean(x), :x2 => ByRow(<(2.5))) # multi conditions | |
# deduplicate | |
x = DataFrame(A=[1,2], B=["x","y"]) | |
append!(x, x) | |
x.C = 1:4 | |
unique(x, [1,2]) # get first unique row for given index | |
unique(x) # whole row | |
nonunique(x, :A) # get indicators of non-unique rows | |
unique!(x, :B) | |
# extract one row from a df into standard collections | |
cols = [:y, :z] | |
Vector(x[1, cols]) # or | |
Array(x[1, cols]) | |
[Vector(x[i, cols]) for i in axes(x, 1)] | |
copy(x[1, cols]) # convert a dfr to NamedTuple | |
Tuple(x[1, cols]) | |
# work with a collection of rows of df | |
er_df = eachrow(df) | |
er_df[1] | |
last(er_df) | |
er_df[end] | |
er_df.x1 | |
# flatten a df | |
df = DataFrame(a='a':'c', b=[[1,2,3], [4,5], 6]) | |
flatten(df, :b) | |
# only 1 row | |
df = DataFrame(a=1) | |
only(df) | |
df2 = repeat(df, 2) | |
only(df2) # ArgumentError: df must contain exactly 1 row |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
innerjoin(x, y, on=:ID=>:id, matchmissing=:equal) # missing not allowed to join-on be default | |
leftjoin(x, y, on="ID"=>"id", matchmissing=:equal) # or rightjoin() based on 2nd tab ids | |
outerjoin() # include ids from both tabs | |
semijoin() | |
antijoin() | |
crossjoin(DataFrame(x=[1, 2]), DataFrame(y=["a","b","c"])) | |
innerjoin(x, y, on=[:id1, :id2], matchmissing=:equal) | |
outerjoin(x, y, on=:id1, makeunique=true, indicator=:source, matchmissing=:equal) # produce all combis | |
innerjoin(x, y, on=[:id1, :id2=>:ID2]) | |
xs = [DataFrame("id"=>1:6, "v$i"=>((1:6) .+ 10i) for i in 1:5] # join >2 dfs | |
innerjoin(xs..., on=:id) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# wide to long | |
stack(x, [:M1, :M2], :id, variable_name="key", value_name="observed") | |
stack(x, Not([:id1, :id2])) # or | |
stack(x, Not([1, 2]) # use idx instead of symbol | |
stack(DataFrame(rand(3,2), :auto)) | |
# long to wide | |
unstack(y, :id2, :variable, :value) | |
unstack(y, renamecols=n -> string("unstacked_", n)) | |
# unable to unstack when no key col present |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import Downloads | |
using Chain | |
ENV["LINES"] = 20 | |
# Data preprocessing | |
Downloads.download("http:...csv", "participation.csv") | |
readlines("participation") | |
df_raw = CSV.read("participation.csv", DataFrame) | |
de = select(df_raw, | |
:lfp => x -> recode(x, "yes" => 1, "no" => 0), # Labor Force Participation | |
:lnnlinc, | |
:age, | |
:age => ByRow(x -> x^2) => :age^2, | |
Between(:educ, :noc), | |
:foreign => categorical, | |
renamecols=false) | |
# exploratory analysis | |
@chain df begin | |
groupby(:lfp) | |
combine(names(df, Real) .=> mean) # shorter than | |
# combine([:lnnlinc, :age, :educ, :nyc, :noc] .=> mean) | |
end | |
@chain df begin | |
groupby([:lfp, :foreign]) | |
combine(nrow) | |
unstack(:lfp, :foreighn, :nrow) # into pivot table | |
select(:lfp, [:no, :yes] => ByRow((x,y) -> y / (x+y)) => :foreign_yes) # or shorter: | |
combine(:foreign => (x -> mean(x .== "yes")) => :foreign_yes) | |
end | |
gd = groupby(df, :lfp) | |
gd[(lfp=0,)] # select by val, not position | |
gd[Dict(:lfp => 0)] | |
gd[(0,)] | |
using StatsPlots | |
@df df density(:age, group=:lfp) | |
# Build a predictive model | |
using GLM | |
probit = glm(Term(:lfp) ~ sum(Term.(propertynames(df)[2:end])), | |
df, Binomial(), ProbitLink()) # or | |
# @formula(lfp ~ linnlinc + age + age^2 + educ + nyc + noc + foreign) | |
df_pred = DataFrame(linnlinc=10.0, age=2.0:0.01:6.2, educ=9, nyc=0, noc=1, foreign="yes") # modify :age while keep other const | |
probit_pred = predict(probit, df_pred, interval=:confidence) | |
plot(df_pred.age, Matrix(probit_pred)...) | |
# advanced functionalities: bootstrapping | |
function boot_sample(df) | |
df_boot = df[rand(1:nrow(df), nrow(df)), :] | |
probit_boot = glm(@formula(lfp ~), | |
df_boot, Binomial()...) | |
return (; (Symbol.(coefnames(probit_boot)) .=> coef(probit_boot))...) | |
end | |
function run_boot(df, reps) | |
coef_boot = DataFrame() | |
for _ in 1:reps | |
push!(coef_boot, boot_sample(df)) | |
end | |
return coef_boot | |
end | |
conf_boot = mapcols(x -> quantile(x, [0.025, 0.975]), coef_boot) | |
confint(probit) | |
conf_param = DataFrame(permutedims(confint(probint)), names(conf_boot)) | |
append!(conf_boot, conf_param) | |
insertcols!(conf_boot, 1, :statistic => ["", "", ]) | |
conf_boot_t = permutedims(conf_boot, :statistic) | |
select!(conf_boot_t, :statistic, :estimate, 3:6 .=> x -> abs(x .- conf_boot_t.estimate), renamecols=false) | |
scatter(0.05 .+ (1:8), conf_boot_t.estimate, yerror=(conf_boot_t."boot lo", conf_boot_t."boot hi")) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment