EuRoXy/10_transform.jl

## 10_transform.jl
# split-apply-combine
gx = groupby(x, [:id1, :id2])
parent(gx)
vcat(gx...) # back to df # or
DataFrame(gx, keepkeys=false)
groupcols(gx)
valuecols(gx)
groupindices(gx)
k = keys(gx)[1]
ntk = NamedTuple(k)
tk = Tuple(k)
## handle missing
groupby(x, :id) # last group missing

# combine, select, transform
x.id2 = axes(x, 1) # keep col from src
@chain x begin
    groupby(:id)
    combine(:id2, :v => mean => :res)
    combine(:v => mean => :res1, :v => sum => :res2, nrow => :n) # src => func => target
end
combine(groupby(x, :id)) do sdf # sub df
    n = nrow(sdf)
    n < 25 ? DataFrame() : DataFrame(n=n)
end
## produce multiple cols in 1 operation
combine(:val => (x -> [x]) => AsTable) #
combine(:val => () => [:c1, :c2]) # custom col name
## unnest
df = DataFrame(a=[(p=1, q=2), (p=3, q=4)])
select(df, :a => AsTable)
## broadcast
combine([:x1, :x2] .=> [minimum maximum])

# aggregate using mapcols
mapcols(mean, x)

# map rows & cols using eachcol and eachrow
map(mean, eachcol(x))
foreach(c -> println(c[1], ": ", mean(c[2])), pairs(eachcol(x)))
map(r -> r.x1 / r.x2, eachrow(x))
er = eachrow(x) # & ec = eachcol(x)
er.x1

# transpose
df.names = ["a", "b", "c"]
permutedims(df, :names)

## 13_extra.jl
using FreqTables
df = DataFrame(a=rand('a':'d', 1000), b=rand(["x", "y", "z"], 1000))
ft = freqtable(df, :a, :b)
prop(ft, margins=1) # calc proportions in rows 2 in cols

using DataFramesMeta
df = DataFrame(x=1:8, y='a':'h', z=repeat([true, false], outer=4))
@with(df, :x + :z)   # create hard scope so vars do not leak out
@subset(df, :x .< 4, :z .== true) # filtering
@select(df, :x, y = 2*:x, z=:y) # create new df
@transform(df, :x = 2*:x, :y=:x) # create new df adding cols based on old one
@orderby(df, :z, -:x) # sort into new df, less powerful than sort but lightweight
using Chain
@chain df begin
    @subset(:x .< 5)
    @orderby(:z)
    @transform(:x^2 = :x .^ 2)
    @select(:z, :x, :x^2)
end

# grouped df
g = groupby(df, :b)
@by(df, :b, :first=first(:a), :last=last(:a), :mean=mean(:a)) # or
@combine(g, ...) # or
combine(g, .a .=> [first, last, mean])
@transform(g, :center = mean(:a), :centered = :a .- mean(:a))
DataFrame(g) # ordered by :b
@transform(g) # orig order

# rowwise operations
@transform(df, :x = ifelse.((:a .> 6) .& (:b .== 4), "yes", "no"))
myfun(a, b) = a > 6 && b == 4 ? "yes" : "no"
@transform(df, :x = myfun.(:a, :b)) # or
@eachrow df begin
    @newcol x::Vector{String}
    :x = :a > 6 && :b == 4 ? "yes" : "no"
end
# in df.jl:
transform(df, [:a, :b] => ByRow((a,b) -> ifelse(a > 6 && b == 4, "yes", "no")) => :x)

## 1_constructors.jl
DataFrame(A=1:3, C=randstring.(3,3,3), fixed=1)

x = Dict("A" => [1,2], fixed => Ref([1,1])) # Ref to protect vector
DataFrame(x)

DataFrame(:B => [true, false]) # symbols (preferred, coz faster)
DataFrame("B" => [true, false])

DataFrame( [:A => [1,2], :C => ['a','b'], :fixed => "const"] ) # pass a vec of pairs also do

DataFrame([rand(3) for i in 1:3], [:x1, :x2, :x3]) # or ["x1", "x2", "x3"]

DataFrame(permutedims([1, 2, 3]), :auto) # auto generate col names

v = [(a=1, b=2), (a=3, b=4)] # NamedTuples
DataFrame(v)
# or
n = (a=1:3, b=11:13)
DataFrame(n)

DataFrame(rand(3,4), Symbol.('a':'d')) # from matrix, or string.('a':'d')

DataFrame(A=Int[], B=Float64[], C=String[]) # predefined cols & types

x = DataFrame(a=1:2, b='a':'b')
y = copy(x) # or y = DataFrame(x) # copycols=false -> (x.a === y.a) is true
(x === y), isequal(x, y), (x.a == y.a), (x.a === y.a)
# (f, t, t, f) ## === f

similar(x, 0) # 2. arg: n_rows

sdf = view(x, [1,1], :) # SubDataFrame
DataFrame(sdf)
dfr = x[1,:] # DataFrameRow
DataFrame(dfr)

Matrix(x) # or Array(x)
# Matrix{Union{Missing, Int64}} contains `missing`

ct = Tables.columntable(x) # convert df to NamedTuples of vectors
rt = Tables.rowtable(x) # convert df to vector of NamedTuples
DataFrame(ct) # convert back

ec = eachcol(x) # vec but not AbstractVector
ec[1] # or ec["x"]
er = eachrow(x) # is AbstractVector
er[end]

# lazy iterator is type stable
nti = Tables.namedtupleiterator(x)
for row in enumerate(nti)
    @show row
end
DataFrame(nti) # convert back

df = DataFrame(:a=>1, :a=>2, :a_1=>3; makeunique=true)
# `nothing` not displayed in ipynb
empty(df) # or empty!() to remove all rows from df

## 2_basicInfo.jl
x = DataFrame(A=[1, 2], B=[1.0, missing], C=["a", "b"])

# dims of df
size(x) # or
nrow(x), ncol(x) # (2,3)
size(x, 1), size(x, 2)
# 2, 3

describe(x, cols=1:2)

names(x)
# "A", "B", "C"
names(x, String) # get col name with given `eltype`
# "C"

propertynames(x)
# :A, :B, :C # vec of `Symbols`

eltype.(eachcol(x))

y = DataFrame(rand(1:10, 1000, 10), :auto)
first(y, 5) # first 5 rows
last(y, 3)
first(y) # or last(y) # only 1 row

# display certain nums of rows & cols
ENV["LINES"] = 10
ENV["COLUMNS"] = 200

x.A, x[!, 1], x[!, :A] # without copy
x."A", x[!,"A"]
x[:, 1] # : create a copy thus safe & preferred except for critical performance
x[:, 1] === x[:, 1] # false

x[1:1, :] # grob 1 row
x[1, :] # produce 1 DataFrameRow similar to NamedTuple

x[Not(1), r"A"] # use Regex to select col and `Not` from InvertedIndices.jl
x[!, Not(1)] # ! indicate not copy thus unsafe as might corrupt
x[1:2, 1:2] .= 1 # or [1, 2] # DataFrame([5 6; 7 8], [:A, :B])

x[:, Cols( "x1", Between(:x2, :x4) )]
@view x[1:2, 1:2]

df = DataFrame()
x = [1, 2, 3]
df.a = x # add new col to df
df.a === x # true # no copy
df[!, :b] = x # or df[:, :c] = x
# or .=   # copy, in this case `!` same effect as `:`
df.c === x # false # copy
df.a === df.b === x # true # no copy
# could lead to silent error
for (n, c) in pairs( eachcol(df) )
    println("$n: ", pop!(c))
end
# b: 2 , others: 3 # pop! removed 3 on col:a
collect( pairs(eachcol(df)) )

df2 = copy(df)
df == df2 # comparison true

df3 = df2 .+ eps()
df == df3 # false
isapprox(df, df3) # true
isapprox(df, df3, atol=eps()/2) # false

df = DataFrame(a=missing)
df == df # missing
df === df # or isequal(df, df) # true

## 3_missingValues.jl
ismissing(1), ismissing(missing)

missing == missing, missing != missing, missing < missing # or
1 == missing, 1 != missing, 1 < missing
# (missing, missing, missing)

isequal(missing, missing), missing === missing, isless(1, missing)
# true, true, true # missing considered > any numeric

collect(skipmissing([1, missing, 2, missing]))
# [1, 2]

replace([1.0, missing, 2.0, missing], missing=>NaN) # or
coalesce.([1.0, missing, 2.0, missing], NaN)
# or
using CategoricalArrays
recode([1.0, missing, 2.0, missing], false, missing=>true)
# [0, 1, 0, 1]

df = DataFrame(a=[1,2,missing], b=["a","b",missing])
replace!(df.a, missing=>100)
df.b = coalesce.(df.b, 100) # type changed to Any

# get unique values with or without missings
unique([1, missing, 2, missing]), levels([1, missing, 2, missing])

x = [1, 2, 3]
y = allowmissing(x)
z = disallowmissing(y) # would fail if y contain missing

df = allowmissing(DataFrame(ones(2,3), :auto))
df[1, 1] = missing
disallowmissing(df) # throw error
disallowmissing(df, error=false)
# after using allowmissing!
# eltype.(eachcol(x)) = Union{Missing, Int64}

x = DataFrame(A=[1, missing, 3, 4], B=["A", "B", missing, "C"])
completecases(x)
# Bool[1, 0, 0, 1]
y = dropmissing(x)
dropmissing!(x)
# or
dropmissing!(x, disallowmissing=false)

string(missing) # "missing"
lift_string = passmissing(string)
lift_string(missing) # missing
lift_string(missing, " ", missing) # missing

df = DataFrame(a=[1,missing,missing], b=[1,2,missing])
sum.(skipmissing.(eachrow(df))) # eachrow is type unstable, so use
sum.(skipmissing.(Tables.namedtupleiterator(df)))

## 4_loadSave.jl
x = DataFrame(A=[true, false, true], B=[1, 2, missing],
              C=[missing, "b", "c"], D=['a', missing, 'c'])
eltype.(eachcol(x))

CSV.write("x1.csv", x)
print(read("x1.csv", String))

y = CSV.read("x1.csv", DataFrame)
# type pf col :D changed to string

## Serialization
### Way 1
open("x.bin", "w") do io
    serialize(io, x)
end
y = open(deserialize, "x.bin")
### Way 2
JDF.save("x.jdf", x)
x_loaded = JDF.load("x.jdf") |> DataFrame
isequal(x_loaded, x) # true
# JDF can load certain cols to help work with large files
x_ondisk = jdf"x.jdf"
names(x_ondisk) # :A :B :C :D
xd = JDF.load(x_ondisk; cols=["A", "D"]) |> DataFrame
### Way 3
JLSO.save("x.jlso", :data => x)
y = JLSO.load("x.jlso")[:data]

# JSON
open(io -> arraytable(io, x), "x1.json", "w")
open(io -> objecttable(io, x), "x2.json", "w")
print(read("x1.json", String))
y1 = open(jsontable, "x1.json") |> DataFrame

# Apache Arrow fmt for interchange with R/Py
Arrow.write("x.arrow", x)
y = Arrow.Table("arrow") |> DataFrame
y.A[1] = false # ReadOnlyMemoryError() # cols of y immutable
y.A # Arrow.BoolVector{Bool} # custom vector type
y.B # Arrow.Primitive{Union{Missing, Int64}, Vector{Int64}}
y2 = copy(y) # get standard Julia Base vector

# gzip compression
df = DataFrame(rand(1:10, 10, 1000), :auto)
# CodecZlib.jl
open("df_compress_test.csv.gz", "w") do io
    stream = GzipCompressorStream(io)
    CSV.write(stream, df)
    close(stream)
end
df2 = CSV.File(transcode(GzipDecompressor, Mmap.mmap("df_compress_test.csv.gz"))) |> Dataframe

# ZipFile.jl
w = ZipFile.Writer("x.zip")
f1 = ZipFile.addfile(w, "x1.csv")
close(w)

z = ZiipFile.Reader("x.zip")
index_xcsv = findfirst(x->x.name == "x1.csv", z.files)
df1_2 = CSV.read((read(z.files[index_xcsv]), DataFrame)
close(z)

## 5_cols.jl
# rename
x = DtatFrame(rand(Bool, 3, 4), :auto)
rename(x, :x => :A)
rename!(c -> c^2, x) # apply func to every colnam
rename(x, 3 => :third) # change 3rd col
rename!(x, [:a, :b, :c, :d])
rename!(x, string.('a':'d'))
rename!(x, "a"=>"d", "d"=>"a") # allow circular renaming
rename(x, fill(:a, 4), makeunique=true)

# reorder cols
Random.seed!(1234)
x[:, shuffle(names(x))]
select!(x, 4:-1:1)

# merge/add
x = DataFrame([(i,j) for i in 1:3, j in 1:4], :auto)
hcat(x, x, makeunique=true)
y = hcat(x, [1,2,3], makeunique=true)
hcat([1,2,3], x, makeunique=true) # or
y = [x DataFrame(A=[1,2,3])]
insertcols!(y, 2, "newcol"=>[1,2,3])
insertcols!(y, 2, :newcol=>[1,2,3], makeunique=true)
insertcols!(x, :A => [1,2,3]) # append # drop index
insertcols!(x, 1, :B => [1,2,3])

df = DataFrame(a = [1,2,3])
insertcols!(df, :b => "x", :c => 'a':'c', :d => Ref([1,2,3]) # broadcast if needed
df1 = DataFrame(a=[1,2])
df2 = DataFrame(b=[2,3], c=[3,4])
insertcols!(df1, pairs(eachcol(df2))...)

# subset/remove
x[:, [1,2,4,5]]
x[:, [:x1, :x4]] # by col name
x[:, [true, false, true, false, true]]
x[:, [:x1]] # single-col df
using InvertedIndices
x[!, r"[12]"]
x[!, Not(1)]
x[!, Between(:x2, :x4)]
x[!, Cols(:x1, Between(:x3, :x5))]  # or
select(x, :x1, Between(:x3, :x5), copycols=false)
df2 = select(df, [1, 2])
select(df, Not([1,2]))

# views
@view x[:, [1,3,5]] # faster

# modify col by name
x[!, :x1] = x[:, :x2]
x[!, :A] = [1, 2, 3]

# find col name
hasproperty(x, :x1) # true
columnindex(x, :x2)

# advanced col selection
df."col 2"
for n in names(df) # interpolate col name by :()
    println(n, "\n", df.:($n), "\n")
end

# work on collection of cols
df = DataFrame(reshape(1:12, 3, 4), :auto)
ec_df = eachcol(df) # DataFrameColumns
ec_df[1] # or
ec_df.x1

# transform
transform(df, [:x1, :x2] => (+) => :res) # src_col => func_to_apply => target_col
using Statistics
combine(df, [:x1, :x2] => cor) # automatically generate 1 col of 1 row
select(df, [:x1, :x2] => cor) # 1 col of 3 row
select(df, :x1, :x2, [:x1, :x2] => ByRow(string)) # apply func on each row
select(df, :x1, :x2, AsTable([:x1, :x2]) => x -> x.x1 + x.x2)
select(df, :x1, nrows => "number_of_rows")
select(df, :x1, :x1 => ByRow(x -> [x ^ 2, x ^ 3]) => ["x1²", "x1³"]) # create multi cols with 1 func # or
select(df, :x1, :x1 => (x -> DataFrame("x1²" => x .^ 2, "x1³" => x .^ 3)) => AsTable)

## 6_rows.jl
# select rows
sdf = view(df, 1:3, 1:3) # SubDataFrame without copy
parent(sdf), parentindices(sdf)
dfr = df[3, :] # DataFrameRow
rownumber(dfr)
df[!, :Z] .= 1
df = DataFrame(a=1:4)
dfv = view(df, [3,2], :)
dfr = dfv[1, :]

# reorder
x = DataFrame(id=1:12, x=rand(12), y=[zeros(6); ones(6)])
issorted(x), issorted(x, :x) # true, false
sort!(x, :x) # in place
y = sort(x, :id) # create new df
sort(x, [:y, :x], rev=[true, false]) # sort by 2 cols, 1st decr, 2nd incr # or
sort(x, [order(:y, rev=true), :x])
sort(x, [order(:y, rev=true), order(:x, by=v->-v)]) # ??
x[shuffle(1:10), :] # reorder randomly
sort!(x, :id)
x[[1,10], :] .= x[[10,1], :] # swap rows

# merge/add rows
[x; x] # merge by rows
reduce(vcat, [x, x, x]) # efficient
y = x[:, reverse(names(x))]
vcat(x, y)
vcat(x, y[:, 1:3], cols=:intersect) # or :union
vcat(x, y[:, 1:3], cols=[:x1, :x5])
append!(x, y)  # modify in place
repeat(x, 2) # twice rows
push!(x, 1:5) # add 1 row to the end
push!(x, Dict(:x1=>11, :x2=>12, :x3=>13, :x4=>14, :x5=>15))
push!(x, (x2=2, x1=1, x4=4, x3=3, x5=5)) # NamedTuples via name matching
push!(x, x[1, :])  # DataFrameRow
source = [(a=1, b=2), (a=missing, b=10, c=20), (b="s", c=1, d=1)]
df = DataFrame()
for row in source
    push!(df, row, cols=:union)
end

# subset/remove
x = DataFrame(id=1:10, val='a':'j')
x[1,:] # a DataFrameRow
x[1:1, :] # a df
view(x, 1:2, :)
x[repeat([true, false], 5), :] # or
view(x, repeat([true, false]), :)
delete!(x, 6:7)
x[Not(1:2), :] # create new df

# filter
filter(r -> r.x1 > 2.5, x, view=true) # or
filter(:x1 => >(2.5), x)
filter!(x) do r
    if r.x1 > 2.5
        return r.x2 < 4.5
    end
end
df = DataFrame(x=1:12, y=modl.(1:12, 4))
filter(row -> row.y in [1,4], df) # or
filter(:y => in([1,4]), df) # or
df[in.(df.y, Ref([1,4])), :]
subset(x, :x1 => x -> x .< mean(x), :x2 => ByRow(<(2.5))) # multi conditions

# deduplicate
x = DataFrame(A=[1,2], B=["x","y"])
append!(x, x)
x.C = 1:4
unique(x, [1,2]) # get first unique row for given index
unique(x) # whole row
nonunique(x, :A) # get indicators of non-unique rows
unique!(x, :B)

# extract one row from a df into standard collections
cols = [:y, :z]
Vector(x[1, cols]) # or
Array(x[1, cols])
[Vector(x[i, cols]) for i in axes(x, 1)]
copy(x[1, cols]) # convert a dfr to NamedTuple
Tuple(x[1, cols])

# work with a collection of rows of df
er_df = eachrow(df)
er_df[1]
last(er_df)
er_df[end]
er_df.x1

# flatten a df
df = DataFrame(a='a':'c', b=[[1,2,3], [4,5], 6])
flatten(df, :b)

# only 1 row
df = DataFrame(a=1)
only(df)
df2 = repeat(df, 2)
only(df2) # ArgumentError: df must contain exactly 1 row

## 8_join.jl
innerjoin(x, y, on=:ID=>:id, matchmissing=:equal) # missing not allowed to join-on be default
leftjoin(x, y, on="ID"=>"id", matchmissing=:equal) # or rightjoin() based on 2nd tab ids
outerjoin() # include ids from both tabs
semijoin()
antijoin()

crossjoin(DataFrame(x=[1, 2]), DataFrame(y=["a","b","c"]))

innerjoin(x, y, on=[:id1, :id2], matchmissing=:equal)

outerjoin(x, y, on=:id1, makeunique=true, indicator=:source, matchmissing=:equal) # produce all combis

innerjoin(x, y, on=[:id1, :id2=>:ID2])

xs = [DataFrame("id"=>1:6, "v$i"=>((1:6) .+ 10i) for i in 1:5] # join >2 dfs
innerjoin(xs..., on=:id)

## 9_reshape.jl
# wide to long
stack(x, [:M1, :M2], :id, variable_name="key", value_name="observed")
stack(x, Not([:id1, :id2])) # or
stack(x, Not([1, 2]) # use idx instead of symbol

stack(DataFrame(rand(3,2), :auto))

# long to wide
unstack(y, :id2, :variable, :value)
unstack(y, renamecols=n -> string("unstacked_", n))
# unable to unstack when no key col present

## JuliaConWorkshop.jl
import Downloads
using Chain
ENV["LINES"] = 20

# Data preprocessing
Downloads.download("http:...csv", "participation.csv")
readlines("participation")
df_raw = CSV.read("participation.csv", DataFrame)
de = select(df_raw,
            :lfp => x -> recode(x, "yes" => 1, "no" => 0), # Labor Force Participation
            :lnnlinc,
            :age,
            :age => ByRow(x -> x^2) => :age^2,
            Between(:educ, :noc),
            :foreign => categorical,
            renamecols=false)

# exploratory analysis
@chain df begin
    groupby(:lfp)
    combine(names(df, Real) .=> mean) # shorter than
    # combine([:lnnlinc, :age, :educ, :nyc, :noc] .=> mean)
end
@chain df begin
    groupby([:lfp, :foreign])
    combine(nrow)
    unstack(:lfp, :foreighn, :nrow) # into pivot table
    select(:lfp, [:no, :yes] => ByRow((x,y) -> y / (x+y)) => :foreign_yes) # or shorter:
    combine(:foreign => (x -> mean(x .== "yes")) => :foreign_yes)
end
gd = groupby(df, :lfp)
gd[(lfp=0,)] # select by val, not position
gd[Dict(:lfp => 0)]
gd[(0,)]

using StatsPlots
@df df density(:age, group=:lfp)

# Build a predictive model
using GLM
probit = glm(Term(:lfp) ~ sum(Term.(propertynames(df)[2:end])),
            df, Binomial(), ProbitLink()) # or
# @formula(lfp ~ linnlinc + age + age^2 + educ + nyc + noc + foreign)
df_pred = DataFrame(linnlinc=10.0, age=2.0:0.01:6.2, educ=9, nyc=0, noc=1, foreign="yes") # modify :age while keep other const
probit_pred = predict(probit, df_pred, interval=:confidence)
plot(df_pred.age, Matrix(probit_pred)...)

# advanced functionalities: bootstrapping
function boot_sample(df)
    df_boot = df[rand(1:nrow(df), nrow(df)), :]
    probit_boot = glm(@formula(lfp ~),
        df_boot, Binomial()...)
    return (; (Symbol.(coefnames(probit_boot)) .=> coef(probit_boot))...)
end
function run_boot(df, reps)
    coef_boot = DataFrame()
    for _ in 1:reps
        push!(coef_boot, boot_sample(df))
    end
    return coef_boot
end
conf_boot = mapcols(x -> quantile(x, [0.025, 0.975]), coef_boot)
confint(probit)
conf_param = DataFrame(permutedims(confint(probint)), names(conf_boot))
append!(conf_boot, conf_param)
insertcols!(conf_boot, 1, :statistic => ["", "", ])
conf_boot_t = permutedims(conf_boot, :statistic)
select!(conf_boot_t, :statistic, :estimate, 3:6 .=> x -> abs(x .- conf_boot_t.estimate), renamecols=false)
scatter(0.05 .+ (1:8), conf_boot_t.estimate, yerror=(conf_boot_t."boot lo", conf_boot_t."boot hi"))
	# split-apply-combine
	gx = groupby(x, [:id1, :id2])
	parent(gx)
	vcat(gx...) # back to df # or
	DataFrame(gx, keepkeys=false)
	groupcols(gx)
	valuecols(gx)
	groupindices(gx)
	k = keys(gx)[1]
	ntk = NamedTuple(k)
	tk = Tuple(k)
	## handle missing
	groupby(x, :id) # last group missing

	# combine, select, transform
	x.id2 = axes(x, 1) # keep col from src
	@chain x begin
	groupby(:id)
	combine(:id2, :v => mean => :res)
	combine(:v => mean => :res1, :v => sum => :res2, nrow => :n) # src => func => target
	end
	combine(groupby(x, :id)) do sdf # sub df
	n = nrow(sdf)
	n < 25 ? DataFrame() : DataFrame(n=n)
	end
	## produce multiple cols in 1 operation
	combine(:val => (x -> [x]) => AsTable) #
	combine(:val => () => [:c1, :c2]) # custom col name
	## unnest
	df = DataFrame(a=[(p=1, q=2), (p=3, q=4)])
	select(df, :a => AsTable)
	## broadcast
	combine([:x1, :x2] .=> [minimum maximum])

	# aggregate using mapcols
	mapcols(mean, x)

	# map rows & cols using eachcol and eachrow
	map(mean, eachcol(x))
	foreach(c -> println(c[1], ": ", mean(c[2])), pairs(eachcol(x)))
	map(r -> r.x1 / r.x2, eachrow(x))
	er = eachrow(x) # & ec = eachcol(x)
	er.x1

	# transpose
	df.names = ["a", "b", "c"]
	permutedims(df, :names)
	using FreqTables
	df = DataFrame(a=rand('a':'d', 1000), b=rand(["x", "y", "z"], 1000))
	ft = freqtable(df, :a, :b)
	prop(ft, margins=1) # calc proportions in rows 2 in cols

	using DataFramesMeta
	df = DataFrame(x=1:8, y='a':'h', z=repeat([true, false], outer=4))
	@with(df, :x + :z) # create hard scope so vars do not leak out
	@subset(df, :x .< 4, :z .== true) # filtering
	@select(df, :x, y = 2*:x, z=:y) # create new df
	@transform(df, :x = 2*:x, :y=:x) # create new df adding cols based on old one
	@orderby(df, :z, -:x) # sort into new df, less powerful than sort but lightweight
	using Chain
	@chain df begin
	@subset(:x .< 5)
	@orderby(:z)
	@transform(:x^2 = :x .^ 2)
	@select(:z, :x, :x^2)
	end

	# grouped df
	g = groupby(df, :b)
	@by(df, :b, :first=first(:a), :last=last(:a), :mean=mean(:a)) # or
	@combine(g, ...) # or
	combine(g, .a .=> [first, last, mean])
	@transform(g, :center = mean(:a), :centered = :a .- mean(:a))
	DataFrame(g) # ordered by :b
	@transform(g) # orig order

	# rowwise operations
	@transform(df, :x = ifelse.((:a .> 6) .& (:b .== 4), "yes", "no"))
	myfun(a, b) = a > 6 && b == 4 ? "yes" : "no"
	@transform(df, :x = myfun.(:a, :b)) # or
	@eachrow df begin
	@newcol x::Vector{String}
	:x = :a > 6 && :b == 4 ? "yes" : "no"
	end
	# in df.jl:
	transform(df, [:a, :b] => ByRow((a,b) -> ifelse(a > 6 && b == 4, "yes", "no")) => :x)
	DataFrame(A=1:3, C=randstring.(3,3,3), fixed=1)

	x = Dict("A" => [1,2], fixed => Ref([1,1])) # Ref to protect vector
	DataFrame(x)

	DataFrame(:B => [true, false]) # symbols (preferred, coz faster)
	DataFrame("B" => [true, false])

	DataFrame( [:A => [1,2], :C => ['a','b'], :fixed => "const"] ) # pass a vec of pairs also do

	DataFrame([rand(3) for i in 1:3], [:x1, :x2, :x3]) # or ["x1", "x2", "x3"]

	DataFrame(permutedims([1, 2, 3]), :auto) # auto generate col names

	v = [(a=1, b=2), (a=3, b=4)] # NamedTuples
	DataFrame(v)
	# or
	n = (a=1:3, b=11:13)
	DataFrame(n)

	DataFrame(rand(3,4), Symbol.('a':'d')) # from matrix, or string.('a':'d')

	DataFrame(A=Int[], B=Float64[], C=String[]) # predefined cols & types

	x = DataFrame(a=1:2, b='a':'b')
	y = copy(x) # or y = DataFrame(x) # copycols=false -> (x.a === y.a) is true
	(x === y), isequal(x, y), (x.a == y.a), (x.a === y.a)
	# (f, t, t, f) ## === f

	similar(x, 0) # 2. arg: n_rows

	sdf = view(x, [1,1], :) # SubDataFrame
	DataFrame(sdf)
	dfr = x[1,:] # DataFrameRow
	DataFrame(dfr)

	Matrix(x) # or Array(x)
	# Matrix{Union{Missing, Int64}} contains `missing`

	ct = Tables.columntable(x) # convert df to NamedTuples of vectors
	rt = Tables.rowtable(x) # convert df to vector of NamedTuples
	DataFrame(ct) # convert back

	ec = eachcol(x) # vec but not AbstractVector
	ec[1] # or ec["x"]
	er = eachrow(x) # is AbstractVector
	er[end]

	# lazy iterator is type stable
	nti = Tables.namedtupleiterator(x)
	for row in enumerate(nti)
	@show row
	end
	DataFrame(nti) # convert back

	df = DataFrame(:a=>1, :a=>2, :a_1=>3; makeunique=true)
	# `nothing` not displayed in ipynb
	empty(df) # or empty!() to remove all rows from df
	x = DataFrame(A=[1, 2], B=[1.0, missing], C=["a", "b"])

	# dims of df
	size(x) # or
	nrow(x), ncol(x) # (2,3)
	size(x, 1), size(x, 2)
	# 2, 3

	describe(x, cols=1:2)

	names(x)
	# "A", "B", "C"
	names(x, String) # get col name with given `eltype`
	# "C"

	propertynames(x)
	# :A, :B, :C # vec of `Symbols`

	eltype.(eachcol(x))

	y = DataFrame(rand(1:10, 1000, 10), :auto)
	first(y, 5) # first 5 rows
	last(y, 3)
	first(y) # or last(y) # only 1 row

	# display certain nums of rows & cols
	ENV["LINES"] = 10
	ENV["COLUMNS"] = 200

	x.A, x[!, 1], x[!, :A] # without copy
	x."A", x[!,"A"]
	x[:, 1] # : create a copy thus safe & preferred except for critical performance
	x[:, 1] === x[:, 1] # false

	x[1:1, :] # grob 1 row
	x[1, :] # produce 1 DataFrameRow similar to NamedTuple

	x[Not(1), r"A"] # use Regex to select col and `Not` from InvertedIndices.jl
	x[!, Not(1)] # ! indicate not copy thus unsafe as might corrupt
	x[1:2, 1:2] .= 1 # or [1, 2] # DataFrame([5 6; 7 8], [:A, :B])

	x[:, Cols( "x1", Between(:x2, :x4) )]
	@view x[1:2, 1:2]

	df = DataFrame()
	x = [1, 2, 3]
	df.a = x # add new col to df
	df.a === x # true # no copy
	df[!, :b] = x # or df[:, :c] = x
	# or .= # copy, in this case `!` same effect as `:`
	df.c === x # false # copy
	df.a === df.b === x # true # no copy
	# could lead to silent error
	for (n, c) in pairs( eachcol(df) )
	println("$n: ", pop!(c))
	end
	# b: 2 , others: 3 # pop! removed 3 on col:a
	collect( pairs(eachcol(df)) )

	df2 = copy(df)
	df == df2 # comparison true

	df3 = df2 .+ eps()
	df == df3 # false
	isapprox(df, df3) # true
	isapprox(df, df3, atol=eps()/2) # false

	df = DataFrame(a=missing)
	df == df # missing
	df === df # or isequal(df, df) # true
	ismissing(1), ismissing(missing)

	missing == missing, missing != missing, missing < missing # or
	1 == missing, 1 != missing, 1 < missing
	# (missing, missing, missing)

	isequal(missing, missing), missing === missing, isless(1, missing)
	# true, true, true # missing considered > any numeric

	collect(skipmissing([1, missing, 2, missing]))
	# [1, 2]

	replace([1.0, missing, 2.0, missing], missing=>NaN) # or
	coalesce.([1.0, missing, 2.0, missing], NaN)
	# or
	using CategoricalArrays
	recode([1.0, missing, 2.0, missing], false, missing=>true)
	# [0, 1, 0, 1]

	df = DataFrame(a=[1,2,missing], b=["a","b",missing])
	replace!(df.a, missing=>100)
	df.b = coalesce.(df.b, 100) # type changed to Any

	# get unique values with or without missings
	unique([1, missing, 2, missing]), levels([1, missing, 2, missing])

	x = [1, 2, 3]
	y = allowmissing(x)
	z = disallowmissing(y) # would fail if y contain missing

	df = allowmissing(DataFrame(ones(2,3), :auto))
	df[1, 1] = missing
	disallowmissing(df) # throw error
	disallowmissing(df, error=false)
	# after using allowmissing!
	# eltype.(eachcol(x)) = Union{Missing, Int64}

	x = DataFrame(A=[1, missing, 3, 4], B=["A", "B", missing, "C"])
	completecases(x)
	# Bool[1, 0, 0, 1]
	y = dropmissing(x)
	dropmissing!(x)
	# or
	dropmissing!(x, disallowmissing=false)

	string(missing) # "missing"
	lift_string = passmissing(string)
	lift_string(missing) # missing
	lift_string(missing, " ", missing) # missing

	df = DataFrame(a=[1,missing,missing], b=[1,2,missing])
	sum.(skipmissing.(eachrow(df))) # eachrow is type unstable, so use
	sum.(skipmissing.(Tables.namedtupleiterator(df)))
	x = DataFrame(A=[true, false, true], B=[1, 2, missing],
	C=[missing, "b", "c"], D=['a', missing, 'c'])
	eltype.(eachcol(x))

	CSV.write("x1.csv", x)
	print(read("x1.csv", String))

	y = CSV.read("x1.csv", DataFrame)
	# type pf col :D changed to string

	## Serialization
	### Way 1
	open("x.bin", "w") do io
	serialize(io, x)
	end
	y = open(deserialize, "x.bin")
	### Way 2
	JDF.save("x.jdf", x)
	x_loaded = JDF.load("x.jdf") \|> DataFrame
	isequal(x_loaded, x) # true
	# JDF can load certain cols to help work with large files
	x_ondisk = jdf"x.jdf"
	names(x_ondisk) # :A :B :C :D
	xd = JDF.load(x_ondisk; cols=["A", "D"]) \|> DataFrame
	### Way 3
	JLSO.save("x.jlso", :data => x)
	y = JLSO.load("x.jlso")[:data]

	# JSON
	open(io -> arraytable(io, x), "x1.json", "w")
	open(io -> objecttable(io, x), "x2.json", "w")
	print(read("x1.json", String))
	y1 = open(jsontable, "x1.json") \|> DataFrame

	# Apache Arrow fmt for interchange with R/Py
	Arrow.write("x.arrow", x)
	y = Arrow.Table("arrow") \|> DataFrame
	y.A[1] = false # ReadOnlyMemoryError() # cols of y immutable
	y.A # Arrow.BoolVector{Bool} # custom vector type
	y.B # Arrow.Primitive{Union{Missing, Int64}, Vector{Int64}}
	y2 = copy(y) # get standard Julia Base vector

	# gzip compression
	df = DataFrame(rand(1:10, 10, 1000), :auto)
	# CodecZlib.jl
	open("df_compress_test.csv.gz", "w") do io
	stream = GzipCompressorStream(io)
	CSV.write(stream, df)
	close(stream)
	end
	df2 = CSV.File(transcode(GzipDecompressor, Mmap.mmap("df_compress_test.csv.gz"))) \|> Dataframe

	# ZipFile.jl
	w = ZipFile.Writer("x.zip")
	f1 = ZipFile.addfile(w, "x1.csv")
	close(w)

	z = ZiipFile.Reader("x.zip")
	index_xcsv = findfirst(x->x.name == "x1.csv", z.files)
	df1_2 = CSV.read((read(z.files[index_xcsv]), DataFrame)
	close(z)
	# rename
	x = DtatFrame(rand(Bool, 3, 4), :auto)
	rename(x, :x => :A)
	rename!(c -> c^2, x) # apply func to every colnam
	rename(x, 3 => :third) # change 3rd col
	rename!(x, [:a, :b, :c, :d])
	rename!(x, string.('a':'d'))
	rename!(x, "a"=>"d", "d"=>"a") # allow circular renaming
	rename(x, fill(:a, 4), makeunique=true)

	# reorder cols
	Random.seed!(1234)
	x[:, shuffle(names(x))]
	select!(x, 4:-1:1)

	# merge/add
	x = DataFrame([(i,j) for i in 1:3, j in 1:4], :auto)
	hcat(x, x, makeunique=true)
	y = hcat(x, [1,2,3], makeunique=true)
	hcat([1,2,3], x, makeunique=true) # or
	y = [x DataFrame(A=[1,2,3])]
	insertcols!(y, 2, "newcol"=>[1,2,3])
	insertcols!(y, 2, :newcol=>[1,2,3], makeunique=true)
	insertcols!(x, :A => [1,2,3]) # append # drop index
	insertcols!(x, 1, :B => [1,2,3])

	df = DataFrame(a = [1,2,3])
	insertcols!(df, :b => "x", :c => 'a':'c', :d => Ref([1,2,3]) # broadcast if needed
	df1 = DataFrame(a=[1,2])
	df2 = DataFrame(b=[2,3], c=[3,4])
	insertcols!(df1, pairs(eachcol(df2))...)

	# subset/remove
	x[:, [1,2,4,5]]
	x[:, [:x1, :x4]] # by col name
	x[:, [true, false, true, false, true]]
	x[:, [:x1]] # single-col df
	using InvertedIndices
	x[!, r"[12]"]
	x[!, Not(1)]
	x[!, Between(:x2, :x4)]
	x[!, Cols(:x1, Between(:x3, :x5))] # or
	select(x, :x1, Between(:x3, :x5), copycols=false)
	df2 = select(df, [1, 2])
	select(df, Not([1,2]))

	# views
	@view x[:, [1,3,5]] # faster

	# modify col by name
	x[!, :x1] = x[:, :x2]
	x[!, :A] = [1, 2, 3]

	# find col name
	hasproperty(x, :x1) # true
	columnindex(x, :x2)

	# advanced col selection
	df."col 2"
	for n in names(df) # interpolate col name by :()
	println(n, "\n", df.:($n), "\n")
	end

	# work on collection of cols
	df = DataFrame(reshape(1:12, 3, 4), :auto)
	ec_df = eachcol(df) # DataFrameColumns
	ec_df[1] # or
	ec_df.x1

	# transform
	transform(df, [:x1, :x2] => (+) => :res) # src_col => func_to_apply => target_col
	using Statistics
	combine(df, [:x1, :x2] => cor) # automatically generate 1 col of 1 row
	select(df, [:x1, :x2] => cor) # 1 col of 3 row
	select(df, :x1, :x2, [:x1, :x2] => ByRow(string)) # apply func on each row
	select(df, :x1, :x2, AsTable([:x1, :x2]) => x -> x.x1 + x.x2)
	select(df, :x1, nrows => "number_of_rows")
	select(df, :x1, :x1 => ByRow(x -> [x ^ 2, x ^ 3]) => ["x1²", "x1³"]) # create multi cols with 1 func # or
	select(df, :x1, :x1 => (x -> DataFrame("x1²" => x .^ 2, "x1³" => x .^ 3)) => AsTable)
	# select rows
	sdf = view(df, 1:3, 1:3) # SubDataFrame without copy
	parent(sdf), parentindices(sdf)
	dfr = df[3, :] # DataFrameRow
	rownumber(dfr)
	df[!, :Z] .= 1
	df = DataFrame(a=1:4)
	dfv = view(df, [3,2], :)
	dfr = dfv[1, :]

	# reorder
	x = DataFrame(id=1:12, x=rand(12), y=[zeros(6); ones(6)])
	issorted(x), issorted(x, :x) # true, false
	sort!(x, :x) # in place
	y = sort(x, :id) # create new df
	sort(x, [:y, :x], rev=[true, false]) # sort by 2 cols, 1st decr, 2nd incr # or
	sort(x, [order(:y, rev=true), :x])
	sort(x, [order(:y, rev=true), order(:x, by=v->-v)]) # ??
	x[shuffle(1:10), :] # reorder randomly
	sort!(x, :id)
	x[[1,10], :] .= x[[10,1], :] # swap rows

	# merge/add rows
	[x; x] # merge by rows
	reduce(vcat, [x, x, x]) # efficient
	y = x[:, reverse(names(x))]
	vcat(x, y)
	vcat(x, y[:, 1:3], cols=:intersect) # or :union
	vcat(x, y[:, 1:3], cols=[:x1, :x5])
	append!(x, y) # modify in place
	repeat(x, 2) # twice rows
	push!(x, 1:5) # add 1 row to the end
	push!(x, Dict(:x1=>11, :x2=>12, :x3=>13, :x4=>14, :x5=>15))
	push!(x, (x2=2, x1=1, x4=4, x3=3, x5=5)) # NamedTuples via name matching
	push!(x, x[1, :]) # DataFrameRow
	source = [(a=1, b=2), (a=missing, b=10, c=20), (b="s", c=1, d=1)]
	df = DataFrame()
	for row in source
	push!(df, row, cols=:union)
	end

	# subset/remove
	x = DataFrame(id=1:10, val='a':'j')
	x[1,:] # a DataFrameRow
	x[1:1, :] # a df
	view(x, 1:2, :)
	x[repeat([true, false], 5), :] # or
	view(x, repeat([true, false]), :)
	delete!(x, 6:7)
	x[Not(1:2), :] # create new df

	# filter
	filter(r -> r.x1 > 2.5, x, view=true) # or
	filter(:x1 => >(2.5), x)
	filter!(x) do r
	if r.x1 > 2.5
	return r.x2 < 4.5
	end
	end
	df = DataFrame(x=1:12, y=modl.(1:12, 4))
	filter(row -> row.y in [1,4], df) # or
	filter(:y => in([1,4]), df) # or
	df[in.(df.y, Ref([1,4])), :]
	subset(x, :x1 => x -> x .< mean(x), :x2 => ByRow(<(2.5))) # multi conditions

	# deduplicate
	x = DataFrame(A=[1,2], B=["x","y"])
	append!(x, x)
	x.C = 1:4
	unique(x, [1,2]) # get first unique row for given index
	unique(x) # whole row
	nonunique(x, :A) # get indicators of non-unique rows
	unique!(x, :B)

	# extract one row from a df into standard collections
	cols = [:y, :z]
	Vector(x[1, cols]) # or
	Array(x[1, cols])
	[Vector(x[i, cols]) for i in axes(x, 1)]
	copy(x[1, cols]) # convert a dfr to NamedTuple
	Tuple(x[1, cols])

	# work with a collection of rows of df
	er_df = eachrow(df)
	er_df[1]
	last(er_df)
	er_df[end]
	er_df.x1

	# flatten a df
	df = DataFrame(a='a':'c', b=[[1,2,3], [4,5], 6])
	flatten(df, :b)

	# only 1 row
	df = DataFrame(a=1)
	only(df)
	df2 = repeat(df, 2)
	only(df2) # ArgumentError: df must contain exactly 1 row
	innerjoin(x, y, on=:ID=>:id, matchmissing=:equal) # missing not allowed to join-on be default
	leftjoin(x, y, on="ID"=>"id", matchmissing=:equal) # or rightjoin() based on 2nd tab ids
	outerjoin() # include ids from both tabs
	semijoin()
	antijoin()

	crossjoin(DataFrame(x=[1, 2]), DataFrame(y=["a","b","c"]))

	innerjoin(x, y, on=[:id1, :id2], matchmissing=:equal)

	outerjoin(x, y, on=:id1, makeunique=true, indicator=:source, matchmissing=:equal) # produce all combis

	innerjoin(x, y, on=[:id1, :id2=>:ID2])

	xs = [DataFrame("id"=>1:6, "v$i"=>((1:6) .+ 10i) for i in 1:5] # join >2 dfs
	innerjoin(xs..., on=:id)
	# wide to long
	stack(x, [:M1, :M2], :id, variable_name="key", value_name="observed")
	stack(x, Not([:id1, :id2])) # or
	stack(x, Not([1, 2]) # use idx instead of symbol

	stack(DataFrame(rand(3,2), :auto))

	# long to wide
	unstack(y, :id2, :variable, :value)
	unstack(y, renamecols=n -> string("unstacked_", n))
	# unable to unstack when no key col present