Created
September 13, 2022 14:49
-
-
Save Patrikios/f2803c02e4ebd1a5df0f6e4cf8bc0e40 to your computer and use it in GitHub Desktop.
A short excourse on the DataFrames.jl API
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using DataFrames | |
# create a data.frame (simple) | |
df = DataFrame(A=1:4, B=["M", "N", "O", "P"]) | |
df[!, :A] # no copy | |
df[:, :A] # copy is returned | |
# create a data.frame (row by row - not performant, as DataFrames are like in R built columnwise) | |
df2 = DataFrame(A=Int[], B=String[]) | |
push!(df2, (1, "M")); | |
push!(df2, [2, "N"]) | |
push!(df2, Dict(:B => "F", :A => 3)) | |
# colnames | |
names(df) | |
names(df, r"A") | |
names(df, Int) | |
names(df, Int32) |> length # 0 matches | |
# colnames as symbols | |
propertynames(df) | |
# summarise data | |
describe(df) | |
# first & last row | |
first(df) | |
first(df, 2) | |
last(df) | |
# using factors like arrays | |
using CategoricalArrays | |
DataFrame(a=1:2, | |
b=[1.0, missing], | |
c=categorical('a':'b'), | |
d=[1 // 2, missing]) | |
# typeof | |
DataFrame == typeof(df) # true | |
# copy a data frame (shallow-without references) | |
df2 = copy(df2) | |
# DataFrames edits in place | |
x = [3, 1, 2]; | |
df = DataFrame(x=x) # copy of x, not direct reference to object x | |
sort!(df) | |
df.x[1] = 100 | |
df | |
x # untouched | |
# subsetting | |
df[1:2, !] | |
df[1:3, :] | |
eachcol(df)[1] | |
df[[1, 3, 5], [:B, :A]] | |
df[[1, 3, 5], [:A]] | |
df[[1, 3, 5], :A] # data.frame type was dropped, vector is returned | |
df = DataFrame(A=1:2:1000, B=repeat(1:10, inner=50), C=1:500) | |
df[df.A .> 500, :] | |
df[(df.A .> 500) .& (300 .< df.C .< 400), :] | |
df[in.(df.A, Ref([1, 5, 601])), :] # Where a specific subset of values needs to be matched, the in() function can be applied (? Ref: Ref is sometimes used in broadcasting in order to treat the referenced values as a scalar.) | |
df[in([1, 5, 601]).(df.A), :] # same as above | |
# selecting cols | |
df = DataFrame(x1=1, x2=2, y=3) | |
df[!, r"x"] | |
df[!, Not(:x1)] # drop :x1 column | |
df[:, Between(:x1, :y)] # keep columns between :r and :y | |
df[:, All()] # keep all columns | |
df[:, Cols(x -> startswith(x, "x"))] # keep columns whose name starts with "x" | |
df = DataFrame(r=1, x1=2, x2=3, y=4) | |
df[:, Cols(r"x", :)] | |
df[:, Cols(Not(r"x"), :)] | |
# views into df | |
@view df[1:3, :A] | |
# views do not return copy - the same case as here, no copies but direct reference | |
df[!, :A] # when a ! is placed in the first indexing position | |
df.A # when using . (getpropery) | |
df[!, [:A, :B]] | |
df[1, [:A, :B]] # when a single row is selected using an integer | |
@view df[1:3, :A] # wen view or @view is used | |
# function 'subset' | |
df = DataFrame(A=1:2:1000, B=repeat(1:10, inner=50), C=1:500) | |
subset(df, :A => a -> a .< 10, :C => c -> iseven.(c)) | |
df = DataFrame(x=[1, 2, missing, 4]) | |
subset(df, :x => i -> [false, true, true, false]) | |
subset(df, :x => rrr -> coalesce.(iseven.(rrr), false)) | |
subset(df, :x => x -> iseven.(x), skipmissing=true) # same as above | |
# function 'select' | |
# selct always returns a data frame | |
df = DataFrame(x1=[1, 2], x2=[3, 4], y=[5, 6]) | |
select(df, :x1) | |
select(df, Not(:x1)) | |
select(df, r"x") # select columns containing 'x' character | |
select(df, :x1 => :a1, :x2 => :a2) # rename columns | |
select(df, :x1, :x2 => (x -> x .- minimum(x)) => :x2_new) # transform columns | |
select(df, :x2, :x2 => ByRow(sqrt)) # # transform columns by row | |
select(df, AsTable(:) => ByRow(extrema) => [:lo, :hi]) | |
df2 = select(df, :x1, copycols=false) # direct reference | |
df2.x1 === df.x1 # same object | |
select!(df, Not(:x1)); # in-place selection | |
print(df) | |
# function 'transform'/'transform!' | |
import Random | |
import FreqTables # freqtable & prop | |
using Statistics # mean | |
Random.seed!(1) | |
df = DataFrame(rand(100000, 3), [:a ,:b, :c]) | |
transform!(df, AsTable(:) => ByRow(argmax) => :prediction) | |
tbl = df.prediction |> FreqTables.freqtable | |
tbl |> FreqTables.prop | |
df = DataFrame(x=[1, 2, missing], y=[1, missing, missing]) | |
transform(df, AsTable(:) .=> | |
ByRow.([sum ∘ skipmissing, | |
x -> count(!ismissing, x), | |
mean ∘ skipmissing]) .=> [:sum, :n, :mean]) | |
# function 'combine' | |
# apply function to each column | |
df = DataFrame(A=1:4, B=4.0:-1.0:1.0) | |
combine(df, names(df) .=> sum) | |
combine(df, names(df) .=> sum, names(df) .=> prod) | |
select(df, names(df) .=> sum, names(df) .=> prod) # works with select which retains the number of rows | |
# replacing data | |
df = DataFrame(a=["a", "None", "b", "None"], b=1:4, | |
c=["None", "j", "k", "h"], d=["x", "y", "None", "z"]) | |
replace!(df.a, "None" => "c") | |
# replacement on a subset of columns [:c, :d] | |
df[:, [:c, :d]] .= ifelse.(df[!, [:c, :d]] .== "None", "c", df[!, [:c, :d]]) | |
# replacement on entire data frame | |
# Do note that in the above examples, changing .= to just = will allocate new column vectors instead of applying the operation in-place. | |
df .= ifelse.(df .== "c", "None", df) | |
# When replacing values with missing, if the columns do not already allow for missing values, one has to either avoid in-place operation and use = instead of .=, or call allowmissing! beforehand: | |
df2 = ifelse.(df .== "None", missing, df) | |
# or first llow missing | |
allowmissing!(df) # operate in-place after allowing for missing | |
df .= ifelse.(df .== "None", missing, df) | |
# reading data into data frame from CSV | |
# package CSV # latency might be high, hence alternatively: | |
# std module DelimitedFiles | |
# other utilities: Arrow.jl, Feather.jl, Avro.jl, JSONTables.jl, Parquet.jl, StatFiles.jl, XLSX.jl, ClipData.jl | |
using CSV | |
df = DataFrame(x=1, y=2) | |
output = "dummy_data.csv" | |
input = output | |
CSV.write(output, df) | |
@assert "dummy_data.csv" in readdir() | |
DataFrame(CSV.File(input)) | |
using DelimitedFiles | |
data, header = readdlm(joinpath(dirname(pathof(DataFrames)), | |
"..", "docs", "src", "assets", "iris.csv"), | |
',', header=true) | |
iris_raw = DataFrame(data, vec(header)) | |
iris = identity.(iris_raw) # identityboradcasting in orde to perform narrowing of eltype | |
eltype.(eachcol(iris_raw)) | |
eltype.(eachcol(iris)) | |
writedlm("test.csv", Iterators.flatten(([names(iris)], eachrow(iris))), ',') | |
# joins | |
people = DataFrame(ID=[20, 40], Name=["John Doe", "Jane Doe"]) | |
jobs = DataFrame(ID=[20, 60], Job=["Lawyer", "Astronaut"]) | |
innerjoin(people, jobs, on=:ID) | |
leftjoin(people, jobs, on=:ID) | |
rightjoin(people, jobs, on=:ID) | |
outerjoin(people, jobs, on=:ID) | |
semijoin(people, jobs, on=:ID) | |
antijoin(people, jobs, on=:ID) | |
crossjoin(people, jobs, makeunique=true) # cartesian product (https://en.wikipedia.org/wiki/Join_(SQL)) | |
a = DataFrame(City=["Amsterdam", "London", "London", "New York", "New York"], | |
Job=["Lawyer", "Lawyer", "Lawyer", "Doctor", "Doctor"], | |
Category=[1, 2, 3, 4, 5]) | |
b = DataFrame(Location=["Amsterdam", "London", "London", "New York", "New York"], | |
Work=["Lawyer", "Lawyer", "Lawyer", "Doctor", "Doctor"], | |
Name=["a", "b", "c", "d", "e"]) | |
innerjoin(a, b, on=[:City => :Location, :Job => :Work]) | |
a = DataFrame(ID=[20, 40], Name=["John", "Jane"]) | |
b = DataFrame(ID=[20, 60], Job=["Lawyer", "Doctor"]) | |
innerjoin(a, b, on=[(:City => :Location), (:Job => :Work)], validate=(true, true)) # validate merge keys | |
outerjoin(a, b, on=:ID, validate=(true, true), source=:source) # using the source keyword argument you can add a column to the resulting data frame indicating whether the given row appeared only in the left, the right or both data frames | |
# split-apply-combine | |
using DataFrames, CSV, Statistics | |
iris = CSV.read((joinpath(dirname(pathof(DataFrames)), | |
"..", "docs", "src", "assets", "iris.csv")), | |
DataFrame) | |
gdf = groupby(iris, :Species) # GroupedDataFrame | |
combine(gdf, :PetalLength => mean) | |
combine(gdf, nrow) | |
combine(gdf, nrow, :PetalLength => mean => :mean) | |
combine(gdf, | |
[:PetalLength, :SepalLength] => ((p, s) -> (a = mean(p) / mean(s), b = sum(p))) => | |
AsTable) # multiple columns are passed as arguments | |
combine(gdf, | |
AsTable([:PetalLength, :SepalLength]) => | |
x -> std(x.PetalLength) / std(x.SepalLength)) # passing a NamedTuple | |
combine(x -> std(x.PetalLength) / std(x.SepalLength), gdf) | |
combine(gdf, 1:2 => cor, nrow) | |
combine(gdf, :PetalLength => (x -> [extrema(x)]) => [:min, :max]) | |
# compare to select | |
select(gdf, 1:2 => cor) | |
transform(gdf, :Species => (x -> chop.(x, head=5, tail=0)) => :New) | |
# do block (not performant) | |
combine(gdf) do df | |
(m = mean(df.PetalLength), s² = var(df.PetalLength)) | |
end | |
# group by | |
for subdf in groupby(iris, :Species) | |
println(size(subdf, 1)) | |
end | |
for (key, subdf) in pairs(groupby(iris, :Species)) | |
println("Number of data points for $(key.Species): $(nrow(subdf))") | |
end | |
df = DataFrame(g=repeat(1:1000, inner=5), x=1:5000) | |
gdf = groupby(df, :g) | |
gdf[(g = 500,)] | |
gdf[[(500,), (501,)]] | |
# apply function to every non-grouping variable | |
gd = groupby(iris, :Species) | |
combine(gd, valuecols(gd) .=> mean) | |
# GroupedDataFrame is a view, expected to be immutable (pushing to the df and then calling the groups will return error) | |
df = DataFrame(id=1:2) | |
gd = groupby(df, :id) | |
push!(df, [3]) | |
gd[1] | |
# grouping on view to df allows appending rows to the source data frame of a GroupedDataFrame | |
df = DataFrame(id=1:2) | |
gd = groupby(view(df, :, :), :id) | |
push!(df, [3]) | |
gd[1] | |
# working with df @view | |
df = DataFrame(a=1:5) | |
sdf = @view df[2:3, :] | |
transform(sdf, :a => ByRow(string)) # create a new data frame | |
transform!(sdf, :a => ByRow(string)) # update the source df in-place | |
show(df) | |
select!(sdf, :a => -, renamecols=false) | |
show(df) # the column replaced an existing column; previously stored values are re-used in filtered-out rows | |
# the same applies to GroupedDataFrame | |
df = DataFrame(a=[1, 1, 1, 2, 2, 3], b=1:6) | |
sdf = @view df[2:4, :] | |
gsdf = groupby(sdf, :a) | |
transform(gsdf, nrow) # creates a new data frame | |
transform!(gsdf, nrow, :b => :b_copy) | |
show(df) | |
select!(gsdf, :b_copy, :b => sum, renamecols=false) | |
show(df) | |
# reshape | |
# stack | |
iris = CSV.read((joinpath(dirname(pathof(DataFrames)),"..", "docs", "src", "assets", "iris.csv")),DataFrame) | |
stack(iris, 1:4) | |
stack(iris, [:SepalLength, :SepalWidth, :PetalLength, :PetalWidth]) | |
stack(iris, [:SepalLength, :SepalWidth], :Species) | |
stack(iris, Not(:Species)) | |
iris.id = 1:size(iris, 1) | |
longdf = stack(iris, Not([:Species, :id])) | |
# unstack | |
unstack(longdf, :id, :variable, :value) | |
unstack(longdf, [:id, :Species], :variable, :value) | |
unstack(longdf, :variable, :value) | |
unstack(longdf) # default for variable and value will be determined | |
stack(iris, view=true) # Passing view=true to stack returns a data frame whose columns are views into the original wide data frame. This saves memory. | |
# stack and aggregation | |
using Statistics | |
d = stack(iris, Not(:Species)) | |
x = combine(groupby(d, [:variable, :Species]), :value => mean => :vsum) | |
first(unstack(x, :Species, :vsum), 6) | |
# transposing DataFrame | |
df1 = DataFrame(a=["x", "y"], b=[1.0, 2.0], c=[3, 4], d=[true, false]) | |
permutedims(df1, 1) # performes type promotion | |
# sorting | |
iris = CSV.read((joinpath(dirname(pathof(DataFrames)),"..", "docs", "src", "assets", "iris.csv")),DataFrame) | |
sort!(iris) # lexicographical sort on all | |
sort!(iris, rev = true) | |
sort!(iris, [:Species, :SepalWidth]) | |
sort!(iris, [order(:Species, by=length), order(:SepalLength, rev=true)]) | |
sort!(iris, [:Species, :PetalLength], rev=[true, false]) | |
sort!(iris, [order(:Species, rev=true), :PetalLength]) | |
# working with Categorical Data in DataFrames | |
using CategoricalArrays | |
v = ["Group A", "Group A", "Group A", "Group B", "Group B", "Group B"] | |
cv = categorical(v) | |
cv = categorical(["Group A", missing, # CategoricalVectorss support missing values | |
"Group A", "Group B", | |
"Group B", missing]) | |
levels(cv) | |
sort(cv) | |
levels!(cv, ["Group B", "Group A"]) # changing the order of appearance of the levels - useful for display purposes or when working with ordered variables | |
sort(cv) | |
cv = compress(cv) # by compressing able to represent 2^32 different levels | |
cv1 = categorical(["A", "B"], compress=true) # same compressing as above | |
cv2 = categorical(["A", "B"], ordered=true) # ordered factor | |
cv1[1] < cv1[2] # but: | |
cv2[1] < cv2[2] | |
isordered(cv1) | |
ordered!(cv1, true) | |
isordered(cv1) | |
# missingness | |
missing | |
missings(Int, 2, 3) # The missings function constructs Vectors and Arrays supporting missing values, using the optional first argument to specify the element-type | |
missings(3) | |
typeof(missing) | |
x = [1, 2, missing] | |
eltype(x) | |
Union{Missing, Int} | |
skipmissing(x) | |
collect(skipmissing(x)) | |
coalesce.(x, 0) | |
df = DataFrame(i=1:5, | |
x=[missing, 4, missing, 2, 1], | |
y=[missing, missing, "c", "d", "e"]) | |
dropmissing(df) | |
dropmissing(df, :x) | |
dropmissing(df, disallowmissing=true) | |
df = DataFrame(x=1:3, y=4:6) | |
allowmissing!(df) | |
df[1, 1] = missing | |
disallowmissing(df, :y) | |
disallowmissing(df, error=false) | |
using Missings | |
passmissing(uppercase)("a") | |
passmissing(uppercase)(missing) # passmissing higher order function that takes some function f as its argument and returns a new function which returns missing if any of its positional arguments are missing and otherwise applies the function f to these arguments | |
Missings.replace(x, 1) |> collect # function Missings.replace returns an iterator which replaces missing elements with another value | |
collect(Missings.replace(x, 1)) == coalesce.(x, 1) | |
eltype(x) | |
nonmissingtype(eltype(x)) | |
parentmodule(nonmissingtype) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment