Skip to content

Instantly share code, notes, and snippets.

@Patrikios
Created September 13, 2022 14:49
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Patrikios/f2803c02e4ebd1a5df0f6e4cf8bc0e40 to your computer and use it in GitHub Desktop.
Save Patrikios/f2803c02e4ebd1a5df0f6e4cf8bc0e40 to your computer and use it in GitHub Desktop.
A short excourse on the DataFrames.jl API
using DataFrames
# create a data.frame (simple)
df = DataFrame(A=1:4, B=["M", "N", "O", "P"])
df[!, :A] # no copy
df[:, :A] # copy is returned
# create a data.frame (row by row - not performant, as DataFrames are like in R built columnwise)
df2 = DataFrame(A=Int[], B=String[])
push!(df2, (1, "M"));
push!(df2, [2, "N"])
push!(df2, Dict(:B => "F", :A => 3))
# colnames
names(df)
names(df, r"A")
names(df, Int)
names(df, Int32) |> length # 0 matches
# colnames as symbols
propertynames(df)
# summarise data
describe(df)
# first & last row
first(df)
first(df, 2)
last(df)
# using factors like arrays
using CategoricalArrays
DataFrame(a=1:2,
b=[1.0, missing],
c=categorical('a':'b'),
d=[1 // 2, missing])
# typeof
DataFrame == typeof(df) # true
# copy a data frame (shallow-without references)
df2 = copy(df2)
# DataFrames edits in place
x = [3, 1, 2];
df = DataFrame(x=x) # copy of x, not direct reference to object x
sort!(df)
df.x[1] = 100
df
x # untouched
# subsetting
df[1:2, !]
df[1:3, :]
eachcol(df)[1]
df[[1, 3, 5], [:B, :A]]
df[[1, 3, 5], [:A]]
df[[1, 3, 5], :A] # data.frame type was dropped, vector is returned
df = DataFrame(A=1:2:1000, B=repeat(1:10, inner=50), C=1:500)
df[df.A .> 500, :]
df[(df.A .> 500) .& (300 .< df.C .< 400), :]
df[in.(df.A, Ref([1, 5, 601])), :] # Where a specific subset of values needs to be matched, the in() function can be applied (? Ref: Ref is sometimes used in broadcasting in order to treat the referenced values as a scalar.)
df[in([1, 5, 601]).(df.A), :] # same as above
# selecting cols
df = DataFrame(x1=1, x2=2, y=3)
df[!, r"x"]
df[!, Not(:x1)] # drop :x1 column
df[:, Between(:x1, :y)] # keep columns between :r and :y
df[:, All()] # keep all columns
df[:, Cols(x -> startswith(x, "x"))] # keep columns whose name starts with "x"
df = DataFrame(r=1, x1=2, x2=3, y=4)
df[:, Cols(r"x", :)]
df[:, Cols(Not(r"x"), :)]
# views into df
@view df[1:3, :A]
# views do not return copy - the same case as here, no copies but direct reference
df[!, :A] # when a ! is placed in the first indexing position
df.A # when using . (getpropery)
df[!, [:A, :B]]
df[1, [:A, :B]] # when a single row is selected using an integer
@view df[1:3, :A] # wen view or @view is used
# function 'subset'
df = DataFrame(A=1:2:1000, B=repeat(1:10, inner=50), C=1:500)
subset(df, :A => a -> a .< 10, :C => c -> iseven.(c))
df = DataFrame(x=[1, 2, missing, 4])
subset(df, :x => i -> [false, true, true, false])
subset(df, :x => rrr -> coalesce.(iseven.(rrr), false))
subset(df, :x => x -> iseven.(x), skipmissing=true) # same as above
# function 'select'
# selct always returns a data frame
df = DataFrame(x1=[1, 2], x2=[3, 4], y=[5, 6])
select(df, :x1)
select(df, Not(:x1))
select(df, r"x") # select columns containing 'x' character
select(df, :x1 => :a1, :x2 => :a2) # rename columns
select(df, :x1, :x2 => (x -> x .- minimum(x)) => :x2_new) # transform columns
select(df, :x2, :x2 => ByRow(sqrt)) # # transform columns by row
select(df, AsTable(:) => ByRow(extrema) => [:lo, :hi])
df2 = select(df, :x1, copycols=false) # direct reference
df2.x1 === df.x1 # same object
select!(df, Not(:x1)); # in-place selection
print(df)
# function 'transform'/'transform!'
import Random
import FreqTables # freqtable & prop
using Statistics # mean
Random.seed!(1)
df = DataFrame(rand(100000, 3), [:a ,:b, :c])
transform!(df, AsTable(:) => ByRow(argmax) => :prediction)
tbl = df.prediction |> FreqTables.freqtable
tbl |> FreqTables.prop
df = DataFrame(x=[1, 2, missing], y=[1, missing, missing])
transform(df, AsTable(:) .=>
ByRow.([sum skipmissing,
x -> count(!ismissing, x),
mean skipmissing]) .=> [:sum, :n, :mean])
# function 'combine'
# apply function to each column
df = DataFrame(A=1:4, B=4.0:-1.0:1.0)
combine(df, names(df) .=> sum)
combine(df, names(df) .=> sum, names(df) .=> prod)
select(df, names(df) .=> sum, names(df) .=> prod) # works with select which retains the number of rows
# replacing data
df = DataFrame(a=["a", "None", "b", "None"], b=1:4,
c=["None", "j", "k", "h"], d=["x", "y", "None", "z"])
replace!(df.a, "None" => "c")
# replacement on a subset of columns [:c, :d]
df[:, [:c, :d]] .= ifelse.(df[!, [:c, :d]] .== "None", "c", df[!, [:c, :d]])
# replacement on entire data frame
# Do note that in the above examples, changing .= to just = will allocate new column vectors instead of applying the operation in-place.
df .= ifelse.(df .== "c", "None", df)
# When replacing values with missing, if the columns do not already allow for missing values, one has to either avoid in-place operation and use = instead of .=, or call allowmissing! beforehand:
df2 = ifelse.(df .== "None", missing, df)
# or first llow missing
allowmissing!(df) # operate in-place after allowing for missing
df .= ifelse.(df .== "None", missing, df)
# reading data into data frame from CSV
# package CSV # latency might be high, hence alternatively:
# std module DelimitedFiles
# other utilities: Arrow.jl, Feather.jl, Avro.jl, JSONTables.jl, Parquet.jl, StatFiles.jl, XLSX.jl, ClipData.jl
using CSV
df = DataFrame(x=1, y=2)
output = "dummy_data.csv"
input = output
CSV.write(output, df)
@assert "dummy_data.csv" in readdir()
DataFrame(CSV.File(input))
using DelimitedFiles
data, header = readdlm(joinpath(dirname(pathof(DataFrames)),
"..", "docs", "src", "assets", "iris.csv"),
',', header=true)
iris_raw = DataFrame(data, vec(header))
iris = identity.(iris_raw) # identityboradcasting in orde to perform narrowing of eltype
eltype.(eachcol(iris_raw))
eltype.(eachcol(iris))
writedlm("test.csv", Iterators.flatten(([names(iris)], eachrow(iris))), ',')
# joins
people = DataFrame(ID=[20, 40], Name=["John Doe", "Jane Doe"])
jobs = DataFrame(ID=[20, 60], Job=["Lawyer", "Astronaut"])
innerjoin(people, jobs, on=:ID)
leftjoin(people, jobs, on=:ID)
rightjoin(people, jobs, on=:ID)
outerjoin(people, jobs, on=:ID)
semijoin(people, jobs, on=:ID)
antijoin(people, jobs, on=:ID)
crossjoin(people, jobs, makeunique=true) # cartesian product (https://en.wikipedia.org/wiki/Join_(SQL))
a = DataFrame(City=["Amsterdam", "London", "London", "New York", "New York"],
Job=["Lawyer", "Lawyer", "Lawyer", "Doctor", "Doctor"],
Category=[1, 2, 3, 4, 5])
b = DataFrame(Location=["Amsterdam", "London", "London", "New York", "New York"],
Work=["Lawyer", "Lawyer", "Lawyer", "Doctor", "Doctor"],
Name=["a", "b", "c", "d", "e"])
innerjoin(a, b, on=[:City => :Location, :Job => :Work])
a = DataFrame(ID=[20, 40], Name=["John", "Jane"])
b = DataFrame(ID=[20, 60], Job=["Lawyer", "Doctor"])
innerjoin(a, b, on=[(:City => :Location), (:Job => :Work)], validate=(true, true)) # validate merge keys
outerjoin(a, b, on=:ID, validate=(true, true), source=:source) # using the source keyword argument you can add a column to the resulting data frame indicating whether the given row appeared only in the left, the right or both data frames
# split-apply-combine
using DataFrames, CSV, Statistics
iris = CSV.read((joinpath(dirname(pathof(DataFrames)),
"..", "docs", "src", "assets", "iris.csv")),
DataFrame)
gdf = groupby(iris, :Species) # GroupedDataFrame
combine(gdf, :PetalLength => mean)
combine(gdf, nrow)
combine(gdf, nrow, :PetalLength => mean => :mean)
combine(gdf,
[:PetalLength, :SepalLength] => ((p, s) -> (a = mean(p) / mean(s), b = sum(p))) =>
AsTable) # multiple columns are passed as arguments
combine(gdf,
AsTable([:PetalLength, :SepalLength]) =>
x -> std(x.PetalLength) / std(x.SepalLength)) # passing a NamedTuple
combine(x -> std(x.PetalLength) / std(x.SepalLength), gdf)
combine(gdf, 1:2 => cor, nrow)
combine(gdf, :PetalLength => (x -> [extrema(x)]) => [:min, :max])
# compare to select
select(gdf, 1:2 => cor)
transform(gdf, :Species => (x -> chop.(x, head=5, tail=0)) => :New)
# do block (not performant)
combine(gdf) do df
(m = mean(df.PetalLength), s² = var(df.PetalLength))
end
# group by
for subdf in groupby(iris, :Species)
println(size(subdf, 1))
end
for (key, subdf) in pairs(groupby(iris, :Species))
println("Number of data points for $(key.Species): $(nrow(subdf))")
end
df = DataFrame(g=repeat(1:1000, inner=5), x=1:5000)
gdf = groupby(df, :g)
gdf[(g = 500,)]
gdf[[(500,), (501,)]]
# apply function to every non-grouping variable
gd = groupby(iris, :Species)
combine(gd, valuecols(gd) .=> mean)
# GroupedDataFrame is a view, expected to be immutable (pushing to the df and then calling the groups will return error)
df = DataFrame(id=1:2)
gd = groupby(df, :id)
push!(df, [3])
gd[1]
# grouping on view to df allows appending rows to the source data frame of a GroupedDataFrame
df = DataFrame(id=1:2)
gd = groupby(view(df, :, :), :id)
push!(df, [3])
gd[1]
# working with df @view
df = DataFrame(a=1:5)
sdf = @view df[2:3, :]
transform(sdf, :a => ByRow(string)) # create a new data frame
transform!(sdf, :a => ByRow(string)) # update the source df in-place
show(df)
select!(sdf, :a => -, renamecols=false)
show(df) # the column replaced an existing column; previously stored values are re-used in filtered-out rows
# the same applies to GroupedDataFrame
df = DataFrame(a=[1, 1, 1, 2, 2, 3], b=1:6)
sdf = @view df[2:4, :]
gsdf = groupby(sdf, :a)
transform(gsdf, nrow) # creates a new data frame
transform!(gsdf, nrow, :b => :b_copy)
show(df)
select!(gsdf, :b_copy, :b => sum, renamecols=false)
show(df)
# reshape
# stack
iris = CSV.read((joinpath(dirname(pathof(DataFrames)),"..", "docs", "src", "assets", "iris.csv")),DataFrame)
stack(iris, 1:4)
stack(iris, [:SepalLength, :SepalWidth, :PetalLength, :PetalWidth])
stack(iris, [:SepalLength, :SepalWidth], :Species)
stack(iris, Not(:Species))
iris.id = 1:size(iris, 1)
longdf = stack(iris, Not([:Species, :id]))
# unstack
unstack(longdf, :id, :variable, :value)
unstack(longdf, [:id, :Species], :variable, :value)
unstack(longdf, :variable, :value)
unstack(longdf) # default for variable and value will be determined
stack(iris, view=true) # Passing view=true to stack returns a data frame whose columns are views into the original wide data frame. This saves memory.
# stack and aggregation
using Statistics
d = stack(iris, Not(:Species))
x = combine(groupby(d, [:variable, :Species]), :value => mean => :vsum)
first(unstack(x, :Species, :vsum), 6)
# transposing DataFrame
df1 = DataFrame(a=["x", "y"], b=[1.0, 2.0], c=[3, 4], d=[true, false])
permutedims(df1, 1) # performes type promotion
# sorting
iris = CSV.read((joinpath(dirname(pathof(DataFrames)),"..", "docs", "src", "assets", "iris.csv")),DataFrame)
sort!(iris) # lexicographical sort on all
sort!(iris, rev = true)
sort!(iris, [:Species, :SepalWidth])
sort!(iris, [order(:Species, by=length), order(:SepalLength, rev=true)])
sort!(iris, [:Species, :PetalLength], rev=[true, false])
sort!(iris, [order(:Species, rev=true), :PetalLength])
# working with Categorical Data in DataFrames
using CategoricalArrays
v = ["Group A", "Group A", "Group A", "Group B", "Group B", "Group B"]
cv = categorical(v)
cv = categorical(["Group A", missing, # CategoricalVectorss support missing values
"Group A", "Group B",
"Group B", missing])
levels(cv)
sort(cv)
levels!(cv, ["Group B", "Group A"]) # changing the order of appearance of the levels - useful for display purposes or when working with ordered variables
sort(cv)
cv = compress(cv) # by compressing able to represent 2^32 different levels
cv1 = categorical(["A", "B"], compress=true) # same compressing as above
cv2 = categorical(["A", "B"], ordered=true) # ordered factor
cv1[1] < cv1[2] # but:
cv2[1] < cv2[2]
isordered(cv1)
ordered!(cv1, true)
isordered(cv1)
# missingness
missing
missings(Int, 2, 3) # The missings function constructs Vectors and Arrays supporting missing values, using the optional first argument to specify the element-type
missings(3)
typeof(missing)
x = [1, 2, missing]
eltype(x)
Union{Missing, Int}
skipmissing(x)
collect(skipmissing(x))
coalesce.(x, 0)
df = DataFrame(i=1:5,
x=[missing, 4, missing, 2, 1],
y=[missing, missing, "c", "d", "e"])
dropmissing(df)
dropmissing(df, :x)
dropmissing(df, disallowmissing=true)
df = DataFrame(x=1:3, y=4:6)
allowmissing!(df)
df[1, 1] = missing
disallowmissing(df, :y)
disallowmissing(df, error=false)
using Missings
passmissing(uppercase)("a")
passmissing(uppercase)(missing) # passmissing higher order function that takes some function f as its argument and returns a new function which returns missing if any of its positional arguments are missing and otherwise applies the function f to these arguments
Missings.replace(x, 1) |> collect # function Missings.replace returns an iterator which replaces missing elements with another value
collect(Missings.replace(x, 1)) == coalesce.(x, 1)
eltype(x)
nonmissingtype(eltype(x))
parentmodule(nonmissingtype)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment