Patrikios/DataFrames_Study.jl

## DataFrames_Study.jl
using DataFrames

# create a data.frame (simple)
df = DataFrame(A=1:4, B=["M", "N", "O", "P"])
df[!, :A] # no copy
df[:, :A]  # copy is returned

# create a data.frame (row by row - not performant, as DataFrames are like in R built columnwise)
df2 = DataFrame(A=Int[], B=String[])
push!(df2, (1, "M"));
push!(df2, [2, "N"])
push!(df2, Dict(:B => "F", :A => 3))

# colnames
names(df)
names(df, r"A")
names(df, Int)
names(df, Int32) |> length # 0 matches

# colnames as symbols
propertynames(df)

# summarise data
describe(df)

# first & last row
first(df)
first(df, 2)
last(df)

# using factors like arrays
using CategoricalArrays
DataFrame(a=1:2,
          b=[1.0, missing],
          c=categorical('a':'b'),
          d=[1 // 2, missing])
# typeof
DataFrame == typeof(df) # true

# copy a data frame (shallow-without references)
df2 = copy(df2)

# DataFrames edits in place
x = [3, 1, 2];
df = DataFrame(x=x) # copy of x, not direct reference to object x
sort!(df)
df.x[1] = 100
df
x # untouched

# subsetting
df[1:2, !]
df[1:3, :]
eachcol(df)[1]
df[[1, 3, 5], [:B, :A]]
df[[1, 3, 5], [:A]]
df[[1, 3, 5], :A] # data.frame type was dropped, vector is returned
df = DataFrame(A=1:2:1000, B=repeat(1:10, inner=50), C=1:500)
df[df.A .> 500, :]
df[(df.A .> 500) .& (300 .< df.C .< 400), :]
df[in.(df.A, Ref([1, 5, 601])), :] # Where a specific subset of values needs to be matched, the in() function can be applied (? Ref: Ref is sometimes used in broadcasting in order to treat the referenced values as a scalar.)
df[in([1, 5, 601]).(df.A), :] # same as above
# selecting cols
df = DataFrame(x1=1, x2=2, y=3)
df[!, r"x"]
df[!, Not(:x1)]  # drop :x1 column
df[:, Between(:x1, :y)] # keep columns between :r and :y
df[:, All()] # keep all columns
df[:, Cols(x -> startswith(x, "x"))] # keep columns whose name starts with "x"
df = DataFrame(r=1, x1=2, x2=3, y=4)
df[:, Cols(r"x", :)]
df[:, Cols(Not(r"x"), :)]
# views into df
@view df[1:3, :A]
# views do not return copy - the same case as here, no copies but direct reference
df[!, :A] # when a ! is placed in the first indexing position
df.A # when using . (getpropery)
df[!, [:A, :B]]
df[1, [:A, :B]] # when a single row is selected using an integer
@view df[1:3, :A] # wen view or @view is used

# function 'subset'
df = DataFrame(A=1:2:1000, B=repeat(1:10, inner=50), C=1:500)
subset(df, :A => a -> a .< 10, :C => c -> iseven.(c))
df = DataFrame(x=[1, 2, missing, 4])
subset(df, :x => i -> [false, true, true, false])
subset(df, :x => rrr -> coalesce.(iseven.(rrr), false))
subset(df, :x => x -> iseven.(x), skipmissing=true) # same as above

# function 'select'
# selct always returns a data frame
df = DataFrame(x1=[1, 2], x2=[3, 4], y=[5, 6])
select(df, :x1)
select(df, Not(:x1))
select(df, r"x") # select columns containing 'x' character
select(df, :x1 => :a1, :x2 => :a2) # rename columns
select(df, :x1, :x2 => (x -> x .- minimum(x)) => :x2_new) # transform columns
select(df, :x2, :x2 => ByRow(sqrt)) # # transform columns by row
select(df, AsTable(:) => ByRow(extrema) => [:lo, :hi])
df2 = select(df, :x1, copycols=false) # direct reference
df2.x1 === df.x1 # same object

select!(df, Not(:x1)); # in-place selection
print(df)

# function 'transform'/'transform!'
import Random
import FreqTables # freqtable & prop
using Statistics # mean
Random.seed!(1)
df = DataFrame(rand(100000, 3), [:a ,:b, :c])
transform!(df, AsTable(:) => ByRow(argmax) => :prediction)
tbl = df.prediction |> FreqTables.freqtable
tbl |> FreqTables.prop
df = DataFrame(x=[1, 2, missing], y=[1, missing, missing])
transform(df, AsTable(:) .=>
            ByRow.([sum ∘ skipmissing,
                    x -> count(!ismissing, x),
                    mean ∘ skipmissing]) .=> [:sum, :n, :mean])


# function 'combine'
# apply function to each column
df = DataFrame(A=1:4, B=4.0:-1.0:1.0)
combine(df, names(df) .=> sum)
combine(df, names(df) .=> sum, names(df) .=> prod)
select(df, names(df) .=> sum, names(df) .=> prod) # works with select which retains the number of rows

# replacing data
df = DataFrame(a=["a", "None", "b", "None"], b=1:4,
                      c=["None", "j", "k", "h"], d=["x", "y", "None", "z"])
replace!(df.a, "None" => "c")

# replacement on a subset of columns [:c, :d]
df[:, [:c, :d]] .= ifelse.(df[!, [:c, :d]] .== "None", "c", df[!, [:c, :d]])
# replacement on entire data frame
# Do note that in the above examples, changing .= to just = will allocate new column vectors instead of applying the operation in-place.
df .= ifelse.(df .== "c", "None", df)
# When replacing values with missing, if the columns do not already allow for missing values, one has to either avoid in-place operation and use = instead of .=, or call allowmissing! beforehand:
df2 = ifelse.(df .== "None", missing, df)
# or first llow missing
allowmissing!(df) # operate in-place after allowing for missing
df .= ifelse.(df .== "None", missing, df)

# reading data into data frame from CSV
# package CSV # latency might be high, hence alternatively:
# std module DelimitedFiles
# other utilities: Arrow.jl, Feather.jl, Avro.jl, JSONTables.jl, Parquet.jl, StatFiles.jl, XLSX.jl, ClipData.jl
using CSV
df = DataFrame(x=1, y=2)
output = "dummy_data.csv"
input = output
CSV.write(output, df)
@assert "dummy_data.csv" in readdir()
DataFrame(CSV.File(input))

using DelimitedFiles
data, header = readdlm(joinpath(dirname(pathof(DataFrames)),
                                       "..", "docs", "src", "assets", "iris.csv"),
                              ',', header=true)

iris_raw = DataFrame(data, vec(header))
iris = identity.(iris_raw) # identityboradcasting in orde to perform narrowing of eltype
eltype.(eachcol(iris_raw))
eltype.(eachcol(iris))
writedlm("test.csv", Iterators.flatten(([names(iris)], eachrow(iris))), ',')

# joins
people = DataFrame(ID=[20, 40], Name=["John Doe", "Jane Doe"])
jobs = DataFrame(ID=[20, 60], Job=["Lawyer", "Astronaut"])
innerjoin(people, jobs, on=:ID)
leftjoin(people, jobs, on=:ID)
rightjoin(people, jobs, on=:ID)
outerjoin(people, jobs, on=:ID)
semijoin(people, jobs, on=:ID)
antijoin(people, jobs, on=:ID)
crossjoin(people, jobs, makeunique=true) # cartesian product (https://en.wikipedia.org/wiki/Join_(SQL))

a = DataFrame(City=["Amsterdam", "London", "London", "New York", "New York"],
                     Job=["Lawyer", "Lawyer", "Lawyer", "Doctor", "Doctor"],
                     Category=[1, 2, 3, 4, 5])
b = DataFrame(Location=["Amsterdam", "London", "London", "New York", "New York"],
                     Work=["Lawyer", "Lawyer", "Lawyer", "Doctor", "Doctor"],
                     Name=["a", "b", "c", "d", "e"])
innerjoin(a, b, on=[:City => :Location, :Job => :Work])

a = DataFrame(ID=[20, 40], Name=["John", "Jane"])
b = DataFrame(ID=[20, 60], Job=["Lawyer", "Doctor"])
innerjoin(a, b, on=[(:City => :Location), (:Job => :Work)], validate=(true, true)) # validate merge keys
outerjoin(a, b, on=:ID, validate=(true, true), source=:source) # using the source keyword argument you can add a column to the resulting data frame indicating whether the given row appeared only in the left, the right or both data frames

# split-apply-combine
using DataFrames, CSV, Statistics

iris = CSV.read((joinpath(dirname(pathof(DataFrames)),
                                 "..", "docs", "src", "assets", "iris.csv")),
                       DataFrame)
gdf = groupby(iris, :Species) # GroupedDataFrame
combine(gdf, :PetalLength => mean)
combine(gdf, nrow)
combine(gdf, nrow, :PetalLength => mean => :mean)
combine(gdf,
        [:PetalLength, :SepalLength] => ((p, s) -> (a = mean(p) / mean(s), b = sum(p))) =>
               AsTable) # multiple columns are passed as arguments
combine(gdf,
               AsTable([:PetalLength, :SepalLength]) =>
               x -> std(x.PetalLength) / std(x.SepalLength)) # passing a NamedTuple
combine(x -> std(x.PetalLength) / std(x.SepalLength), gdf)
combine(gdf, 1:2 => cor, nrow)
combine(gdf, :PetalLength => (x -> [extrema(x)]) => [:min, :max])
# compare to select
select(gdf, 1:2 => cor)
transform(gdf, :Species => (x -> chop.(x, head=5, tail=0)) => :New)

# do block (not performant)
combine(gdf) do df
    (m = mean(df.PetalLength), s² = var(df.PetalLength))
end

# group by
for subdf in groupby(iris, :Species)
    println(size(subdf, 1))
end

for (key, subdf) in pairs(groupby(iris, :Species))
    println("Number of data points for $(key.Species): $(nrow(subdf))")
end

df = DataFrame(g=repeat(1:1000, inner=5), x=1:5000)
gdf = groupby(df, :g)
gdf[(g = 500,)]
gdf[[(500,), (501,)]]

# apply function to every non-grouping variable
gd = groupby(iris, :Species)
combine(gd, valuecols(gd) .=> mean)

# GroupedDataFrame is a view, expected to be immutable (pushing to the df and then calling the groups will return error)
df = DataFrame(id=1:2)
gd = groupby(df, :id)
push!(df, [3])
gd[1]
# grouping on view to df allows appending rows to the source data frame of a GroupedDataFrame
df = DataFrame(id=1:2)
gd = groupby(view(df, :, :), :id)
push!(df, [3])
gd[1]

# working with df @view
df = DataFrame(a=1:5)
sdf = @view df[2:3, :]
transform(sdf, :a => ByRow(string)) # create a new data frame
transform!(sdf, :a => ByRow(string)) # update the source df in-place
show(df)
select!(sdf, :a => -, renamecols=false)
show(df) # the column replaced an existing column; previously stored values are re-used in filtered-out rows

# the same applies to GroupedDataFrame
df = DataFrame(a=[1, 1, 1, 2, 2, 3], b=1:6)
sdf = @view df[2:4, :]
gsdf = groupby(sdf, :a)
transform(gsdf, nrow) # creates a new data frame
transform!(gsdf, nrow, :b => :b_copy)
show(df)
select!(gsdf, :b_copy, :b => sum, renamecols=false)
show(df)

# reshape
# stack
iris = CSV.read((joinpath(dirname(pathof(DataFrames)),"..", "docs", "src", "assets", "iris.csv")),DataFrame)
stack(iris, 1:4)
stack(iris, [:SepalLength, :SepalWidth, :PetalLength, :PetalWidth])
stack(iris, [:SepalLength, :SepalWidth], :Species)
stack(iris, Not(:Species))
iris.id = 1:size(iris, 1)
longdf = stack(iris, Not([:Species, :id]))
# unstack
unstack(longdf, :id, :variable, :value)
unstack(longdf, [:id, :Species], :variable, :value)
unstack(longdf, :variable, :value)
unstack(longdf) # default for variable and value will be determined
stack(iris, view=true) # Passing view=true to stack returns a data frame whose columns are views into the original wide data frame. This saves memory.

# stack and aggregation
using Statistics
d = stack(iris, Not(:Species))
x = combine(groupby(d, [:variable, :Species]), :value => mean => :vsum)
first(unstack(x, :Species, :vsum), 6)

# transposing DataFrame
df1 = DataFrame(a=["x", "y"], b=[1.0, 2.0], c=[3, 4], d=[true, false])
permutedims(df1, 1) # performes type promotion

# sorting
iris = CSV.read((joinpath(dirname(pathof(DataFrames)),"..", "docs", "src", "assets", "iris.csv")),DataFrame)
sort!(iris) # lexicographical sort on all
sort!(iris, rev = true)
sort!(iris, [:Species, :SepalWidth])
sort!(iris, [order(:Species, by=length), order(:SepalLength, rev=true)])
sort!(iris, [:Species, :PetalLength], rev=[true, false])
sort!(iris, [order(:Species, rev=true), :PetalLength])

# working with Categorical Data in DataFrames
using CategoricalArrays
v = ["Group A", "Group A", "Group A", "Group B", "Group B", "Group B"]
cv = categorical(v)
cv = categorical(["Group A", missing, # CategoricalVectorss support missing values
                  "Group A", "Group B",
                  "Group B", missing])
levels(cv)
sort(cv)
levels!(cv, ["Group B", "Group A"]) # changing the order of appearance of the levels - useful for display purposes or when working with ordered variables
sort(cv)
cv = compress(cv) # by compressing able to represent 2^32 different levels
cv1 = categorical(["A", "B"], compress=true) # same compressing as above
cv2 = categorical(["A", "B"], ordered=true) # ordered factor
cv1[1] < cv1[2] # but:
cv2[1] < cv2[2]
isordered(cv1)
ordered!(cv1, true)
isordered(cv1)

# missingness
missing
missings(Int, 2, 3) # The missings function constructs Vectors and Arrays supporting missing values, using the optional first argument to specify the element-type
missings(3)
typeof(missing)
x = [1, 2, missing]
eltype(x)
Union{Missing, Int}
skipmissing(x)
collect(skipmissing(x))
coalesce.(x, 0)
df = DataFrame(i=1:5,
                      x=[missing, 4, missing, 2, 1],
                      y=[missing, missing, "c", "d", "e"])
dropmissing(df)
dropmissing(df, :x)
dropmissing(df, disallowmissing=true)
df = DataFrame(x=1:3, y=4:6)
allowmissing!(df)
df[1, 1] = missing
disallowmissing(df, :y)
disallowmissing(df, error=false)
using Missings
passmissing(uppercase)("a")
passmissing(uppercase)(missing) # passmissing higher order function that takes some function f as its argument and returns a new function which returns missing if any of its positional arguments are missing and otherwise applies the function f to these arguments
Missings.replace(x, 1) |> collect # function Missings.replace returns an iterator which replaces missing elements with another value
collect(Missings.replace(x, 1)) == coalesce.(x, 1)
eltype(x)
nonmissingtype(eltype(x))
parentmodule(nonmissingtype)
	using DataFrames

	# create a data.frame (simple)
	df = DataFrame(A=1:4, B=["M", "N", "O", "P"])
	df[!, :A] # no copy
	df[:, :A] # copy is returned

	# create a data.frame (row by row - not performant, as DataFrames are like in R built columnwise)
	df2 = DataFrame(A=Int[], B=String[])
	push!(df2, (1, "M"));
	push!(df2, [2, "N"])
	push!(df2, Dict(:B => "F", :A => 3))

	# colnames
	names(df)
	names(df, r"A")
	names(df, Int)
	names(df, Int32) \|> length # 0 matches

	# colnames as symbols
	propertynames(df)

	# summarise data
	describe(df)

	# first & last row
	first(df)
	first(df, 2)
	last(df)

	# using factors like arrays
	using CategoricalArrays
	DataFrame(a=1:2,
	b=[1.0, missing],
	c=categorical('a':'b'),
	d=[1 // 2, missing])
	# typeof
	DataFrame == typeof(df) # true

	# copy a data frame (shallow-without references)
	df2 = copy(df2)

	# DataFrames edits in place
	x = [3, 1, 2];
	df = DataFrame(x=x) # copy of x, not direct reference to object x
	sort!(df)
	df.x[1] = 100
	df
	x # untouched

	# subsetting
	df[1:2, !]
	df[1:3, :]
	eachcol(df)[1]
	df[[1, 3, 5], [:B, :A]]
	df[[1, 3, 5], [:A]]
	df[[1, 3, 5], :A] # data.frame type was dropped, vector is returned
	df = DataFrame(A=1:2:1000, B=repeat(1:10, inner=50), C=1:500)
	df[df.A .> 500, :]
	df[(df.A .> 500) .& (300 .< df.C .< 400), :]
	df[in.(df.A, Ref([1, 5, 601])), :] # Where a specific subset of values needs to be matched, the in() function can be applied (? Ref: Ref is sometimes used in broadcasting in order to treat the referenced values as a scalar.)
	df[in([1, 5, 601]).(df.A), :] # same as above
	# selecting cols
	df = DataFrame(x1=1, x2=2, y=3)
	df[!, r"x"]
	df[!, Not(:x1)] # drop :x1 column
	df[:, Between(:x1, :y)] # keep columns between :r and :y
	df[:, All()] # keep all columns
	df[:, Cols(x -> startswith(x, "x"))] # keep columns whose name starts with "x"
	df = DataFrame(r=1, x1=2, x2=3, y=4)
	df[:, Cols(r"x", :)]
	df[:, Cols(Not(r"x"), :)]
	# views into df
	@view df[1:3, :A]
	# views do not return copy - the same case as here, no copies but direct reference
	df[!, :A] # when a ! is placed in the first indexing position
	df.A # when using . (getpropery)
	df[!, [:A, :B]]
	df[1, [:A, :B]] # when a single row is selected using an integer
	@view df[1:3, :A] # wen view or @view is used

	# function 'subset'
	df = DataFrame(A=1:2:1000, B=repeat(1:10, inner=50), C=1:500)
	subset(df, :A => a -> a .< 10, :C => c -> iseven.(c))
	df = DataFrame(x=[1, 2, missing, 4])
	subset(df, :x => i -> [false, true, true, false])
	subset(df, :x => rrr -> coalesce.(iseven.(rrr), false))
	subset(df, :x => x -> iseven.(x), skipmissing=true) # same as above

	# function 'select'
	# selct always returns a data frame
	df = DataFrame(x1=[1, 2], x2=[3, 4], y=[5, 6])
	select(df, :x1)
	select(df, Not(:x1))
	select(df, r"x") # select columns containing 'x' character
	select(df, :x1 => :a1, :x2 => :a2) # rename columns
	select(df, :x1, :x2 => (x -> x .- minimum(x)) => :x2_new) # transform columns
	select(df, :x2, :x2 => ByRow(sqrt)) # # transform columns by row
	select(df, AsTable(:) => ByRow(extrema) => [:lo, :hi])
	df2 = select(df, :x1, copycols=false) # direct reference
	df2.x1 === df.x1 # same object

	select!(df, Not(:x1)); # in-place selection
	print(df)

	# function 'transform'/'transform!'
	import Random
	import FreqTables # freqtable & prop
	using Statistics # mean
	Random.seed!(1)
	df = DataFrame(rand(100000, 3), [:a ,:b, :c])
	transform!(df, AsTable(:) => ByRow(argmax) => :prediction)
	tbl = df.prediction \|> FreqTables.freqtable
	tbl \|> FreqTables.prop
	df = DataFrame(x=[1, 2, missing], y=[1, missing, missing])
	transform(df, AsTable(:) .=>
	ByRow.([sum ∘ skipmissing,
	x -> count(!ismissing, x),
	mean ∘ skipmissing]) .=> [:sum, :n, :mean])


	# function 'combine'
	# apply function to each column
	df = DataFrame(A=1:4, B=4.0:-1.0:1.0)
	combine(df, names(df) .=> sum)
	combine(df, names(df) .=> sum, names(df) .=> prod)
	select(df, names(df) .=> sum, names(df) .=> prod) # works with select which retains the number of rows

	# replacing data
	df = DataFrame(a=["a", "None", "b", "None"], b=1:4,
	c=["None", "j", "k", "h"], d=["x", "y", "None", "z"])
	replace!(df.a, "None" => "c")

	# replacement on a subset of columns [:c, :d]
	df[:, [:c, :d]] .= ifelse.(df[!, [:c, :d]] .== "None", "c", df[!, [:c, :d]])
	# replacement on entire data frame
	# Do note that in the above examples, changing .= to just = will allocate new column vectors instead of applying the operation in-place.
	df .= ifelse.(df .== "c", "None", df)
	# When replacing values with missing, if the columns do not already allow for missing values, one has to either avoid in-place operation and use = instead of .=, or call allowmissing! beforehand:
	df2 = ifelse.(df .== "None", missing, df)
	# or first llow missing
	allowmissing!(df) # operate in-place after allowing for missing
	df .= ifelse.(df .== "None", missing, df)

	# reading data into data frame from CSV
	# package CSV # latency might be high, hence alternatively:
	# std module DelimitedFiles
	# other utilities: Arrow.jl, Feather.jl, Avro.jl, JSONTables.jl, Parquet.jl, StatFiles.jl, XLSX.jl, ClipData.jl
	using CSV
	df = DataFrame(x=1, y=2)
	output = "dummy_data.csv"
	input = output
	CSV.write(output, df)
	@assert "dummy_data.csv" in readdir()
	DataFrame(CSV.File(input))

	using DelimitedFiles
	data, header = readdlm(joinpath(dirname(pathof(DataFrames)),
	"..", "docs", "src", "assets", "iris.csv"),
	',', header=true)

	iris_raw = DataFrame(data, vec(header))
	iris = identity.(iris_raw) # identityboradcasting in orde to perform narrowing of eltype
	eltype.(eachcol(iris_raw))
	eltype.(eachcol(iris))
	writedlm("test.csv", Iterators.flatten(([names(iris)], eachrow(iris))), ',')

	# joins
	people = DataFrame(ID=[20, 40], Name=["John Doe", "Jane Doe"])
	jobs = DataFrame(ID=[20, 60], Job=["Lawyer", "Astronaut"])
	innerjoin(people, jobs, on=:ID)
	leftjoin(people, jobs, on=:ID)
	rightjoin(people, jobs, on=:ID)
	outerjoin(people, jobs, on=:ID)
	semijoin(people, jobs, on=:ID)
	antijoin(people, jobs, on=:ID)
	crossjoin(people, jobs, makeunique=true) # cartesian product (https://en.wikipedia.org/wiki/Join_(SQL))

	a = DataFrame(City=["Amsterdam", "London", "London", "New York", "New York"],
	Job=["Lawyer", "Lawyer", "Lawyer", "Doctor", "Doctor"],
	Category=[1, 2, 3, 4, 5])
	b = DataFrame(Location=["Amsterdam", "London", "London", "New York", "New York"],
	Work=["Lawyer", "Lawyer", "Lawyer", "Doctor", "Doctor"],
	Name=["a", "b", "c", "d", "e"])
	innerjoin(a, b, on=[:City => :Location, :Job => :Work])

	a = DataFrame(ID=[20, 40], Name=["John", "Jane"])
	b = DataFrame(ID=[20, 60], Job=["Lawyer", "Doctor"])
	innerjoin(a, b, on=[(:City => :Location), (:Job => :Work)], validate=(true, true)) # validate merge keys
	outerjoin(a, b, on=:ID, validate=(true, true), source=:source) # using the source keyword argument you can add a column to the resulting data frame indicating whether the given row appeared only in the left, the right or both data frames

	# split-apply-combine
	using DataFrames, CSV, Statistics

	iris = CSV.read((joinpath(dirname(pathof(DataFrames)),
	"..", "docs", "src", "assets", "iris.csv")),
	DataFrame)
	gdf = groupby(iris, :Species) # GroupedDataFrame
	combine(gdf, :PetalLength => mean)
	combine(gdf, nrow)
	combine(gdf, nrow, :PetalLength => mean => :mean)
	combine(gdf,
	[:PetalLength, :SepalLength] => ((p, s) -> (a = mean(p) / mean(s), b = sum(p))) =>
	AsTable) # multiple columns are passed as arguments
	combine(gdf,
	AsTable([:PetalLength, :SepalLength]) =>
	x -> std(x.PetalLength) / std(x.SepalLength)) # passing a NamedTuple
	combine(x -> std(x.PetalLength) / std(x.SepalLength), gdf)
	combine(gdf, 1:2 => cor, nrow)
	combine(gdf, :PetalLength => (x -> [extrema(x)]) => [:min, :max])
	# compare to select
	select(gdf, 1:2 => cor)
	transform(gdf, :Species => (x -> chop.(x, head=5, tail=0)) => :New)

	# do block (not performant)
	combine(gdf) do df
	(m = mean(df.PetalLength), s² = var(df.PetalLength))
	end

	# group by
	for subdf in groupby(iris, :Species)
	println(size(subdf, 1))
	end

	for (key, subdf) in pairs(groupby(iris, :Species))
	println("Number of data points for $(key.Species): $(nrow(subdf))")
	end

	df = DataFrame(g=repeat(1:1000, inner=5), x=1:5000)
	gdf = groupby(df, :g)
	gdf[(g = 500,)]
	gdf[[(500,), (501,)]]

	# apply function to every non-grouping variable
	gd = groupby(iris, :Species)
	combine(gd, valuecols(gd) .=> mean)

	# GroupedDataFrame is a view, expected to be immutable (pushing to the df and then calling the groups will return error)
	df = DataFrame(id=1:2)
	gd = groupby(df, :id)
	push!(df, [3])
	gd[1]
	# grouping on view to df allows appending rows to the source data frame of a GroupedDataFrame
	df = DataFrame(id=1:2)
	gd = groupby(view(df, :, :), :id)
	push!(df, [3])
	gd[1]

	# working with df @view
	df = DataFrame(a=1:5)
	sdf = @view df[2:3, :]
	transform(sdf, :a => ByRow(string)) # create a new data frame
	transform!(sdf, :a => ByRow(string)) # update the source df in-place
	show(df)
	select!(sdf, :a => -, renamecols=false)
	show(df) # the column replaced an existing column; previously stored values are re-used in filtered-out rows

	# the same applies to GroupedDataFrame
	df = DataFrame(a=[1, 1, 1, 2, 2, 3], b=1:6)
	sdf = @view df[2:4, :]
	gsdf = groupby(sdf, :a)
	transform(gsdf, nrow) # creates a new data frame
	transform!(gsdf, nrow, :b => :b_copy)
	show(df)
	select!(gsdf, :b_copy, :b => sum, renamecols=false)
	show(df)

	# reshape
	# stack
	iris = CSV.read((joinpath(dirname(pathof(DataFrames)),"..", "docs", "src", "assets", "iris.csv")),DataFrame)
	stack(iris, 1:4)
	stack(iris, [:SepalLength, :SepalWidth, :PetalLength, :PetalWidth])
	stack(iris, [:SepalLength, :SepalWidth], :Species)
	stack(iris, Not(:Species))
	iris.id = 1:size(iris, 1)
	longdf = stack(iris, Not([:Species, :id]))
	# unstack
	unstack(longdf, :id, :variable, :value)
	unstack(longdf, [:id, :Species], :variable, :value)
	unstack(longdf, :variable, :value)
	unstack(longdf) # default for variable and value will be determined
	stack(iris, view=true) # Passing view=true to stack returns a data frame whose columns are views into the original wide data frame. This saves memory.

	# stack and aggregation
	using Statistics
	d = stack(iris, Not(:Species))
	x = combine(groupby(d, [:variable, :Species]), :value => mean => :vsum)
	first(unstack(x, :Species, :vsum), 6)

	# transposing DataFrame
	df1 = DataFrame(a=["x", "y"], b=[1.0, 2.0], c=[3, 4], d=[true, false])
	permutedims(df1, 1) # performes type promotion

	# sorting
	iris = CSV.read((joinpath(dirname(pathof(DataFrames)),"..", "docs", "src", "assets", "iris.csv")),DataFrame)
	sort!(iris) # lexicographical sort on all
	sort!(iris, rev = true)
	sort!(iris, [:Species, :SepalWidth])
	sort!(iris, [order(:Species, by=length), order(:SepalLength, rev=true)])
	sort!(iris, [:Species, :PetalLength], rev=[true, false])
	sort!(iris, [order(:Species, rev=true), :PetalLength])

	# working with Categorical Data in DataFrames
	using CategoricalArrays
	v = ["Group A", "Group A", "Group A", "Group B", "Group B", "Group B"]
	cv = categorical(v)
	cv = categorical(["Group A", missing, # CategoricalVectorss support missing values
	"Group A", "Group B",
	"Group B", missing])
	levels(cv)
	sort(cv)
	levels!(cv, ["Group B", "Group A"]) # changing the order of appearance of the levels - useful for display purposes or when working with ordered variables
	sort(cv)
	cv = compress(cv) # by compressing able to represent 2^32 different levels
	cv1 = categorical(["A", "B"], compress=true) # same compressing as above
	cv2 = categorical(["A", "B"], ordered=true) # ordered factor
	cv1[1] < cv1[2] # but:
	cv2[1] < cv2[2]
	isordered(cv1)
	ordered!(cv1, true)
	isordered(cv1)

	# missingness
	missing
	missings(Int, 2, 3) # The missings function constructs Vectors and Arrays supporting missing values, using the optional first argument to specify the element-type
	missings(3)
	typeof(missing)
	x = [1, 2, missing]
	eltype(x)
	Union{Missing, Int}
	skipmissing(x)
	collect(skipmissing(x))
	coalesce.(x, 0)
	df = DataFrame(i=1:5,
	x=[missing, 4, missing, 2, 1],
	y=[missing, missing, "c", "d", "e"])
	dropmissing(df)
	dropmissing(df, :x)
	dropmissing(df, disallowmissing=true)
	df = DataFrame(x=1:3, y=4:6)
	allowmissing!(df)
	df[1, 1] = missing
	disallowmissing(df, :y)
	disallowmissing(df, error=false)
	using Missings
	passmissing(uppercase)("a")
	passmissing(uppercase)(missing) # passmissing higher order function that takes some function f as its argument and returns a new function which returns missing if any of its positional arguments are missing and otherwise applies the function f to these arguments
	Missings.replace(x, 1) \|> collect # function Missings.replace returns an iterator which replaces missing elements with another value
	collect(Missings.replace(x, 1)) == coalesce.(x, 1)
	eltype(x)
	nonmissingtype(eltype(x))
	parentmodule(nonmissingtype)