Skip to content

Instantly share code, notes, and snippets.

@vrld
Last active August 29, 2015 14:03
Show Gist options
  • Save vrld/1e4bd79921edac55d0d9 to your computer and use it in GitHub Desktop.
Save vrld/1e4bd79921edac55d0d9 to your computer and use it in GitHub Desktop.
module Crossvalidation
export KFold, StratifiedKFold, LOO, RandomSplit
#==
K-Fold cross validation
Example (5-Fold CV):
KFold(X, y, 5) do X_tr, y_tr, X_te, y_te
train!(model, X_tr, y_tr)
confusion = zeros(2,2)
for i = 1,length(y_te)
pred = predict(model, X_te[i,:])
confusion[y_te[i]+1, pred+1] += 1
end
confusion
end
==#
function KFold(f::Function, X::Matrix, y::Vector, K::Integer)
@assert size(X, 1) == size(y, 1)
nelem = length(y)
blocksize = ceil(nelem / K)
idx = shuffle!(collect(1:nelem))
ret = {}
for i = 1:K
te, tr = idx[1:blocksize], idx[blocksize+1:end]
push!(ret, f(i, X[tr,:], y[tr], X[te,:], y[te]))
idx = circshift(idx, blocksize)
end
ret
end
function KFold(f::Function, X::Vector, y::Vector, K::Integer)
@assert size(X, 1) == size(y, 1)
nelem = length(y)
blocksize = ceil(nelem / K)
idx = shuffle!(collect(1:nelem))
ret = {}
for i = 1:K
te, tr = idx[1:blocksize], idx[blocksize+1:end]
push!(ret, f(i, X[tr], y[tr], X[te], y[te]))
idx = circshift(idx, blocksize)
end
ret
end
#==
Stratified K-Fold cross validation:
Each fold has approximately the same distribution of labels
==#
function StratifiedKFold{T}(f::Function, X::Matrix, y::Vector{T}, K::Integer)
@assert size(X, 1) == size(y, 1)
labels = collect(Set(y))
idx, blocksize = Dict{T, Vector{Int}}(), Dict{T, Int}()
for l in labels
idx[l] = shuffle!(collect(1:length(y))[y.==l])
blocksize[l] = ceil(length(idx[l]) / K)
end
ret = {}
for i = 1:K
te,tr = Int[], Int[]
for l in labels
append!(te, idx[l][1:blocksize[l]])
append!(tr, idx[l][blocksize[l]+1:end])
idx[l] = circshift(idx[l], blocksize[l])
end
push!(ret, f(i, X[tr,:], y[tr], X[te,:], y[te]))
end
ret
end
function StratifiedKFold{T}(f::Function, X::Vector, y::Vector{T}, K::Integer)
@assert size(X, 1) == size(y, 1)
labels = collect(Set(y))
idx, blocksize = Dict{T, Vector{Int}}(), Dict{T, Int}()
for l in labels
idx[l] = shuffle!(collect(1:length(y))[y.==l])
blocksize[l] = ceil(length(idx[l]) / K)
end
ret = {}
for i = 1:K
te,tr = Int[], Int[]
for l in labels
append!(te, idx[l][1:blocksize[l]])
append!(tr, idx[l][blocksize[l]+1:end])
idx[l] = circshift(idx[l], blocksize[l])
end
push!(ret, f(i, X[tr], y[tr], X[te], y[te]))
end
ret
end
#==
Leave-one-out cross validation
==#
function LOO(f::Function, X::Matrix, y::Vector, K::Integer)
@assert size(X, 1) == size(y, 1)
te = falses(length(y))
te[1] = true
ret = {}
for i = 1:length(y)
push!(ret, f(X[!te,:], y[!te], X[te,:], y[te]))
te = circshift(te, 1)
end
ret
end
#==
Random split in two sets
==#
function RandomSplit(X::Matrix, y::Vector, fraction::Float64 = 0.5)
@assert size(X, 1) == size(y, 1)
idx = shuffle!(collect(1:length(y)) .<= fraction*length(y))
return X[idx,:], y[idx], X[!idx,:], y[!idx]
end
function RandomSplit(f::Function, X::Matrix, y::Vector, N::Int, fraction::Float64 = 0.5)
@assert size(X, 1) == size(y, 1)
ret = {}
for i = 1:N
idx = shuffle!(collect(1:length(y)) .<= fraction*length(y))
push!(ret, f(i, X[idx,:], y[idx], X[!idx,:], y[!idx]))
end
return ret
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment