public
Last active

splitdf.randomize.r

  • Download Gist
splitdf.randomize.r
R
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52
#splitdf splits a data frame into a training and testing set.
#returns a list of two data frames: trainset and testset.
#you can optionally apply a random seed.
splitdf <- function(dataframe, seed=NULL, trainfrac=0.5) {
if (trainfrac<=0 | trainfrac>=1) stop("Training fraction must be between 0 and 1, not inclusive")
if (!is.null(seed)) set.seed(seed)
index <- 1:nrow(dataframe)
trainindex <- sample(index, trunc(length(index)/(1/trainfrac)))
trainset <- dataframe[trainindex, ]
testset <- dataframe[-trainindex, ]
list(trainset=trainset,testset=testset)
}
 
#this function utilizes the function above.
#you give it a data frame you want to randomize,
#and a character vector with column names you want to be sure are
#equally distributed among the two different sets.
#these columns must be continuous variables. chi2 not yet implemented.
splitdf.randomize <- function(dataframe, ttestcolnames=c("cols","to","test"), ...) {
d <- dataframe
if (!all(ttestcolnames %in% names(d))) stop(paste(ttestcolnames,"not in dataframe"))
ps <- NULL
while (is.null(ps) | any(ps<.5)) {
sets <- splitdf(d, trainfrac=...)
trainset <- sets$trainset
testset <- sets$testset
ttestcols <- which(names(d) %in% ttestcolnames)
ps <- NULL
for (col in ttestcols) {
p <- t.test(trainset[ ,col], testset[ ,col])$p.value
ps=c(ps,p)
}
print(paste(ttestcolnames," t-test p-value =",ps))
cat("\n")
}
list(trainset=trainset,testset=testset)
}
 
# sometimes you might have significant differences in variables of interest
# between training and testing sets.
data(iris)
s44 <- splitdf(iris, seed=44)
train <- s44$trainset
test <- s44$testset
t.test(train$Sepal.Length, test$Sepal.Length)
 
#first, specify which columns you want to ensure are "even" between the sets
cols <- c("Sepal.Length","Sepal.Width","Petal.Length","Petal.Width")
 
#Now, split up the dataset again, keeping even distribution of those variables.
set.seed(80842)
evensplit <- splitdf.randomize(iris,cols)

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.