Navigation Menu

Skip to content

Instantly share code, notes, and snippets.

@stephenturner
Created February 19, 2011 02:01
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save stephenturner/834760 to your computer and use it in GitHub Desktop.
Save stephenturner/834760 to your computer and use it in GitHub Desktop.
splitdf.randomize.r
#splitdf splits a data frame into a training and testing set.
#returns a list of two data frames: trainset and testset.
#you can optionally apply a random seed.
splitdf <- function(dataframe, seed=NULL, trainfrac=0.5) {
if (trainfrac<=0 | trainfrac>=1) stop("Training fraction must be between 0 and 1, not inclusive")
if (!is.null(seed)) set.seed(seed)
index <- 1:nrow(dataframe)
trainindex <- sample(index, trunc(length(index)/(1/trainfrac)))
trainset <- dataframe[trainindex, ]
testset <- dataframe[-trainindex, ]
list(trainset=trainset,testset=testset)
}
#this function utilizes the function above.
#you give it a data frame you want to randomize,
#and a character vector with column names you want to be sure are
#equally distributed among the two different sets.
#these columns must be continuous variables. chi2 not yet implemented.
splitdf.randomize <- function(dataframe, ttestcolnames=c("cols","to","test"), ...) {
d <- dataframe
if (!all(ttestcolnames %in% names(d))) stop(paste(ttestcolnames,"not in dataframe"))
ps <- NULL
while (is.null(ps) | any(ps<.5)) {
sets <- splitdf(d, trainfrac=...)
trainset <- sets$trainset
testset <- sets$testset
ttestcols <- which(names(d) %in% ttestcolnames)
ps <- NULL
for (col in ttestcols) {
p <- t.test(trainset[ ,col], testset[ ,col])$p.value
ps=c(ps,p)
}
print(paste(ttestcolnames," t-test p-value =",ps))
cat("\n")
}
list(trainset=trainset,testset=testset)
}
# sometimes you might have significant differences in variables of interest
# between training and testing sets.
data(iris)
s44 <- splitdf(iris, seed=44)
train <- s44$trainset
test <- s44$testset
t.test(train$Sepal.Length, test$Sepal.Length)
#first, specify which columns you want to ensure are "even" between the sets
cols <- c("Sepal.Length","Sepal.Width","Petal.Length","Petal.Width")
#Now, split up the dataset again, keeping even distribution of those variables.
set.seed(80842)
evensplit <- splitdf.randomize(iris,cols)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment