stephenturner/splitdf.randomize.r

## splitdf.randomize.r
#splitdf splits a data frame into a training and testing set.
#returns a list of two data frames: trainset and testset.
#you can optionally apply a random seed.
splitdf <- function(dataframe, seed=NULL, trainfrac=0.5) {
	if (trainfrac<=0 | trainfrac>=1) stop("Training fraction must be between 0 and 1, not inclusive")
	if (!is.null(seed)) set.seed(seed)
	index <- 1:nrow(dataframe)
	trainindex <- sample(index, trunc(length(index)/(1/trainfrac)))
	trainset <- dataframe[trainindex, ]
	testset <- dataframe[-trainindex, ]
	list(trainset=trainset,testset=testset)
}

#this function utilizes the function above.
#you give it a data frame you want to randomize,
#and a character vector with column names you want to be sure are
#equally distributed among the two different sets.
#these columns must be continuous variables. chi2 not yet implemented.
splitdf.randomize <- function(dataframe, ttestcolnames=c("cols","to","test"), ...) {
	d <- dataframe
	if (!all(ttestcolnames %in% names(d))) stop(paste(ttestcolnames,"not in dataframe"))
	ps <- NULL
	while (is.null(ps) | any(ps<.5)) {
		sets <- splitdf(d, trainfrac=...)
		trainset <- sets$trainset
		testset <- sets$testset
		ttestcols <- which(names(d) %in% ttestcolnames)
		ps <- NULL
		for (col in ttestcols) {
			p <- t.test(trainset[ ,col], testset[ ,col])$p.value
			ps=c(ps,p)
		}
		print(paste(ttestcolnames," t-test p-value =",ps))
		cat("\n")
	}
	list(trainset=trainset,testset=testset)
}

# sometimes you might have significant differences in variables of interest
# between training and testing sets.
data(iris)
s44 <- splitdf(iris, seed=44)
train <- s44$trainset
test <- s44$testset
t.test(train$Sepal.Length, test$Sepal.Length)

#first, specify which columns you want to ensure are "even" between the sets
cols <- c("Sepal.Length","Sepal.Width","Petal.Length","Petal.Width")

#Now, split up the dataset again, keeping even distribution of those variables.
set.seed(80842)
evensplit <- splitdf.randomize(iris,cols)
	#splitdf splits a data frame into a training and testing set.
	#returns a list of two data frames: trainset and testset.
	#you can optionally apply a random seed.
	splitdf <- function(dataframe, seed=NULL, trainfrac=0.5) {
	if (trainfrac<=0 \| trainfrac>=1) stop("Training fraction must be between 0 and 1, not inclusive")
	if (!is.null(seed)) set.seed(seed)
	index <- 1:nrow(dataframe)
	trainindex <- sample(index, trunc(length(index)/(1/trainfrac)))
	trainset <- dataframe[trainindex, ]
	testset <- dataframe[-trainindex, ]
	list(trainset=trainset,testset=testset)
	}

	#this function utilizes the function above.
	#you give it a data frame you want to randomize,
	#and a character vector with column names you want to be sure are
	#equally distributed among the two different sets.
	#these columns must be continuous variables. chi2 not yet implemented.
	splitdf.randomize <- function(dataframe, ttestcolnames=c("cols","to","test"), ...) {
	d <- dataframe
	if (!all(ttestcolnames %in% names(d))) stop(paste(ttestcolnames,"not in dataframe"))
	ps <- NULL
	while (is.null(ps) \| any(ps<.5)) {
	sets <- splitdf(d, trainfrac=...)
	trainset <- sets$trainset
	testset <- sets$testset
	ttestcols <- which(names(d) %in% ttestcolnames)
	ps <- NULL
	for (col in ttestcols) {
	p <- t.test(trainset[ ,col], testset[ ,col])$p.value
	ps=c(ps,p)
	}
	print(paste(ttestcolnames," t-test p-value =",ps))
	cat("\n")
	}
	list(trainset=trainset,testset=testset)
	}

	# sometimes you might have significant differences in variables of interest
	# between training and testing sets.
	data(iris)
	s44 <- splitdf(iris, seed=44)
	train <- s44$trainset
	test <- s44$testset
	t.test(train$Sepal.Length, test$Sepal.Length)

	#first, specify which columns you want to ensure are "even" between the sets
	cols <- c("Sepal.Length","Sepal.Width","Petal.Length","Petal.Width")

	#Now, split up the dataset again, keeping even distribution of those variables.
	set.seed(80842)
	evensplit <- splitdf.randomize(iris,cols)