Skip to content

Instantly share code, notes, and snippets.

@mattsigal
Last active August 29, 2015 14:11
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mattsigal/c17650d8a9b0f5b018af to your computer and use it in GitHub Desktop.
Save mattsigal/c17650d8a9b0f5b018af to your computer and use it in GitHub Desktop.
Create Stratified Norm Sample from Strata and Count Vector
#' Function to draw observations from stratified norm sample
#'
#' This function creates a stratified random sample, given an arbitrary number of
#' factors and levels within those factors.
#'
#' @param dat data.frame object
#' @param strata character; a named vector indicating the strata variables
#' @param observations numeric; number of cases to sample from each strata
#' @param return.grid logical; return grid of factor combintions?
#' @param full.data logical; return the original dataset with a new logical
#' variable (named from 'full.data.id') indicating whether the row was selected
#' @param full.data.id; a string pertaining the variable name used when 'full.data=TRUE'
#'
#' @example
#'
#' dat <- data.frame(Gender=sample(c("Male", "Female"), size = 1500, replace = TRUE),
#' AgeGroup=sample(c("18-39", "40-49", "50+"), size = 1500, replace = TRUE),
#' Relationship=sample(c("Direct", "Manager", "Coworker", "Friend"), size = 1500, replace = TRUE),
#' X=rnorm(n=1500, mean=0, sd=1),
#' Y=rnorm(n=1500, mean=0, sd=1),
#' Z=rnorm(n=1500, mean=0, sd=1))
#'
#' strata = c("Gender", "AgeGroup", "Relationship")
#'
#' stratifiedNorm(dat, strata, return.grid=TRUE)
#' test <- stratifiedNorm(dat, strata, counts = 5, return.grid=FALSE)
#' head(test)
#'
#' stratifiedNorm(dat, strata, counts = 5, full.data=TRUE)
#'
stratifiedNorm <- function(dat, strata, observations=0, return.grid=FALSE, full.data=FALSE, full.data.id = "sampled"){
if (class(dat) != "data.frame")
stop("Data input is not a dataframe.")
if (!all(strata %in% names(dat)))
stop("Please verify your 'strata' arguments exist in dataframe.")
if (class(observations) != "numeric")
stop("Please ensure the observations vector is numeric.")
nstrata <- length(strata)
svars <- dat[,strata]
factors <- list()
dat <- cbind(InTeRnAlIdVaRs=as.numeric(rownames(dat)), dat)
for (i in 1:nstrata){
if (class(svars[,i]) != "factor"){
stop("Strata variable is not a factor.")
}
factors[[i]] <- levels(svars[,i])
}
names(factors) <- strata
remove <- expand.grid(factors)
if(length(observations) == 1)
observations <- rep(observations, nrow(remove))
if(length(observations) < nrow(remove))
stop("Observations vector does not have enough elements.")
if(length(observations) > nrow(remove))
stop("Observations vector has too many elements.")
if(return.grid) return(cbind(remove, Observations = observations))
ret <- lapply(1:nrow(remove), function(ind, dat, remove, count){
pick <- matrix(FALSE, nrow(dat), ncol(remove))
names <- colnames(remove)
for(i in 1L:ncol(pick))
pick[,i] <- dat[,names[i]] == remove[ind,names[i]]
pick <- rowSums(pick) == ncol(remove)
tmpsvars <- dat[pick,]
if(nrow(tmpsvars) > count[ind]){
return(tmpsvars[sample(1:nrow(tmpsvars), count[ind]), ])
}
if(nrow(tmpsvars) == count[ind]){
message(sprintf('Combination for (%s) is equal to count. Returning all observations.',
paste0(as.matrix(remove[ind,]), collapse = '|')))
return(tmpsvars)
} else {
message(sprintf('Combination for (%s) has LESS than count. Returning all observations.',
paste0(as.matrix(remove[ind,]), collapse = '|')))
return(tmpsvars)
}
}, dat=dat, remove=remove, count=observations)
newdat <- do.call(rbind, ret)
IDINT <- dat$InTeRnAlIdVaRs %in% newdat$InTeRnAlIdVaRs
dat$InTeRnAlIdVaRs <- newdat$InTeRnAlIdVaRs <- NULL
if(full.data == FALSE){
return(newdat)
} else {
if(full.data.id %in% colnames(dat)){
warning('full.data.id name exists in original data. Placing id name in first column with
a unique name')
dat <- cbind(IDINT, dat)
} else {
dat[[full.data.id]] <- IDINT
}
return(dat)
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment