Skip to content

Instantly share code, notes, and snippets.

@lendle
Last active August 29, 2015 14:01
Show Gist options
  • Save lendle/735d80084b498489f894 to your computer and use it in GitHub Desktop.
Save lendle/735d80084b498489f894 to your computer and use it in GitHub Desktop.
Implementation of the variable selection part of the HD-PS algorithm in R
hdps_screen <- function(outcome, treatment, covars, keep=0, indexes=TRUE) {
#outcome and treatment are binary vectors of length n
#covars is a matrix of binary variabesl of size n x p,
# where p is the number of covariates for in a paritular data dimension
#keep: number of covariates to keep
# indexes: return indexes or subest of covars
#
#returns indexes of kept covariates or matrix of kept covars
#indexes are sorted by abs(log(multiplicitive bias))
#if keep is 0 or greatner than the number of covars for which multiplicitive bias
# could be calculated, all indexes are returned(sorted)
# matrix of kept covars are in the original column order
#p_c1 - prev of each covar among exposed
#p_c0 - '' among unexposed
#rr_cd - "independent association between the [covar] and study outcome"
if (!all(outcome %in% c(0,1)))
stop("outcome should be binary")
if (!all(treatment %in% c(0,1)))
stop("treatment should be binary")
if (!all(covars %in% c(0,1)))
stop("covars should be binary")
if (keep==0 && indexes==FALSE)
stop("You want to keep all columns (keep=0) but want the covariates back, not the ordered indexes (indexes=FALSE). That doesn't make sense.")
if (ncol(covars) < keep) {
if (indexes==FALSE) {
return(covars)
} else {
return(1:nrow(covars))
}
}
treatment <- factor(treatment)
covar_prev <- by(covars, treatment, colMeans)
p_c1 <- covar_prev[[levels(treatment)[1]]]
p_c0 <- covar_prev[[levels(treatment)[2]]]
calc_rr_cd <- function(outcome, covar) {
#returns rr_cd if rr_cd > 1, else returns 1/rr_cd
#might return Inf if there are 0 outcomes at a particular level of the covar
prevs <- by(outcome, covar, mean)
rr_cd <- (prevs[1])/(prevs[2])
max(rr_cd, 1/rr_cd)
}
#a vector of rr_cd if rr_cd > 1, 1/rr_cd otherwise
rr_cds <- apply(covars, 2, calc_rr_cd, outcome=outcome)
#Infs in rr_cds will result in NaN for some covariates
bias_mult <- (p_c1 * (rr_cds - 1) + 1) /
(p_c0 * (rr_cds - 1) + 1)
abs_log_bias_mult <- abs(log(bias_mult))
#throw out NaNs/NAs
ordered_idxs <- order(abs_log_bias_mult, decreasing=TRUE, na.last = NA)
ordered_idxs <- if (keep == 0) {
ordered_idxs
} else {
ordered_idxs[1:keep]
}
if (indexes) {
ordered_idxs
} else {
covars[, sort(ordered_idxs)]
}
}
library(Rcpp)
cppFunction('NumericVector colVars(NumericMatrix x) {
int nrow = x.nrow(), ncol = x.ncol();
NumericVector out(ncol);
for (int j = 0; j < ncol; j++) {
double mean = 0;
double M2 = 0;
int n;
double delta, xx;
for (int i = 0; i < nrow; i++) {
n = i+1;
xx = x(i,j);
delta = xx - mean;
mean += delta/n;
M2 = M2 + delta*(xx-mean);
}
out(j) = M2/(n-1);
}
return out;
}')
assess_recurrance <- function(x, top_k_var = 100) {
#expands a matrix by replacing it's columns with as.numeric(x > 0),
# as.numeric(x > median(x)), as.numeric(x > quantile(x, prob=0.75))
#only unique columns (per original column) are kept
#top_k_var: before expanding, keep at most this many columns with the highest variance
assess_one_column <- function(x) {
quants <- quantile(x, probs=c(0.5, 0.75), names=FALSE)
quants <- unique(quants)
quants <- Filter(function(q) !(q %in% c(0, 1)), quants)
mat <- matrix(0, length(x), length(quants) + 1)
mat[, 1] <- x > 0
if (length(quants) > 0) {
for (i in 1:length(quants)) {
mat[, i+1] <- x >= quants[i]
}
}
mat
}
x <- as.matrix(x)
if (ncol(x) > top_k_var) {
#vars <- apply(x, 2, var)
vars <- colVars(x)
var_ords <- order(vars, decreasing=TRUE)
x <- x[, sort(var_ords[1:top_k_var])]
}
mats <- lapply(1:ncol(x), function(i) assess_one_column(x[,i]))
cnams <- colnames(x)
if (!is.null(cnams)) {
for (i in seq_along(mats)) {
num <- ncol(mats[[i]])
colnames(mats[[i]]) <- paste(cnams[i], c(1, 50, 75)[1:num], sep="_")
}
}
do.call(cbind, mats)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment