Last active
August 29, 2015 14:01
-
-
Save lendle/735d80084b498489f894 to your computer and use it in GitHub Desktop.
Implementation of the variable selection part of the HD-PS algorithm in R
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
hdps_screen <- function(outcome, treatment, covars, keep=0, indexes=TRUE) { | |
#outcome and treatment are binary vectors of length n | |
#covars is a matrix of binary variabesl of size n x p, | |
# where p is the number of covariates for in a paritular data dimension | |
#keep: number of covariates to keep | |
# indexes: return indexes or subest of covars | |
# | |
#returns indexes of kept covariates or matrix of kept covars | |
#indexes are sorted by abs(log(multiplicitive bias)) | |
#if keep is 0 or greatner than the number of covars for which multiplicitive bias | |
# could be calculated, all indexes are returned(sorted) | |
# matrix of kept covars are in the original column order | |
#p_c1 - prev of each covar among exposed | |
#p_c0 - '' among unexposed | |
#rr_cd - "independent association between the [covar] and study outcome" | |
if (!all(outcome %in% c(0,1))) | |
stop("outcome should be binary") | |
if (!all(treatment %in% c(0,1))) | |
stop("treatment should be binary") | |
if (!all(covars %in% c(0,1))) | |
stop("covars should be binary") | |
if (keep==0 && indexes==FALSE) | |
stop("You want to keep all columns (keep=0) but want the covariates back, not the ordered indexes (indexes=FALSE). That doesn't make sense.") | |
if (ncol(covars) < keep) { | |
if (indexes==FALSE) { | |
return(covars) | |
} else { | |
return(1:nrow(covars)) | |
} | |
} | |
treatment <- factor(treatment) | |
covar_prev <- by(covars, treatment, colMeans) | |
p_c1 <- covar_prev[[levels(treatment)[1]]] | |
p_c0 <- covar_prev[[levels(treatment)[2]]] | |
calc_rr_cd <- function(outcome, covar) { | |
#returns rr_cd if rr_cd > 1, else returns 1/rr_cd | |
#might return Inf if there are 0 outcomes at a particular level of the covar | |
prevs <- by(outcome, covar, mean) | |
rr_cd <- (prevs[1])/(prevs[2]) | |
max(rr_cd, 1/rr_cd) | |
} | |
#a vector of rr_cd if rr_cd > 1, 1/rr_cd otherwise | |
rr_cds <- apply(covars, 2, calc_rr_cd, outcome=outcome) | |
#Infs in rr_cds will result in NaN for some covariates | |
bias_mult <- (p_c1 * (rr_cds - 1) + 1) / | |
(p_c0 * (rr_cds - 1) + 1) | |
abs_log_bias_mult <- abs(log(bias_mult)) | |
#throw out NaNs/NAs | |
ordered_idxs <- order(abs_log_bias_mult, decreasing=TRUE, na.last = NA) | |
ordered_idxs <- if (keep == 0) { | |
ordered_idxs | |
} else { | |
ordered_idxs[1:keep] | |
} | |
if (indexes) { | |
ordered_idxs | |
} else { | |
covars[, sort(ordered_idxs)] | |
} | |
} | |
library(Rcpp) | |
cppFunction('NumericVector colVars(NumericMatrix x) { | |
int nrow = x.nrow(), ncol = x.ncol(); | |
NumericVector out(ncol); | |
for (int j = 0; j < ncol; j++) { | |
double mean = 0; | |
double M2 = 0; | |
int n; | |
double delta, xx; | |
for (int i = 0; i < nrow; i++) { | |
n = i+1; | |
xx = x(i,j); | |
delta = xx - mean; | |
mean += delta/n; | |
M2 = M2 + delta*(xx-mean); | |
} | |
out(j) = M2/(n-1); | |
} | |
return out; | |
}') | |
assess_recurrance <- function(x, top_k_var = 100) { | |
#expands a matrix by replacing it's columns with as.numeric(x > 0), | |
# as.numeric(x > median(x)), as.numeric(x > quantile(x, prob=0.75)) | |
#only unique columns (per original column) are kept | |
#top_k_var: before expanding, keep at most this many columns with the highest variance | |
assess_one_column <- function(x) { | |
quants <- quantile(x, probs=c(0.5, 0.75), names=FALSE) | |
quants <- unique(quants) | |
quants <- Filter(function(q) !(q %in% c(0, 1)), quants) | |
mat <- matrix(0, length(x), length(quants) + 1) | |
mat[, 1] <- x > 0 | |
if (length(quants) > 0) { | |
for (i in 1:length(quants)) { | |
mat[, i+1] <- x >= quants[i] | |
} | |
} | |
mat | |
} | |
x <- as.matrix(x) | |
if (ncol(x) > top_k_var) { | |
#vars <- apply(x, 2, var) | |
vars <- colVars(x) | |
var_ords <- order(vars, decreasing=TRUE) | |
x <- x[, sort(var_ords[1:top_k_var])] | |
} | |
mats <- lapply(1:ncol(x), function(i) assess_one_column(x[,i])) | |
cnams <- colnames(x) | |
if (!is.null(cnams)) { | |
for (i in seq_along(mats)) { | |
num <- ncol(mats[[i]]) | |
colnames(mats[[i]]) <- paste(cnams[i], c(1, 50, 75)[1:num], sep="_") | |
} | |
} | |
do.call(cbind, mats) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment