lendle/hdps.R

## hdps.R
hdps_screen <- function(outcome, treatment, covars, keep=0, indexes=TRUE) {
  #outcome and treatment are binary vectors of length n
  #covars is a matrix of binary variabesl of size n x p,
  # where p is the number of covariates for in a paritular data dimension
  #keep: number of covariates to keep
  # indexes: return indexes or subest of covars
  #
  #returns indexes of kept covariates or matrix of kept covars
  #indexes are sorted by abs(log(multiplicitive bias))
  #if keep is 0 or greatner than the number of covars for which multiplicitive bias
  # could be calculated, all indexes are returned(sorted)
  # matrix of kept covars are in the original column order

  #p_c1 - prev of each covar among exposed
  #p_c0 - '' among unexposed
  #rr_cd - "independent association between the [covar] and study outcome"

  if (!all(outcome %in% c(0,1)))
    stop("outcome should be binary")
  if (!all(treatment %in% c(0,1)))
    stop("treatment should be binary")
  if (!all(covars %in% c(0,1)))
    stop("covars should be binary")

  if (keep==0 && indexes==FALSE)
    stop("You want to keep all columns (keep=0) but want the covariates back, not the ordered indexes (indexes=FALSE). That doesn't make sense.")

  if (ncol(covars) < keep) {
    if (indexes==FALSE) {
      return(covars)
    } else {
     return(1:nrow(covars))
    }
  }

  treatment <- factor(treatment)

  covar_prev <- by(covars, treatment, colMeans)
  p_c1 <- covar_prev[[levels(treatment)[1]]]
  p_c0 <- covar_prev[[levels(treatment)[2]]]

  calc_rr_cd <- function(outcome, covar) {
    #returns rr_cd if rr_cd > 1, else returns 1/rr_cd
    #might return Inf if there are 0 outcomes at a particular level of the covar
    prevs <- by(outcome, covar, mean)
    rr_cd <- (prevs[1])/(prevs[2])
    max(rr_cd, 1/rr_cd)
  }

  #a vector of rr_cd if rr_cd > 1, 1/rr_cd otherwise
  rr_cds <- apply(covars, 2, calc_rr_cd, outcome=outcome)

  #Infs in rr_cds will result in NaN for some covariates
  bias_mult <- (p_c1 * (rr_cds - 1) + 1) /
               (p_c0 * (rr_cds - 1) + 1)

  abs_log_bias_mult <- abs(log(bias_mult))


  #throw out NaNs/NAs
  ordered_idxs <- order(abs_log_bias_mult, decreasing=TRUE, na.last = NA)

  ordered_idxs <- if (keep == 0) {
    ordered_idxs
  } else {
    ordered_idxs[1:keep]
  }


  if (indexes) {
    ordered_idxs
  } else {
    covars[, sort(ordered_idxs)]
  }


}

library(Rcpp)
cppFunction('NumericVector colVars(NumericMatrix x) {
      int nrow = x.nrow(), ncol = x.ncol();
      NumericVector out(ncol);

      for (int j = 0; j < ncol; j++) {
        double mean = 0;
        double M2 = 0;
        int n;
        double delta, xx;

        for (int i = 0; i < nrow; i++) {
          n = i+1;
          xx = x(i,j);
          delta = xx - mean;
          mean += delta/n;
          M2 = M2 + delta*(xx-mean);
        }

        out(j) = M2/(n-1);
      }

      return out;
}')

assess_recurrance <- function(x, top_k_var = 100) {
  #expands a matrix by replacing it's columns with as.numeric(x > 0),
  # as.numeric(x > median(x)), as.numeric(x > quantile(x, prob=0.75))
  #only unique columns (per original column) are kept
  #top_k_var: before expanding, keep at most this many columns with the highest variance

  assess_one_column <- function(x) {
    quants <- quantile(x, probs=c(0.5, 0.75), names=FALSE)
    quants <- unique(quants)
    quants <- Filter(function(q) !(q %in% c(0, 1)), quants)

    mat <- matrix(0, length(x), length(quants) + 1)

    mat[, 1] <- x > 0
    if (length(quants) > 0) {
      for (i in 1:length(quants)) {
        mat[, i+1] <- x >= quants[i]
      }
    }
    mat
  }

  x <- as.matrix(x)

  if (ncol(x) > top_k_var) {
    #vars <- apply(x, 2, var)
    vars <- colVars(x)
    var_ords <- order(vars, decreasing=TRUE)
    x <- x[, sort(var_ords[1:top_k_var])]
  }

  mats <- lapply(1:ncol(x), function(i) assess_one_column(x[,i]))

  cnams <- colnames(x)
  if (!is.null(cnams)) {
    for (i in seq_along(mats)) {
      num <- ncol(mats[[i]])
      colnames(mats[[i]]) <- paste(cnams[i], c(1, 50, 75)[1:num], sep="_")

    }
  }
  do.call(cbind, mats)
}
	hdps_screen <- function(outcome, treatment, covars, keep=0, indexes=TRUE) {
	#outcome and treatment are binary vectors of length n
	#covars is a matrix of binary variabesl of size n x p,
	# where p is the number of covariates for in a paritular data dimension
	#keep: number of covariates to keep
	# indexes: return indexes or subest of covars
	#
	#returns indexes of kept covariates or matrix of kept covars
	#indexes are sorted by abs(log(multiplicitive bias))
	#if keep is 0 or greatner than the number of covars for which multiplicitive bias
	# could be calculated, all indexes are returned(sorted)
	# matrix of kept covars are in the original column order

	#p_c1 - prev of each covar among exposed
	#p_c0 - '' among unexposed
	#rr_cd - "independent association between the [covar] and study outcome"

	if (!all(outcome %in% c(0,1)))
	stop("outcome should be binary")
	if (!all(treatment %in% c(0,1)))
	stop("treatment should be binary")
	if (!all(covars %in% c(0,1)))
	stop("covars should be binary")

	if (keep==0 && indexes==FALSE)
	stop("You want to keep all columns (keep=0) but want the covariates back, not the ordered indexes (indexes=FALSE). That doesn't make sense.")

	if (ncol(covars) < keep) {
	if (indexes==FALSE) {
	return(covars)
	} else {
	return(1:nrow(covars))
	}
	}

	treatment <- factor(treatment)

	covar_prev <- by(covars, treatment, colMeans)
	p_c1 <- covar_prev[[levels(treatment)[1]]]
	p_c0 <- covar_prev[[levels(treatment)[2]]]

	calc_rr_cd <- function(outcome, covar) {
	#returns rr_cd if rr_cd > 1, else returns 1/rr_cd
	#might return Inf if there are 0 outcomes at a particular level of the covar
	prevs <- by(outcome, covar, mean)
	rr_cd <- (prevs[1])/(prevs[2])
	max(rr_cd, 1/rr_cd)
	}

	#a vector of rr_cd if rr_cd > 1, 1/rr_cd otherwise
	rr_cds <- apply(covars, 2, calc_rr_cd, outcome=outcome)

	#Infs in rr_cds will result in NaN for some covariates
	bias_mult <- (p_c1 * (rr_cds - 1) + 1) /
	(p_c0 * (rr_cds - 1) + 1)

	abs_log_bias_mult <- abs(log(bias_mult))


	#throw out NaNs/NAs
	ordered_idxs <- order(abs_log_bias_mult, decreasing=TRUE, na.last = NA)

	ordered_idxs <- if (keep == 0) {
	ordered_idxs
	} else {
	ordered_idxs[1:keep]
	}


	if (indexes) {
	ordered_idxs
	} else {
	covars[, sort(ordered_idxs)]
	}


	}

	library(Rcpp)
	cppFunction('NumericVector colVars(NumericMatrix x) {
	int nrow = x.nrow(), ncol = x.ncol();
	NumericVector out(ncol);

	for (int j = 0; j < ncol; j++) {
	double mean = 0;
	double M2 = 0;
	int n;
	double delta, xx;

	for (int i = 0; i < nrow; i++) {
	n = i+1;
	xx = x(i,j);
	delta = xx - mean;
	mean += delta/n;
	M2 = M2 + delta*(xx-mean);
	}

	out(j) = M2/(n-1);
	}

	return out;
	}')

	assess_recurrance <- function(x, top_k_var = 100) {
	#expands a matrix by replacing it's columns with as.numeric(x > 0),
	# as.numeric(x > median(x)), as.numeric(x > quantile(x, prob=0.75))
	#only unique columns (per original column) are kept
	#top_k_var: before expanding, keep at most this many columns with the highest variance

	assess_one_column <- function(x) {
	quants <- quantile(x, probs=c(0.5, 0.75), names=FALSE)
	quants <- unique(quants)
	quants <- Filter(function(q) !(q %in% c(0, 1)), quants)

	mat <- matrix(0, length(x), length(quants) + 1)

	mat[, 1] <- x > 0
	if (length(quants) > 0) {
	for (i in 1:length(quants)) {
	mat[, i+1] <- x >= quants[i]
	}
	}
	mat
	}

	x <- as.matrix(x)

	if (ncol(x) > top_k_var) {
	#vars <- apply(x, 2, var)
	vars <- colVars(x)
	var_ords <- order(vars, decreasing=TRUE)
	x <- x[, sort(var_ords[1:top_k_var])]
	}

	mats <- lapply(1:ncol(x), function(i) assess_one_column(x[,i]))

	cnams <- colnames(x)
	if (!is.null(cnams)) {
	for (i in seq_along(mats)) {
	num <- ncol(mats[[i]])
	colnames(mats[[i]]) <- paste(cnams[i], c(1, 50, 75)[1:num], sep="_")

	}
	}
	do.call(cbind, mats)
	}