public
Last active

Categorical data to indicator matrix to log odds ratios

  • Download Gist
Binarize_categories.R
R
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
# Starting with categorical data, ending with a table of log odds ratios
 
doInstall <- TRUE # Change to FALSE if you don't want packages installed.
toInstall <- c("plyr", "reshape2")
if(doInstall){install.packages(toInstall,
repos = "http://cran.us.r-project.org")}
lapply(toInstall, library, character.only = TRUE)
 
# Canonical example of categorical data
HEC <- melt(HairEyeColor)
HEC <- HEC[rep(1:nrow(HEC), HEC[, 4]), -4]
colnames(HEC) <- c("Hair", "Eye", "Gender")
head(HEC) # This df has a row for each observation
 
# Convert matrix of factors to matrix of indicator variables
indicatorMatrix <- model.matrix(~ ., data = HEC,
contrasts.arg = sapply(HEC, contrasts, contrasts = FALSE))[, -1]
# (from http://stackoverflow.com/a/4569239/479554)
head(indicatorMatrix)
 
# Make a table of log odd ratios between categories
TT <- t(indicatorMatrix) %*% indicatorMatrix # Has both (two true)
TF <- t(indicatorMatrix) %*% !indicatorMatrix # Has one, but not other
FT <- t(!indicatorMatrix) %*% indicatorMatrix # etc.
FF <- t(!indicatorMatrix) %*% !indicatorMatrix # etc.
 
oddsRatios <- (TT / TF) / (FT / FF)
logOddsRatios <- log(oddsRatios)
 
arrange(melt(logOddsRatios)[melt(upper.tri(logOddsRatios))[, 3], ], value)
# upper.tri indicates which items in a matrix are in the upper triangle.

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.