Created
February 10, 2022 16:09
-
-
Save baogorek/9cf62f1700737246aebb0e3ed35a095a to your computer and use it in GitHub Desktop.
Showing deviation coding for R's lm
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(dplyr) | |
library(tidyr) | |
sim_df <- bind_rows( | |
tibble(group='A', y=rnorm(100, mean=33)), | |
tibble(group='B', y=rnorm(10000, mean=30)), | |
tibble(group='C', y=rnorm(100, mean=33.1)) | |
) | |
sim_df$group <- as.character(sim_df | |
# deviation, or sum coding, compares each cell mean to the *unweighted* mean of the cell means | |
lm_deviation <- lm(y ~ group, data = sim_df, contrasts = list(group = "contr.sum")) | |
# prediction, A and C will be significant, because B dominates the mean. B will not be | |
summary(lm_deviation) | |
# Intercept is estimating (33 + 30 + 33.1) / 3 = 32.03 | |
# Group 1, or "A" is that mean above plus 1 | |
# Group 2, or "B" is that mean above minus 2 | |
# What about "C"? I'd have to work it out from the Intercept and Group 1 and Group 2 numbers? | |
# That's kind of frustrating | |
c_contrast <- c(0, -1, -1) | |
(c_est <- c_contrast %*% coef(lm_deviation)) | |
(c_se <- sqrt(c_contrast %*% vcov(lm_deviation) %*% c_contrast)) | |
sim_df2 <- sim_df | |
sim_df2$group <- factor(sim_df2$group, levels=c("C", "B", "A")) | |
# C: est and se | |
0.89862 0.07382 | |
lm_deviation2 <- lm(y ~ group, data = sim_df2, contrasts = list(group = "contr.sum")) | |
summary(lm_deviation2) | |
# So those three contrasts wouldn't be linearly independent, but you can still make | |
# decisions based on each group. I hate how I lose the labels. | |
my_lm <- lm(y ~ group, data = sim_df) | |
# predition, B will be sigficantly different than A, C will not | |
summary(my_lm) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment