baogorek/regression-coding.R

## regression-coding.R
library(dplyr)
library(tidyr)


sim_df <- bind_rows(
    tibble(group='A', y=rnorm(100, mean=33)),
    tibble(group='B', y=rnorm(10000, mean=30)),
    tibble(group='C', y=rnorm(100, mean=33.1))
)

sim_df$group <- as.character(sim_df

# deviation, or sum coding, compares each cell mean to the *unweighted* mean of the cell means

lm_deviation <- lm(y ~ group, data = sim_df, contrasts = list(group = "contr.sum"))
# prediction, A and C will be significant, because B dominates the mean. B will not be
summary(lm_deviation)
# Intercept is estimating (33 + 30 + 33.1) / 3 = 32.03
# Group 1, or "A" is that mean above plus 1
# Group 2, or "B" is that mean above minus 2
# What about "C"? I'd have to work it out from the Intercept and Group 1 and Group 2 numbers?
# That's kind of frustrating

c_contrast <- c(0, -1, -1)

(c_est <- c_contrast %*% coef(lm_deviation))
(c_se <- sqrt(c_contrast %*% vcov(lm_deviation) %*% c_contrast))

sim_df2 <- sim_df
sim_df2$group <- factor(sim_df2$group, levels=c("C", "B", "A"))

# C: est and se
0.89862    0.07382

lm_deviation2 <- lm(y ~ group, data = sim_df2, contrasts = list(group = "contr.sum"))
summary(lm_deviation2)

# So those three contrasts wouldn't be linearly independent, but you can still make
# decisions based on each group. I hate how I lose the labels.

my_lm <- lm(y ~ group, data = sim_df)
# predition, B will be sigficantly different than A, C will not
summary(my_lm)
	library(dplyr)
	library(tidyr)


	sim_df <- bind_rows(
	tibble(group='A', y=rnorm(100, mean=33)),
	tibble(group='B', y=rnorm(10000, mean=30)),
	tibble(group='C', y=rnorm(100, mean=33.1))
	)

	sim_df$group <- as.character(sim_df

	# deviation, or sum coding, compares each cell mean to the unweighted mean of the cell means

	lm_deviation <- lm(y ~ group, data = sim_df, contrasts = list(group = "contr.sum"))
	# prediction, A and C will be significant, because B dominates the mean. B will not be
	summary(lm_deviation)
	# Intercept is estimating (33 + 30 + 33.1) / 3 = 32.03
	# Group 1, or "A" is that mean above plus 1
	# Group 2, or "B" is that mean above minus 2
	# What about "C"? I'd have to work it out from the Intercept and Group 1 and Group 2 numbers?
	# That's kind of frustrating

	c_contrast <- c(0, -1, -1)

	(c_est <- c_contrast %*% coef(lm_deviation))
	(c_se <- sqrt(c_contrast %% vcov(lm_deviation) %% c_contrast))

	sim_df2 <- sim_df
	sim_df2$group <- factor(sim_df2$group, levels=c("C", "B", "A"))

	# C: est and se
	0.89862 0.07382

	lm_deviation2 <- lm(y ~ group, data = sim_df2, contrasts = list(group = "contr.sum"))
	summary(lm_deviation2)

	# So those three contrasts wouldn't be linearly independent, but you can still make
	# decisions based on each group. I hate how I lose the labels.

	my_lm <- lm(y ~ group, data = sim_df)
	# predition, B will be sigficantly different than A, C will not
	summary(my_lm)