mmparker/reg_group_and_group_var.r

## reg_group_and_group_var.r
# Make a dataset for Channel A
dat.a <- data.frame(purchase = rbinom(n = 100, size = 1, prob = .1),
                    channel = "a")

# One for Channel B
dat.b <- data.frame(purchase= rbinom(n = 100, size = 1, prob = .2),
                    channel = "b")

# Add the rates
dat.a$rate <- sum(dat.a$purchase) / nrow(dat.a)
dat.b$rate <- sum(dat.b$purchase) / nrow(dat.b)

# Merge into one dataset
dat <- rbind(dat.a, dat.b)

# Model purchase as a function of channel, rate, and both
mchannel <- glm(purchase ~ channel, data = dat)
mrate <- glm(purchase ~ rate, data = dat)
mboth <- glm(purchase ~ channel + rate, data = dat)

# Predictions from the models are all the same,
# but R will complain that "prediction from a rank-deficient fit may be misleading"
data.frame(mchannel = predict(mchannel, dat),
           mrate = predict(mrate, dat),
           mboth = predict(mboth, dat)
)

# Which is because when both of those variables are in the model,
# R recognizes that there is zero new information in the rates
# and doesn't estimate any coefficients for them
summary(mboth)
	# Make a dataset for Channel A
	dat.a <- data.frame(purchase = rbinom(n = 100, size = 1, prob = .1),
	channel = "a")

	# One for Channel B
	dat.b <- data.frame(purchase= rbinom(n = 100, size = 1, prob = .2),
	channel = "b")

	# Add the rates
	dat.a$rate <- sum(dat.a$purchase) / nrow(dat.a)
	dat.b$rate <- sum(dat.b$purchase) / nrow(dat.b)

	# Merge into one dataset
	dat <- rbind(dat.a, dat.b)

	# Model purchase as a function of channel, rate, and both
	mchannel <- glm(purchase ~ channel, data = dat)
	mrate <- glm(purchase ~ rate, data = dat)
	mboth <- glm(purchase ~ channel + rate, data = dat)

	# Predictions from the models are all the same,
	# but R will complain that "prediction from a rank-deficient fit may be misleading"
	data.frame(mchannel = predict(mchannel, dat),
	mrate = predict(mrate, dat),
	mboth = predict(mboth, dat)
	)

	# Which is because when both of those variables are in the model,
	# R recognizes that there is zero new information in the rates
	# and doesn't estimate any coefficients for them
	summary(mboth)