clayford/simulate_proportional_odds_regression.R

## simulate_proportional_odds_regression.R
# Clay Ford
# 2020-06-22

# Simulate data from a proportional odds model with proportional odds assumption
# satisfied.

# 300 observations and a grouping variable (example: democratic/republican)
n <- 300
set.seed(1)
grp <- sample(0:1, size = n, replace = TRUE)

# Generate boundary probabilities of professed ideology.
ideology <- c("Very Liberal", "Slightly Liberal", "Moderate", "Slightly Conservative", "Very Conservative")

# Notice the slope is always -1.
# That is, the proportional odds assumption is satisfied.
# plogis() is the inverse logit function
r1 <- plogis(q = -2.5 - -1*grp)
r2 <- plogis(q = -1.5 - -1*grp)
r3 <- plogis(q = 0.2 - -1*grp)
r4 <- plogis(q = 1.1 - -1*grp)

grp[1:2]
r1[1:2]

# Probability of "Very Liberal" if Republican
head(r1[grp==0],1)
# Probability of "Very Liberal" if Democrat
head(r1[grp==1],1)

# Probability of "Very Liberal" or "Slightly Liberal" if Republican
head(r2[grp==0],1)
# Probability of "Very Liberal" or "Slightly Liberal" if Democrat
head(r2[grp==1],1)

# Probability of "Very Liberal" or "Slightly Liberal" or "Moderate" if Republican
head(r3[grp==0],1)
# Probability of "Very Liberal" or "Slightly Liberal" or "Moderate" if Democrat
head(r3[grp==1],1)

# Probability of "Very Liberal" or "Slightly Liberal" or "Moderate" if Republican
head(r4[grp==0],1)
# Probability of "Very Liberal" or "Slightly Liberal" or "Moderate" if Democrat
head(r4[grp==1],1)

# Probability of "Very Liberal" or "Slightly Liberal" or "Moderate" or "Slightly
# Conservative" if Republican
head(r4[grp==0],1)
# Probability of "Very Liberal" or "Slightly Liberal" or "Moderate" or "Slightly
# Conservative" if Democrat
head(r4[grp==1],1)


# Place probabilities into a matrix
probs <- matrix(c(r1, r2, r3, r4), ncol = 4)

# Generate probabilities for each ideology:
# Very Liberal = x[1]
# Slightly Liberal = x[2] - x[1]
# Moderate = x[3] - x[2]
# Slightly Conservative = x[4] - x[3]
# Very Conservative = 1 - x[4]
p <- apply(probs, 1, function(x) c(x[1], diff(x), 1 - x[4]))

# Now generate ideologies per the probabilities
y <- apply(p, 2, function(x)sample(ideology, size = 1, replace = TRUE, prob = x))
head(y)
# convert y to factor
y <- factor(y, levels = ideology)
# convert grp to factor
party <- factor(grp, labels = c("Rep", "Dem"))

# cross tabulation of party and ideology
table(party, y)

# Fit proportional odds model
library(MASS)
m <- polr(y ~ grp)
summary(m)

# We come pretty close to recovering "true" values used to simulate data

# Compare to multinomial logit model to asses proportional odds assumption
library(nnet)
mlm <- multinom(y ~ grp)

M1 <- logLik(m)
M2 <- logLik(mlm)
(G <- -2*(M1[1] - M2[1]))
pchisq(G,3,lower.tail = FALSE)

# p > 0.05 provides some evidence that the proportional odds assumption is
# satisfied.


##################################


# Now generate data where proportional odds assumption is NOT satisfied.
# The difference is in the slopes below. Notice they all differ.

set.seed(2)
# grp <- sample(0:1, size = n, replace = TRUE)
r1 <- plogis(q = -2.5 - -1.6*grp)
r2 <- plogis(q = -1.5 - -1.1*grp)
r3 <- plogis(q = 0.2 - -0.6*grp)
r4 <- plogis(q = 1.1 - -0.2*grp)

# generate ideology labels as before and fit models
probs <- matrix(c(r1, r2, r3, r4), ncol = 4)
p <- apply(probs, 1, function(x) c(x[1], diff(x), 1 - x[4]))
y <- apply(p, 2, function(x)sample(ideology, size = 1, replace = TRUE, prob = x))
y <- factor(y, levels = ideology)
party <- factor(grp, labels = c("Rep", "Dem"))


# proportional odds model
m <- polr(y ~ grp)
# multinomial logit model
mlm <- multinom(y ~ grp)

# compare models
M1 <- logLik(m)
M2 <- logLik(mlm)
(G <- -2*(M1[1] - M2[1]))
pchisq(G,3,lower.tail = FALSE)

# p < 0.5 provides some evidence that the proportional odds assumption is
# suspect, though this will not always be the case.
	# Clay Ford
	# 2020-06-22

	# Simulate data from a proportional odds model with proportional odds assumption
	# satisfied.

	# 300 observations and a grouping variable (example: democratic/republican)
	n <- 300
	set.seed(1)
	grp <- sample(0:1, size = n, replace = TRUE)

	# Generate boundary probabilities of professed ideology.
	ideology <- c("Very Liberal", "Slightly Liberal", "Moderate", "Slightly Conservative", "Very Conservative")

	# Notice the slope is always -1.
	# That is, the proportional odds assumption is satisfied.
	# plogis() is the inverse logit function
	r1 <- plogis(q = -2.5 - -1*grp)
	r2 <- plogis(q = -1.5 - -1*grp)
	r3 <- plogis(q = 0.2 - -1*grp)
	r4 <- plogis(q = 1.1 - -1*grp)

	grp[1:2]
	r1[1:2]

	# Probability of "Very Liberal" if Republican
	head(r1[grp==0],1)
	# Probability of "Very Liberal" if Democrat
	head(r1[grp==1],1)

	# Probability of "Very Liberal" or "Slightly Liberal" if Republican
	head(r2[grp==0],1)
	# Probability of "Very Liberal" or "Slightly Liberal" if Democrat
	head(r2[grp==1],1)

	# Probability of "Very Liberal" or "Slightly Liberal" or "Moderate" if Republican
	head(r3[grp==0],1)
	# Probability of "Very Liberal" or "Slightly Liberal" or "Moderate" if Democrat
	head(r3[grp==1],1)

	# Probability of "Very Liberal" or "Slightly Liberal" or "Moderate" if Republican
	head(r4[grp==0],1)
	# Probability of "Very Liberal" or "Slightly Liberal" or "Moderate" if Democrat
	head(r4[grp==1],1)

	# Probability of "Very Liberal" or "Slightly Liberal" or "Moderate" or "Slightly
	# Conservative" if Republican
	head(r4[grp==0],1)
	# Probability of "Very Liberal" or "Slightly Liberal" or "Moderate" or "Slightly
	# Conservative" if Democrat
	head(r4[grp==1],1)


	# Place probabilities into a matrix
	probs <- matrix(c(r1, r2, r3, r4), ncol = 4)

	# Generate probabilities for each ideology:
	# Very Liberal = x[1]
	# Slightly Liberal = x[2] - x[1]
	# Moderate = x[3] - x[2]
	# Slightly Conservative = x[4] - x[3]
	# Very Conservative = 1 - x[4]
	p <- apply(probs, 1, function(x) c(x[1], diff(x), 1 - x[4]))

	# Now generate ideologies per the probabilities
	y <- apply(p, 2, function(x)sample(ideology, size = 1, replace = TRUE, prob = x))
	head(y)
	# convert y to factor
	y <- factor(y, levels = ideology)
	# convert grp to factor
	party <- factor(grp, labels = c("Rep", "Dem"))

	# cross tabulation of party and ideology
	table(party, y)

	# Fit proportional odds model
	library(MASS)
	m <- polr(y ~ grp)
	summary(m)

	# We come pretty close to recovering "true" values used to simulate data

	# Compare to multinomial logit model to asses proportional odds assumption
	library(nnet)
	mlm <- multinom(y ~ grp)

	M1 <- logLik(m)
	M2 <- logLik(mlm)
	(G <- -2*(M1[1] - M2[1]))
	pchisq(G,3,lower.tail = FALSE)

	# p > 0.05 provides some evidence that the proportional odds assumption is
	# satisfied.


	##################################


	# Now generate data where proportional odds assumption is NOT satisfied.
	# The difference is in the slopes below. Notice they all differ.

	set.seed(2)
	# grp <- sample(0:1, size = n, replace = TRUE)
	r1 <- plogis(q = -2.5 - -1.6*grp)
	r2 <- plogis(q = -1.5 - -1.1*grp)
	r3 <- plogis(q = 0.2 - -0.6*grp)
	r4 <- plogis(q = 1.1 - -0.2*grp)

	# generate ideology labels as before and fit models
	probs <- matrix(c(r1, r2, r3, r4), ncol = 4)
	p <- apply(probs, 1, function(x) c(x[1], diff(x), 1 - x[4]))
	y <- apply(p, 2, function(x)sample(ideology, size = 1, replace = TRUE, prob = x))
	y <- factor(y, levels = ideology)
	party <- factor(grp, labels = c("Rep", "Dem"))


	# proportional odds model
	m <- polr(y ~ grp)
	# multinomial logit model
	mlm <- multinom(y ~ grp)

	# compare models
	M1 <- logLik(m)
	M2 <- logLik(mlm)
	(G <- -2*(M1[1] - M2[1]))
	pchisq(G,3,lower.tail = FALSE)

	# p < 0.5 provides some evidence that the proportional odds assumption is
	# suspect, though this will not always be the case.