JonasMoss/SSC_posting_and_politics_beta.R

## SSC_posting_and_politics_beta.R
## Checks for associations between politics and posting at SSC, using Beta-Bayes!
## Data from: http://slatestarcodex.com/Stuff/ssc2018public.xlsx

library("tidyverse")
SSC = read_excel("ssc2018public.xlsx") # Downloaded from SSC.

Politics = as.factor(SSC$PoliticalAffiliation)
Comments = as.factor(SSC$Comment)

## Not everyone will agree with my classification.
## Check out the content of lev if you're worried.
lev = levels(Politics)
levels(Politics) = list("Conservative"      = c(lev[1],lev[4],lev[15],lev[17],lev[18],lev[22]),
                        "Libertarian"       = c(lev[2],lev[3],lev[5],lev[6],lev[7],lev[10],lev[12],lev[13],lev[20],lev[21]),
                        "Social democratic" = c(lev[19]),
                        "Liberal"           = c(lev[8],lev[9],lev[16]),
                        "Socialist"         = c(lev[11],lev[14]))

## I change the comments classification from categorical to numerical.

levy = levels(Comments)
levels(Comments) = list("0" = levy[4],
                        "1" = levy[3],
                        "2" = levy[1],
                        "3" = levy[2],
                        "4" = levy[5])
Comments = as.numeric(Comments)
levy = c(1:5, NA)

## Now I do the final manipulations before plotting.

tib = tibble(Comments = Comments, Politics = Politics)
count(tib, Politics, Comments) -> counts

## ============================================================================
## Conditional probabilities: What about the conditional probability of being
## libertarian or conservative given a posting rate between 3 and 5?
##
## The answer is that the conditional probability is 0.54, compared to an
## unconditional probability of 0.33. This isn't bad! An increase form 1/3 to
## 1/2!
## ============================================================================

counts %>%
  filter(Comments %in% c(3,4,5)) %$%
  n %>%
  sum ->
  denominator

counts %>%
  filter(Politics %in% c("Conservative", "Libertarian"), Comments %in% c(3,4,5)) %$%
  n %>%
  sum ->
  numerator

numerator/denominator # 0.54

counts %>%
  filter(Politics %in% c("Conservative", "Libertarian")) %$%
  n %>%
  sum ->
  conslib

conslib/sum(counts$n) # 0.35

## ============================================================================
## Next natural problem: What is the risk that A is libertarian/conservative
## given that he posts a lot (3 - 5)?
## ============================================================================

counts %>%
  filter(Politics %in% c("Conservative", "Libertarian"), Comments %in% c(1,2)) %$%
  n %>%
  sum ->
  numerator2


counts %>%
  filter(Comments %in% c(1,2)) %$%
  n %>%
  sum ->
  denominator2

p1 = numerator/denominator  # 0.54
p2 = numerator2/denominator2 # 0.34
risk_ratio = p1/p2  # 1.57


## ============================================================================
## Beta analysis
## ============================================================================

## This is the data. m0, m1 are the number of trials, y0, y1 are the number of
## successes (or deaths).
y1 = numerator
y0 = numerator2
m1 = denominator
m0 = denominator2

## The prior parameter for the betas. Equal parameters are neutral.
alpha0 = 1/2
beta0  = 1/2
alpha1 = 1/2
beta1  = 1/2

## Posterior parameters.
alpha0_post = alpha0 + y0
beta0_post = beta0 + m0 - y0
alpha1_post = alpha1 + y1
beta1_post = beta1 + m1 - y1

## Plot posteriors and priors for both ps.
x = seq(0, 1, by = 0.0001)
plot(x, dbeta(x, alpha0_post, beta0_post), lty = 1, col = "blue", type = "l", ylab = "Density",
     bty = "l", main = "Priors and posteriors.")
lines(x, dbeta(x, alpha0, beta0), lty = 2, col = "blue")
lines(x, dbeta(x, alpha1_post, beta1_post), lty = 1, col = "red")
lines(x, dbeta(x, alpha1, beta1), lty = 2, col = "red")
legend(c("topright"), c("Infrequent posters", "Frequent posters"),
       col = c("red", "blue"), bty = "n", lty = c(1,1))

## Density function for the risk ratio.

dens = Vectorize(function(z, alpha1, beta1, alpha0, beta0) {
  integrate(function(p0) {
    p0*dbeta(z*p0, alpha1, beta1)*dbeta(p0, alpha0, beta0)
  }, lower = 0, upper = min(1, 1/z))$value
})

## Distribution function for the risk ratio.

dist = Vectorize(function(z, alpha1, beta1, alpha0, beta0) {
  integrate(function(p0) {
    pbeta(z*p0, alpha1, beta1)*dbeta(p0, alpha0, beta0)
  }, lower = 0, upper = 1)$value
})

## Find the modes.

## Plotting.
## Posterior in black, prior in purple.

x = seq(1, 2.1, by = 0.001)
plot(x, sapply(x, dens, alpha1 = alpha1_post,
                        beta1  = beta1_post,
                        alpha0 = alpha0_post,
                        beta0  = beta0_post), type = "l",
     main = "Risk ratio posterior density", bty = "l", ylab = "Density", xlab = "Risk ratio",
     sub = bquote(alpha[0] ~ "=" ~ .(alpha0) ~ ";" ~ beta[0] ~ "=" ~ .(beta0) ~
                    alpha[1] ~ "=" ~ .(alpha1) ~ ";" ~ beta[1] ~ "=" ~ .(beta1)))

mode0 = nlm(function(x) -dens(x, alpha1 = alpha1_post,
                                 beta1  = beta1_post,
                                 alpha0 = alpha0_post,
                                 beta0  = beta0_post), p = 1.5)$estimate

abline(v = mode0, lty = 2)
legend(c("topright"), legend = c("Density", paste0("Mode = ",(signif(mode0, 3)))), lty = c(1,2), bty = "n")

## ============================================================================
## Using the distribution function to find 95% credibility intervals.
## ============================================================================


lower = optimize(function(x) (dist(x, alpha1 = alpha1_post,
                              beta1  = beta1_post,
                              alpha0 = alpha0_post,
                              beta0  = beta0_post) - 0.025)^2, lower = -3, upper = 2)$minimum


upper = optimize(function(x) (dist(x, alpha1 = alpha1_post,
                                   beta1  = beta1_post,
                                   alpha0 = alpha0_post,
                                   beta0  = beta0_post) - 0.975)^2, lower = -3, upper = 2)$minimum

signif(c(lower, upper), 3)
	## Checks for associations between politics and posting at SSC, using Beta-Bayes!
	## Data from: http://slatestarcodex.com/Stuff/ssc2018public.xlsx

	library("tidyverse")
	SSC = read_excel("ssc2018public.xlsx") # Downloaded from SSC.

	Politics = as.factor(SSC$PoliticalAffiliation)
	Comments = as.factor(SSC$Comment)

	## Not everyone will agree with my classification.
	## Check out the content of lev if you're worried.
	lev = levels(Politics)
	levels(Politics) = list("Conservative" = c(lev[1],lev[4],lev[15],lev[17],lev[18],lev[22]),
	"Libertarian" = c(lev[2],lev[3],lev[5],lev[6],lev[7],lev[10],lev[12],lev[13],lev[20],lev[21]),
	"Social democratic" = c(lev[19]),
	"Liberal" = c(lev[8],lev[9],lev[16]),
	"Socialist" = c(lev[11],lev[14]))

	## I change the comments classification from categorical to numerical.

	levy = levels(Comments)
	levels(Comments) = list("0" = levy[4],
	"1" = levy[3],
	"2" = levy[1],
	"3" = levy[2],
	"4" = levy[5])
	Comments = as.numeric(Comments)
	levy = c(1:5, NA)

	## Now I do the final manipulations before plotting.

	tib = tibble(Comments = Comments, Politics = Politics)
	count(tib, Politics, Comments) -> counts

	## ============================================================================
	## Conditional probabilities: What about the conditional probability of being
	## libertarian or conservative given a posting rate between 3 and 5?
	##
	## The answer is that the conditional probability is 0.54, compared to an
	## unconditional probability of 0.33. This isn't bad! An increase form 1/3 to
	## 1/2!
	## ============================================================================

	counts %>%
	filter(Comments %in% c(3,4,5)) %$%
	n %>%
	sum ->
	denominator

	counts %>%
	filter(Politics %in% c("Conservative", "Libertarian"), Comments %in% c(3,4,5)) %$%
	n %>%
	sum ->
	numerator

	numerator/denominator # 0.54

	counts %>%
	filter(Politics %in% c("Conservative", "Libertarian")) %$%
	n %>%
	sum ->
	conslib

	conslib/sum(counts$n) # 0.35

	## ============================================================================
	## Next natural problem: What is the risk that A is libertarian/conservative
	## given that he posts a lot (3 - 5)?
	## ============================================================================

	counts %>%
	filter(Politics %in% c("Conservative", "Libertarian"), Comments %in% c(1,2)) %$%
	n %>%
	sum ->
	numerator2


	counts %>%
	filter(Comments %in% c(1,2)) %$%
	n %>%
	sum ->
	denominator2

	p1 = numerator/denominator # 0.54
	p2 = numerator2/denominator2 # 0.34
	risk_ratio = p1/p2 # 1.57


	## ============================================================================
	## Beta analysis
	## ============================================================================

	## This is the data. m0, m1 are the number of trials, y0, y1 are the number of
	## successes (or deaths).
	y1 = numerator
	y0 = numerator2
	m1 = denominator
	m0 = denominator2

	## The prior parameter for the betas. Equal parameters are neutral.
	alpha0 = 1/2
	beta0 = 1/2
	alpha1 = 1/2
	beta1 = 1/2

	## Posterior parameters.
	alpha0_post = alpha0 + y0
	beta0_post = beta0 + m0 - y0
	alpha1_post = alpha1 + y1
	beta1_post = beta1 + m1 - y1

	## Plot posteriors and priors for both ps.
	x = seq(0, 1, by = 0.0001)
	plot(x, dbeta(x, alpha0_post, beta0_post), lty = 1, col = "blue", type = "l", ylab = "Density",
	bty = "l", main = "Priors and posteriors.")
	lines(x, dbeta(x, alpha0, beta0), lty = 2, col = "blue")
	lines(x, dbeta(x, alpha1_post, beta1_post), lty = 1, col = "red")
	lines(x, dbeta(x, alpha1, beta1), lty = 2, col = "red")
	legend(c("topright"), c("Infrequent posters", "Frequent posters"),
	col = c("red", "blue"), bty = "n", lty = c(1,1))

	## Density function for the risk ratio.

	dens = Vectorize(function(z, alpha1, beta1, alpha0, beta0) {
	integrate(function(p0) {
	p0dbeta(zp0, alpha1, beta1)*dbeta(p0, alpha0, beta0)
	}, lower = 0, upper = min(1, 1/z))$value
	})

	## Distribution function for the risk ratio.

	dist = Vectorize(function(z, alpha1, beta1, alpha0, beta0) {
	integrate(function(p0) {
	pbeta(zp0, alpha1, beta1)dbeta(p0, alpha0, beta0)
	}, lower = 0, upper = 1)$value
	})

	## Find the modes.

	## Plotting.
	## Posterior in black, prior in purple.

	x = seq(1, 2.1, by = 0.001)
	plot(x, sapply(x, dens, alpha1 = alpha1_post,
	beta1 = beta1_post,
	alpha0 = alpha0_post,
	beta0 = beta0_post), type = "l",
	main = "Risk ratio posterior density", bty = "l", ylab = "Density", xlab = "Risk ratio",
	sub = bquote(alpha[0] ~ "=" ~ .(alpha0) ~ ";" ~ beta[0] ~ "=" ~ .(beta0) ~
	alpha[1] ~ "=" ~ .(alpha1) ~ ";" ~ beta[1] ~ "=" ~ .(beta1)))

	mode0 = nlm(function(x) -dens(x, alpha1 = alpha1_post,
	beta1 = beta1_post,
	alpha0 = alpha0_post,
	beta0 = beta0_post), p = 1.5)$estimate

	abline(v = mode0, lty = 2)
	legend(c("topright"), legend = c("Density", paste0("Mode = ",(signif(mode0, 3)))), lty = c(1,2), bty = "n")

	## ============================================================================
	## Using the distribution function to find 95% credibility intervals.
	## ============================================================================


	lower = optimize(function(x) (dist(x, alpha1 = alpha1_post,
	beta1 = beta1_post,
	alpha0 = alpha0_post,
	beta0 = beta0_post) - 0.025)^2, lower = -3, upper = 2)$minimum



	upper = optimize(function(x) (dist(x, alpha1 = alpha1_post,
	beta1 = beta1_post,
	alpha0 = alpha0_post,
	beta0 = beta0_post) - 0.975)^2, lower = -3, upper = 2)$minimum

	signif(c(lower, upper), 3)