Skip to content

Instantly share code, notes, and snippets.

@JonasMoss
Created January 4, 2018 16:02
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save JonasMoss/1926e7db8e0c47c53914ca7874144b66 to your computer and use it in GitHub Desktop.
Save JonasMoss/1926e7db8e0c47c53914ca7874144b66 to your computer and use it in GitHub Desktop.
## Checks for associations between politics and posting at SSC, using Beta-Bayes!
## Data from: http://slatestarcodex.com/Stuff/ssc2018public.xlsx
library("tidyverse")
SSC = read_excel("ssc2018public.xlsx") # Downloaded from SSC.
Politics = as.factor(SSC$PoliticalAffiliation)
Comments = as.factor(SSC$Comment)
## Not everyone will agree with my classification.
## Check out the content of lev if you're worried.
lev = levels(Politics)
levels(Politics) = list("Conservative" = c(lev[1],lev[4],lev[15],lev[17],lev[18],lev[22]),
"Libertarian" = c(lev[2],lev[3],lev[5],lev[6],lev[7],lev[10],lev[12],lev[13],lev[20],lev[21]),
"Social democratic" = c(lev[19]),
"Liberal" = c(lev[8],lev[9],lev[16]),
"Socialist" = c(lev[11],lev[14]))
## I change the comments classification from categorical to numerical.
levy = levels(Comments)
levels(Comments) = list("0" = levy[4],
"1" = levy[3],
"2" = levy[1],
"3" = levy[2],
"4" = levy[5])
Comments = as.numeric(Comments)
levy = c(1:5, NA)
## Now I do the final manipulations before plotting.
tib = tibble(Comments = Comments, Politics = Politics)
count(tib, Politics, Comments) -> counts
## ============================================================================
## Conditional probabilities: What about the conditional probability of being
## libertarian or conservative given a posting rate between 3 and 5?
##
## The answer is that the conditional probability is 0.54, compared to an
## unconditional probability of 0.33. This isn't bad! An increase form 1/3 to
## 1/2!
## ============================================================================
counts %>%
filter(Comments %in% c(3,4,5)) %$%
n %>%
sum ->
denominator
counts %>%
filter(Politics %in% c("Conservative", "Libertarian"), Comments %in% c(3,4,5)) %$%
n %>%
sum ->
numerator
numerator/denominator # 0.54
counts %>%
filter(Politics %in% c("Conservative", "Libertarian")) %$%
n %>%
sum ->
conslib
conslib/sum(counts$n) # 0.35
## ============================================================================
## Next natural problem: What is the risk that A is libertarian/conservative
## given that he posts a lot (3 - 5)?
## ============================================================================
counts %>%
filter(Politics %in% c("Conservative", "Libertarian"), Comments %in% c(1,2)) %$%
n %>%
sum ->
numerator2
counts %>%
filter(Comments %in% c(1,2)) %$%
n %>%
sum ->
denominator2
p1 = numerator/denominator # 0.54
p2 = numerator2/denominator2 # 0.34
risk_ratio = p1/p2 # 1.57
## ============================================================================
## Beta analysis
## ============================================================================
## This is the data. m0, m1 are the number of trials, y0, y1 are the number of
## successes (or deaths).
y1 = numerator
y0 = numerator2
m1 = denominator
m0 = denominator2
## The prior parameter for the betas. Equal parameters are neutral.
alpha0 = 1/2
beta0 = 1/2
alpha1 = 1/2
beta1 = 1/2
## Posterior parameters.
alpha0_post = alpha0 + y0
beta0_post = beta0 + m0 - y0
alpha1_post = alpha1 + y1
beta1_post = beta1 + m1 - y1
## Plot posteriors and priors for both ps.
x = seq(0, 1, by = 0.0001)
plot(x, dbeta(x, alpha0_post, beta0_post), lty = 1, col = "blue", type = "l", ylab = "Density",
bty = "l", main = "Priors and posteriors.")
lines(x, dbeta(x, alpha0, beta0), lty = 2, col = "blue")
lines(x, dbeta(x, alpha1_post, beta1_post), lty = 1, col = "red")
lines(x, dbeta(x, alpha1, beta1), lty = 2, col = "red")
legend(c("topright"), c("Infrequent posters", "Frequent posters"),
col = c("red", "blue"), bty = "n", lty = c(1,1))
## Density function for the risk ratio.
dens = Vectorize(function(z, alpha1, beta1, alpha0, beta0) {
integrate(function(p0) {
p0*dbeta(z*p0, alpha1, beta1)*dbeta(p0, alpha0, beta0)
}, lower = 0, upper = min(1, 1/z))$value
})
## Distribution function for the risk ratio.
dist = Vectorize(function(z, alpha1, beta1, alpha0, beta0) {
integrate(function(p0) {
pbeta(z*p0, alpha1, beta1)*dbeta(p0, alpha0, beta0)
}, lower = 0, upper = 1)$value
})
## Find the modes.
## Plotting.
## Posterior in black, prior in purple.
x = seq(1, 2.1, by = 0.001)
plot(x, sapply(x, dens, alpha1 = alpha1_post,
beta1 = beta1_post,
alpha0 = alpha0_post,
beta0 = beta0_post), type = "l",
main = "Risk ratio posterior density", bty = "l", ylab = "Density", xlab = "Risk ratio",
sub = bquote(alpha[0] ~ "=" ~ .(alpha0) ~ ";" ~ beta[0] ~ "=" ~ .(beta0) ~
alpha[1] ~ "=" ~ .(alpha1) ~ ";" ~ beta[1] ~ "=" ~ .(beta1)))
mode0 = nlm(function(x) -dens(x, alpha1 = alpha1_post,
beta1 = beta1_post,
alpha0 = alpha0_post,
beta0 = beta0_post), p = 1.5)$estimate
abline(v = mode0, lty = 2)
legend(c("topright"), legend = c("Density", paste0("Mode = ",(signif(mode0, 3)))), lty = c(1,2), bty = "n")
## ============================================================================
## Using the distribution function to find 95% credibility intervals.
## ============================================================================
lower = optimize(function(x) (dist(x, alpha1 = alpha1_post,
beta1 = beta1_post,
alpha0 = alpha0_post,
beta0 = beta0_post) - 0.025)^2, lower = -3, upper = 2)$minimum
upper = optimize(function(x) (dist(x, alpha1 = alpha1_post,
beta1 = beta1_post,
alpha0 = alpha0_post,
beta0 = beta0_post) - 0.975)^2, lower = -3, upper = 2)$minimum
signif(c(lower, upper), 3)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment