Created
January 4, 2018 16:02
-
-
Save JonasMoss/1926e7db8e0c47c53914ca7874144b66 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## Checks for associations between politics and posting at SSC, using Beta-Bayes! | |
## Data from: http://slatestarcodex.com/Stuff/ssc2018public.xlsx | |
library("tidyverse") | |
SSC = read_excel("ssc2018public.xlsx") # Downloaded from SSC. | |
Politics = as.factor(SSC$PoliticalAffiliation) | |
Comments = as.factor(SSC$Comment) | |
## Not everyone will agree with my classification. | |
## Check out the content of lev if you're worried. | |
lev = levels(Politics) | |
levels(Politics) = list("Conservative" = c(lev[1],lev[4],lev[15],lev[17],lev[18],lev[22]), | |
"Libertarian" = c(lev[2],lev[3],lev[5],lev[6],lev[7],lev[10],lev[12],lev[13],lev[20],lev[21]), | |
"Social democratic" = c(lev[19]), | |
"Liberal" = c(lev[8],lev[9],lev[16]), | |
"Socialist" = c(lev[11],lev[14])) | |
## I change the comments classification from categorical to numerical. | |
levy = levels(Comments) | |
levels(Comments) = list("0" = levy[4], | |
"1" = levy[3], | |
"2" = levy[1], | |
"3" = levy[2], | |
"4" = levy[5]) | |
Comments = as.numeric(Comments) | |
levy = c(1:5, NA) | |
## Now I do the final manipulations before plotting. | |
tib = tibble(Comments = Comments, Politics = Politics) | |
count(tib, Politics, Comments) -> counts | |
## ============================================================================ | |
## Conditional probabilities: What about the conditional probability of being | |
## libertarian or conservative given a posting rate between 3 and 5? | |
## | |
## The answer is that the conditional probability is 0.54, compared to an | |
## unconditional probability of 0.33. This isn't bad! An increase form 1/3 to | |
## 1/2! | |
## ============================================================================ | |
counts %>% | |
filter(Comments %in% c(3,4,5)) %$% | |
n %>% | |
sum -> | |
denominator | |
counts %>% | |
filter(Politics %in% c("Conservative", "Libertarian"), Comments %in% c(3,4,5)) %$% | |
n %>% | |
sum -> | |
numerator | |
numerator/denominator # 0.54 | |
counts %>% | |
filter(Politics %in% c("Conservative", "Libertarian")) %$% | |
n %>% | |
sum -> | |
conslib | |
conslib/sum(counts$n) # 0.35 | |
## ============================================================================ | |
## Next natural problem: What is the risk that A is libertarian/conservative | |
## given that he posts a lot (3 - 5)? | |
## ============================================================================ | |
counts %>% | |
filter(Politics %in% c("Conservative", "Libertarian"), Comments %in% c(1,2)) %$% | |
n %>% | |
sum -> | |
numerator2 | |
counts %>% | |
filter(Comments %in% c(1,2)) %$% | |
n %>% | |
sum -> | |
denominator2 | |
p1 = numerator/denominator # 0.54 | |
p2 = numerator2/denominator2 # 0.34 | |
risk_ratio = p1/p2 # 1.57 | |
## ============================================================================ | |
## Beta analysis | |
## ============================================================================ | |
## This is the data. m0, m1 are the number of trials, y0, y1 are the number of | |
## successes (or deaths). | |
y1 = numerator | |
y0 = numerator2 | |
m1 = denominator | |
m0 = denominator2 | |
## The prior parameter for the betas. Equal parameters are neutral. | |
alpha0 = 1/2 | |
beta0 = 1/2 | |
alpha1 = 1/2 | |
beta1 = 1/2 | |
## Posterior parameters. | |
alpha0_post = alpha0 + y0 | |
beta0_post = beta0 + m0 - y0 | |
alpha1_post = alpha1 + y1 | |
beta1_post = beta1 + m1 - y1 | |
## Plot posteriors and priors for both ps. | |
x = seq(0, 1, by = 0.0001) | |
plot(x, dbeta(x, alpha0_post, beta0_post), lty = 1, col = "blue", type = "l", ylab = "Density", | |
bty = "l", main = "Priors and posteriors.") | |
lines(x, dbeta(x, alpha0, beta0), lty = 2, col = "blue") | |
lines(x, dbeta(x, alpha1_post, beta1_post), lty = 1, col = "red") | |
lines(x, dbeta(x, alpha1, beta1), lty = 2, col = "red") | |
legend(c("topright"), c("Infrequent posters", "Frequent posters"), | |
col = c("red", "blue"), bty = "n", lty = c(1,1)) | |
## Density function for the risk ratio. | |
dens = Vectorize(function(z, alpha1, beta1, alpha0, beta0) { | |
integrate(function(p0) { | |
p0*dbeta(z*p0, alpha1, beta1)*dbeta(p0, alpha0, beta0) | |
}, lower = 0, upper = min(1, 1/z))$value | |
}) | |
## Distribution function for the risk ratio. | |
dist = Vectorize(function(z, alpha1, beta1, alpha0, beta0) { | |
integrate(function(p0) { | |
pbeta(z*p0, alpha1, beta1)*dbeta(p0, alpha0, beta0) | |
}, lower = 0, upper = 1)$value | |
}) | |
## Find the modes. | |
## Plotting. | |
## Posterior in black, prior in purple. | |
x = seq(1, 2.1, by = 0.001) | |
plot(x, sapply(x, dens, alpha1 = alpha1_post, | |
beta1 = beta1_post, | |
alpha0 = alpha0_post, | |
beta0 = beta0_post), type = "l", | |
main = "Risk ratio posterior density", bty = "l", ylab = "Density", xlab = "Risk ratio", | |
sub = bquote(alpha[0] ~ "=" ~ .(alpha0) ~ ";" ~ beta[0] ~ "=" ~ .(beta0) ~ | |
alpha[1] ~ "=" ~ .(alpha1) ~ ";" ~ beta[1] ~ "=" ~ .(beta1))) | |
mode0 = nlm(function(x) -dens(x, alpha1 = alpha1_post, | |
beta1 = beta1_post, | |
alpha0 = alpha0_post, | |
beta0 = beta0_post), p = 1.5)$estimate | |
abline(v = mode0, lty = 2) | |
legend(c("topright"), legend = c("Density", paste0("Mode = ",(signif(mode0, 3)))), lty = c(1,2), bty = "n") | |
## ============================================================================ | |
## Using the distribution function to find 95% credibility intervals. | |
## ============================================================================ | |
lower = optimize(function(x) (dist(x, alpha1 = alpha1_post, | |
beta1 = beta1_post, | |
alpha0 = alpha0_post, | |
beta0 = beta0_post) - 0.025)^2, lower = -3, upper = 2)$minimum | |
upper = optimize(function(x) (dist(x, alpha1 = alpha1_post, | |
beta1 = beta1_post, | |
alpha0 = alpha0_post, | |
beta0 = beta0_post) - 0.975)^2, lower = -3, upper = 2)$minimum | |
signif(c(lower, upper), 3) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment