Skip to content

Instantly share code, notes, and snippets.

@acbass49
Created March 14, 2025 00:51
Show Gist options
  • Select an option

  • Save acbass49/ee534d080de6cf36dbb3b69039be4c21 to your computer and use it in GitHub Desktop.

Select an option

Save acbass49/ee534d080de6cf36dbb3b69039be4c21 to your computer and use it in GitHub Desktop.
Predicting Converts
# Title: Predicting Converts
# Description: Predicting converts in the Mormon church
# Date: 2025-03-13
# Author: Alex Bass
# sourcing previous function
get_predictions <- function(model, new_data, names, model_name){
tmp <- predict(model, newdata = new_data,type = 'response', se.fit = TRUE)
tmp |>
dplyr::as_tibble() |>
dplyr::mutate(
lower = fit - 1.96 * `se.fit`,
upper = fit + 1.96 * `se.fit`,
names = names,
model = model_name
) |>
dplyr::select(model, names, fit, lower, upper)
}
# loading packages
library(tidyverse)
library(haven)
# load data
data <- read_sav('./data/rls/rls2024.sav')
# Number of converts in the sample
disaffiliated_24 <- data |>
filter(FRMREL != 20000 & RELTRAD == 20000) |>
nrow()
# there is 151 converts
total_24 <- data |>
filter(RELTRAD == 20000) |>
nrow()
# there is 565 total Mormons in the sample
151/565
prop_24 <- disaffiliated_24 / total_24
ci_24 <- prop.test(disaffiliated_24, total_24)$conf.int
cat("2024: ", prop_24, " (95% CI: ", ci_24[1], "-", ci_24[2], ")\n")
# meaning that 27% of Mormons are converts
# How has this changed over time?
data_07 <- read_sav('./data/rls/rls2007.sav')
data_14 <- read_sav('./data/rls/rls2014.sav')
disaffiliated_07 <- (data_07 |>
filter(q50 != 3 & reltrad == 20000) |>
nrow())
total_07 <- (data_07 |>
filter(reltrad == 20000) |>
nrow()
)
prop_07 <- disaffiliated_07 / total_07
ci_07 <- prop.test(disaffiliated_07, total_07)$conf.int
cat("2007: ", prop_07, " (95% CI: ", ci_07[1], "-", ci_07[2], ")\n")
# In 2007, the percent of converts was 28%
disaffiliated_14 <- (data_14 |>
filter(qj1 != 3 & RELTRAD == 20000) |>
nrow())
total_14 <- (data_14 |>
filter(RELTRAD == 20000) |>
nrow()
)
prop_14 <- disaffiliated_14 / total_14
ci_14 <- prop.test(disaffiliated_14, total_14)$conf.int
cat("2014: ", prop_14, " (95% CI: ", ci_14[1], "-", ci_14[2], ")\n")
# In 2014, the percent of converts was 30%
# So it has stayed pretty much the same in the last 20 years.
# Let's look at activity rate of converts vs. other types of Mormons
data$convert <- dplyr::case_when(
data$FRMREL != 20000 & data$RELTRAD == 20000 ~ "convert",
data$FRMREL == 20000 & data$RELTRAD == 20000 ~ "born_in_church",
TRUE ~ NA
)
data$attend <- ifelse(data$ATTNDPERRLS %in% c(1,2),1,ifelse(data$ATTNDPERRLS %in% c(9),NA,0))
data |>
group_by(convert) |>
count(attend) |>
drop_na() |>
mutate(prop = n/sum(n))
# converts are less likely to attend church weekly 77% vs. 67%
# Now let's estimate a simple model for converts
# the dv will be super imbalanced, so will need to randomly select non-convert respondents
# may need to join multiple datasets for more power
data$marital_rc <- as.numeric(car::recode(data$MARITAL, "4:5=2;6=4;99=NA"))
data$age_rc <- factor(car::recode(
as.numeric(data$BIRTHDECADE),
"6:7='18-35';5='36-45';3:4='46-65';1:2='66+';99=NA", as.numeric = FALSE),
levels = c(
"18-35", "36-45", "46-65", "66+")
)
data$educ_rc <- as.numeric(car::recode(data$EDUCREC, "99=NA"))
data$non_white <- as.numeric(car::recode(data$RACECMB, "1=0;2:5=1;9=NA"))
data$female <- as.numeric(car::recode(data$GENDER, "3=NA;99=NA"))
data$inc_rc <- as.numeric(car::recode(data$INC_SDT1, "1:3=1;4:5=2;6:8=3;99=NA"))
data$fertrec_rc <- as.numeric(car::recode(data$FERTREC, "99=NA"))
data$party_rc <- dplyr::case_when(
data$PARTY %in% c(1) | (data$PARTY %in% c(3) & data$PARTYLN %in% c(1)) ~ 1,
data$PARTY %in% c(2) | (data$PARTY %in% c(3) & data$PARTYLN %in% c(2)) ~ 2,
data$PARTY %in% c(4) ~ 3,
TRUE ~ NA
)
data$race_rc <- dplyr::case_when(
data$RACECMB %in% c(1) & data$HISP %in% c(2) ~ 1,
data$RACECMB %in% c(2) & data$HISP %in% c(2) ~ 2,
data$HISP %in% c(1) ~ 3,
data$RACECMB %in% c(3,4,5) & data$HISP %in% c(2) ~ 4,
TRUE ~ NA
)
data <- data |>
mutate(former_relig = factor(car::recode(as.numeric(FRMREL), "1000='Protestant';10000='Catholic';100000='Unaffiliated';else='Other'", as.factor = TRUE),levels = c("Other", 'Protestant', 'Catholic', 'Unaffiliated')))
reg_data <- data |>
filter(FRMREL != 20000) |>
mutate(converted = ifelse(RELTRAD == 20000,1,0))
to_join <- reg_data |>
filter(RELTRAD != 20000)
reg_data |>
filter(converted == 0) |>
nrow()
#sampling from non-converts to balance the dataset
to_join <- to_join[sample(1:36027, 151),]
reg_data <- reg_data |>
filter(RELTRAD == 20000) |>
bind_rows(to_join)
prop.table(table(reg_data$converted))
# looks good
reg_data$age_rc <- as.numeric(reg_data$age_rc)
model_24 <- glm(
"converted ~ factor(marital_rc) + factor(former_relig) + age_rc + educ_rc + female + non_white + inc_rc + fertrec_rc + factor(party_rc)",
data = reg_data,
family = binomial()
)
summary(model_24)
# I should combine all three datasets for the model
# for more power of course
# significant vars are "divorced", num_kids, GOP
# Not sure how helpful these models because
# I think converts will just conform to the
# church. It would be much more helpful to
# see survey data of before a person joins.
# Let's look at some descriptive stats
# with 2014 data joined in
data_14$educ_rc <- as.numeric(car::recode(data_14$educ, "1:3=1;4:5=2;6=3;7:8=4;9=NA"))
data_14$female <- data_14$SEX
data_14$inc_rc <- as.numeric(car::recode(data_14$income, "1:5=1;6:7=2;8:9=3;99=NA"))
data_14$marital_rc <- as.numeric(car::recode(data_14$marital, "3=2;6=3;4:5=4;2=4;9=NA"))
data_14$age_rc <- factor(car::recode(as.numeric(data_14$agerec),
"1:3='18-35';4:5='36-45';6:9='46-65';10:15='66+';99=NA", as.numeric = FALSE),
levels = c("18-35", "36-45", "46-65", "66+"))
data_14$non_white <- as.numeric(car::recode(data_14$racethn, "1=0;2:4=1;9=NA"))
data_14$race_rc <- as.numeric(car::recode(data_14$racethn, "9=NA"))
data_14$party_rc <- dplyr::case_when(
data_14$party %in% c(1) | (data_14$party %in% c(3) & data_14$partyln %in% c(1)) ~ 1,
data_14$party %in% c(2) | (data_14$party %in% c(3) & data_14$partyln %in% c(2)) ~ 2,
data_14$party %in% c(4,5) ~ 3,
TRUE ~ NA
)
data_14$convert <- dplyr::case_when(
data_14$qj1 != 3 & data_14$RELTRAD == 20000 ~ "convert",
data_14$qj1 == 3 & data_14$RELTRAD == 20000 ~ "born_in_church",
TRUE ~ NA
)
data_14$fertrec_rc <- as.numeric(car::recode(data_14$fertrec, "4:5=4;99=NA"))
data_14$former_relig <- factor(car::recode(as.numeric(data_14$qj1), "1='Protestant';2='Catholic';9:12='Unaffiliated';else='Other'", as.factor = TRUE),levels = c("Other", 'Protestant', 'Catholic', 'Unaffiliated'))
get_tabs <- function(data, demo){
data |>
group_by(convert) |>
count(!!as.symbol(demo)) |>
drop_na() |>
mutate(prop = n/sum(n)) |>
select(-n) |>
pivot_wider(names_from = convert, values_from = prop)
}
names_to_keep <- c(
'marital_rc',
'female',
'age_rc',
'educ_rc',
'non_white',
'inc_rc',
'fertrec_rc',
'party_rc',
'convert',
'former_relig',
'race_rc',
'WEIGHT'
)
data_combined <- data |>
select(all_of(names_to_keep)) |>
bind_rows(data_14) |>
select(all_of(names_to_keep))
#Running another model with the additional data
reg_data_2 <- data_combined |>
filter(convert != "born_in_church" | is.na(convert)) |>
mutate(converted = ifelse(convert == "convert",1,ifelse(is.na(convert),0,0))) |>
mutate(converted = replace_na(converted, 0))
to_join <- reg_data_2 |>
filter(converted == 0)
top_of_range <- reg_data_2 |>
filter(converted == 0) |>
nrow()
num_samples <- sum(data_combined$convert == "convert", na.rm = TRUE)
#sampling from non-converts to balance the dataset
to_join <- to_join[sample(1:top_of_range, num_samples),]
reg_data_2 <- reg_data_2 |>
filter(converted == 1) |>
bind_rows(to_join)
prop.table(table(reg_data_2$converted))
reg_data_2$age_rc <- as.numeric(reg_data_2$age_rc)
model_24_14 <- glm(
"converted ~ factor(marital_rc) + factor(former_relig) + age_rc + educ_rc + female + non_white + fertrec_rc + factor(party_rc) + inc_rc",
data = reg_data_2,
family = binomial()
)
summary(model_24_14)
# Tidy the model output and exponentiate the coefficients
tidy_model <- broom::tidy(model_24_14, conf.int = TRUE)
tidy_model <- tidy_model %>%
mutate(
odds_ratio = exp(estimate),
conf.low = exp(conf.low),
conf.high = exp(conf.high))
# Create the coefficient plot
ggplot(tidy_model, aes(x = odds_ratio, y = reorder(term, odds_ratio))) +
geom_point() +
geom_errorbarh(aes(xmin = conf.low, xmax = conf.high), height = 0.2) +
labs(
title = "Coefficient Plot for Logistic Regression Model (Odds Ratios)",
x = "Odds Ratio",
y = "Predictor") +
theme_minimal() +
geom_vline(xintercept = 1, linetype = "dashed", color = "red")
data_combined |>
filter(convert == "convert") |>
nrow()
# total of 350 converts - not bad
demos <- c(
'marital_rc',
'age_rc',
'educ_rc',
'race_rc',
'female',
'inc_rc',
'fertrec_rc',
'party_rc'
)
dfs <- c()
for (demo in demos){
dfs <- c(dfs, list(get_tabs(data |> filter(!is.na(convert)), demo) |>
mutate(var = demo) |>
relocate(var)))
}
(tmp <- data.table::rbindlist(dfs, use.names = FALSE))
# the converts...
# - skews older than Mormons!
# - skews less educated and lower income
# - skews less white
# - skews less Republican, but still very Republican
# - fewer kids than Mormons, but still lots of kids
get_tabs_total_pop <- function(data, demo){
data |>
count(!!as.symbol(demo), wt=WEIGHT) |>
drop_na() |>
mutate(prop = n/sum(n)) |>
select(-n)
}
demos <- c(
'marital_rc',
'age_rc',
'educ_rc',
'race_rc',
'female',
'inc_rc',
'fertrec_rc',
'party_rc'
)
dfs <- c()
for (demo in demos){
dfs <- c(dfs, list(get_tabs_total_pop(data, demo) |>
mutate(var = demo) |>
relocate(var)))
}
tmp2 <- tmp |>
left_join(data.table::rbindlist(dfs, use.names = FALSE) |>
dplyr::rename(overall_pop = prop),
by = c("var", "marital_rc")
) |>
select(var, marital_rc, overall_pop, born_in_church, convert) |>
mutate(across(3:5, \(x) paste0(round(x, 2)*100, "%")))
tmp2 |>
write.csv("~/Desktop/converts.csv", row.names = FALSE)
# predicted probabilities
len_yrs <- 1
predict_data_2024_14 <- data.frame(
age_rc = rep(mean(reg_data_2$age_rc, na.rm = T),len_yrs),
educ_rc = rep(mean(reg_data_2$educ_rc, na.rm = T),len_yrs),
non_white = rep(mean(reg_data_2$non_white, na.rm = T),len_yrs),
female = rep(mean(reg_data_2$female, na.rm=TRUE),len_yrs),
marital_rc = rep(1,len_yrs), #modal value
inc_rc = rep(mean(reg_data_2$inc_rc, na.rm = T),len_yrs),
fertrec_rc = rep(mean(reg_data_2$fertrec_rc, na.rm = T),len_yrs),
party_rc = rep(1,len_yrs),
former_relig = rep("Other",len_yrs)
)
# Baseline Rate
lbls <- c("baseline")
get_predictions(model_24_14, predict_data_2024_14, lbls, "2014+2024 data")
# by num children
fertrec_rc <- seq(0,4,1)
len_yrs <- length(fertrec_rc)
predict_data_2024_14 <- data.frame(
age_rc = rep(mean(reg_data_2$age_rc, na.rm = T),len_yrs),
educ_rc = rep(mean(reg_data_2$educ_rc, na.rm = T),len_yrs),
non_white = rep(mean(reg_data_2$non_white, na.rm = T),len_yrs),
female = rep(mean(reg_data_2$female, na.rm=TRUE),len_yrs),
marital_rc = rep(1,len_yrs), #modal value
inc_rc = rep(mean(reg_data_2$inc_rc, na.rm = T),len_yrs),
fertrec_rc = fertrec_rc,
party_rc = rep(3,len_yrs),
former_relig = rep("Other",len_yrs)
)
lbls <- c("child:0","child:1","child:2","child:3","child:4+")
get_predictions(model_24_14, predict_data_2024_14, lbls, "2014+2024 data") |>
mutate(baseline_ratio = fit/min(fit))
# by income
inc_rc <- seq(1,3,1)
len_yrs <- length(inc_rc)
predict_data_2024_14 <- data.frame(
age_rc = rep(mean(reg_data_2$age_rc, na.rm = T),len_yrs),
educ_rc = rep(mean(reg_data_2$educ_rc, na.rm = T),len_yrs),
non_white = rep(mean(reg_data_2$non_white, na.rm = T),len_yrs),
female = rep(mean(reg_data_2$female, na.rm=TRUE),len_yrs),
marital_rc = rep(1,len_yrs), #modal value
inc_rc = inc_rc,
fertrec_rc = rep(mean(reg_data_2$fertrec_rc, na.rm = T),len_yrs),
party_rc = rep(3,len_yrs),
former_relig = rep("Other",len_yrs)
)
lbls <- c("income:below50","income:50-100","income:100+")
get_predictions(model_24_14, predict_data_2024_14, lbls, "2014+2024 data") |>
mutate(baseline_ratio = fit/min(fit))
# by party
party_rc <- seq(1,3,1)
len_yrs <- length(party_rc)
predict_data_2024_14 <- data.frame(
age_rc = rep(mean(reg_data_2$age_rc, na.rm = T),len_yrs),
educ_rc = rep(mean(reg_data_2$educ_rc, na.rm = T),len_yrs),
non_white = rep(mean(reg_data_2$non_white, na.rm = T),len_yrs),
female = rep(mean(reg_data_2$female, na.rm=TRUE),len_yrs),
marital_rc = rep(1,len_yrs), #modal value
inc_rc = rep(mean(reg_data_2$inc_rc, na.rm = T),len_yrs),
fertrec_rc = rep(mean(reg_data_2$fertrec_rc, na.rm = T),len_yrs),
party_rc = party_rc,
former_relig = rep("Other",len_yrs)
)
lbls <- c("party:GOP","party:DEM", "party:Other")
get_predictions(model_24_14, predict_data_2024_14, lbls, "2014+2024 data") |>
mutate(baseline_ratio = fit/min(fit))
# by former religion
former_relig <- c("Protestant","Catholic","Unaffiliated","Other")
len_yrs <- length(former_relig)
predict_data_2024_14 <- data.frame(
age_rc = rep(mean(reg_data_2$age_rc, na.rm = T),len_yrs),
educ_rc = rep(mean(reg_data_2$educ_rc, na.rm = T),len_yrs),
non_white = rep(mean(reg_data_2$non_white, na.rm = T),len_yrs),
female = rep(mean(reg_data_2$female, na.rm=TRUE),len_yrs),
marital_rc = rep(1,len_yrs), #modal value
inc_rc = rep(mean(reg_data_2$inc_rc, na.rm = T),len_yrs),
fertrec_rc = rep(mean(reg_data_2$fertrec_rc, na.rm = T),len_yrs),
party_rc = rep(3,len_yrs),
former_relig = former_relig
)
lbls <- c("Protestant","Catholic","Unaffiliated","Other")
get_predictions(model_24_14, predict_data_2024_14, lbls, "2014+2024 data") |>
mutate(baseline_ratio = fit/min(fit))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment