Created
March 14, 2025 00:51
-
-
Save acbass49/ee534d080de6cf36dbb3b69039be4c21 to your computer and use it in GitHub Desktop.
Predicting Converts
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Title: Predicting Converts | |
| # Description: Predicting converts in the Mormon church | |
| # Date: 2025-03-13 | |
| # Author: Alex Bass | |
| # sourcing previous function | |
| get_predictions <- function(model, new_data, names, model_name){ | |
| tmp <- predict(model, newdata = new_data,type = 'response', se.fit = TRUE) | |
| tmp |> | |
| dplyr::as_tibble() |> | |
| dplyr::mutate( | |
| lower = fit - 1.96 * `se.fit`, | |
| upper = fit + 1.96 * `se.fit`, | |
| names = names, | |
| model = model_name | |
| ) |> | |
| dplyr::select(model, names, fit, lower, upper) | |
| } | |
| # loading packages | |
| library(tidyverse) | |
| library(haven) | |
| # load data | |
| data <- read_sav('./data/rls/rls2024.sav') | |
| # Number of converts in the sample | |
| disaffiliated_24 <- data |> | |
| filter(FRMREL != 20000 & RELTRAD == 20000) |> | |
| nrow() | |
| # there is 151 converts | |
| total_24 <- data |> | |
| filter(RELTRAD == 20000) |> | |
| nrow() | |
| # there is 565 total Mormons in the sample | |
| 151/565 | |
| prop_24 <- disaffiliated_24 / total_24 | |
| ci_24 <- prop.test(disaffiliated_24, total_24)$conf.int | |
| cat("2024: ", prop_24, " (95% CI: ", ci_24[1], "-", ci_24[2], ")\n") | |
| # meaning that 27% of Mormons are converts | |
| # How has this changed over time? | |
| data_07 <- read_sav('./data/rls/rls2007.sav') | |
| data_14 <- read_sav('./data/rls/rls2014.sav') | |
| disaffiliated_07 <- (data_07 |> | |
| filter(q50 != 3 & reltrad == 20000) |> | |
| nrow()) | |
| total_07 <- (data_07 |> | |
| filter(reltrad == 20000) |> | |
| nrow() | |
| ) | |
| prop_07 <- disaffiliated_07 / total_07 | |
| ci_07 <- prop.test(disaffiliated_07, total_07)$conf.int | |
| cat("2007: ", prop_07, " (95% CI: ", ci_07[1], "-", ci_07[2], ")\n") | |
| # In 2007, the percent of converts was 28% | |
| disaffiliated_14 <- (data_14 |> | |
| filter(qj1 != 3 & RELTRAD == 20000) |> | |
| nrow()) | |
| total_14 <- (data_14 |> | |
| filter(RELTRAD == 20000) |> | |
| nrow() | |
| ) | |
| prop_14 <- disaffiliated_14 / total_14 | |
| ci_14 <- prop.test(disaffiliated_14, total_14)$conf.int | |
| cat("2014: ", prop_14, " (95% CI: ", ci_14[1], "-", ci_14[2], ")\n") | |
| # In 2014, the percent of converts was 30% | |
| # So it has stayed pretty much the same in the last 20 years. | |
| # Let's look at activity rate of converts vs. other types of Mormons | |
| data$convert <- dplyr::case_when( | |
| data$FRMREL != 20000 & data$RELTRAD == 20000 ~ "convert", | |
| data$FRMREL == 20000 & data$RELTRAD == 20000 ~ "born_in_church", | |
| TRUE ~ NA | |
| ) | |
| data$attend <- ifelse(data$ATTNDPERRLS %in% c(1,2),1,ifelse(data$ATTNDPERRLS %in% c(9),NA,0)) | |
| data |> | |
| group_by(convert) |> | |
| count(attend) |> | |
| drop_na() |> | |
| mutate(prop = n/sum(n)) | |
| # converts are less likely to attend church weekly 77% vs. 67% | |
| # Now let's estimate a simple model for converts | |
| # the dv will be super imbalanced, so will need to randomly select non-convert respondents | |
| # may need to join multiple datasets for more power | |
| data$marital_rc <- as.numeric(car::recode(data$MARITAL, "4:5=2;6=4;99=NA")) | |
| data$age_rc <- factor(car::recode( | |
| as.numeric(data$BIRTHDECADE), | |
| "6:7='18-35';5='36-45';3:4='46-65';1:2='66+';99=NA", as.numeric = FALSE), | |
| levels = c( | |
| "18-35", "36-45", "46-65", "66+") | |
| ) | |
| data$educ_rc <- as.numeric(car::recode(data$EDUCREC, "99=NA")) | |
| data$non_white <- as.numeric(car::recode(data$RACECMB, "1=0;2:5=1;9=NA")) | |
| data$female <- as.numeric(car::recode(data$GENDER, "3=NA;99=NA")) | |
| data$inc_rc <- as.numeric(car::recode(data$INC_SDT1, "1:3=1;4:5=2;6:8=3;99=NA")) | |
| data$fertrec_rc <- as.numeric(car::recode(data$FERTREC, "99=NA")) | |
| data$party_rc <- dplyr::case_when( | |
| data$PARTY %in% c(1) | (data$PARTY %in% c(3) & data$PARTYLN %in% c(1)) ~ 1, | |
| data$PARTY %in% c(2) | (data$PARTY %in% c(3) & data$PARTYLN %in% c(2)) ~ 2, | |
| data$PARTY %in% c(4) ~ 3, | |
| TRUE ~ NA | |
| ) | |
| data$race_rc <- dplyr::case_when( | |
| data$RACECMB %in% c(1) & data$HISP %in% c(2) ~ 1, | |
| data$RACECMB %in% c(2) & data$HISP %in% c(2) ~ 2, | |
| data$HISP %in% c(1) ~ 3, | |
| data$RACECMB %in% c(3,4,5) & data$HISP %in% c(2) ~ 4, | |
| TRUE ~ NA | |
| ) | |
| data <- data |> | |
| mutate(former_relig = factor(car::recode(as.numeric(FRMREL), "1000='Protestant';10000='Catholic';100000='Unaffiliated';else='Other'", as.factor = TRUE),levels = c("Other", 'Protestant', 'Catholic', 'Unaffiliated'))) | |
| reg_data <- data |> | |
| filter(FRMREL != 20000) |> | |
| mutate(converted = ifelse(RELTRAD == 20000,1,0)) | |
| to_join <- reg_data |> | |
| filter(RELTRAD != 20000) | |
| reg_data |> | |
| filter(converted == 0) |> | |
| nrow() | |
| #sampling from non-converts to balance the dataset | |
| to_join <- to_join[sample(1:36027, 151),] | |
| reg_data <- reg_data |> | |
| filter(RELTRAD == 20000) |> | |
| bind_rows(to_join) | |
| prop.table(table(reg_data$converted)) | |
| # looks good | |
| reg_data$age_rc <- as.numeric(reg_data$age_rc) | |
| model_24 <- glm( | |
| "converted ~ factor(marital_rc) + factor(former_relig) + age_rc + educ_rc + female + non_white + inc_rc + fertrec_rc + factor(party_rc)", | |
| data = reg_data, | |
| family = binomial() | |
| ) | |
| summary(model_24) | |
| # I should combine all three datasets for the model | |
| # for more power of course | |
| # significant vars are "divorced", num_kids, GOP | |
| # Not sure how helpful these models because | |
| # I think converts will just conform to the | |
| # church. It would be much more helpful to | |
| # see survey data of before a person joins. | |
| # Let's look at some descriptive stats | |
| # with 2014 data joined in | |
| data_14$educ_rc <- as.numeric(car::recode(data_14$educ, "1:3=1;4:5=2;6=3;7:8=4;9=NA")) | |
| data_14$female <- data_14$SEX | |
| data_14$inc_rc <- as.numeric(car::recode(data_14$income, "1:5=1;6:7=2;8:9=3;99=NA")) | |
| data_14$marital_rc <- as.numeric(car::recode(data_14$marital, "3=2;6=3;4:5=4;2=4;9=NA")) | |
| data_14$age_rc <- factor(car::recode(as.numeric(data_14$agerec), | |
| "1:3='18-35';4:5='36-45';6:9='46-65';10:15='66+';99=NA", as.numeric = FALSE), | |
| levels = c("18-35", "36-45", "46-65", "66+")) | |
| data_14$non_white <- as.numeric(car::recode(data_14$racethn, "1=0;2:4=1;9=NA")) | |
| data_14$race_rc <- as.numeric(car::recode(data_14$racethn, "9=NA")) | |
| data_14$party_rc <- dplyr::case_when( | |
| data_14$party %in% c(1) | (data_14$party %in% c(3) & data_14$partyln %in% c(1)) ~ 1, | |
| data_14$party %in% c(2) | (data_14$party %in% c(3) & data_14$partyln %in% c(2)) ~ 2, | |
| data_14$party %in% c(4,5) ~ 3, | |
| TRUE ~ NA | |
| ) | |
| data_14$convert <- dplyr::case_when( | |
| data_14$qj1 != 3 & data_14$RELTRAD == 20000 ~ "convert", | |
| data_14$qj1 == 3 & data_14$RELTRAD == 20000 ~ "born_in_church", | |
| TRUE ~ NA | |
| ) | |
| data_14$fertrec_rc <- as.numeric(car::recode(data_14$fertrec, "4:5=4;99=NA")) | |
| data_14$former_relig <- factor(car::recode(as.numeric(data_14$qj1), "1='Protestant';2='Catholic';9:12='Unaffiliated';else='Other'", as.factor = TRUE),levels = c("Other", 'Protestant', 'Catholic', 'Unaffiliated')) | |
| get_tabs <- function(data, demo){ | |
| data |> | |
| group_by(convert) |> | |
| count(!!as.symbol(demo)) |> | |
| drop_na() |> | |
| mutate(prop = n/sum(n)) |> | |
| select(-n) |> | |
| pivot_wider(names_from = convert, values_from = prop) | |
| } | |
| names_to_keep <- c( | |
| 'marital_rc', | |
| 'female', | |
| 'age_rc', | |
| 'educ_rc', | |
| 'non_white', | |
| 'inc_rc', | |
| 'fertrec_rc', | |
| 'party_rc', | |
| 'convert', | |
| 'former_relig', | |
| 'race_rc', | |
| 'WEIGHT' | |
| ) | |
| data_combined <- data |> | |
| select(all_of(names_to_keep)) |> | |
| bind_rows(data_14) |> | |
| select(all_of(names_to_keep)) | |
| #Running another model with the additional data | |
| reg_data_2 <- data_combined |> | |
| filter(convert != "born_in_church" | is.na(convert)) |> | |
| mutate(converted = ifelse(convert == "convert",1,ifelse(is.na(convert),0,0))) |> | |
| mutate(converted = replace_na(converted, 0)) | |
| to_join <- reg_data_2 |> | |
| filter(converted == 0) | |
| top_of_range <- reg_data_2 |> | |
| filter(converted == 0) |> | |
| nrow() | |
| num_samples <- sum(data_combined$convert == "convert", na.rm = TRUE) | |
| #sampling from non-converts to balance the dataset | |
| to_join <- to_join[sample(1:top_of_range, num_samples),] | |
| reg_data_2 <- reg_data_2 |> | |
| filter(converted == 1) |> | |
| bind_rows(to_join) | |
| prop.table(table(reg_data_2$converted)) | |
| reg_data_2$age_rc <- as.numeric(reg_data_2$age_rc) | |
| model_24_14 <- glm( | |
| "converted ~ factor(marital_rc) + factor(former_relig) + age_rc + educ_rc + female + non_white + fertrec_rc + factor(party_rc) + inc_rc", | |
| data = reg_data_2, | |
| family = binomial() | |
| ) | |
| summary(model_24_14) | |
| # Tidy the model output and exponentiate the coefficients | |
| tidy_model <- broom::tidy(model_24_14, conf.int = TRUE) | |
| tidy_model <- tidy_model %>% | |
| mutate( | |
| odds_ratio = exp(estimate), | |
| conf.low = exp(conf.low), | |
| conf.high = exp(conf.high)) | |
| # Create the coefficient plot | |
| ggplot(tidy_model, aes(x = odds_ratio, y = reorder(term, odds_ratio))) + | |
| geom_point() + | |
| geom_errorbarh(aes(xmin = conf.low, xmax = conf.high), height = 0.2) + | |
| labs( | |
| title = "Coefficient Plot for Logistic Regression Model (Odds Ratios)", | |
| x = "Odds Ratio", | |
| y = "Predictor") + | |
| theme_minimal() + | |
| geom_vline(xintercept = 1, linetype = "dashed", color = "red") | |
| data_combined |> | |
| filter(convert == "convert") |> | |
| nrow() | |
| # total of 350 converts - not bad | |
| demos <- c( | |
| 'marital_rc', | |
| 'age_rc', | |
| 'educ_rc', | |
| 'race_rc', | |
| 'female', | |
| 'inc_rc', | |
| 'fertrec_rc', | |
| 'party_rc' | |
| ) | |
| dfs <- c() | |
| for (demo in demos){ | |
| dfs <- c(dfs, list(get_tabs(data |> filter(!is.na(convert)), demo) |> | |
| mutate(var = demo) |> | |
| relocate(var))) | |
| } | |
| (tmp <- data.table::rbindlist(dfs, use.names = FALSE)) | |
| # the converts... | |
| # - skews older than Mormons! | |
| # - skews less educated and lower income | |
| # - skews less white | |
| # - skews less Republican, but still very Republican | |
| # - fewer kids than Mormons, but still lots of kids | |
| get_tabs_total_pop <- function(data, demo){ | |
| data |> | |
| count(!!as.symbol(demo), wt=WEIGHT) |> | |
| drop_na() |> | |
| mutate(prop = n/sum(n)) |> | |
| select(-n) | |
| } | |
| demos <- c( | |
| 'marital_rc', | |
| 'age_rc', | |
| 'educ_rc', | |
| 'race_rc', | |
| 'female', | |
| 'inc_rc', | |
| 'fertrec_rc', | |
| 'party_rc' | |
| ) | |
| dfs <- c() | |
| for (demo in demos){ | |
| dfs <- c(dfs, list(get_tabs_total_pop(data, demo) |> | |
| mutate(var = demo) |> | |
| relocate(var))) | |
| } | |
| tmp2 <- tmp |> | |
| left_join(data.table::rbindlist(dfs, use.names = FALSE) |> | |
| dplyr::rename(overall_pop = prop), | |
| by = c("var", "marital_rc") | |
| ) |> | |
| select(var, marital_rc, overall_pop, born_in_church, convert) |> | |
| mutate(across(3:5, \(x) paste0(round(x, 2)*100, "%"))) | |
| tmp2 |> | |
| write.csv("~/Desktop/converts.csv", row.names = FALSE) | |
| # predicted probabilities | |
| len_yrs <- 1 | |
| predict_data_2024_14 <- data.frame( | |
| age_rc = rep(mean(reg_data_2$age_rc, na.rm = T),len_yrs), | |
| educ_rc = rep(mean(reg_data_2$educ_rc, na.rm = T),len_yrs), | |
| non_white = rep(mean(reg_data_2$non_white, na.rm = T),len_yrs), | |
| female = rep(mean(reg_data_2$female, na.rm=TRUE),len_yrs), | |
| marital_rc = rep(1,len_yrs), #modal value | |
| inc_rc = rep(mean(reg_data_2$inc_rc, na.rm = T),len_yrs), | |
| fertrec_rc = rep(mean(reg_data_2$fertrec_rc, na.rm = T),len_yrs), | |
| party_rc = rep(1,len_yrs), | |
| former_relig = rep("Other",len_yrs) | |
| ) | |
| # Baseline Rate | |
| lbls <- c("baseline") | |
| get_predictions(model_24_14, predict_data_2024_14, lbls, "2014+2024 data") | |
| # by num children | |
| fertrec_rc <- seq(0,4,1) | |
| len_yrs <- length(fertrec_rc) | |
| predict_data_2024_14 <- data.frame( | |
| age_rc = rep(mean(reg_data_2$age_rc, na.rm = T),len_yrs), | |
| educ_rc = rep(mean(reg_data_2$educ_rc, na.rm = T),len_yrs), | |
| non_white = rep(mean(reg_data_2$non_white, na.rm = T),len_yrs), | |
| female = rep(mean(reg_data_2$female, na.rm=TRUE),len_yrs), | |
| marital_rc = rep(1,len_yrs), #modal value | |
| inc_rc = rep(mean(reg_data_2$inc_rc, na.rm = T),len_yrs), | |
| fertrec_rc = fertrec_rc, | |
| party_rc = rep(3,len_yrs), | |
| former_relig = rep("Other",len_yrs) | |
| ) | |
| lbls <- c("child:0","child:1","child:2","child:3","child:4+") | |
| get_predictions(model_24_14, predict_data_2024_14, lbls, "2014+2024 data") |> | |
| mutate(baseline_ratio = fit/min(fit)) | |
| # by income | |
| inc_rc <- seq(1,3,1) | |
| len_yrs <- length(inc_rc) | |
| predict_data_2024_14 <- data.frame( | |
| age_rc = rep(mean(reg_data_2$age_rc, na.rm = T),len_yrs), | |
| educ_rc = rep(mean(reg_data_2$educ_rc, na.rm = T),len_yrs), | |
| non_white = rep(mean(reg_data_2$non_white, na.rm = T),len_yrs), | |
| female = rep(mean(reg_data_2$female, na.rm=TRUE),len_yrs), | |
| marital_rc = rep(1,len_yrs), #modal value | |
| inc_rc = inc_rc, | |
| fertrec_rc = rep(mean(reg_data_2$fertrec_rc, na.rm = T),len_yrs), | |
| party_rc = rep(3,len_yrs), | |
| former_relig = rep("Other",len_yrs) | |
| ) | |
| lbls <- c("income:below50","income:50-100","income:100+") | |
| get_predictions(model_24_14, predict_data_2024_14, lbls, "2014+2024 data") |> | |
| mutate(baseline_ratio = fit/min(fit)) | |
| # by party | |
| party_rc <- seq(1,3,1) | |
| len_yrs <- length(party_rc) | |
| predict_data_2024_14 <- data.frame( | |
| age_rc = rep(mean(reg_data_2$age_rc, na.rm = T),len_yrs), | |
| educ_rc = rep(mean(reg_data_2$educ_rc, na.rm = T),len_yrs), | |
| non_white = rep(mean(reg_data_2$non_white, na.rm = T),len_yrs), | |
| female = rep(mean(reg_data_2$female, na.rm=TRUE),len_yrs), | |
| marital_rc = rep(1,len_yrs), #modal value | |
| inc_rc = rep(mean(reg_data_2$inc_rc, na.rm = T),len_yrs), | |
| fertrec_rc = rep(mean(reg_data_2$fertrec_rc, na.rm = T),len_yrs), | |
| party_rc = party_rc, | |
| former_relig = rep("Other",len_yrs) | |
| ) | |
| lbls <- c("party:GOP","party:DEM", "party:Other") | |
| get_predictions(model_24_14, predict_data_2024_14, lbls, "2014+2024 data") |> | |
| mutate(baseline_ratio = fit/min(fit)) | |
| # by former religion | |
| former_relig <- c("Protestant","Catholic","Unaffiliated","Other") | |
| len_yrs <- length(former_relig) | |
| predict_data_2024_14 <- data.frame( | |
| age_rc = rep(mean(reg_data_2$age_rc, na.rm = T),len_yrs), | |
| educ_rc = rep(mean(reg_data_2$educ_rc, na.rm = T),len_yrs), | |
| non_white = rep(mean(reg_data_2$non_white, na.rm = T),len_yrs), | |
| female = rep(mean(reg_data_2$female, na.rm=TRUE),len_yrs), | |
| marital_rc = rep(1,len_yrs), #modal value | |
| inc_rc = rep(mean(reg_data_2$inc_rc, na.rm = T),len_yrs), | |
| fertrec_rc = rep(mean(reg_data_2$fertrec_rc, na.rm = T),len_yrs), | |
| party_rc = rep(3,len_yrs), | |
| former_relig = former_relig | |
| ) | |
| lbls <- c("Protestant","Catholic","Unaffiliated","Other") | |
| get_predictions(model_24_14, predict_data_2024_14, lbls, "2014+2024 data") |> | |
| mutate(baseline_ratio = fit/min(fit)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment