Skip to content

Instantly share code, notes, and snippets.

@hughjonesd
Created April 21, 2026 07:03
Show Gist options
  • Select an option

  • Save hughjonesd/e8d23b6f4a8f5095b554a226e6c99edb to your computer and use it in GitHub Desktop.

Select an option

Save hughjonesd/e8d23b6f4a8f5095b554a226e6c99edb to your computer and use it in GitHub Desktop.
Code for Substack ISSP note
#### Setup ####
library(broom)
library(countrycode)
library(glue)
library(haven)
library(patchwork)
library(rnaturalearth)
library(tidyverse)
library(santoku) # out of order so as to preserve chop() from tidyr
issp <- haven::read_dta("data/issp-family-2022/ZA10000_v2-0-0.dta")
labels <- map(issp, \(x) attr(x, "label"))
labels <- tibble(var = names(labels), label = as.character(labels))
# most but not all variables with a _ are country-specific
ctry_specific <- str_detect(labels$label, "Country specific")
labels <- labels |> filter(! ctry_specific)
issp <- issp[, ! ctry_specific]
issp <- issp |>
mutate(
# this is a first guess!
across(everything(), ~ if_else(.x < 0, NA, .x)),
v19 = na_if(v19, 95), # 95 = "as many as possible"
v26 = na_if(v26, 960),
Sex = if_else(SEX == 1, "Male", "Female"),
Degree = if_else(EDULEVEL >= 6, "Degree", "No degree"),
Urban = if_else(URBRURAL <= 2, "City/suburbs", "Other"),
Status = chop(TOPBOT, c(Low = 0, Middle = 5, High = 7)),
Married = if_else(MARITAL %in% 1:2, "Married", "Other"),
`Marital Status` = case_match(MARITAL,
1:2 ~ "Married",
3:5 ~ "Previously married",
6 ~ "Never married"
),
Age = chop(as.numeric(AGE), seq(20, 65, 5), labels = lbl_discrete("-")),
countryname = countrycode::countrycode(country, "iso3n", "country.name.en"),
developing = countryname %in% c("India", "Philippines", "Russia",
"Thailand", "South Africa"),
# NB: Russia isn't a European country, for reasons we need not describe
europe = countryname %in% c("Austria", "Bulgaria", "Croatia", "Czechia",
"Denmark", "Finland", "France", "Germany",
"Greece", "Hungary", "Iceland", "Italy",
"Lithuania", "Netherlands", "Norway",
"Poland", "Slovakia", "Slovenia", "Spain",
"Sweden", "Switzerland"),
Continent = case_when(
europe ~ "Europe",
countryname == "United States" ~ "US",
.default = "Other"
)
) |>
mutate(.by = countryname,
# all countries get equal weight
# x 1000 to avoid small weights
eq_weight = WEIGHT_COM/sum(WEIGHT_COM) * 1000
) |>
# Partial out country/age-group effects
mutate(.by = c(countryname, Age),
kids_p = v63 - weighted.mean(v63, eq_weight, na.rm = TRUE)
) |>
mutate(
kids_p = kids_p + weighted.mean(v63, eq_weight, na.rm = TRUE)
)
## Weight survey countries
pops <- tidyr::world_bank_pop |>
filter(indicator == "SP.POP.TOTL") |>
select(country, `2017`) |>
rename(pop = `2017`) |>
mutate(
country = countrycode::countrycode(country, "wb", "iso3n")
)
issp <- issp |>
left_join(pops, by = "country") |>
mutate(.by = country,
# population weights. We don't use these now. See "Weighting" below
# for why:
pop_weight = WEIGHT_COM/sum(WEIGHT_COM) * pop,
)
# Some population weights are very extreme:
quantile(issp$pop_weight, 0:20/20, na.rm=T) |> round(3)
# here's one strategy:
max_wt <- quantile(issp$pop_weight, 0.9, na.rm = TRUE)
issp$pop_weight <- pmin(issp$pop_weight, max_wt)
# ordered_value: is a "value", and non-NA answers have ordinal measurement
labels$ordered_value = if_else(str_detect(labels$var, "^v\\d"), TRUE, FALSE)
labels$ordered_value[labels$var %in% c("v30", "v31", "v32")] <- FALSE
# 4 means "women should decide"
labels$ordered_value[labels$var %in% c("v9", "v10")] <- FALSE
# v54-55 are ambiguous, let's keep them (relatives vs friends)
labels$ordered_value[labels$var %in% c("v34", "v35", "v36", "v37",
"v38", "v39", "v40",
"v40", "v41", "v42", "v43",
"v44", "v45", "v46", "v47",
"v48", "v49", "v50",
"v56", "v60", "v61", "v62",
"v63", "v64", "v65", "v66",
"v67", "v68", "v69", "v70",
"v71", "v72")] <- FALSE
values <- labels$var[labels$ordered_value]
#### Principal components ####
## Edit values for missingness
MAX_MISSING <- 0.05
ctry_missingness <- issp |>
# we don't worry about missingness in developing countries
filter(! developing) |>
filter(Continent == "Europe") |>
select(country, all_of(values)) |>
summarize(.by = country,
across(all_of(values), ~ mean(is.na(.x)))
) |>
pivot_longer(cols = -country, names_to = "var", values_to = "prop_missing") |>
summarize(.by = var,
max_missing = max(prop_missing)
) |>
arrange(desc(max_missing))
labels <- labels |> left_join(ctry_missingness, by = "var")
# lots of missingness and collectively adds up to a lot of
# missingness of the pc1...
values <- labels$var[labels$ordered_value & labels$max_missing < MAX_MISSING]
## Run principal components
# do higher values of a variable mean "more liberal" (1) or "less liberal" (-1)
# some questions about father's role are ambiguous but in fact mostly vary
# between "mother" and "both" so liberalism is the "bothwards" direction.
# v25 ("role model") is truly ambiguous in my view
liberalism_direction <- c(
v1 = -1,
v2 = 1,
v3 = 1,
v6 = 1,
v7 = 1,
v8 = 1,
v11 = -1,
v13 = 1,
v14 = -1,
v15 = -1,
v16 = -1,
v21 = 1,
v23 = 1,
v24 = 1,
v25 = 0
)
fml <- reformulate(values)
pc <- princomp(fml, data = issp, na.action = na.exclude, cor = TRUE)
plot(pc)
value_loadings <- loadings(pc)[, 1:2] |>
as_tibble() |>
rename(
pc1 = "Comp.1",
pc2 = "Comp.2"
) |>
mutate(.before = 1,
var = values,
liberalism = liberalism_direction[var]
) |>
left_join(labels, by = "var")
# should be negative on all or almost all values; pc1 = conservative
value_loadings$liberalism * value_loadings$pc1
value_loadings |> arrange(desc(abs(pc1)))
issp[, c("pc1", "pc2")] <- predict(pc, issp)[, 1:2]
issp |>
filter(!developing) |>
filter(Continent == "Europe") |>
summarize(.by = country,
pc1_missing = mean(is.na(pc1))
) |>
arrange(desc(pc1_missing))
# We calculate European quantiles over the whole age range.
pc1_ecdf_europe <- ecdf(issp$pc1[issp$Continent == "Europe"])
issp$pc1_quantile <- pc1_ecdf_europe(issp$pc1)
# Note that some (just 3) people from continent Other will
# have 0 quantile because they are less than the European minimum
# v1: working mum can establish as warm a relationship as trad mum
# higher values = disagree, so = traditional attitudes
cor(issp$pc1, issp$v1, use="pair")
issp$liberal <- - issp$pc1
issp$liberal_quantile <- 1 - issp$pc1_quantile
summary(lm(v63 ~ pc1 + pc2, data = issp, subset = AGE >= 45,
weights = eq_weight))
summary(lm(v63 ~ pc1 + pc2 + countryname, data = issp,
subset = AGE >= 45, weights = eq_weight))
#### Table of values ####
labels |>
filter(var %in% values) |>
mutate(label = str_remove(label, "^Q.. ")) |>
select(label)
#### Country map ####
all_ctries <- ne_countries(returnclass = "sf", continent = "Europe")
all_ctries <- all_ctries[all_ctries$name != "Russia", ]
countries <- unique(issp$countryname)
all_ctries$included <- all_ctries$name %in% countries
all_ctries$excluded <- all_ctries$name %in% c("India", "Philippines", "Russia",
"Thailand", "South Africa", "Israel")
all_ctries$included[
all_ctries$name %in% c("Czechia", "United States of America")] <- TRUE
ggplot(all_ctries) +
geom_sf(aes(fill = included & ! excluded), linewidth = 0.2, colour = "white") +
coord_sf(
xlim = c(-25, 45),
ylim = c(34, 72),
expand = FALSE
) +
scale_fill_manual(values = c("TRUE" = "orange", "FALSE" = "grey")) +
theme_void() +
theme(legend.position = "none")
#### Plot preparation ####
min_age <- 45
max_age <- 59
weight_type <- "eq_weight"
weight_type <- "pop_weight"
issp_focus <- issp |>
filter(
between(AGE, min_age, max_age),
Continent == "Europe",
! is.na(liberal_quantile)
)
issp_focus$weight <- issp_focus[[weight_type]]
theme_set(theme_minimal() + theme(legend.position = "none"))
legend_theme <- theme(legend.position = "inside",
legend.position.inside = c(.8, .8),
legend.title = element_blank(),
legend.background = element_rect("white", linewidth = 0))
weight_cap <- if (weight_type == "eq_weight") {
"All countries have equal total weight."
} else if (weight_type == "pop_weight") {
"Respondents weighted by country population."
} else {
stop(weight_type, " unrecognized.")
}
caption <- paste("ISSP, European countries only.", weight_cap)
base_plot <- issp_focus |>
ggplot(aes(x = liberal_quantile, y = v63, weight = weight)) +
geom_hline(yintercept = 2.1, linetype = "dotted") +
geom_smooth(colour = "black", method = "loess", span = 1) +
coord_cartesian() +
labs(
title = "Number of children by quantile of liberal family values",
subtitle = glue("ISSP respondents {min_age}-{max_age}"),
y = "Children",
x = "",
caption = caption
)
#### The plots ####
base_plot
base_plot +
geom_smooth(aes(weight = eq_weight, linetype = "Equal weights"),
colour = "black",
method = "loess", span = 1) +
scale_linetype_manual(name = NULL,
values = c("Equal weights" = "22"), na.value = "solid") +
guides(linetype = "legend") +
theme(legend.position = "bottom")
country_coefs <- issp_focus |>
mutate(
liberal_std = c(scale(liberal))
) |>
split(issp_focus$countryname) |>
purrr::map(\(x) lm(v63 ~ liberal_std, weights = eq_weight, data = x)) |>
purrr::map(\(x) {
res <- broom::tidy(x, conf.int = TRUE)
res[res$term == "liberal_std", c("estimate", "conf.low", "conf.high")]
}) |>
purrr::list_rbind(names_to = "countryname") |>
mutate(
positive = estimate > 0
) |>
left_join(issp_focus[c("countryname", "pop")], by = "countryname")
country_coefs |>
mutate(
countryname = fct_rev(countryname)
) |>
ggplot(aes(y = countryname)) +
geom_vline(xintercept = 0, colour = "grey50", linetype = "dashed") +
geom_pointrange(aes(x = estimate, xmin = conf.low, xmax = conf.high,
colour = positive, size = pop)) +
scale_colour_manual(values = c("TRUE" = "black", "FALSE" = "red")) +
scale_size_area(name = "Population", max_size = 1.8,
labels = scales::label_number(scale_cut = scales::cut_short_scale())) +
guides(colour = "none") +
labs(
title = "Per-country effects of liberal family values on fertility",
subtitle = "Effects of a 1 standard deviation change. Lines show 95% confidence intervals",
y = "",
x = "Change in fertility",
caption = "ISSP, European countries only."
) +
theme(legend.position = "right")
status_plot <- base_plot +
geom_smooth(aes(colour = Status, group = Status), method = "loess",
span = 1, se = FALSE) +
scale_colour_manual(values = c("Low" = "indianred2", "Middle" = "orange3",
"High" = "slateblue4"),
na.translate = FALSE) +
labs(
subtitle = glue("ISSP respondents {min_age}-{max_age}. Density plot below."),
caption = ""
) +
theme(legend.title = element_text(), legend.position = "right")
status_density <- issp_focus |>
filter(! is.na(Status)) |>
ggplot(aes(liberal_quantile, weight = weight)) +
geom_density(aes(y = after_stat(wdensity), colour = Status, fill = Status),
bounds = c(0, 1), alpha = 0.4) +
scale_colour_manual(values = c("Low" = "indianred2", "Middle" = "orange3",
"High" = "slateblue4"),
na.translate = FALSE, aesthetics = c("colour", "fill")) +
labs(caption = caption) +
theme_void() +
theme(legend.position = "none")
status_plot / status_density + plot_layout(heights = c(3, 1))
educ_plot <- base_plot +
geom_smooth(aes(colour = Degree, group = Degree), method = "loess", span = 1,
se = FALSE) +
scale_colour_manual(values = c("Degree" = "indianred", "No degree" = "seagreen"),
na.translate = FALSE) +
coord_cartesian(ylim = c(1.4, 2.2)) +
labs(
subtitle = glue("ISSP respondents {min_age}-{max_age}. Density plot below."),
caption = ""
) +
theme(plot.caption = element_blank(), legend.position = "right")
educ_density <- issp_focus |>
ggplot(aes(x = liberal_quantile, weight = weight)) +
geom_density(aes(fill = Degree, y = after_stat(wdensity), colour = Degree),
bounds = c(0,1), alpha = 0.3) +
scale_colour_manual(values = c("Degree" = "indianred", "No degree" = "seagreen"),
na.translate = FALSE, aesthetics = c("colour", "fill")) +
labs(caption = caption) +
theme_void() +
theme(legend.position = "none")
educ_plot / educ_density + plot_layout(heights = c(3, 1))
issp_focus |>
mutate(
Married = as.numeric(MARITAL %in% 1:2)
) |>
ggplot(aes(x = liberal_quantile, y = Married, weight = weight)) +
geom_smooth(aes(linetype = "All respondents"), colour = "mediumslateblue") +
geom_smooth(data = \(x) filter(x, PARTLIV == 1),
aes(linetype = "Respondents living together"), colour = "violetred") +
scale_linetype_manual(name = "",
values = c("Respondents living together" = "22",
"All respondents" = "solid")) +
scale_y_continuous(labels = scales::label_percent()) +
labs(
title = "Proportion married by quantile of liberal family values",
subtitle = glue("ISSP respondents {min_age}-{max_age}"),
x = "",
y = "Prop. married",
caption = caption
) +
theme(legend.position = "bottom")
marstat_plot <- base_plot +
geom_smooth(
data = \(x) mutate(x,
`Marital Status` = fct_relevel(`Marital Status`, "Married",
"Previously married", "Never married")),
aes(colour = `Marital Status`, group = `Marital Status`), se = FALSE,
method = "loess", span = 1) +
scale_colour_manual(values = c("Married" = "orange",
"Previously married" = "steelblue2",
"Never married" = "seagreen3"),
na.translate = FALSE, aesthetics = c("colour", "fill")) +
labs(caption = "") +
coord_cartesian() +
theme(legend.position = "right", plot.caption = element_blank())
marstat_density <- issp_focus |>
ggplot(aes(x = liberal_quantile, weight = weight)) +
geom_density(aes(y = after_stat(wdensity)/1000, fill = `Marital Status`,
colour = `Marital Status`), bounds = c(0, 1), alpha = 0.5) +
scale_colour_manual(values = c("Married" = "orange",
"Previously married" = "steelblue2",
"Never married" = "seagreen3"),
na.translate = FALSE,
aesthetics = c("colour", "fill")) +
labs(caption = caption) +
theme_void() +
theme(legend.position = "none")
marstat_plot / marstat_density + plot_layout(heights = c(3, 1))
# Parity
issp_focus |>
mutate(
Children = chop(as.numeric(v63), 0:5, labels = lbl_discrete(last = "5+")),
Children = fct_rev(Children)
) |>
filter(! is.na(Children)) |>
ggplot(aes(x = liberal_quantile, weight = weight)) +
geom_density(aes(y = after_stat(wdensity), fill = Children), position = "fill",
colour = NA, alpha = 0.7) +
scale_fill_viridis_d() +
scale_y_continuous(labels = scales::label_percent()) +
labs(
title = "Exact number of children by quantile of liberal family values",
subtitle = glue("ISSP respondents {min_age}-{max_age}"),
x = "",
y = "",
caption = caption
) +
theme(legend.position = "right")
base_plot +
# global!
issp |>
filter(between(AGE, 45, 49), ! is.na(pop_weight)) |>
mutate(
Continent = if_else(countryname == "India", "India", Continent),
Continent = fct_relevel(Continent, "Europe", "US", "India", "Other")
) +
geom_density(aes(y = after_stat(wdensity)/5e7),
alpha = 0.6, fill = "wheat", colour = NA) +
aes(weight = pop_weight) +
facet_grid(cols = vars(Continent)) +
theme(legend.position = "right") +
coord_cartesian() +
labs(
subtitle = "ISSP respondents 45-59. Population density at bottom.",
caption = paste("ISSP, all countries.", weight_cap)
)
#### More plots, not used in article ####
age_plot <- base_plot +
geom_smooth(aes(colour = Age, fill = Age, group = Age),
alpha = 0.15, method = "loess", span = 1, se = TRUE) +
scale_colour_viridis_d(begin = 0.2, end = 0.8, option = "B",
aesthetics = c("colour", "fill")) +
legend_theme + theme(plot.caption = element_blank())
age_density <- issp_focus |>
ggplot(aes(x = liberal_quantile, weight = eq_weight)) +
geom_density(aes(fill = Age, y = after_stat(wdensity), colour = Age),
bounds = c(0,1), alpha = 0.15) +
scale_colour_viridis_d(begin = 0.2, end = 0.8, option = "B",
aesthetics = c("colour", "fill")) +
labs(caption = caption) +
theme_void() +
theme(legend.position = "none")
age_plot / age_density + plot_layout(heights = c(3, 1))
base_plot +
geom_smooth(aes(colour = Sex, group = Sex), method = "loess", span = 1,
se = FALSE) +
scale_colour_manual(values = c("Male" = "blue4", "Female" = "purple"),
na.translate = FALSE) +
legend_theme
base_plot +
geom_smooth(aes(colour = Urban, group = Urban),
method = "loess", span = 1, se = FALSE) +
scale_colour_manual(values = c("City/suburbs" = "steelblue", "Other" = "green4"),
na.translate = FALSE) +
coord_cartesian(ylim = c(1.5, 3)) +
legend_theme
# Younger ages
issp |>
filter(Continent == "Europe", ! is.na(liberal_quantile)) |>
mutate(
lq5 = chop_evenly(liberal_quantile, 5, labels = lbl_glue("{as.numeric(l)*100}-{as.numeric(r)*100}%")),
Age = chop(as.numeric(AGE), c(25, 30, 35, 40, 45), labels = lbl_discrete("-"))
) |>
summarize(.by = c(Age, lq5),
v63 = weighted.mean(v63, pop_weight, na.rm = TRUE)
) |>
ggplot(aes(lq5, v63, colour = Age, group = Age)) +
geom_line() +
geom_point() +
geom_text(
# aes(label = if_else(lq5 == "0-20%", Age, "")),
aes(label = if_else((as.numeric(lq5) %% 5 + as.numeric(Age) %% 2) == 1, Age, "")),
vjust = -0.5, hjust = 0.5
) +
scale_colour_viridis_d(end = 0.75) +
theme(legend.position = "none") +
labs(
title = "Number of children by age and liberal family values",
subtitle = "ISSP respondents, all ages. Five quantile groups.",
y = "Children",
x = "",
caption = caption
)
#### Weighting: the issues ####
effective_n <- function (w) {w <- na.omit(w); sum(w)^2/sum(w^2)}
effective_n(issp_focus$pop_weight)
effective_n(issp_focus$eq_weight)
effective_n(issp_focus$WEIGHT_COM)
# Basically, if you weight by country population, small countries
# count for little; and the US will dominate. But the US has a different
# fertility rate, plus is more centrist, so now you are just capturing
# that:
issp |>
filter(between(AGE, 45, 49)) |>
filter(! developing) |>
drop_na(pop) |>
ggplot(aes(x = liberal_quantile, y = after_stat(wdensity),
weight = pop_weight, group = countryname)) +
geom_density(aes(weight = 1, linetype = "none", colour = "none")) +
geom_density(aes(weight = pop_weight/1e5, linetype = "population",
colour = "population"), linewidth = 1.1) +
geom_density(aes(weight = WEIGHT_COM, linetype = "within-country",
colour = "within-country"), linewidth = 1.1) +
facet_wrap(vars(countryname)) +
scale_colour_discrete(name = "Weighting") +
scale_linetype_manual(name = "Weighting",
values = c("none" = "solid", "within-country" = "dotted", "population" = "dashed")) +
theme(legend.position = "bottom") +
labs(
title = "Sample densities of liberalism by country, different weightings",
subtitle = "Unweighted/weighted within-country/weighted by country population",
x = "",
y = "Density"
)
# In Europe, weights of different countries vary radically at
# different quantiles
weight_plot_2 <- issp |>
filter(between(AGE, 45, 49), Continent == "Europe") |>
ggplot(aes(x = liberal_quantile, weight = pop_weight)) +
geom_vline(xintercept = c(0.2, 0.35, 0.5), linetype = 2) +
geom_density(aes(fill = countryname, y = after_stat(wdensity)),
position = "fill", alpha = 0.6) +
labs(
title = "Density of liberalism quantiles by country, using population weights",
subtitle = "European countries",
y = "Weighted density",
x = ""
)
weight_plot_2
# not so bad if we use country-equal weights
weight_plot_2 + aes(weight = eq_weight) +
labs(title = "Density of liberalism quantiles by country, using country-equal weights")
# across ages, does it look better? Not really.
weight_plot_2 + filter(issp, between(AGE, 45, 60), Continent == "Europe")
# I think the answer is, we just don't use population weights. If we do
# we are effectively just looking at between-country differences.
# globally it's still pretty bad
weight_plot_2 + issp |>
filter(between(AGE, 45, 49), ! developing, ! is.na(pop_weight))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment