-
-
Save ryanburge/37272aa04b2dd1f4b00a689a611f98b3 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Load necessary library | |
library(stringr) | |
# Set the path to the directory containing your .dta files | |
path <- "E:/mtf/new/all" | |
# List all .dta files in the directory | |
files <- list.files(path, pattern = "\\.dta$", full.names = TRUE) | |
# Function to rename files from short year to full year format | |
rename_files <- function(file_path) { | |
# Extract the base name of the file | |
base_name <- basename(file_path) | |
# Extract the year part from the base name | |
short_year <- str_extract(base_name, "\\d{2}") | |
# Determine the full year | |
full_year <- ifelse(as.numeric(short_year) <= 22, paste0("20", short_year), paste0("19", short_year)) | |
# Create the new base name with full year | |
new_base_name <- sub(short_year, full_year, base_name) | |
# Create the full path for the new file name | |
new_file_path <- file.path(path, new_base_name) | |
# Rename the file | |
file.rename(file_path, new_file_path) | |
# Return the new file path | |
return(new_file_path) | |
} | |
# Apply the renaming function to all files | |
new_files <- sapply(files, rename_files) | |
# Print the new file names | |
print(new_files) | |
# Set the path to the directory containing your .dta files | |
path <- "E:/mtf/new/all" | |
# List all .dta files in the directory | |
files <- list.files(path, pattern = "\\.dta$", full.names = TRUE) | |
# Function to create a valid R variable name from the filename | |
make_var_name <- function(filepath) { | |
base_name <- basename(filepath) | |
no_extension <- tools::file_path_sans_ext(base_name) | |
return(no_extension) | |
} | |
# Import each file and assign it to a variable in the global environment | |
lapply(files, function(file) { | |
data_name <- make_var_name(file) | |
assign(data_name, import(file), envir = .GlobalEnv) | |
}) | |
rm(mtf) | |
# Define the years you want to process | |
years <- 1995:2011 | |
# Function to process each dataset | |
process_mtf <- function(year) { | |
dataset_name <- paste0("mtf20", year) # Construct dataset name | |
dataset <- get(dataset_name) # Retrieve dataset | |
dataset %>% | |
filter(V195 > 0, V195 != -9) %>% # Remove missing responses | |
mutate(V195_recode = ifelse(V195 %in% c(1, 2), 1, 0)) %>% | |
mean_ci(V195_recode, wt = V5, ci = .84) %>% | |
mutate(year = as.character(year)) # Add year column | |
} | |
# Apply function to all years and combine results | |
results1 <- map_dfr(years, process_mtf) | |
# Define the years you want to process | |
years <- 2012:2021 | |
# Function to process each dataset | |
process_mtf <- function(year) { | |
dataset_name <- paste0("mtf20", year) # Adjust dataset name format | |
if (!exists(dataset_name)) return(NULL) # Skip if dataset doesn't exist | |
dataset <- get(dataset_name) # Retrieve dataset | |
dataset %>% | |
filter(V2195 > 0, V2195 != -9) %>% # Remove missing responses | |
mutate(V2195_recode = ifelse(V2195 %in% c(1, 2), 1, 0)) %>% | |
mean_ci(V2195_recode, wt = ARCHIVE_WT, ci = .84) %>% | |
mutate(year = as.character(year)) # Add year column | |
} | |
# Apply function to all years and combine results | |
results2 <- map_dfr(years, process_mtf) | |
gg <- bind_rows(results1, results2) %>% mutate(year = as.numeric(year)) | |
gg %>% | |
ggplot(., aes(x = year, y = mean, group = 1)) + | |
geom_line() + | |
geom_point(stroke = 1, shape = 21, fill = "white", color = "firebrick") + | |
theme_rb() + | |
scale_y_continuous(labels = percent, limits = c(0, .75)) + | |
add_text(x = 1995, y = .305, word = "34%", sz = 7) + | |
add_text(x = 2021, y = .74, word = "71%", sz = 7) + | |
scale_x_continuous(breaks = seq(1995, 2021, by = 5)) + # Show every 5 years | |
labs(x = "Year", y = "", title = "The Share of High School Seniors Who Go On a Date Once a Month or Less", | |
caption = "@ryanburge\nData: Monitoring the Future, 1995-2021") | |
save("mtf_no_dates.png") | |
# Define years for each variable structure | |
years_1 <- 1995:2011 # Uses V169 for attendance | |
years_2 <- 2012:2021 # Uses V2169 for attendance | |
# Function to process 1995-2011 datasets | |
process_mtf_1 <- function(year) { | |
dataset_name <- paste0("mtf20", year) # Construct dataset name | |
if (!exists(dataset_name)) return(NULL) # Skip if dataset doesn't exist | |
dataset <- get(dataset_name) # Retrieve dataset | |
dataset %>% | |
filter(V195 > 0, V195 != -9) %>% # Remove missing responses | |
mutate( | |
V195_recode = ifelse(V195 %in% c(1, 2), 1, 0), # Recode dating variable | |
att = V169, # Assign attendance variable | |
att = frcode(att == 1 ~ "Never", | |
att == 2 ~ "Rarely", | |
att == 3 ~ "Monthly", | |
att == 4 ~ "Weekly") # Recode attendance categories | |
) %>% | |
group_by(att) %>% | |
mean_ci(V195_recode, wt = V5, ci = .84) %>% | |
mutate(year = as.character(year)) # Add year column | |
} | |
# Function to process 2012-2021 datasets | |
process_mtf_2 <- function(year) { | |
dataset_name <- paste0("mtf20", year) # Construct dataset name | |
if (!exists(dataset_name)) return(NULL) # Skip if dataset doesn't exist | |
dataset <- get(dataset_name) # Retrieve dataset | |
dataset %>% | |
filter(V2195 > 0, V2195 != -9) %>% # Remove missing responses | |
mutate( | |
V2195_recode = ifelse(V2195 %in% c(1, 2), 1, 0), # Recode dating variable | |
att = V2169, # Assign attendance variable | |
att = frcode(att == 1 ~ "Never", | |
att == 2 ~ "Rarely", | |
att == 3 ~ "Monthly", | |
att == 4 ~ "Weekly") # Recode attendance categories | |
) %>% | |
group_by(att) %>% | |
mean_ci(V2195_recode, wt = ARCHIVE_WT, ci = .84) %>% | |
mutate(year = as.character(year)) # Add year column | |
} | |
# Apply function to all years and combine results | |
results_1 <- map_dfr(years_1, process_mtf_1) | |
results_2 <- map_dfr(years_2, process_mtf_2) | |
# Combine both periods into a single dataframe | |
final_results <- bind_rows(results_1, results_2) %>% filter(att != "NA") %>% mutate(year = as.numeric(year)) | |
final_results %>% | |
ggplot(., aes(x = year, y = mean, group = att, color = att)) + | |
geom_line() + | |
geom_point(stroke = 1, shape = 21, fill = "white") + | |
theme_rb(legend = TRUE) + | |
scale_color_gdocs() + | |
scale_y_continuous(labels = percent, limits = c(.25, .80)) + | |
theme(plot.title = element_text(size = 15)) + | |
scale_x_continuous(breaks = seq(1995, 2021, by = 5)) + # Show every 5 years | |
labs(x = "Year", y = "", title = "The Share of High School Seniors Who Go On a Date Once a Month or Less by Religious Attendance", | |
caption = "@ryanburge\nData: Monitoring the Future, 1995-2021") | |
save("mtf_no_dates_by_att.png") | |
# Define the years for each variable structure | |
years_1 <- 1995:2011 # Uses V194 for going out | |
years_2 <- 2012:2021 # Uses V2194 for going out | |
# Function to process 1995-2011 datasets | |
process_mtf_1 <- function(year) { | |
dataset_name <- paste0("mtf20", year) # Construct dataset name | |
if (!exists(dataset_name)) return(NULL) # Skip if dataset doesn't exist | |
dataset <- get(dataset_name) # Retrieve dataset | |
dataset %>% | |
filter(V194 > 0, V194 != 9) %>% # Remove missing responses | |
mutate(V194_recode = ifelse(V194 %in% c(1, 2), 1, 0)) %>% | |
mean_ci(V194_recode, wt = V5, ci = .84) %>% | |
mutate(year = as.character(year)) # Add year column | |
} | |
# Function to process 2012-2021 datasets | |
process_mtf_2 <- function(year) { | |
dataset_name <- paste0("mtf20", year) # Construct dataset name | |
if (!exists(dataset_name)) return(NULL) # Skip if dataset doesn't exist | |
dataset <- get(dataset_name) # Retrieve dataset | |
dataset %>% | |
filter(V2194 > 0, V2194 != -9) %>% # Remove missing responses | |
mutate(V2194_recode = ifelse(V2194 %in% c(1, 2), 1, 0)) %>% | |
mean_ci(V2194_recode, wt = ARCHIVE_WT, ci = .84) %>% | |
mutate(year = as.character(year)) # Add year column | |
} | |
# Apply function to all years and combine results | |
results_1 <- map_dfr(years_1, process_mtf_1) | |
results_2 <- map_dfr(years_2, process_mtf_2) | |
# Combine both periods into a single dataframe | |
gg <- bind_rows(results_1, results_2) %>% mutate(year = as.numeric(year)) | |
gg %>% | |
ggplot(., aes(x = year, y = mean, group = 1)) + | |
geom_line() + | |
geom_point(stroke = 1, shape = 21, fill = "white", color = "darkgreen") + | |
theme_rb() + | |
scale_y_continuous(labels = percent, limits = c(0, .50)) + | |
add_text(x = 1995, y = .19, word = "22%", sz = 7) + | |
add_text(x = 2021, y = .485, word = "46%", sz = 7) + | |
scale_x_continuous(breaks = seq(1995, 2021, by = 5)) + # Show every 5 years | |
labs(x = "Year", y = "", title = "The Share of High School Seniors Who Go On Out For Fun or Recreation Once a Week or Less", | |
caption = "@ryanburge\nData: Monitoring the Future, 1995-2021") | |
save("mtf_no_social.png") | |
library(dplyr) | |
library(purrr) | |
# Define years for each variable structure | |
years_1 <- 1995:2011 # Uses V194 for going out, V169 for attendance | |
years_2 <- 2012:2021 # Uses V2194 for going out, V2169 for attendance | |
# Function to process 1995-2011 datasets | |
process_mtf_1 <- function(year) { | |
dataset_name <- paste0("mtf20", year) # Construct dataset name | |
if (!exists(dataset_name)) return(NULL) # Skip if dataset doesn't exist | |
dataset <- get(dataset_name) # Retrieve dataset | |
dataset %>% | |
filter(V194 > 0, V194 != 9) %>% # Remove missing responses | |
mutate( | |
V194_recode = ifelse(V194 %in% c(1, 2), 1, 0), # Recode going out variable | |
att = V169, # Assign attendance variable | |
att = frcode(att == 1 ~ "Never", | |
att == 2 ~ "Rarely", | |
att == 3 ~ "Monthly", | |
att == 4 ~ "Weekly") # Recode attendance categories | |
) %>% | |
group_by(att, year = as.character(year)) %>% | |
mean_ci(V194_recode, wt = V5, ci = .84) | |
} | |
# Function to process 2012-2021 datasets | |
process_mtf_2 <- function(year) { | |
dataset_name <- paste0("mtf20", year) # Construct dataset name | |
if (!exists(dataset_name)) return(NULL) # Skip if dataset doesn't exist | |
dataset <- get(dataset_name) # Retrieve dataset | |
dataset %>% | |
filter(V2194 > 0, V2194 != -9) %>% # Remove missing responses | |
mutate( | |
V2194_recode = ifelse(V2194 %in% c(1, 2), 1, 0), # Recode going out variable | |
att = V2169, # Assign attendance variable | |
att = frcode(att == 1 ~ "Never", | |
att == 2 ~ "Rarely", | |
att == 3 ~ "Monthly", | |
att == 4 ~ "Weekly") # Recode attendance categories | |
) %>% | |
group_by(att, year = as.character(year)) %>% | |
mean_ci(V2194_recode, wt = ARCHIVE_WT, ci = .84) | |
} | |
# Apply function to all years and combine results | |
results_1 <- map_dfr(years_1, process_mtf_1) | |
results_2 <- map_dfr(years_2, process_mtf_2) | |
# Combine both periods into a single dataframe | |
final_results <- bind_rows(results_1, results_2) %>% | |
filter(!is.na(att)) %>% # Remove NA attendance values | |
mutate(year = as.numeric(year)) | |
# View the combined dataset | |
print(final_results) | |
final_results %>% | |
ggplot(., aes(x = year, y = mean, group = att, color = att)) + | |
geom_line() + | |
geom_point(stroke = 1, shape = 21, fill = "white") + | |
theme_rb(legend = TRUE) + | |
scale_color_gdocs() + | |
scale_y_continuous(labels = percent, limits = c(0, .52)) + | |
theme(plot.title = element_text(size = 15)) + | |
scale_x_continuous(breaks = seq(1995, 2021, by = 5)) + # Show every 5 years | |
labs(x = "Year", y = "", title = "The Share of High School Seniors Who Go On Out For Fun or Recreation Once a Week or Less", | |
subtitle = "By Religious Attendance", | |
caption = "@ryanburge\nData: Monitoring the Future, 1995-2021") | |
save("mtf_no_social_by_att.png") | |
# Define the years for each variable structure | |
years_1 <- 1995:2011 # Uses V192 for job status | |
years_2 <- 2012:2022 # Uses V2191 for job status | |
# Function to process 1995-2011 datasets | |
process_mtf_1 <- function(year) { | |
dataset_name <- paste0("mtf20", year) # Construct dataset name | |
if (!exists(dataset_name)) return(NULL) # Skip if dataset doesn't exist | |
dataset <- get(dataset_name) # Retrieve dataset | |
dataset %>% | |
mutate(job = case_when(V191 == 1 ~ 1, TRUE ~ 0)) %>% | |
mean_ci(job, wt = V5, ci = .84) %>% | |
mutate(year = as.character(year)) # Add year column | |
} | |
# Function to process 2012-2022 datasets | |
process_mtf_2 <- function(year) { | |
dataset_name <- paste0("mtf20", year) # Construct dataset name | |
if (!exists(dataset_name)) return(NULL) # Skip if dataset doesn't exist | |
dataset <- get(dataset_name) # Retrieve dataset | |
dataset %>% | |
mutate(job = case_when(V2191 == 1 ~ 1, TRUE ~ 0)) %>% | |
mean_ci(job, wt = ARCHIVE_WT, ci = .84) %>% | |
mutate(year = as.character(year)) # Add year column | |
} | |
# Apply function to all years and combine results | |
results_1 <- map_dfr(years_1, process_mtf_1) | |
results_2 <- map_dfr(years_2, process_mtf_2) | |
# Combine both periods into a single dataframe | |
gg <- bind_rows(results_1, results_2) %>% mutate(year = as.numeric(year)) | |
gg %>% | |
ggplot(., aes(x = year, y = mean, group = 1)) + | |
geom_line() + | |
geom_point(stroke = 1, shape = 21, fill = "white", color = "darkorchid") + | |
theme_rb() + | |
scale_y_continuous(labels = percent, limits = c(0, .42)) + | |
add_text(x = 1995, y = .19, word = "22%", sz = 7) + | |
add_text(x = 2022, y = .38, word = "35%", sz = 7) + | |
scale_x_continuous(breaks = seq(1995, 2021, by = 5)) + # Show every 5 years | |
labs(x = "Year", y = "", title = "On the average over the school year, how many hours per week do you work\nin a paid or unpaid job? - Share Saying Zero Hours", | |
caption = "@ryanburge\nData: Monitoring the Future, 1995-2022") | |
save("mtf_no_job.png") | |
library(dplyr) | |
library(purrr) | |
# Define years for each variable structure | |
years_1 <- 1995:2011 # Uses V195, V194, V191 | |
years_2 <- 2012:2022 # Uses V2195, V2194, V2191 | |
# Function to process 1995-2011 datasets | |
process_mtf_1 <- function(year) { | |
dataset_name <- paste0("mtf20", year) | |
if (!exists(dataset_name)) return(NULL) # Skip missing dataset | |
dataset <- get(dataset_name) # Retrieve dataset | |
dataset %>% | |
mutate( | |
date_less = case_when(V195 %in% c(1, 2) ~ 1, V195 > 2 ~ 0, TRUE ~ NA_real_), | |
social_less = case_when(V194 %in% c(1, 2) ~ 1, V194 > 2 ~ 0, TRUE ~ NA_real_), | |
no_job = case_when(V191 == 1 ~ 1, V191 > 1 ~ 0, TRUE ~ NA_real_), | |
combined = ifelse(date_less == 1 & social_less == 1 & no_job == 1, 1, 0) | |
) %>% | |
filter(!is.na(combined)) %>% | |
mean_ci(combined, wt = V5, ci = .84) %>% | |
mutate(year = as.character(year)) | |
} | |
# Function to process 2012-2022 datasets | |
process_mtf_2 <- function(year) { | |
dataset_name <- paste0("mtf20", year) | |
if (!exists(dataset_name)) return(NULL) # Skip missing dataset | |
dataset <- get(dataset_name) # Retrieve dataset | |
dataset %>% | |
mutate( | |
date_less = case_when(V2195 %in% c(1, 2) ~ 1, V2195 > 2 ~ 0, TRUE ~ NA_real_), | |
social_less = case_when(V2194 %in% c(1, 2) ~ 1, V2194 > 2 ~ 0, TRUE ~ NA_real_), | |
no_job = case_when(V2191 == 1 ~ 1, V2191 > 1 ~ 0, TRUE ~ NA_real_), | |
combined = ifelse(date_less == 1 & social_less == 1 & no_job == 1, 1, 0) | |
) %>% | |
filter(!is.na(combined)) %>% | |
mean_ci(combined, wt = ARCHIVE_WT, ci = .84) %>% | |
mutate(year = as.character(year)) | |
} | |
# Apply function to all years and combine results | |
results_1 <- map_dfr(years_1, process_mtf_1) | |
results_2 <- map_dfr(years_2, process_mtf_2) | |
# Combine both periods into a single dataframe | |
final_results <- bind_rows(results_1, results_2) %>% mutate(year = as.numeric(year)) | |
# View results | |
print(final_results) | |
# Plot trends over time | |
final_results %>% | |
ggplot(aes(x = year, y = mean, group = 1)) + | |
geom_line() + | |
geom_point(stroke = 1, shape = 21, fill = "white", color = "azure4") + | |
theme_rb() + | |
scale_y_continuous(labels = scales::percent_format(), limits = c(0, .20)) + | |
add_text(x = 1995, y = .05, word = "3.5%", sz = 7) + | |
add_text(x = 2022, y = .15, word = "15.8%", sz = 7) + | |
scale_x_continuous(breaks = seq(1995, 2022, by = 5)) + # Show every 5 years | |
labs(x = "Year", y = "", | |
title = "The Share of High School Seniors Who Rarely Date, Socialize, or Work", | |
subtitle = "Less Than One Date Per Month, Socialize No More than Once a Week, and Have No Job", | |
caption = "@ryanburge\nData: Monitoring the Future, 1995-2022") | |
save("mtf_combined_social_trends.png") | |
library(dplyr) | |
# Function to process 1995 dataset | |
process_1995 <- function(dataset) { | |
dataset %>% | |
mutate( | |
date_less = case_when(V195 %in% c(1, 2) ~ 1, V195 > 2 ~ 0, TRUE ~ NA_real_), | |
social_less = case_when(V194 %in% c(1, 2) ~ 1, V194 > 2 ~ 0, TRUE ~ NA_real_), | |
no_job = case_when(V191 == 1 ~ 1, V191 > 1 ~ 0, TRUE ~ NA_real_), | |
combined = ifelse(date_less == 1 & social_less == 1 & no_job == 1, 1, 0), | |
att = V169 | |
) %>% | |
mutate(att = frcode(att == 1 ~ "Never", | |
att == 2 ~ "Rarely", | |
att == 3 ~ "Monthly", | |
att == 4 ~ "Weekly")) %>% | |
group_by(att) %>% | |
mean_ci(combined, wt = V5, ci = .84) %>% | |
mutate(year = "1995") | |
} | |
# Function to process 2022 dataset | |
process_2022 <- function(dataset) { | |
dataset %>% | |
mutate( | |
date_less = case_when(V2195 %in% c(1, 2) ~ 1, V2195 > 2 ~ 0, TRUE ~ NA_real_), | |
social_less = case_when(V2194 %in% c(1, 2) ~ 1, V2194 > 2 ~ 0, TRUE ~ NA_real_), | |
no_job = case_when(V2191 == 1 ~ 1, V2191 > 1 ~ 0, TRUE ~ NA_real_), | |
combined = ifelse(date_less == 1 & social_less == 1 & no_job == 1, 1, 0), | |
att = V2169 | |
) %>% | |
mutate(att = frcode(att == 1 ~ "Never", | |
att == 2 ~ "Rarely", | |
att == 3 ~ "Monthly", | |
att == 4 ~ "Weekly")) %>% | |
group_by(att) %>% | |
mean_ci(combined, wt = ARCHIVE_WT, ci = .84) %>% | |
mutate(year = "2022") | |
} | |
# Load datasets | |
dataset_1995 <- get("mtf201995") | |
dataset_2022 <- get("mtf202022") | |
# Process both years | |
results_1995 <- process_1995(dataset_1995) | |
results_2022 <- process_2022(dataset_2022) | |
# Combine results | |
final_results <- bind_rows(results_1995, results_2022) | |
# View results | |
gg <- final_results %>% na.omit() | |
# Plot | |
gg %>% | |
ggplot(aes(x = att, y = mean, fill = factor(year))) + | |
geom_col(color = "black", position = "dodge") + | |
theme_rb(legend = TRUE) + | |
scale_fill_calc() + | |
lab_bar(above = TRUE, pos = .01, sz = 7, type = mean) + | |
y_pct() + | |
theme(legend.text = element_text(size = 22)) + | |
labs(x = "Religious Attendance", y = "", | |
title = "Share of Seniors with Very Little Social Life", | |
subtitle = "Broken Down by Religious Attendance", | |
caption = "@ryanburge\nData: Monitoring the Future, 1995-2022") | |
save("mtf_low_engagement_by_attendance_1995_2022.png", wd = 5) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment