Skip to content

Instantly share code, notes, and snippets.

@jmclawson
Last active July 21, 2024 14:09
Show Gist options
  • Save jmclawson/ac61b353b69f2a2f278fb1c265412427 to your computer and use it in GitHub Desktop.
Save jmclawson/ac61b353b69f2a2f278fb1c265412427 to your computer and use it in GitHub Desktop.
load_data <- function(){
if (!file.exists("data/scorecard.rds")) {
"https://ed-public-download.app.cloud.gov/downloads/College_Scorecard_Raw_Data_06102024.zip" |>
download_once() |>
utils::unzip(files = c("Most-Recent-Cohorts-Institution.csv", "CollegeScorecardDataDictionary.xlsx"),
exdir = "data")
scorecard <- readr::read_csv("data/Most-Recent-Cohorts-Institution.csv")
dictionary <- "data/CollegeScorecardDataDictionary.xlsx" |>
readxl::read_excel(sheet = "Institution_Data_Dictionary")
saveRDS(scorecard, "data/scorecard.rds")
saveRDS(dictionary, "data/dictionary.rds")
} else {
scorecard <- readRDS("data/scorecard.rds")
dictionary <- readRDS("data/dictionary.rds")
}
scorecard <<- scorecard
dictionary <<- dictionary
}
# from tmtyro
download_once <- function(
url,
filename = NULL,
destdir = "data"
) {
if(is.null(filename)){
the_filename <- url |> stringr::str_extract("[a-z A-Z 0-9 \\- _]+[.]{1,1}+[a-zA-Z]{1,4}$")
} else {
the_filename <- filename
}
if(!dir.exists(destdir)){
dir.create(destdir)
}
filepath <- file.path(destdir, the_filename)
if(!file.exists(filepath)) {
status <- 1
tryCatch(status <- utils::download.file(url, destfile = filepath), error = function(e) return(NULL))
if (status != 0) filepath <- NULL
}
invisible(filepath)
}
prep_columns <- function(
data = scorecard,
category = c(NA, "academics", "admissions",
"aid", "completion", "cost",
"earnings", "repayment", "root",
"school", "student"),
name = TRUE) {
df <- data
category <- match.arg(category)
if (!is.na(category)) {
if (name) {
cols <- dictionary |>
dplyr::filter(`VARIABLE NAME` == "UNITID" |
`VARIABLE NAME` == "INSTNM" |
`VARIABLE NAME` %in% category |
`dev-category` %in% category)
} else {
cols <- dictionary |>
dplyr::filter(`VARIABLE NAME` == "UNITID" |
`VARIABLE NAME` %in% category |
`dev-category` %in% category)
}
cols <- cols |>
dplyr::select(newname = `developer-friendly name`,
oldname = `VARIABLE NAME`) |>
tidyr::drop_na(newname)
} else {
cols <- dictionary |>
dplyr::select(newname = `developer-friendly name`,
oldname = `VARIABLE NAME`) |>
tidyr::drop_na(newname)
}
cols_vec <- c(cols$oldname) |>
setNames(cols$newname)
df |>
dplyr::select(1, dplyr::any_of(cols_vec))
}
recode_labels <- function(data) {
if ("UNITID" %in% colnames(data)){ # for original column names
dict <- dictionary |>
dplyr::rename(
name = `VARIABLE NAME`,
value = VALUE,
label = LABEL) |>
dplyr::select(name, value, label)
} else { # for renamed columns
dict <- dictionary |>
dplyr::rename(
name = `developer-friendly name`,
value = VALUE,
label = LABEL) |>
dplyr::select(name, value, label)
}
df <- data |>
tidyr::pivot_longer(
cols = 2:ncol(data),
values_transform = as.character) |>
dplyr::left_join(
dict |>
tidyr::fill(name) |>
dplyr::mutate(value = as.character(value)),
by = c("name", "value")) |>
dplyr::mutate(keep = dplyr::if_else(is.na(label), value, label))
numeric_cols <- dict |>
tidyr::drop_na(name) |>
dplyr::filter(is.na(label)) |>
dplyr::pull(name) |>
intersect(data |>
dplyr::select(dplyr::where(is.numeric)) |>
colnames())
df |>
dplyr::select(-c(value, label)) |>
tidyr::pivot_wider(values_from = keep) |>
dplyr::mutate(dplyr::across(dplyr::any_of(numeric_cols), as.numeric))
}
@jmclawson
Copy link
Author

Use:

library(dplyr)
load_data()

scorecard |> 
  filter(STABBR == "OH") |> 
  prep_columns(category = "school") |> 
  recode_labels() |> 
  View(title = "t")

scorecard |> 
  filter(RELAFFIL == 65) |> 
  prep_columns(category = "admis") |> 
  recode_labels() |> 
  View(title = "t")

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment