Skip to content

Instantly share code, notes, and snippets.

@btskinner
Created July 20, 2018 14:49
Show Gist options
  • Save btskinner/c8654a4369900d7914310fbb77103c14 to your computer and use it in GitHub Desktop.
Save btskinner/c8654a4369900d7914310fbb77103c14 to your computer and use it in GitHub Desktop.
Add variable / value labels to IPEDS data in R
################################################################################
##
## <PROJ> Add variable / value labels to IPEDS data in R
## <FILE> label_ipeds.r
## <AUTH> Benjamin Skinner @btskinner
## <INIT> 12 July 2018
##
################################################################################
## USAGE -----------------------------------------------------------------------
##
## (1) download relevant Stata data and label files from IPEDS (leave zipped)
##
## - Stata data: *_Data_Stata.zip
## - Stata labels: *_Stata.zip
##
## (2) change input/output directories below if desired
##
## (3) run
##
## NB: You can download zipped IPEDS files using < downloadipeds.r > script @
## https://github.com/btskinner/downloadipeds
## -----------------------------------------------------------------------------
## -----------------------------------------------------------------------------
## SET I/O DIRECTORIES (DEFAULT = everything in the current directory)
## -----------------------------------------------------------------------------
## If directory structure like this EXAMPLE:
##
## ./
## |__/r_data
## |
## |__/stata_data
## | |-- ADM2014_Data_Stata.zip
## | |-- ADM2015_Data_Stata.zip
## |
## |__/stata_labels
## | |-- ADM2014_Stata.zip
## | |-- ADM2015_Stata.zip
## |
## |-- label_ipeds.r
##
## Then:
##
## labs_ddir <- file.path('.', 'stata_labels')
## stata_ddir <- file.path('.', 'stata_data')
## r_ddir <- file.path('.', 'r_data')
labs_ddir <- file.path('.') # path to folder w/ zipped label files
stata_ddir <- file.path('.') # path to folder w/ zipped Stata data
r_ddir <- file.path('.') # path to output folder for Rdata files
## -----------------------------------------------------------------------------
## WANT NOISIER OUTPUT? (DEFAULT = FALSE)
## -----------------------------------------------------------------------------
## allow readr::read_csv() messages?
noisy <- FALSE
## -----------------------------------------------------------------------------
## LIBRARIES & FUNCTIONS
## -----------------------------------------------------------------------------
## libraries
libs <- c('tidyverse','labelled')
lapply(libs, require, character.only = TRUE)
read_zip <- function(zipfile, type, noisy) {
## create a name for the dir where we'll unzip
zipdir <- tempfile()
## create the dir using that name
dir.create(zipdir)
## unzip the file into the dir
unzip(zipfile, exdir = zipdir)
## get the files into the dir
files <- list.files(zipdir, recursive = TRUE)
## chose rv file if more than two b/c IPEDS likes revisions
if (length(files) > 1) {
file <- grep('*_rv_*', tolower(files), value = TRUE)
if (length(file) == 0) {
file <- grep('*\\.csv', files, value = TRUE)
}
} else {
file <- files[1]
}
## get the full name of the file
file <- file.path(zipdir, file)
## read the file
if (type == 'csv') {
if (noisy) {
out <- read_csv(file)
} else {
out <- suppressMessages(suppressWarnings(read_csv(file,
progress = FALSE)))
}
} else {
out <- readLines(file, encoding = 'latin1')
}
## remove tmp
unlink(zipdir, recursive = TRUE)
## return
return(out)
}
read_labels <- function(zipfile) {
## read in label file
labs <- read_zip(zipfile, 'do')
## get insheet line and add one to get next line
line_no <- grep('insheet', labs) + 1
## drop header
labs <- labs[line_no:length(labs)]
## drop first asterisk
labs <- gsub('^\\*(.+)$', '\\1', labs)
## return
return(labs)
}
assign_var_labels <- function(df, label_vec) {
## get variable label lines
varlabs <- grep('^label variable', label_vec, value = TRUE)
## if no labels, exit
if (length(varlabs) == 0) { return(df) }
## get variables that have labels
vars <- unlist(lapply(varlabs, function(x) { strsplit(x, ' ')[[1]][[3]] }))
## get the labels belonging to those variables
labs <- gsub('label variable .+"(.+)"', '\\1', varlabs)
## create list
varlabs <- setNames(as.list(labs), vars)
## assign to variables
var_label(df) <- varlabs
## return new data frame
return(df)
}
assign_val_labels <- function(df, label_vec) {
## get value label lines
vallabs <- grep('^label define', label_vec, value = TRUE)
## if no labels, exit
if (length(vallabs) == 0) { return(df) }
## get unique defined labels
labdefs <- unique(gsub('^label define (\\w+).+', '\\1', vallabs))
## get label value lines
vars <- grep('^label values', label_vec, value = TRUE)
## make list of variable plus its value definition
vardef <- setNames(as.list(gsub('^label values (\\w+).+', '\\1', vars)),
gsub('^label values \\w+ (\\w+)\\*?.*', '\\1', vars))
## make unique b/c of some double labels
vardef <- vardef[!duplicated(vardef)]
## loop through each variable
for (i in 1:length(labdefs)) {
## get label
labdef <- labdefs[i]
## skip if missing
if (!is.null(vardef[[labdef]])) {
## subset lines with this definition
pattern <- paste0('\\b', labdef, '\\b')
vallab <- grep(pattern, vallabs, value = TRUE)
## get values
pattern <- paste0('label define ', labdef, ' +(-?\\w+).+')
values <- gsub(pattern, '\\1', vallab)
## convert values to class of variable...hacky fix here
suppressWarnings(class(values) <- class(df[[vardef[[labdef]]]]))
## get labels
pattern <- paste0('label define ', labdef, ' .+"(.+)" ?(, ?add ?)?')
labels <- gsub(pattern, '\\1', vallab)
## make list
labels <- setNames(values, labels)
## label values
df[[vardef[[labdef]]]] <- labelled(df[[vardef[[labdef]]]], labels)
}
}
## return dataframe
return(df)
}
assign_imp_labels <- function(df, label_vec) {
## find line numbers surrounding imputation values
line_no_start <- grep('imputation.*variable(s)?', label_vec) + 1
## if no imputation labels, exit
if (length(line_no_start) == 0) { return(df) }
line_no_stop <- grep('^tab\\b', label_vec)[[1]] - 1
labs <- label_vec[line_no_start:line_no_stop]
## get variables starting with 'x'
vars <- df %>% select(starts_with('x')) %>% names(.)
## make list of each impute value and label
values <- gsub('(\\w\\b).+', '\\1', labs)
labels <- gsub('\\w\\b (.+)', '\\1', labs)
labels <- setNames(values, labels)
## loop through each imputed variable
for (v in vars) {
if (class(df[[v]]) == class(values)) {
df[[v]] <- labelled(df[[v]], labels)
}
}
## return dataframe
return(df)
}
## -----------------------------------------------------------------------------
## RUN BY LOOPING THROUGH FILES
## -----------------------------------------------------------------------------
## get list of zip files
stata_zip <- grep('*_Data_Stata\\.zip', list.files(stata_ddir), value = TRUE)
stata_lab <- grep('_Stata\\.zip', list.files(labs_ddir), value = TRUE)
## if stata_ddir and labs_ddir are the same, subset
if (identical(stata_ddir, labs_ddir)) {
stata_lab <- stata_lab[!(stata_lab %in% stata_zip)]
}
## loop
for (i in 1:length(stata_zip)) {
f <- stata_zip[i]
## message
message(paste0('Working with: ', f))
## get basename
fname <- gsub('(^.+)_Data_Stata.zip', '\\1', f)
## get label file
lab_file <- grep(paste0('^', fname, '_Stata'), stata_lab, value = TRUE)
## skip if missing label file
if (length(lab_file) == 0) {
message(paste0(' NO LABEL FILE FOR: ', fname, ', skipping'))
next
}
## read in data
df <- read_zip(file.path(stata_ddir, f), 'csv', noisy) %>%
rename_all(tolower)
## get labels
labs <- read_labels(file.path(labs_ddir, lab_file))
## assign variable labels
df <- assign_var_labels(df, labs)
## assign value labels
df <- assign_val_labels(df, labs)
## assign imputation labels
df <- assign_imp_labels(df, labs)
## rename data frame to match file name
assign(tolower(fname), df)
## save
save(list = tolower(fname),
file = file.path(r_ddir, paste0(fname, '.Rdata')))
## garbage collect every 10 loops...may help...idk
if (i %% 10 == 0) { gc() }
}
## =============================================================================
## END SCRIPT
################################################################################
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment