btskinner/label_ipeds.r

## label_ipeds.r
################################################################################
##
## <PROJ> Add variable / value labels to IPEDS data in R
## <FILE> label_ipeds.r
## <AUTH> Benjamin Skinner @btskinner
## <INIT> 12 July 2018
##
################################################################################

## USAGE -----------------------------------------------------------------------
##
## (1) download relevant Stata data and label files from IPEDS (leave zipped)
##
##     - Stata data:    *_Data_Stata.zip
##     - Stata labels:  *_Stata.zip
##
## (2) change input/output directories below if desired
##
## (3) run
##
## NB: You can download zipped IPEDS files using < downloadipeds.r > script @
##     https://github.com/btskinner/downloadipeds
## -----------------------------------------------------------------------------

## -----------------------------------------------------------------------------
## SET I/O DIRECTORIES (DEFAULT = everything in the current directory)
## -----------------------------------------------------------------------------

##  If directory structure like this EXAMPLE:
##
##  ./
##  |__/r_data
##  |
##  |__/stata_data
##  |    |-- ADM2014_Data_Stata.zip
##  |    |-- ADM2015_Data_Stata.zip
##  |
##  |__/stata_labels
##  |    |-- ADM2014_Stata.zip
##  |    |-- ADM2015_Stata.zip
##  |
##  |-- label_ipeds.r
##
##  Then:
##
##  labs_ddir <- file.path('.', 'stata_labels')
##  stata_ddir <- file.path('.', 'stata_data')
##  r_ddir <- file.path('.', 'r_data')

labs_ddir <- file.path('.')      # path to folder w/ zipped label files
stata_ddir <- file.path('.')     # path to folder w/ zipped Stata data
r_ddir <- file.path('.')         # path to output folder for Rdata files

## -----------------------------------------------------------------------------
## WANT NOISIER OUTPUT? (DEFAULT = FALSE)
## -----------------------------------------------------------------------------

## allow readr::read_csv() messages?
noisy <- FALSE

## -----------------------------------------------------------------------------
## LIBRARIES & FUNCTIONS
## -----------------------------------------------------------------------------

## libraries
libs <- c('tidyverse','labelled')
lapply(libs, require, character.only = TRUE)

read_zip <- function(zipfile, type, noisy) {
    ## create a name for the dir where we'll unzip
    zipdir <- tempfile()
    ## create the dir using that name
    dir.create(zipdir)
    ## unzip the file into the dir
    unzip(zipfile, exdir = zipdir)
    ## get the files into the dir
    files <- list.files(zipdir, recursive = TRUE)
    ## chose rv file if more than two b/c IPEDS likes revisions
    if (length(files) > 1) {
        file <- grep('*_rv_*', tolower(files), value = TRUE)
        if (length(file) == 0) {
            file <- grep('*\\.csv', files, value = TRUE)
        }
    } else {
        file <- files[1]
    }
    ## get the full name of the file
    file <- file.path(zipdir, file)
    ## read the file
    if (type == 'csv') {
        if (noisy) {
            out <- read_csv(file)
        } else {
            out <- suppressMessages(suppressWarnings(read_csv(file,
                                                              progress = FALSE)))
        }
    } else {
        out <- readLines(file, encoding = 'latin1')
    }
    ## remove tmp
    unlink(zipdir, recursive = TRUE)
    ## return
    return(out)
}

read_labels <- function(zipfile) {
    ## read in label file
    labs <- read_zip(zipfile, 'do')
    ## get insheet line and add one to get next line
    line_no <- grep('insheet', labs) + 1
    ## drop header
    labs <- labs[line_no:length(labs)]
    ## drop first asterisk
    labs <- gsub('^\\*(.+)$', '\\1', labs)
    ## return
    return(labs)
}

assign_var_labels <- function(df, label_vec) {
    ## get variable label lines
    varlabs <- grep('^label variable', label_vec, value = TRUE)
    ## if no labels, exit
    if (length(varlabs) == 0) { return(df) }
    ## get variables that have labels
    vars <- unlist(lapply(varlabs, function(x) { strsplit(x, ' ')[[1]][[3]] }))
    ## get the labels belonging to those variables
    labs <- gsub('label variable .+"(.+)"', '\\1', varlabs)
    ## create list
    varlabs <- setNames(as.list(labs), vars)
    ## assign to variables
    var_label(df) <- varlabs
    ## return new data frame
    return(df)
}

assign_val_labels <- function(df, label_vec) {
    ## get value label lines
    vallabs <- grep('^label define', label_vec, value = TRUE)
    ## if no labels, exit
    if (length(vallabs) == 0) { return(df) }
    ## get unique defined labels
    labdefs <- unique(gsub('^label define (\\w+).+', '\\1', vallabs))
    ## get label value lines
    vars <- grep('^label values', label_vec, value = TRUE)
    ## make list of variable plus its value definition
    vardef <- setNames(as.list(gsub('^label values (\\w+).+', '\\1', vars)),
                       gsub('^label values \\w+ (\\w+)\\*?.*', '\\1', vars))
    ## make unique b/c of some double labels
    vardef <- vardef[!duplicated(vardef)]

    ## loop through each variable
    for (i in 1:length(labdefs)) {
        ## get label
        labdef <- labdefs[i]
        ## skip if missing
        if (!is.null(vardef[[labdef]])) {
            ## subset lines with this definition
            pattern <- paste0('\\b', labdef, '\\b')
            vallab <- grep(pattern, vallabs, value = TRUE)
            ## get values
            pattern <- paste0('label define ', labdef, ' +(-?\\w+).+')
            values <- gsub(pattern, '\\1', vallab)
            ## convert values to class of variable...hacky fix here
            suppressWarnings(class(values) <- class(df[[vardef[[labdef]]]]))
            ## get labels
            pattern <- paste0('label define ', labdef, ' .+"(.+)" ?(, ?add ?)?')
            labels <- gsub(pattern, '\\1', vallab)
            ## make list
            labels <- setNames(values, labels)
            ## label values
            df[[vardef[[labdef]]]] <- labelled(df[[vardef[[labdef]]]], labels)
        }
    }
    ## return dataframe
    return(df)
}

assign_imp_labels <- function(df, label_vec) {
    ## find line numbers surrounding imputation values
    line_no_start <- grep('imputation.*variable(s)?', label_vec) + 1
    ## if no imputation labels, exit
    if (length(line_no_start) == 0) { return(df) }
    line_no_stop <- grep('^tab\\b', label_vec)[[1]] - 1
    labs <- label_vec[line_no_start:line_no_stop]
    ## get variables starting with 'x'
    vars <- df %>% select(starts_with('x')) %>% names(.)
    ## make list of each impute value and label
    values <- gsub('(\\w\\b).+', '\\1', labs)
    labels <- gsub('\\w\\b (.+)', '\\1', labs)
    labels <- setNames(values, labels)
    ## loop through each imputed variable
    for (v in vars) {
        if (class(df[[v]]) == class(values)) {
            df[[v]] <- labelled(df[[v]], labels)
        }
    }
    ## return dataframe
    return(df)
}

## -----------------------------------------------------------------------------
## RUN BY LOOPING THROUGH FILES
## -----------------------------------------------------------------------------

## get list of zip files
stata_zip <- grep('*_Data_Stata\\.zip', list.files(stata_ddir), value = TRUE)
stata_lab <- grep('_Stata\\.zip', list.files(labs_ddir), value = TRUE)

## if stata_ddir and labs_ddir are the same, subset
if (identical(stata_ddir, labs_ddir)) {
    stata_lab <- stata_lab[!(stata_lab %in% stata_zip)]
}

## loop
for (i in 1:length(stata_zip)) {
    f <- stata_zip[i]
    ## message
    message(paste0('Working with: ', f))
    ## get basename
    fname <- gsub('(^.+)_Data_Stata.zip', '\\1', f)
    ## get label file
    lab_file <- grep(paste0('^', fname, '_Stata'), stata_lab, value = TRUE)
    ## skip if missing label file
    if (length(lab_file) == 0) {
        message(paste0('  NO LABEL FILE FOR: ', fname, ', skipping'))
        next
    }
    ## read in data
    df <- read_zip(file.path(stata_ddir, f), 'csv', noisy) %>%
        rename_all(tolower)
    ## get labels
    labs <- read_labels(file.path(labs_ddir, lab_file))
    ## assign variable labels
    df <- assign_var_labels(df, labs)
    ## assign value labels
    df <- assign_val_labels(df, labs)
    ## assign imputation labels
    df <- assign_imp_labels(df, labs)
    ## rename data frame to match file name
    assign(tolower(fname), df)
    ## save
    save(list = tolower(fname),
         file = file.path(r_ddir, paste0(fname, '.Rdata')))
    ## garbage collect every 10 loops...may help...idk
    if (i %% 10 == 0) { gc() }
}

## =============================================================================
## END SCRIPT
################################################################################
	################################################################################
	##
	## <PROJ> Add variable / value labels to IPEDS data in R
	## <FILE> label_ipeds.r
	## <AUTH> Benjamin Skinner @btskinner
	## <INIT> 12 July 2018
	##
	################################################################################

	## USAGE -----------------------------------------------------------------------
	##
	## (1) download relevant Stata data and label files from IPEDS (leave zipped)
	##
	## - Stata data: *_Data_Stata.zip
	## - Stata labels: *_Stata.zip
	##
	## (2) change input/output directories below if desired
	##
	## (3) run
	##
	## NB: You can download zipped IPEDS files using < downloadipeds.r > script @
	## https://github.com/btskinner/downloadipeds
	## -----------------------------------------------------------------------------

	## -----------------------------------------------------------------------------
	## SET I/O DIRECTORIES (DEFAULT = everything in the current directory)
	## -----------------------------------------------------------------------------

	## If directory structure like this EXAMPLE:
	##
	## ./
	## \|__/r_data
	## \|
	## \|__/stata_data
	## \| \|-- ADM2014_Data_Stata.zip
	## \| \|-- ADM2015_Data_Stata.zip
	## \|
	## \|__/stata_labels
	## \| \|-- ADM2014_Stata.zip
	## \| \|-- ADM2015_Stata.zip
	## \|
	## \|-- label_ipeds.r
	##
	## Then:
	##
	## labs_ddir <- file.path('.', 'stata_labels')
	## stata_ddir <- file.path('.', 'stata_data')
	## r_ddir <- file.path('.', 'r_data')

	labs_ddir <- file.path('.') # path to folder w/ zipped label files
	stata_ddir <- file.path('.') # path to folder w/ zipped Stata data
	r_ddir <- file.path('.') # path to output folder for Rdata files

	## -----------------------------------------------------------------------------
	## WANT NOISIER OUTPUT? (DEFAULT = FALSE)
	## -----------------------------------------------------------------------------

	## allow readr::read_csv() messages?
	noisy <- FALSE

	## -----------------------------------------------------------------------------
	## LIBRARIES & FUNCTIONS
	## -----------------------------------------------------------------------------

	## libraries
	libs <- c('tidyverse','labelled')
	lapply(libs, require, character.only = TRUE)

	read_zip <- function(zipfile, type, noisy) {
	## create a name for the dir where we'll unzip
	zipdir <- tempfile()
	## create the dir using that name
	dir.create(zipdir)
	## unzip the file into the dir
	unzip(zipfile, exdir = zipdir)
	## get the files into the dir
	files <- list.files(zipdir, recursive = TRUE)
	## chose rv file if more than two b/c IPEDS likes revisions
	if (length(files) > 1) {
	file <- grep('_rv_', tolower(files), value = TRUE)
	if (length(file) == 0) {
	file <- grep('*\\.csv', files, value = TRUE)
	}
	} else {
	file <- files[1]
	}
	## get the full name of the file
	file <- file.path(zipdir, file)
	## read the file
	if (type == 'csv') {
	if (noisy) {
	out <- read_csv(file)
	} else {
	out <- suppressMessages(suppressWarnings(read_csv(file,
	progress = FALSE)))
	}
	} else {
	out <- readLines(file, encoding = 'latin1')
	}
	## remove tmp
	unlink(zipdir, recursive = TRUE)
	## return
	return(out)
	}

	read_labels <- function(zipfile) {
	## read in label file
	labs <- read_zip(zipfile, 'do')
	## get insheet line and add one to get next line
	line_no <- grep('insheet', labs) + 1
	## drop header
	labs <- labs[line_no:length(labs)]
	## drop first asterisk
	labs <- gsub('^\\*(.+)$', '\\1', labs)
	## return
	return(labs)
	}

	assign_var_labels <- function(df, label_vec) {
	## get variable label lines
	varlabs <- grep('^label variable', label_vec, value = TRUE)
	## if no labels, exit
	if (length(varlabs) == 0) { return(df) }
	## get variables that have labels
	vars <- unlist(lapply(varlabs, function(x) { strsplit(x, ' ')[[1]][[3]] }))
	## get the labels belonging to those variables
	labs <- gsub('label variable .+"(.+)"', '\\1', varlabs)
	## create list
	varlabs <- setNames(as.list(labs), vars)
	## assign to variables
	var_label(df) <- varlabs
	## return new data frame
	return(df)
	}

	assign_val_labels <- function(df, label_vec) {
	## get value label lines
	vallabs <- grep('^label define', label_vec, value = TRUE)
	## if no labels, exit
	if (length(vallabs) == 0) { return(df) }
	## get unique defined labels
	labdefs <- unique(gsub('^label define (\\w+).+', '\\1', vallabs))
	## get label value lines
	vars <- grep('^label values', label_vec, value = TRUE)
	## make list of variable plus its value definition
	vardef <- setNames(as.list(gsub('^label values (\\w+).+', '\\1', vars)),
	gsub('^label values \\w+ (\\w+)\\?.', '\\1', vars))
	## make unique b/c of some double labels
	vardef <- vardef[!duplicated(vardef)]

	## loop through each variable
	for (i in 1:length(labdefs)) {
	## get label
	labdef <- labdefs[i]
	## skip if missing
	if (!is.null(vardef[[labdef]])) {
	## subset lines with this definition
	pattern <- paste0('\\b', labdef, '\\b')
	vallab <- grep(pattern, vallabs, value = TRUE)
	## get values
	pattern <- paste0('label define ', labdef, ' +(-?\\w+).+')
	values <- gsub(pattern, '\\1', vallab)
	## convert values to class of variable...hacky fix here
	suppressWarnings(class(values) <- class(df[[vardef[[labdef]]]]))
	## get labels
	pattern <- paste0('label define ', labdef, ' .+"(.+)" ?(, ?add ?)?')
	labels <- gsub(pattern, '\\1', vallab)
	## make list
	labels <- setNames(values, labels)
	## label values
	df[[vardef[[labdef]]]] <- labelled(df[[vardef[[labdef]]]], labels)
	}
	}
	## return dataframe
	return(df)
	}

	assign_imp_labels <- function(df, label_vec) {
	## find line numbers surrounding imputation values
	line_no_start <- grep('imputation.*variable(s)?', label_vec) + 1
	## if no imputation labels, exit
	if (length(line_no_start) == 0) { return(df) }
	line_no_stop <- grep('^tab\\b', label_vec)[[1]] - 1
	labs <- label_vec[line_no_start:line_no_stop]
	## get variables starting with 'x'
	vars <- df %>% select(starts_with('x')) %>% names(.)
	## make list of each impute value and label
	values <- gsub('(\\w\\b).+', '\\1', labs)
	labels <- gsub('\\w\\b (.+)', '\\1', labs)
	labels <- setNames(values, labels)
	## loop through each imputed variable
	for (v in vars) {
	if (class(df[[v]]) == class(values)) {
	df[[v]] <- labelled(df[[v]], labels)
	}
	}
	## return dataframe
	return(df)
	}

	## -----------------------------------------------------------------------------
	## RUN BY LOOPING THROUGH FILES
	## -----------------------------------------------------------------------------

	## get list of zip files
	stata_zip <- grep('*_Data_Stata\\.zip', list.files(stata_ddir), value = TRUE)
	stata_lab <- grep('_Stata\\.zip', list.files(labs_ddir), value = TRUE)

	## if stata_ddir and labs_ddir are the same, subset
	if (identical(stata_ddir, labs_ddir)) {
	stata_lab <- stata_lab[!(stata_lab %in% stata_zip)]
	}

	## loop
	for (i in 1:length(stata_zip)) {
	f <- stata_zip[i]
	## message
	message(paste0('Working with: ', f))
	## get basename
	fname <- gsub('(^.+)_Data_Stata.zip', '\\1', f)
	## get label file
	lab_file <- grep(paste0('^', fname, '_Stata'), stata_lab, value = TRUE)
	## skip if missing label file
	if (length(lab_file) == 0) {
	message(paste0(' NO LABEL FILE FOR: ', fname, ', skipping'))
	next
	}
	## read in data
	df <- read_zip(file.path(stata_ddir, f), 'csv', noisy) %>%
	rename_all(tolower)
	## get labels
	labs <- read_labels(file.path(labs_ddir, lab_file))
	## assign variable labels
	df <- assign_var_labels(df, labs)
	## assign value labels
	df <- assign_val_labels(df, labs)
	## assign imputation labels
	df <- assign_imp_labels(df, labs)
	## rename data frame to match file name
	assign(tolower(fname), df)
	## save
	save(list = tolower(fname),
	file = file.path(r_ddir, paste0(fname, '.Rdata')))
	## garbage collect every 10 loops...may help...idk
	if (i %% 10 == 0) { gc() }
	}

	## =============================================================================
	## END SCRIPT
	################################################################################