charliejhadley/replace-na-shenanigans.R

## replace-na-shenanigans.R
library(tidyverse)
library(janitor)
library(readxl)


download.file(url = "https://github.com/rfortherestofus/going-deeper/raw/master/data-raw/enrollment-18-19.xlsx",
              mode = "wb",
              destfile = "data-raw/enrollment-18-19.xlsx")

download.file(url = "https://github.com/rfortherestofus/going-deeper/raw/master/data-raw/enrollment-17-18.xlsx",
              mode = "wb",
              destfile = "data-raw/enrollment-17-18.xlsx")

enrollment_18_19 <- read_excel(path = "data-raw/enrollment-18-19.xlsx",
                               sheet = "Sheet 1")

enrollment_17_18 <- read_excel(path = "data-raw/enrollment-17-18.xlsx",
                               sheet = "Sheet 1")

oregon_districts <- read_excel(path = "data-raw/oregon-districts.xlsx",
                               sheet = 'Sheet1') %>%
  clean_names()


enrollment_18_19 %>%
  select(-contains("grade")) %>%
  select(-contains("kindergarten")) %>%
  select(-contains("percent")) %>%
  pivot_longer(cols = -district_id,
               names_to = "race_ethnicity",
               values_to = "number_of_students") %>%
  mutate(number_of_students = na_if(number_of_students, "-")) %>%
  mutate(number_of_students = replace_na(number_of_students, "0"))


clean_enrollment_data <- function(raw_data, data_year, race_ethnicity_remove_text) {
  raw_data %>%
    select(-contains("grade")) %>%
    select(-contains("kindergarten")) %>%
    select(-contains("percent")) %>%
    pivot_longer(cols = -district_id,
                 names_to = "race_ethnicity",
                 values_to = "number_of_students") %>%
    mutate(number_of_students = na_if(number_of_students, "-")) %>%
    mutate(number_of_students = as.character(number_of_students),
           number_of_students = replace_na(number_of_students, "0")) %>%
    mutate(number_of_students = as.numeric(number_of_students)) %>%
    mutate(race_ethnicity = str_remove(race_ethnicity, race_ethnicity_remove_text)) %>%
    mutate(race_ethnicity = case_when(
      race_ethnicity == "american_indian_alaska_native" ~ "American Indian Alaska Native",
      race_ethnicity == "asian" ~ "Asian",
      race_ethnicity == "black_african_american" ~ "Black/African American",
      race_ethnicity == "hispanic_latino" ~ "Hispanic/Latino",
      race_ethnicity == "multiracial" ~ "Multi-Racial",
      race_ethnicity == "native_hawaiian_pacific_islander" ~ "Pacific Islander",
      race_ethnicity == "white" ~ "White"
    )) %>%
    group_by(district_id) %>%
    mutate(pct = number_of_students / sum(number_of_students)) %>%
    ungroup() %>%
    mutate(year = data_year)
}

enrollment_by_race_ethnicity_18_19 <- clean_enrollment_data(raw_data = enrollment_18_19,
                                                            data_year = "2018-2019",
                                                            race_ethnicity_remove_text = "x2018_19_")

enrollment_by_race_ethnicity_17_18 <- clean_enrollment_data(raw_data = enrollment_17_18,
                                                            data_year = "2017-2018",
                                                            race_ethnicity_remove_text = "x2017_18_")

enrollment_by_race_ethnicity <- bind_rows(enrollment_by_race_ethnicity_17_18,
                                          enrollment_by_race_ethnicity_18_19) %>%
  left_join(oregon_districts,
            by = c("district_id" = "attending_district_institutional_id")) %>%
  rename(percent_of_total_at_school = pct) %>%
  select(district_id, district, everything())