Skip to content

Instantly share code, notes, and snippets.

@TerrenKlein TerrenKlein/Race.r
Created Nov 30, 2016

Embed
What would you like to do?
# QSS 30--Final Project
# Race Chart
## Clear file and load necessary packages
rm(list = ls())
library(dplyr)
library(tidyr)
library(ggplot2)
# setting working directory
setwd("~/Dropbox/DartmouthRace_Data")
# Three different column name vectors
col <- c("INSTNM","UGDS_NRA","UGDS_UNKN", "UGDS_WHITENH", "UGDS_BLACKNH", "UGDS_API", "UGDS_AIANOLD","UGDS_HISPOLD")
col2 <- c("INSTNM","UGDS_NRA","UGDS_UNKN", "UGDS_WHITE", "UGDS_BLACK", "UGDS_NHPI", "UGDS_AIAN", "UGDS_HISP", "UGDS_ASIAN")
col3 <- c("INSTNM","UG_NRA","UG_UNKN", "UG_WHITENH", "UG_BLACKNH", "UG_API", "UG_AIANOLD","UG_HISPOLD")
# Cleandata is function that reads in the different csvs one by one and makes the names of the columns from each year uniform
cleandata <- function(arg1) {
filename <- paste("Merged", arg1, "_PP.csv", sep="")
temp <- read.csv(filename,stringsAsFactors = FALSE)
# the 2000 data is stored differently than the rest
if(arg1 == 2000) {
temp <- temp[,col3]
} else if ((arg1 < 2010) && (arg1 != 2000)){
temp <- temp[,col]
}
else{
temp <- temp[,col2]
}
#go through all of the columns and make them numberic
for (i in 2:ncol(temp)){
temp[,i] <- as.numeric(temp[,i])
}
# Make the order of all the columns the same and then rename them
if(arg1 == 2000) {
temp <- temp[c(1,2,3,4,5,7,8,6)] %>%
rename(UGDS_ASIAN2 = UG_API, UGDS_WHITE=UG_WHITENH, UGDS_BLACK=UG_BLACKNH, UGDS_AIAN=UG_AIANOLD, UGDS_HISP=UG_HISPOLD, UGDS_NRA = UG_NRA, UGDS_UNKN = UG_UNKN)
} else if ((arg1 < 2010) && (arg1 != 2000)){
temp <- temp[c(1,2,3,4,5,7,8,6)] %>%
rename(UGDS_ASIAN2 = UGDS_API, UGDS_WHITE = UGDS_WHITENH, UGDS_BLACK = UGDS_BLACKNH, UGDS_AIAN = UGDS_AIANOLD, UGDS_HISP = UGDS_HISPOLD)
}
# After 2010, the categorization of Asian changes. This makes combines the categories to match the rest of the years.
else{
temp <- temp %>%
mutate(UGDS_ASIAN2 = UGDS_NHPI + UGDS_ASIAN) %>%
select(-UGDS_ASIAN, -UGDS_NHPI)
}
# return the new data frame
temp$year <- arg1
return(temp)
}
# Test the function
test1 <- cleandata(2014)
# Create empty df to add the rest to
alldata <- test1[FALSE,]
##bind each years data frame into one data framse
for (i in 1996:2014) {
temp <- cleandata(i)
alldata <- rbind(alldata,temp)
}
# I don't know what's going on with 2009, but it seems to be disorganized. I will look into this later
alldata <- alldata[alldata$year != 200,]
national <- alldata
df
## Create three new data frames that contain the demographic averages for Ivy League, Dartmouth College, and National Data
ivy <- c("BROWN UNIVERSITY", "COLUMBIA UNIVERSITY IN THE CITY OF NEW YORK", "CORNELL UNIVERSITY-ENDOWED COLLEGES", "HARVARD UNIVERSITY",
"PRINCETON UNIVERSITY", "YALE UNIVERSITY","Brown University", "Columbia University",
"Cornell University", "Harvard University", "Princeton University", "Yale University")
ivyLeague <- filter(alldata, INSTNM %in% ivy)
ivyLeague <- na.omit(ivyLeague)
ivyLeague <- ivyLeague[,-1]
ivyLeague <- ivyLeague %>% group_by(year) %>% summarise_each(funs(mean))
ivyLeague$Type <- "Ivy League"
dart <- c("DARTMOUTH COLLEGE", "Dartmouth College")
dartmouth <- filter(alldata, INSTNM %in% dart)
dartmouth <- na.omit(dartmouth)
dartmouth <- dartmouth[,-1]
dartmouth <- dartmouth %>% group_by(year) %>% summarise_each(funs(mean))
dartmouth$Type <- "Dartmouth"
national <- na.omit(national)
national <- national[national$year != 2008,-1] # There is something up with the national data from 08. I'll check this later too
national <- national %>% group_by(year) %>% summarise_each(funs(mean))
national$Type <- "National"
## Bind the three data frames
df <- rbind(national, dartmouth, ivyLeague)
#df <- df[,-3]
df <- rename(df, International = UGDS_NRA, Unknown=UGDS_UNKN, Asian = UGDS_ASIAN2, White = UGDS_WHITE, Black = UGDS_BLACK, "American Indian or Alaskan Native" = UGDS_AIAN, Hispanic = UGDS_HISP)
df <- gather(df, "Race", "Percentage", 2:8, convert = TRUE)
#df$Race <- factor(df$Race, levels=c("White", "Unknown", "Black", "Asian", "Hispanic", "International", "American Indian or Alaskan Native"))
# Export for infogram
write.csv(df, "RaceHigherEd.csv")
## Make line chart and save
plot <- ggplot(data=df, aes(x=year, y=Percentage, color = Type)) +
geom_line(size=2) +
geom_point(size=4) +
theme(plot.background = element_rect(fill = 'midnightblue'),
panel.grid.major = element_line(colour = "black", size=.3),
panel.grid.minor = element_line(colour = "black", size=.3),
panel.border = element_rect(fill = NA, colour = "black", size = 4),
axis.title.x = element_text(face = "bold", color = "gray87", size = 18),
axis.title.y = element_text(face = "bold", color = "gray87", size = 18),
plot.title = element_text(face = "bold", color = "gray90", size = 21),
axis.text = element_text(colour = "gray87",size=14),
strip.text = element_text(size=20),
legend.title=element_blank(),
aspect.ratio = 1) +
scale_color_manual(values=c("chartreuse4", "coral3", "cornflowerblue")) +
xlab("Years") +
ylab("Percentage of Student Population") +
scale_x_continuous(breaks = c(1996, 1998, 2000, 2002, 2004, 2006, 2008, 2010, 2012,2014)) +
ggtitle("Race in Higher Education: Contextualizing Dartmouth's Demographics (1996-2014)") +
facet_wrap(~Race, ncol=3, scales= "free_y")
ggsave("HigherEdRace.png", plot)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.