Created
November 30, 2016 17:19
-
-
Save TerrenKlein/de4c8c9d1dd49c3664d157d21458cdb9 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# QSS 30--Final Project | |
# Race Chart | |
## Clear file and load necessary packages | |
rm(list = ls()) | |
library(dplyr) | |
library(tidyr) | |
library(ggplot2) | |
# setting working directory | |
setwd("~/Dropbox/DartmouthRace_Data") | |
# Three different column name vectors | |
col <- c("INSTNM","UGDS_NRA","UGDS_UNKN", "UGDS_WHITENH", "UGDS_BLACKNH", "UGDS_API", "UGDS_AIANOLD","UGDS_HISPOLD") | |
col2 <- c("INSTNM","UGDS_NRA","UGDS_UNKN", "UGDS_WHITE", "UGDS_BLACK", "UGDS_NHPI", "UGDS_AIAN", "UGDS_HISP", "UGDS_ASIAN") | |
col3 <- c("INSTNM","UG_NRA","UG_UNKN", "UG_WHITENH", "UG_BLACKNH", "UG_API", "UG_AIANOLD","UG_HISPOLD") | |
# Cleandata is function that reads in the different csvs one by one and makes the names of the columns from each year uniform | |
cleandata <- function(arg1) { | |
filename <- paste("Merged", arg1, "_PP.csv", sep="") | |
temp <- read.csv(filename,stringsAsFactors = FALSE) | |
# the 2000 data is stored differently than the rest | |
if(arg1 == 2000) { | |
temp <- temp[,col3] | |
} else if ((arg1 < 2010) && (arg1 != 2000)){ | |
temp <- temp[,col] | |
} | |
else{ | |
temp <- temp[,col2] | |
} | |
#go through all of the columns and make them numberic | |
for (i in 2:ncol(temp)){ | |
temp[,i] <- as.numeric(temp[,i]) | |
} | |
# Make the order of all the columns the same and then rename them | |
if(arg1 == 2000) { | |
temp <- temp[c(1,2,3,4,5,7,8,6)] %>% | |
rename(UGDS_ASIAN2 = UG_API, UGDS_WHITE=UG_WHITENH, UGDS_BLACK=UG_BLACKNH, UGDS_AIAN=UG_AIANOLD, UGDS_HISP=UG_HISPOLD, UGDS_NRA = UG_NRA, UGDS_UNKN = UG_UNKN) | |
} else if ((arg1 < 2010) && (arg1 != 2000)){ | |
temp <- temp[c(1,2,3,4,5,7,8,6)] %>% | |
rename(UGDS_ASIAN2 = UGDS_API, UGDS_WHITE = UGDS_WHITENH, UGDS_BLACK = UGDS_BLACKNH, UGDS_AIAN = UGDS_AIANOLD, UGDS_HISP = UGDS_HISPOLD) | |
} | |
# After 2010, the categorization of Asian changes. This makes combines the categories to match the rest of the years. | |
else{ | |
temp <- temp %>% | |
mutate(UGDS_ASIAN2 = UGDS_NHPI + UGDS_ASIAN) %>% | |
select(-UGDS_ASIAN, -UGDS_NHPI) | |
} | |
# return the new data frame | |
temp$year <- arg1 | |
return(temp) | |
} | |
# Test the function | |
test1 <- cleandata(2014) | |
# Create empty df to add the rest to | |
alldata <- test1[FALSE,] | |
##bind each years data frame into one data framse | |
for (i in 1996:2014) { | |
temp <- cleandata(i) | |
alldata <- rbind(alldata,temp) | |
} | |
# I don't know what's going on with 2009, but it seems to be disorganized. I will look into this later | |
alldata <- alldata[alldata$year != 200,] | |
national <- alldata | |
df | |
## Create three new data frames that contain the demographic averages for Ivy League, Dartmouth College, and National Data | |
ivy <- c("BROWN UNIVERSITY", "COLUMBIA UNIVERSITY IN THE CITY OF NEW YORK", "CORNELL UNIVERSITY-ENDOWED COLLEGES", "HARVARD UNIVERSITY", | |
"PRINCETON UNIVERSITY", "YALE UNIVERSITY","Brown University", "Columbia University", | |
"Cornell University", "Harvard University", "Princeton University", "Yale University") | |
ivyLeague <- filter(alldata, INSTNM %in% ivy) | |
ivyLeague <- na.omit(ivyLeague) | |
ivyLeague <- ivyLeague[,-1] | |
ivyLeague <- ivyLeague %>% group_by(year) %>% summarise_each(funs(mean)) | |
ivyLeague$Type <- "Ivy League" | |
dart <- c("DARTMOUTH COLLEGE", "Dartmouth College") | |
dartmouth <- filter(alldata, INSTNM %in% dart) | |
dartmouth <- na.omit(dartmouth) | |
dartmouth <- dartmouth[,-1] | |
dartmouth <- dartmouth %>% group_by(year) %>% summarise_each(funs(mean)) | |
dartmouth$Type <- "Dartmouth" | |
national <- na.omit(national) | |
national <- national[national$year != 2008,-1] # There is something up with the national data from 08. I'll check this later too | |
national <- national %>% group_by(year) %>% summarise_each(funs(mean)) | |
national$Type <- "National" | |
## Bind the three data frames | |
df <- rbind(national, dartmouth, ivyLeague) | |
#df <- df[,-3] | |
df <- rename(df, International = UGDS_NRA, Unknown=UGDS_UNKN, Asian = UGDS_ASIAN2, White = UGDS_WHITE, Black = UGDS_BLACK, "American Indian or Alaskan Native" = UGDS_AIAN, Hispanic = UGDS_HISP) | |
df <- gather(df, "Race", "Percentage", 2:8, convert = TRUE) | |
#df$Race <- factor(df$Race, levels=c("White", "Unknown", "Black", "Asian", "Hispanic", "International", "American Indian or Alaskan Native")) | |
# Export for infogram | |
write.csv(df, "RaceHigherEd.csv") | |
## Make line chart and save | |
plot <- ggplot(data=df, aes(x=year, y=Percentage, color = Type)) + | |
geom_line(size=2) + | |
geom_point(size=4) + | |
theme(plot.background = element_rect(fill = 'midnightblue'), | |
panel.grid.major = element_line(colour = "black", size=.3), | |
panel.grid.minor = element_line(colour = "black", size=.3), | |
panel.border = element_rect(fill = NA, colour = "black", size = 4), | |
axis.title.x = element_text(face = "bold", color = "gray87", size = 18), | |
axis.title.y = element_text(face = "bold", color = "gray87", size = 18), | |
plot.title = element_text(face = "bold", color = "gray90", size = 21), | |
axis.text = element_text(colour = "gray87",size=14), | |
strip.text = element_text(size=20), | |
legend.title=element_blank(), | |
aspect.ratio = 1) + | |
scale_color_manual(values=c("chartreuse4", "coral3", "cornflowerblue")) + | |
xlab("Years") + | |
ylab("Percentage of Student Population") + | |
scale_x_continuous(breaks = c(1996, 1998, 2000, 2002, 2004, 2006, 2008, 2010, 2012,2014)) + | |
ggtitle("Race in Higher Education: Contextualizing Dartmouth's Demographics (1996-2014)") + | |
facet_wrap(~Race, ncol=3, scales= "free_y") | |
ggsave("HigherEdRace.png", plot) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment