TerrenKlein/Race.r

## Race.r
# QSS 30--Final Project
# Race Chart

## Clear file and load necessary packages
rm(list = ls())
library(dplyr)
library(tidyr)
library(ggplot2)

# setting working directory

setwd("~/Dropbox/DartmouthRace_Data")

# Three different column name vectors
col <- c("INSTNM","UGDS_NRA","UGDS_UNKN", "UGDS_WHITENH", "UGDS_BLACKNH", "UGDS_API", "UGDS_AIANOLD","UGDS_HISPOLD")
col2 <- c("INSTNM","UGDS_NRA","UGDS_UNKN", "UGDS_WHITE", "UGDS_BLACK", "UGDS_NHPI", "UGDS_AIAN", "UGDS_HISP", "UGDS_ASIAN")
col3 <- c("INSTNM","UG_NRA","UG_UNKN", "UG_WHITENH", "UG_BLACKNH", "UG_API", "UG_AIANOLD","UG_HISPOLD")

# Cleandata is function that reads in the different csvs one by one and makes the names of the columns from each year uniform
cleandata <- function(arg1) {

  filename <- paste("Merged", arg1, "_PP.csv", sep="")

  temp <- read.csv(filename,stringsAsFactors = FALSE)
  # the 2000 data is stored differently than the rest
  if(arg1 == 2000) {
    temp <- temp[,col3]
  } else if ((arg1 < 2010) && (arg1 != 2000)){

    temp <- temp[,col]
  }
  else{
    temp <- temp[,col2]
  }

  #go through all of the columns and make them numberic
  for (i in 2:ncol(temp)){
    temp[,i] <- as.numeric(temp[,i])
  }
  # Make the order of all the columns the same and then rename them
  if(arg1 == 2000) {
    temp <- temp[c(1,2,3,4,5,7,8,6)] %>%
      rename(UGDS_ASIAN2 = UG_API, UGDS_WHITE=UG_WHITENH, UGDS_BLACK=UG_BLACKNH, UGDS_AIAN=UG_AIANOLD, UGDS_HISP=UG_HISPOLD, UGDS_NRA = UG_NRA, UGDS_UNKN = UG_UNKN)
  } else if ((arg1 < 2010) && (arg1 != 2000)){
    temp <- temp[c(1,2,3,4,5,7,8,6)] %>%
      rename(UGDS_ASIAN2 = UGDS_API, UGDS_WHITE = UGDS_WHITENH, UGDS_BLACK = UGDS_BLACKNH, UGDS_AIAN = UGDS_AIANOLD, UGDS_HISP = UGDS_HISPOLD)
  }
  # After 2010, the categorization of Asian changes. This makes combines the categories to match the rest of the years.
  else{
    temp <- temp %>%
      mutate(UGDS_ASIAN2 = UGDS_NHPI + UGDS_ASIAN) %>%
      select(-UGDS_ASIAN, -UGDS_NHPI)
  }
  # return the new data frame
  temp$year <- arg1
  return(temp)
}

# Test the function
test1 <- cleandata(2014)

# Create empty df to add the rest to
alldata <- test1[FALSE,]

##bind each years data frame into one data framse
for (i in 1996:2014) {

  temp <- cleandata(i)
  alldata <- rbind(alldata,temp)

}

# I don't know what's going on with 2009, but it seems to be disorganized. I will look into this later
alldata <- alldata[alldata$year != 200,]
national <- alldata
df


## Create three new data frames that contain the demographic averages for Ivy League, Dartmouth College, and National Data
ivy <- c("BROWN UNIVERSITY", "COLUMBIA UNIVERSITY IN THE CITY OF NEW YORK", "CORNELL UNIVERSITY-ENDOWED COLLEGES", "HARVARD UNIVERSITY",
         "PRINCETON UNIVERSITY", "YALE UNIVERSITY","Brown University", "Columbia University",
         "Cornell University", "Harvard University", "Princeton University", "Yale University")
ivyLeague <- filter(alldata, INSTNM %in% ivy)
ivyLeague <- na.omit(ivyLeague)
ivyLeague <- ivyLeague[,-1]
ivyLeague <- ivyLeague %>% group_by(year) %>% summarise_each(funs(mean))
ivyLeague$Type <- "Ivy League"

dart <- c("DARTMOUTH COLLEGE", "Dartmouth College")
dartmouth <- filter(alldata, INSTNM %in% dart)
dartmouth <- na.omit(dartmouth)
dartmouth <- dartmouth[,-1]
dartmouth <- dartmouth %>% group_by(year) %>% summarise_each(funs(mean))
dartmouth$Type <- "Dartmouth"

national <- na.omit(national)
national <- national[national$year != 2008,-1] # There is something up with the national data from 08. I'll check this later too
national <- national %>% group_by(year) %>% summarise_each(funs(mean))
national$Type <- "National"

## Bind the three data frames
df <- rbind(national, dartmouth, ivyLeague)

#df <- df[,-3]
df <- rename(df, International = UGDS_NRA, Unknown=UGDS_UNKN, Asian = UGDS_ASIAN2, White = UGDS_WHITE, Black = UGDS_BLACK, "American Indian or Alaskan Native" = UGDS_AIAN, Hispanic = UGDS_HISP)
df <- gather(df, "Race", "Percentage", 2:8, convert = TRUE)

#df$Race <- factor(df$Race, levels=c("White", "Unknown", "Black", "Asian", "Hispanic", "International", "American Indian or Alaskan Native"))

# Export for infogram
write.csv(df, "RaceHigherEd.csv")

## Make line chart and save
plot <- ggplot(data=df, aes(x=year, y=Percentage, color = Type)) +
  geom_line(size=2) +
  geom_point(size=4) +
  theme(plot.background = element_rect(fill = 'midnightblue'),
        panel.grid.major = element_line(colour = "black", size=.3),
        panel.grid.minor = element_line(colour = "black", size=.3),
        panel.border = element_rect(fill = NA, colour = "black", size = 4),
        axis.title.x = element_text(face = "bold", color = "gray87", size = 18),
        axis.title.y = element_text(face = "bold", color = "gray87", size = 18),
        plot.title = element_text(face = "bold", color = "gray90", size = 21),
        axis.text = element_text(colour = "gray87",size=14),
        strip.text = element_text(size=20),
        legend.title=element_blank(),
        aspect.ratio = 1) +
  scale_color_manual(values=c("chartreuse4", "coral3", "cornflowerblue")) +
  xlab("Years") +
  ylab("Percentage of Student Population") +
  scale_x_continuous(breaks = c(1996, 1998, 2000, 2002, 2004, 2006, 2008, 2010, 2012,2014)) +
  ggtitle("Race in Higher Education: Contextualizing Dartmouth's Demographics (1996-2014)") +
  facet_wrap(~Race, ncol=3, scales= "free_y")

ggsave("HigherEdRace.png", plot)
	# QSS 30--Final Project
	# Race Chart

	## Clear file and load necessary packages
	rm(list = ls())
	library(dplyr)
	library(tidyr)
	library(ggplot2)

	# setting working directory

	setwd("~/Dropbox/DartmouthRace_Data")

	# Three different column name vectors
	col <- c("INSTNM","UGDS_NRA","UGDS_UNKN", "UGDS_WHITENH", "UGDS_BLACKNH", "UGDS_API", "UGDS_AIANOLD","UGDS_HISPOLD")
	col2 <- c("INSTNM","UGDS_NRA","UGDS_UNKN", "UGDS_WHITE", "UGDS_BLACK", "UGDS_NHPI", "UGDS_AIAN", "UGDS_HISP", "UGDS_ASIAN")
	col3 <- c("INSTNM","UG_NRA","UG_UNKN", "UG_WHITENH", "UG_BLACKNH", "UG_API", "UG_AIANOLD","UG_HISPOLD")

	# Cleandata is function that reads in the different csvs one by one and makes the names of the columns from each year uniform
	cleandata <- function(arg1) {

	filename <- paste("Merged", arg1, "_PP.csv", sep="")

	temp <- read.csv(filename,stringsAsFactors = FALSE)
	# the 2000 data is stored differently than the rest
	if(arg1 == 2000) {
	temp <- temp[,col3]
	} else if ((arg1 < 2010) && (arg1 != 2000)){

	temp <- temp[,col]
	}
	else{
	temp <- temp[,col2]
	}

	#go through all of the columns and make them numberic
	for (i in 2:ncol(temp)){
	temp[,i] <- as.numeric(temp[,i])
	}
	# Make the order of all the columns the same and then rename them
	if(arg1 == 2000) {
	temp <- temp[c(1,2,3,4,5,7,8,6)] %>%
	rename(UGDS_ASIAN2 = UG_API, UGDS_WHITE=UG_WHITENH, UGDS_BLACK=UG_BLACKNH, UGDS_AIAN=UG_AIANOLD, UGDS_HISP=UG_HISPOLD, UGDS_NRA = UG_NRA, UGDS_UNKN = UG_UNKN)
	} else if ((arg1 < 2010) && (arg1 != 2000)){
	temp <- temp[c(1,2,3,4,5,7,8,6)] %>%
	rename(UGDS_ASIAN2 = UGDS_API, UGDS_WHITE = UGDS_WHITENH, UGDS_BLACK = UGDS_BLACKNH, UGDS_AIAN = UGDS_AIANOLD, UGDS_HISP = UGDS_HISPOLD)
	}
	# After 2010, the categorization of Asian changes. This makes combines the categories to match the rest of the years.
	else{
	temp <- temp %>%
	mutate(UGDS_ASIAN2 = UGDS_NHPI + UGDS_ASIAN) %>%
	select(-UGDS_ASIAN, -UGDS_NHPI)
	}
	# return the new data frame
	temp$year <- arg1
	return(temp)
	}

	# Test the function
	test1 <- cleandata(2014)

	# Create empty df to add the rest to
	alldata <- test1[FALSE,]

	##bind each years data frame into one data framse
	for (i in 1996:2014) {

	temp <- cleandata(i)
	alldata <- rbind(alldata,temp)

	}

	# I don't know what's going on with 2009, but it seems to be disorganized. I will look into this later
	alldata <- alldata[alldata$year != 200,]
	national <- alldata
	df


	## Create three new data frames that contain the demographic averages for Ivy League, Dartmouth College, and National Data
	ivy <- c("BROWN UNIVERSITY", "COLUMBIA UNIVERSITY IN THE CITY OF NEW YORK", "CORNELL UNIVERSITY-ENDOWED COLLEGES", "HARVARD UNIVERSITY",
	"PRINCETON UNIVERSITY", "YALE UNIVERSITY","Brown University", "Columbia University",
	"Cornell University", "Harvard University", "Princeton University", "Yale University")
	ivyLeague <- filter(alldata, INSTNM %in% ivy)
	ivyLeague <- na.omit(ivyLeague)
	ivyLeague <- ivyLeague[,-1]
	ivyLeague <- ivyLeague %>% group_by(year) %>% summarise_each(funs(mean))
	ivyLeague$Type <- "Ivy League"

	dart <- c("DARTMOUTH COLLEGE", "Dartmouth College")
	dartmouth <- filter(alldata, INSTNM %in% dart)
	dartmouth <- na.omit(dartmouth)
	dartmouth <- dartmouth[,-1]
	dartmouth <- dartmouth %>% group_by(year) %>% summarise_each(funs(mean))
	dartmouth$Type <- "Dartmouth"

	national <- na.omit(national)
	national <- national[national$year != 2008,-1] # There is something up with the national data from 08. I'll check this later too
	national <- national %>% group_by(year) %>% summarise_each(funs(mean))
	national$Type <- "National"

	## Bind the three data frames
	df <- rbind(national, dartmouth, ivyLeague)

	#df <- df[,-3]
	df <- rename(df, International = UGDS_NRA, Unknown=UGDS_UNKN, Asian = UGDS_ASIAN2, White = UGDS_WHITE, Black = UGDS_BLACK, "American Indian or Alaskan Native" = UGDS_AIAN, Hispanic = UGDS_HISP)
	df <- gather(df, "Race", "Percentage", 2:8, convert = TRUE)

	#df$Race <- factor(df$Race, levels=c("White", "Unknown", "Black", "Asian", "Hispanic", "International", "American Indian or Alaskan Native"))

	# Export for infogram
	write.csv(df, "RaceHigherEd.csv")

	## Make line chart and save
	plot <- ggplot(data=df, aes(x=year, y=Percentage, color = Type)) +
	geom_line(size=2) +
	geom_point(size=4) +
	theme(plot.background = element_rect(fill = 'midnightblue'),
	panel.grid.major = element_line(colour = "black", size=.3),
	panel.grid.minor = element_line(colour = "black", size=.3),
	panel.border = element_rect(fill = NA, colour = "black", size = 4),
	axis.title.x = element_text(face = "bold", color = "gray87", size = 18),
	axis.title.y = element_text(face = "bold", color = "gray87", size = 18),
	plot.title = element_text(face = "bold", color = "gray90", size = 21),
	axis.text = element_text(colour = "gray87",size=14),
	strip.text = element_text(size=20),
	legend.title=element_blank(),
	aspect.ratio = 1) +
	scale_color_manual(values=c("chartreuse4", "coral3", "cornflowerblue")) +
	xlab("Years") +
	ylab("Percentage of Student Population") +
	scale_x_continuous(breaks = c(1996, 1998, 2000, 2002, 2004, 2006, 2008, 2010, 2012,2014)) +
	ggtitle("Race in Higher Education: Contextualizing Dartmouth's Demographics (1996-2014)") +
	facet_wrap(~Race, ncol=3, scales= "free_y")

	ggsave("HigherEdRace.png", plot)