neila/cs112assignment1.R

## cs112assignment1.R
######### RUN THE CODE BELOW IN R. R-STUDIO IS THE RECOMMENDED IDE. BOTH R AND R-STUDIO ARE FREE.
######### QUESTIONS SHOULD BE POSTED TO PIAZZA
######### THE ACTUAL ASSIGNMENT BEGINS ON LINE 71


#########Provided Code (some parts commented out because I didn't want it)#########

### Multilateral Development Institution Data
foo <- read.csv("https://tinyurl.com/yb4phxx8") # read in the data

# column names
names(foo)

# dimensions of the data set
dim(foo)

# quick look at the data structure
head(foo)

# one thing to be very careful with (in this data set) is the use of dates. 8 columns involve dates.

# take note of the columns representing calendar dates
#date.columns <- c(11, 12, 14, 15, 16, 17, 18, 25)

# these columns need some tweaking--I want to address missing values, calling the blank (empty)
# elements "NA" instead of leaving them blank, and I wish to tell R these are "Date" objects.

#for(i in date.columns)  # this "for loop" only loops through the "date.columns" -- no other columns.

#{

# identify which values are missing in the "i"th column of the foo data set
#  which_values_are_missing <- which(as.character(foo[, i]) == "")

# those values that are missing (blank) in the "i"th column are replaced by <NA>
# because R knows how to handle "NA" -- NA means something special in R--blanks are handled
# more unpredictably (which is bad).
#  foo[which_values_are_missing, i] <- NA

# last step--replace each of these columns (which is structured as a column of "factor" values)
# as a column of dates--i.e., convert them to an object of "class" = Date. They are dates, after all.
# And if you convert them to the Date class, R will know they are dates and you can manipulate
# dates in a simple, straightforward way. Otherwise, you won't be able to easily manipulate them
# arithmetically.  E.g., for simple Date operations, see lines 48-58 below...
# **By the way, if you don't understand what a "factor" is in R, you should Google it.**
#  foo[, i] <- as.Date(as.character(foo[, i]))

#}

# Now R knows that these columns are comprised of dates
# Also, one additional helpful hint... How to eliminate rows with NAs...
# The "is.na" function--for more info, Google it or type ?is.na at the R command prompt in the console.
#which.have.NAs <- which(is.na(foo$Rating == TRUE)) # for which rows is the claim "is.na" a TRUE claim?

# Then, if you wanted to, e.g., remove all those rows, retaining only the rows with ratings...
#new_foo <- foo[-which.have.NAs, ]
# Notice I called this tweaked data set "new_foo" instead of rewriting over the original data set...
# It's a bit safer to do this, in case I decide I want to quickly revert back to the original data set.


###########################################################################


### ASSIGNMENT 1 -- You may want to read ALL the questions before you begin.
### NOTE: FOR ALL QUESTIONS BELOW, ONLY CONSIDER PROJECTS WITH
### non-missing "Circulation.Date" >= 2008-01-01.
### EXCLUDE ALL OTHER PROJECTS FROM YOUR ANALYSIS.
### YOU MUST provide a link to your R code. ------ DON'T FORGET TO DO THIS!!!!!!!!!!!!
### Take note of the column names: i.e., you can type: names(foo)
### fyi: the column called "Rating" is the success rating at completion. 0 = lowest, 3 = highest.


#####Sho's data cleaning#####

#clean all columns and replace empty values with NA
for(i in names(foo))  # loop through all columns.

{

  # identify which values are missing in the "i"th column of the foo data set
  which_values_are_missing <- which(as.character(foo[, i]) == "")

  # those values that are missing (blank) in the "i"th column are replaced by <NA>
  # because R knows how to handle "NA" -- NA means something special in R--blanks are handled
  #more unpredictably (which is bad).
  foo[which_values_are_missing, i] <- NA
}

#clean date columns
date.columns <- c(11, 12, 14, 15, 16, 17, 18, 25)
for(i in date.columns)  # this "for loop" only loops through the "date.columns" -- no other columns.

{
  # replace each of these columns (which is structured as a column of "factor" values)
  # as a column of dates--i.e., convert them to an object of "class" = Date. They are dates, after all.
  # And if you convert them to the Date class, R will know they are dates and you can manipulate
  # dates in a simple, straightforward way. Otherwise, you won't be able to easily manipulate them
  # arithmetically.  E.g., for simple Date operations, see lines 48-58 below...
  # **By the way, if you don't understand what a "factor" is in R, you should Google it.**
  foo[, i] <- as.Date(as.character(foo[, i]))

}

#only consider projects 2018-01-01 and after
badfoo <- which(is.na(foo$CirculationDate == TRUE))
new_foo <- foo[-badfoo, ]
new_foo <- new_foo[new_foo$CirculationDate >= "2008-01-01", ]

###########################

# (1) When projects are approved, they are approved for a certain period of time (until the time of
# "original completion date"). While projects are active, this "original" completion date is
# often pushed out (extended), and then there is a "revised" completion date.
# You have been told that project duration at approval is generally about
# 2 years (24 months). In other words, (purportedly) when projects are approved, the difference
# between the original project completion date and the the approval date is (supposedly)
# approximately 24 months.

# (a) Is this claim true? Explain. (Remember, for this ENTIRE assignment, only consider
# projects with Circulation.Date >= 2008-01-01. This will be your only reminder...)

original_na <- which(is.na(new_foo$OriginalCompletionDate == TRUE))
#identify which values are NA in the OriginalCompletionDate column

new_foo_1a <- new_foo[-original_na, ]
#subset new_foo_1a as a dataframe that excludes rows which have been identified above

original_duration <- c(new_foo_1a$OriginalCompletionDate - new_foo_1a$ApprovalDate)
mean(original_duration)
quantile(original_duration)

hist(as.numeric(original_duration),main="Histogram of project duration at approval",
     xlab="Project duration at approval (in days)")


# Has project duration at approval changed over time (consider projects circulated earlier
# and circulated later). Be sure to discuss mean durations, median durations, and the
# interquartile range of durations (using the "quantile" function).
# Approximate suggested length: 3-5 sentences

for(i in "CirculationDate")  # this "for loop" only loops through the CirculationDate -- no other columns.

{
  #convert date to numeric
  new_foo_1a[, i] <- as.numeric(new_foo_1a[, i])

}

quantile(c(new_foo_1a$CirculationDate)) #knowing where the earliest and latest 25% o projects are
late <- which(new_foo_1a$CirculationDate >= 16890.75) #specify rows that are late
early <- which(new_foo_1a$CirculationDate <= 14790.00) #specify rows that are early

new_foo_1a_late <- new_foo_1a[late, ] #new dataframe only with late rows
new_foo_1a_early <- new_foo_1a[early, ] #new dataframe only with early rows

original_duration_late <- c(new_foo_1a_late$OriginalCompletionDate - new_foo_1a_late$ApprovalDate)
#new duration only with late projects
original_duration_early <- c(new_foo_1a_early$OriginalCompletionDate - new_foo_1a_early$ApprovalDate)
#new duration only with early projects

mean(original_duration_early)
quantile(original_duration_early)
hist(as.numeric(original_duration_early),main="Histogram of early circulated projects' duration at approval",
     xlab="Project duration at approval (in days)")

mean(original_duration_late)
quantile(original_duration_late)
hist(as.numeric(original_duration_late),main="Histogram of late circulated projects' duration at approval",
     xlab="Project duration at approval (in days)")


# (b) How does original planned project duration differ from actual duration (if actual duration is
# measured as the duration between "ApprovalDate" and "RevisedCompletionDate"?)  Once again, use
# means, medians, and interquartile ranges to explain your results.
# Approximate suggested length: 3-5 sentences

new_foo_1b <- new_foo_1a
#subset new_foo_1b as the same dataframe as new_foo_1a since all rows in RevisedCompletionDate are non NA

revised_duration <- c(new_foo_1b$RevisedCompletionDate - new_foo_1b$ApprovalDate)
mean(revised_duration)
quantile(revised_duration)

hist(as.numeric(revised_duration),main="Histogram of actual project duration",
     xlab="Actual project duration (in days)")


# (2) What % of projects that have ratings were rated 0?
# What % were rated 1? What % were rated 2? What % were rated 3? Answer these questions using a table
# or a figure. Provide a title AND an explanatory sentence or two that provides the numerical % results
# rounded to the nearest percentage-point.

rating_na <- which(is.na(new_foo$Rating == TRUE))
#identify which values are NA in the Rating column

new_foo_2 <- new_foo[-rating_na, ]
#remove the rows identified above

frequency <- table(new_foo_2$Rating)
#make a table of rating

newrow <- c(length(which(new_foo_2$Rating == 0)),
            length(which(new_foo_2$Rating == 1)),
            length(which(new_foo_2$Rating == 2)),
            length(which(new_foo_2$Rating == 3)))/length(new_foo_2$Rating)*100 #make a new row with proportions

frequency <- rbind(frequency, percentage=lapply(newrow, round, 0))
#add new row to the table
frequency

ratingplot = hist(new_foo_2$Rating, breaks=-0.5:3.5, plot=FALSE)
#make the histogram
ratingplot$density = ratingplot$counts/sum(ratingplot$counts)*100
#the y axis is in percentage instead of raw frequency

plot(ratingplot,freq=FALSE, main = "histogram of ratings (proportion)",
     ylab = "density (in percent)", xlab="Ratings")


# (3) Repeat problem 2, but this time exclude all PPTA projects. PPTA projects are more prone to
# negative ratings, because after a certain point in time only the low-rated PPTA projects required
# ratings.  PPTA stands for "Project Preparatory Technical Assistance" and it is basically a project
# intended to set up a loan (often a very large multi-million-dollar loan). Only PPTAs that fail to
# "eventuate" to a loan are rated, which is why they are usually rated negatively.

ppta <- which(new_foo_2$Type == "PPTA")
#identify which values are "PPTA" in the Type column

new_foo_3 <- new_foo_2[-ppta, ]
#remove the rows identified above

re_frequency <- table(new_foo_3$Rating)
#make a table of rating
re_newrow <- c(length(which(new_foo_3$Rating == 0)),
               length(which(new_foo_3$Rating == 1)),
               length(which(new_foo_3$Rating == 2)),
               length(which(new_foo_3$Rating == 3)))/length(new_foo_3$Rating)*100
#make a new row with the percentages

re_frequency <- rbind(re_frequency, percentage=lapply(re_newrow, round, 0))
#add new row to the table
re_frequency

re_ratingplot = hist(new_foo_3$Rating, breaks=-0.5:3.5, plot=FALSE)
#make the histogram
re_ratingplot$density = re_ratingplot$counts/sum(re_ratingplot$counts)*100
#the y axis is in percentage instead of raw frequency

plot(re_ratingplot,freq=FALSE, main = "histogram of ratings excluding PPTA projects (proportion)",
     ylab = "density (in percent)", xlab="Ratings")


# (4) Identify the top 25% of projects by "Revised.Amount" and the bottom 25% of projects by
# "RevisedAmount". ("RevisedAmount" shows the final project budget.)
# Compare the ratings of these projects. Can you draw a causal conclusion about the effect of
# budget size on ratings? Why or why not?
# Hint: Compare the characteristics of the two project groupings,
# e.g., "Dept", "Division", "Cluster", "Country"
# Approximate suggested length: 3-5 sentences.

quantile(c(new_foo$RevisedAmount))

top25 <- which(new_foo$RevisedAmount >= 1.000)
bottom25 <- which(new_foo$RevisedAmount <= 0.400)

budgettop <- new_foo[top25, ]
budgetbottom <- new_foo[bottom25, ]
budgetbottom <- budgetbottom[-which(is.na(budgetbottom$Rating == TRUE)), ]

topfrequency <- table(budgettop$Rating)
bottomfrequency <- table(budgetbottom$Rating)

topbudgetplot = hist(budgettop$Rating, breaks=-0.5:3.5, plot=FALSE)
#make the histogram
topbudgetplot$density = topbudgetplot$counts/sum(topbudgetplot$counts)*100
#the y axis is in percentage instead of raw frequency
plot(topbudgetplot,freq=FALSE, main = "histogram of ratings with top 25% budget amount (proportion)",
     ylab = "density (in percent)", xlab="Ratings")

bottombudgetplot = hist(budgetbottom$Rating,
                        breaks=(min(budgetbottom$Rating)-1):(max(budgetbottom$Rating)+0.5), plot=FALSE)
#make the histogram

bottombudgetplot$density = bottombudgetplot$counts/sum(bottombudgetplot$counts)*100
#the y axis is in percentage instead of raw frequency

plot(bottombudgetplot,freq=FALSE, main = "histogram of ratings with bottom 25% budget amount (proportion)",
     ylab = "density (in percent)", xlab="Ratings")


# (5) Imagine your manager asks you to apply Jeremy Howard's drivetrain model to the
# problem of optimal budget-setting to maximize project success (i.e., "Rating").
# In such a situation, what would be the:
# (a) decision problem or objective?
# (b) lever or levers?
# (c) ideal RCT design?
# (d) dependent variable(s) and independent variable(s) in the modeler
# (e) And---Why would running RCTs and modeling/optimizing over RCT results be preferable
# to using (observational, non-RCT) "foo" data?
# Approximate suggested length: 1-3 sentences for each sub-question.
	######### RUN THE CODE BELOW IN R. R-STUDIO IS THE RECOMMENDED IDE. BOTH R AND R-STUDIO ARE FREE.
	######### QUESTIONS SHOULD BE POSTED TO PIAZZA
	######### THE ACTUAL ASSIGNMENT BEGINS ON LINE 71


	#########Provided Code (some parts commented out because I didn't want it)#########

	### Multilateral Development Institution Data
	foo <- read.csv("https://tinyurl.com/yb4phxx8") # read in the data

	# column names
	names(foo)

	# dimensions of the data set
	dim(foo)

	# quick look at the data structure
	head(foo)

	# one thing to be very careful with (in this data set) is the use of dates. 8 columns involve dates.

	# take note of the columns representing calendar dates
	#date.columns <- c(11, 12, 14, 15, 16, 17, 18, 25)

	# these columns need some tweaking--I want to address missing values, calling the blank (empty)
	# elements "NA" instead of leaving them blank, and I wish to tell R these are "Date" objects.

	#for(i in date.columns) # this "for loop" only loops through the "date.columns" -- no other columns.

	#{

	# identify which values are missing in the "i"th column of the foo data set
	# which_values_are_missing <- which(as.character(foo[, i]) == "")

	# those values that are missing (blank) in the "i"th column are replaced by <NA>
	# because R knows how to handle "NA" -- NA means something special in R--blanks are handled
	# more unpredictably (which is bad).
	# foo[which_values_are_missing, i] <- NA

	# last step--replace each of these columns (which is structured as a column of "factor" values)
	# as a column of dates--i.e., convert them to an object of "class" = Date. They are dates, after all.
	# And if you convert them to the Date class, R will know they are dates and you can manipulate
	# dates in a simple, straightforward way. Otherwise, you won't be able to easily manipulate them
	# arithmetically. E.g., for simple Date operations, see lines 48-58 below...
	# By the way, if you don't understand what a "factor" is in R, you should Google it.
	# foo[, i] <- as.Date(as.character(foo[, i]))

	#}

	# Now R knows that these columns are comprised of dates
	# Also, one additional helpful hint... How to eliminate rows with NAs...
	# The "is.na" function--for more info, Google it or type ?is.na at the R command prompt in the console.
	#which.have.NAs <- which(is.na(foo$Rating == TRUE)) # for which rows is the claim "is.na" a TRUE claim?

	# Then, if you wanted to, e.g., remove all those rows, retaining only the rows with ratings...
	#new_foo <- foo[-which.have.NAs, ]
	# Notice I called this tweaked data set "new_foo" instead of rewriting over the original data set...
	# It's a bit safer to do this, in case I decide I want to quickly revert back to the original data set.



	###########################################################################



	### ASSIGNMENT 1 -- You may want to read ALL the questions before you begin.
	### NOTE: FOR ALL QUESTIONS BELOW, ONLY CONSIDER PROJECTS WITH
	### non-missing "Circulation.Date" >= 2008-01-01.
	### EXCLUDE ALL OTHER PROJECTS FROM YOUR ANALYSIS.
	### YOU MUST provide a link to your R code. ------ DON'T FORGET TO DO THIS!!!!!!!!!!!!
	### Take note of the column names: i.e., you can type: names(foo)
	### fyi: the column called "Rating" is the success rating at completion. 0 = lowest, 3 = highest.



	#####Sho's data cleaning#####

	#clean all columns and replace empty values with NA
	for(i in names(foo)) # loop through all columns.

	{

	# identify which values are missing in the "i"th column of the foo data set
	which_values_are_missing <- which(as.character(foo[, i]) == "")

	# those values that are missing (blank) in the "i"th column are replaced by <NA>
	# because R knows how to handle "NA" -- NA means something special in R--blanks are handled
	#more unpredictably (which is bad).
	foo[which_values_are_missing, i] <- NA
	}

	#clean date columns
	date.columns <- c(11, 12, 14, 15, 16, 17, 18, 25)
	for(i in date.columns) # this "for loop" only loops through the "date.columns" -- no other columns.

	{
	# replace each of these columns (which is structured as a column of "factor" values)
	# as a column of dates--i.e., convert them to an object of "class" = Date. They are dates, after all.
	# And if you convert them to the Date class, R will know they are dates and you can manipulate
	# dates in a simple, straightforward way. Otherwise, you won't be able to easily manipulate them
	# arithmetically. E.g., for simple Date operations, see lines 48-58 below...
	# By the way, if you don't understand what a "factor" is in R, you should Google it.
	foo[, i] <- as.Date(as.character(foo[, i]))

	}

	#only consider projects 2018-01-01 and after
	badfoo <- which(is.na(foo$CirculationDate == TRUE))
	new_foo <- foo[-badfoo, ]
	new_foo <- new_foo[new_foo$CirculationDate >= "2008-01-01", ]

	###########################

	# (1) When projects are approved, they are approved for a certain period of time (until the time of
	# "original completion date"). While projects are active, this "original" completion date is
	# often pushed out (extended), and then there is a "revised" completion date.
	# You have been told that project duration at approval is generally about
	# 2 years (24 months). In other words, (purportedly) when projects are approved, the difference
	# between the original project completion date and the the approval date is (supposedly)
	# approximately 24 months.

	# (a) Is this claim true? Explain. (Remember, for this ENTIRE assignment, only consider
	# projects with Circulation.Date >= 2008-01-01. This will be your only reminder...)

	original_na <- which(is.na(new_foo$OriginalCompletionDate == TRUE))
	#identify which values are NA in the OriginalCompletionDate column

	new_foo_1a <- new_foo[-original_na, ]
	#subset new_foo_1a as a dataframe that excludes rows which have been identified above

	original_duration <- c(new_foo_1a$OriginalCompletionDate - new_foo_1a$ApprovalDate)
	mean(original_duration)
	quantile(original_duration)

	hist(as.numeric(original_duration),main="Histogram of project duration at approval",
	xlab="Project duration at approval (in days)")





	# Has project duration at approval changed over time (consider projects circulated earlier
	# and circulated later). Be sure to discuss mean durations, median durations, and the
	# interquartile range of durations (using the "quantile" function).
	# Approximate suggested length: 3-5 sentences

	for(i in "CirculationDate") # this "for loop" only loops through the CirculationDate -- no other columns.

	{
	#convert date to numeric
	new_foo_1a[, i] <- as.numeric(new_foo_1a[, i])

	}

	quantile(c(new_foo_1a$CirculationDate)) #knowing where the earliest and latest 25% o projects are
	late <- which(new_foo_1a$CirculationDate >= 16890.75) #specify rows that are late
	early <- which(new_foo_1a$CirculationDate <= 14790.00) #specify rows that are early

	new_foo_1a_late <- new_foo_1a[late, ] #new dataframe only with late rows
	new_foo_1a_early <- new_foo_1a[early, ] #new dataframe only with early rows

	original_duration_late <- c(new_foo_1a_late$OriginalCompletionDate - new_foo_1a_late$ApprovalDate)
	#new duration only with late projects
	original_duration_early <- c(new_foo_1a_early$OriginalCompletionDate - new_foo_1a_early$ApprovalDate)
	#new duration only with early projects

	mean(original_duration_early)
	quantile(original_duration_early)
	hist(as.numeric(original_duration_early),main="Histogram of early circulated projects' duration at approval",
	xlab="Project duration at approval (in days)")

	mean(original_duration_late)
	quantile(original_duration_late)
	hist(as.numeric(original_duration_late),main="Histogram of late circulated projects' duration at approval",
	xlab="Project duration at approval (in days)")





	# (b) How does original planned project duration differ from actual duration (if actual duration is
	# measured as the duration between "ApprovalDate" and "RevisedCompletionDate"?) Once again, use
	# means, medians, and interquartile ranges to explain your results.
	# Approximate suggested length: 3-5 sentences

	new_foo_1b <- new_foo_1a
	#subset new_foo_1b as the same dataframe as new_foo_1a since all rows in RevisedCompletionDate are non NA

	revised_duration <- c(new_foo_1b$RevisedCompletionDate - new_foo_1b$ApprovalDate)
	mean(revised_duration)
	quantile(revised_duration)

	hist(as.numeric(revised_duration),main="Histogram of actual project duration",
	xlab="Actual project duration (in days)")





	# (2) What % of projects that have ratings were rated 0?
	# What % were rated 1? What % were rated 2? What % were rated 3? Answer these questions using a table
	# or a figure. Provide a title AND an explanatory sentence or two that provides the numerical % results
	# rounded to the nearest percentage-point.

	rating_na <- which(is.na(new_foo$Rating == TRUE))
	#identify which values are NA in the Rating column

	new_foo_2 <- new_foo[-rating_na, ]
	#remove the rows identified above

	frequency <- table(new_foo_2$Rating)
	#make a table of rating

	newrow <- c(length(which(new_foo_2$Rating == 0)),
	length(which(new_foo_2$Rating == 1)),
	length(which(new_foo_2$Rating == 2)),
	length(which(new_foo_2$Rating == 3)))/length(new_foo_2$Rating)*100 #make a new row with proportions

	frequency <- rbind(frequency, percentage=lapply(newrow, round, 0))
	#add new row to the table
	frequency

	ratingplot = hist(new_foo_2$Rating, breaks=-0.5:3.5, plot=FALSE)
	#make the histogram
	ratingplot$density = ratingplot$counts/sum(ratingplot$counts)*100
	#the y axis is in percentage instead of raw frequency

	plot(ratingplot,freq=FALSE, main = "histogram of ratings (proportion)",
	ylab = "density (in percent)", xlab="Ratings")





	# (3) Repeat problem 2, but this time exclude all PPTA projects. PPTA projects are more prone to
	# negative ratings, because after a certain point in time only the low-rated PPTA projects required
	# ratings. PPTA stands for "Project Preparatory Technical Assistance" and it is basically a project
	# intended to set up a loan (often a very large multi-million-dollar loan). Only PPTAs that fail to
	# "eventuate" to a loan are rated, which is why they are usually rated negatively.

	ppta <- which(new_foo_2$Type == "PPTA")
	#identify which values are "PPTA" in the Type column

	new_foo_3 <- new_foo_2[-ppta, ]
	#remove the rows identified above

	re_frequency <- table(new_foo_3$Rating)
	#make a table of rating
	re_newrow <- c(length(which(new_foo_3$Rating == 0)),
	length(which(new_foo_3$Rating == 1)),
	length(which(new_foo_3$Rating == 2)),
	length(which(new_foo_3$Rating == 3)))/length(new_foo_3$Rating)*100
	#make a new row with the percentages

	re_frequency <- rbind(re_frequency, percentage=lapply(re_newrow, round, 0))
	#add new row to the table
	re_frequency

	re_ratingplot = hist(new_foo_3$Rating, breaks=-0.5:3.5, plot=FALSE)
	#make the histogram
	re_ratingplot$density = re_ratingplot$counts/sum(re_ratingplot$counts)*100
	#the y axis is in percentage instead of raw frequency

	plot(re_ratingplot,freq=FALSE, main = "histogram of ratings excluding PPTA projects (proportion)",
	ylab = "density (in percent)", xlab="Ratings")






	# (4) Identify the top 25% of projects by "Revised.Amount" and the bottom 25% of projects by
	# "RevisedAmount". ("RevisedAmount" shows the final project budget.)
	# Compare the ratings of these projects. Can you draw a causal conclusion about the effect of
	# budget size on ratings? Why or why not?
	# Hint: Compare the characteristics of the two project groupings,
	# e.g., "Dept", "Division", "Cluster", "Country"
	# Approximate suggested length: 3-5 sentences.

	quantile(c(new_foo$RevisedAmount))

	top25 <- which(new_foo$RevisedAmount >= 1.000)
	bottom25 <- which(new_foo$RevisedAmount <= 0.400)

	budgettop <- new_foo[top25, ]
	budgetbottom <- new_foo[bottom25, ]
	budgetbottom <- budgetbottom[-which(is.na(budgetbottom$Rating == TRUE)), ]

	topfrequency <- table(budgettop$Rating)
	bottomfrequency <- table(budgetbottom$Rating)

	topbudgetplot = hist(budgettop$Rating, breaks=-0.5:3.5, plot=FALSE)
	#make the histogram
	topbudgetplot$density = topbudgetplot$counts/sum(topbudgetplot$counts)*100
	#the y axis is in percentage instead of raw frequency
	plot(topbudgetplot,freq=FALSE, main = "histogram of ratings with top 25% budget amount (proportion)",
	ylab = "density (in percent)", xlab="Ratings")

	bottombudgetplot = hist(budgetbottom$Rating,
	breaks=(min(budgetbottom$Rating)-1):(max(budgetbottom$Rating)+0.5), plot=FALSE)
	#make the histogram

	bottombudgetplot$density = bottombudgetplot$counts/sum(bottombudgetplot$counts)*100
	#the y axis is in percentage instead of raw frequency

	plot(bottombudgetplot,freq=FALSE, main = "histogram of ratings with bottom 25% budget amount (proportion)",
	ylab = "density (in percent)", xlab="Ratings")






	# (5) Imagine your manager asks you to apply Jeremy Howard's drivetrain model to the
	# problem of optimal budget-setting to maximize project success (i.e., "Rating").
	# In such a situation, what would be the:
	# (a) decision problem or objective?
	# (b) lever or levers?
	# (c) ideal RCT design?
	# (d) dependent variable(s) and independent variable(s) in the modeler
	# (e) And---Why would running RCTs and modeling/optimizing over RCT results be preferable
	# to using (observational, non-RCT) "foo" data?
	# Approximate suggested length: 1-3 sentences for each sub-question.