Last active
January 26, 2019 15:57
-
-
Save neila/039907a8820ac7e908d7959b47012ccc to your computer and use it in GitHub Desktop.
CS112 Assignment 1 Code
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
######### RUN THE CODE BELOW IN R. R-STUDIO IS THE RECOMMENDED IDE. BOTH R AND R-STUDIO ARE FREE. | |
######### QUESTIONS SHOULD BE POSTED TO PIAZZA | |
######### THE ACTUAL ASSIGNMENT BEGINS ON LINE 71 | |
#########Provided Code (some parts commented out because I didn't want it)######### | |
### Multilateral Development Institution Data | |
foo <- read.csv("https://tinyurl.com/yb4phxx8") # read in the data | |
# column names | |
names(foo) | |
# dimensions of the data set | |
dim(foo) | |
# quick look at the data structure | |
head(foo) | |
# one thing to be very careful with (in this data set) is the use of dates. 8 columns involve dates. | |
# take note of the columns representing calendar dates | |
#date.columns <- c(11, 12, 14, 15, 16, 17, 18, 25) | |
# these columns need some tweaking--I want to address missing values, calling the blank (empty) | |
# elements "NA" instead of leaving them blank, and I wish to tell R these are "Date" objects. | |
#for(i in date.columns) # this "for loop" only loops through the "date.columns" -- no other columns. | |
#{ | |
# identify which values are missing in the "i"th column of the foo data set | |
# which_values_are_missing <- which(as.character(foo[, i]) == "") | |
# those values that are missing (blank) in the "i"th column are replaced by <NA> | |
# because R knows how to handle "NA" -- NA means something special in R--blanks are handled | |
# more unpredictably (which is bad). | |
# foo[which_values_are_missing, i] <- NA | |
# last step--replace each of these columns (which is structured as a column of "factor" values) | |
# as a column of dates--i.e., convert them to an object of "class" = Date. They are dates, after all. | |
# And if you convert them to the Date class, R will know they are dates and you can manipulate | |
# dates in a simple, straightforward way. Otherwise, you won't be able to easily manipulate them | |
# arithmetically. E.g., for simple Date operations, see lines 48-58 below... | |
# **By the way, if you don't understand what a "factor" is in R, you should Google it.** | |
# foo[, i] <- as.Date(as.character(foo[, i])) | |
#} | |
# Now R knows that these columns are comprised of dates | |
# Also, one additional helpful hint... How to eliminate rows with NAs... | |
# The "is.na" function--for more info, Google it or type ?is.na at the R command prompt in the console. | |
#which.have.NAs <- which(is.na(foo$Rating == TRUE)) # for which rows is the claim "is.na" a TRUE claim? | |
# Then, if you wanted to, e.g., remove all those rows, retaining only the rows with ratings... | |
#new_foo <- foo[-which.have.NAs, ] | |
# Notice I called this tweaked data set "new_foo" instead of rewriting over the original data set... | |
# It's a bit safer to do this, in case I decide I want to quickly revert back to the original data set. | |
########################################################################### | |
### ASSIGNMENT 1 -- You may want to read ALL the questions before you begin. | |
### NOTE: FOR ALL QUESTIONS BELOW, ONLY CONSIDER PROJECTS WITH | |
### non-missing "Circulation.Date" >= 2008-01-01. | |
### EXCLUDE ALL OTHER PROJECTS FROM YOUR ANALYSIS. | |
### YOU MUST provide a link to your R code. ------ DON'T FORGET TO DO THIS!!!!!!!!!!!! | |
### Take note of the column names: i.e., you can type: names(foo) | |
### fyi: the column called "Rating" is the success rating at completion. 0 = lowest, 3 = highest. | |
#####Sho's data cleaning##### | |
#clean all columns and replace empty values with NA | |
for(i in names(foo)) # loop through all columns. | |
{ | |
# identify which values are missing in the "i"th column of the foo data set | |
which_values_are_missing <- which(as.character(foo[, i]) == "") | |
# those values that are missing (blank) in the "i"th column are replaced by <NA> | |
# because R knows how to handle "NA" -- NA means something special in R--blanks are handled | |
#more unpredictably (which is bad). | |
foo[which_values_are_missing, i] <- NA | |
} | |
#clean date columns | |
date.columns <- c(11, 12, 14, 15, 16, 17, 18, 25) | |
for(i in date.columns) # this "for loop" only loops through the "date.columns" -- no other columns. | |
{ | |
# replace each of these columns (which is structured as a column of "factor" values) | |
# as a column of dates--i.e., convert them to an object of "class" = Date. They are dates, after all. | |
# And if you convert them to the Date class, R will know they are dates and you can manipulate | |
# dates in a simple, straightforward way. Otherwise, you won't be able to easily manipulate them | |
# arithmetically. E.g., for simple Date operations, see lines 48-58 below... | |
# **By the way, if you don't understand what a "factor" is in R, you should Google it.** | |
foo[, i] <- as.Date(as.character(foo[, i])) | |
} | |
#only consider projects 2018-01-01 and after | |
badfoo <- which(is.na(foo$CirculationDate == TRUE)) | |
new_foo <- foo[-badfoo, ] | |
new_foo <- new_foo[new_foo$CirculationDate >= "2008-01-01", ] | |
########################### | |
# (1) When projects are approved, they are approved for a certain period of time (until the time of | |
# "original completion date"). While projects are active, this "original" completion date is | |
# often pushed out (extended), and then there is a "revised" completion date. | |
# You have been told that project duration at approval is generally about | |
# 2 years (24 months). In other words, (purportedly) when projects are approved, the difference | |
# between the original project completion date and the the approval date is (supposedly) | |
# approximately 24 months. | |
# (a) Is this claim true? Explain. (Remember, for this ENTIRE assignment, only consider | |
# projects with Circulation.Date >= 2008-01-01. This will be your only reminder...) | |
original_na <- which(is.na(new_foo$OriginalCompletionDate == TRUE)) | |
#identify which values are NA in the OriginalCompletionDate column | |
new_foo_1a <- new_foo[-original_na, ] | |
#subset new_foo_1a as a dataframe that excludes rows which have been identified above | |
original_duration <- c(new_foo_1a$OriginalCompletionDate - new_foo_1a$ApprovalDate) | |
mean(original_duration) | |
quantile(original_duration) | |
hist(as.numeric(original_duration),main="Histogram of project duration at approval", | |
xlab="Project duration at approval (in days)") | |
# Has project duration at approval changed over time (consider projects circulated earlier | |
# and circulated later). Be sure to discuss mean durations, median durations, and the | |
# interquartile range of durations (using the "quantile" function). | |
# Approximate suggested length: 3-5 sentences | |
for(i in "CirculationDate") # this "for loop" only loops through the CirculationDate -- no other columns. | |
{ | |
#convert date to numeric | |
new_foo_1a[, i] <- as.numeric(new_foo_1a[, i]) | |
} | |
quantile(c(new_foo_1a$CirculationDate)) #knowing where the earliest and latest 25% o projects are | |
late <- which(new_foo_1a$CirculationDate >= 16890.75) #specify rows that are late | |
early <- which(new_foo_1a$CirculationDate <= 14790.00) #specify rows that are early | |
new_foo_1a_late <- new_foo_1a[late, ] #new dataframe only with late rows | |
new_foo_1a_early <- new_foo_1a[early, ] #new dataframe only with early rows | |
original_duration_late <- c(new_foo_1a_late$OriginalCompletionDate - new_foo_1a_late$ApprovalDate) | |
#new duration only with late projects | |
original_duration_early <- c(new_foo_1a_early$OriginalCompletionDate - new_foo_1a_early$ApprovalDate) | |
#new duration only with early projects | |
mean(original_duration_early) | |
quantile(original_duration_early) | |
hist(as.numeric(original_duration_early),main="Histogram of early circulated projects' duration at approval", | |
xlab="Project duration at approval (in days)") | |
mean(original_duration_late) | |
quantile(original_duration_late) | |
hist(as.numeric(original_duration_late),main="Histogram of late circulated projects' duration at approval", | |
xlab="Project duration at approval (in days)") | |
# (b) How does original planned project duration differ from actual duration (if actual duration is | |
# measured as the duration between "ApprovalDate" and "RevisedCompletionDate"?) Once again, use | |
# means, medians, and interquartile ranges to explain your results. | |
# Approximate suggested length: 3-5 sentences | |
new_foo_1b <- new_foo_1a | |
#subset new_foo_1b as the same dataframe as new_foo_1a since all rows in RevisedCompletionDate are non NA | |
revised_duration <- c(new_foo_1b$RevisedCompletionDate - new_foo_1b$ApprovalDate) | |
mean(revised_duration) | |
quantile(revised_duration) | |
hist(as.numeric(revised_duration),main="Histogram of actual project duration", | |
xlab="Actual project duration (in days)") | |
# (2) What % of projects that have ratings were rated 0? | |
# What % were rated 1? What % were rated 2? What % were rated 3? Answer these questions using a table | |
# or a figure. Provide a title AND an explanatory sentence or two that provides the numerical % results | |
# rounded to the nearest percentage-point. | |
rating_na <- which(is.na(new_foo$Rating == TRUE)) | |
#identify which values are NA in the Rating column | |
new_foo_2 <- new_foo[-rating_na, ] | |
#remove the rows identified above | |
frequency <- table(new_foo_2$Rating) | |
#make a table of rating | |
newrow <- c(length(which(new_foo_2$Rating == 0)), | |
length(which(new_foo_2$Rating == 1)), | |
length(which(new_foo_2$Rating == 2)), | |
length(which(new_foo_2$Rating == 3)))/length(new_foo_2$Rating)*100 #make a new row with proportions | |
frequency <- rbind(frequency, percentage=lapply(newrow, round, 0)) | |
#add new row to the table | |
frequency | |
ratingplot = hist(new_foo_2$Rating, breaks=-0.5:3.5, plot=FALSE) | |
#make the histogram | |
ratingplot$density = ratingplot$counts/sum(ratingplot$counts)*100 | |
#the y axis is in percentage instead of raw frequency | |
plot(ratingplot,freq=FALSE, main = "histogram of ratings (proportion)", | |
ylab = "density (in percent)", xlab="Ratings") | |
# (3) Repeat problem 2, but this time exclude all PPTA projects. PPTA projects are more prone to | |
# negative ratings, because after a certain point in time only the low-rated PPTA projects required | |
# ratings. PPTA stands for "Project Preparatory Technical Assistance" and it is basically a project | |
# intended to set up a loan (often a very large multi-million-dollar loan). Only PPTAs that fail to | |
# "eventuate" to a loan are rated, which is why they are usually rated negatively. | |
ppta <- which(new_foo_2$Type == "PPTA") | |
#identify which values are "PPTA" in the Type column | |
new_foo_3 <- new_foo_2[-ppta, ] | |
#remove the rows identified above | |
re_frequency <- table(new_foo_3$Rating) | |
#make a table of rating | |
re_newrow <- c(length(which(new_foo_3$Rating == 0)), | |
length(which(new_foo_3$Rating == 1)), | |
length(which(new_foo_3$Rating == 2)), | |
length(which(new_foo_3$Rating == 3)))/length(new_foo_3$Rating)*100 | |
#make a new row with the percentages | |
re_frequency <- rbind(re_frequency, percentage=lapply(re_newrow, round, 0)) | |
#add new row to the table | |
re_frequency | |
re_ratingplot = hist(new_foo_3$Rating, breaks=-0.5:3.5, plot=FALSE) | |
#make the histogram | |
re_ratingplot$density = re_ratingplot$counts/sum(re_ratingplot$counts)*100 | |
#the y axis is in percentage instead of raw frequency | |
plot(re_ratingplot,freq=FALSE, main = "histogram of ratings excluding PPTA projects (proportion)", | |
ylab = "density (in percent)", xlab="Ratings") | |
# (4) Identify the top 25% of projects by "Revised.Amount" and the bottom 25% of projects by | |
# "RevisedAmount". ("RevisedAmount" shows the final project budget.) | |
# Compare the ratings of these projects. Can you draw a causal conclusion about the effect of | |
# budget size on ratings? Why or why not? | |
# Hint: Compare the characteristics of the two project groupings, | |
# e.g., "Dept", "Division", "Cluster", "Country" | |
# Approximate suggested length: 3-5 sentences. | |
quantile(c(new_foo$RevisedAmount)) | |
top25 <- which(new_foo$RevisedAmount >= 1.000) | |
bottom25 <- which(new_foo$RevisedAmount <= 0.400) | |
budgettop <- new_foo[top25, ] | |
budgetbottom <- new_foo[bottom25, ] | |
budgetbottom <- budgetbottom[-which(is.na(budgetbottom$Rating == TRUE)), ] | |
topfrequency <- table(budgettop$Rating) | |
bottomfrequency <- table(budgetbottom$Rating) | |
topbudgetplot = hist(budgettop$Rating, breaks=-0.5:3.5, plot=FALSE) | |
#make the histogram | |
topbudgetplot$density = topbudgetplot$counts/sum(topbudgetplot$counts)*100 | |
#the y axis is in percentage instead of raw frequency | |
plot(topbudgetplot,freq=FALSE, main = "histogram of ratings with top 25% budget amount (proportion)", | |
ylab = "density (in percent)", xlab="Ratings") | |
bottombudgetplot = hist(budgetbottom$Rating, | |
breaks=(min(budgetbottom$Rating)-1):(max(budgetbottom$Rating)+0.5), plot=FALSE) | |
#make the histogram | |
bottombudgetplot$density = bottombudgetplot$counts/sum(bottombudgetplot$counts)*100 | |
#the y axis is in percentage instead of raw frequency | |
plot(bottombudgetplot,freq=FALSE, main = "histogram of ratings with bottom 25% budget amount (proportion)", | |
ylab = "density (in percent)", xlab="Ratings") | |
# (5) Imagine your manager asks you to apply Jeremy Howard's drivetrain model to the | |
# problem of optimal budget-setting to maximize project success (i.e., "Rating"). | |
# In such a situation, what would be the: | |
# (a) decision problem or objective? | |
# (b) lever or levers? | |
# (c) ideal RCT design? | |
# (d) dependent variable(s) and independent variable(s) in the modeler | |
# (e) And---Why would running RCTs and modeling/optimizing over RCT results be preferable | |
# to using (observational, non-RCT) "foo" data? | |
# Approximate suggested length: 1-3 sentences for each sub-question. | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment