Last active
January 24, 2020 04:21
-
-
Save anglilian/3a7764c52c5c83474d425ab68630a8f9 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
##SETTING UP DATA | |
foo <- read.csv("https://tinyurl.com/yb4phxx8") | |
names(foo) | |
dim(foo) | |
head(foo) | |
date.columns <- c(11,12,14,15,16,17,18,25) | |
for (i in date.columns) { | |
which_values_are_missing <- which(as.character(foo[,i]) == "") | |
foo[which_values_are_missing, i] <- NA | |
foo[,i] <- as.Date(as.character(foo[,i])) | |
} | |
#Removing NA values from data | |
which.have.NAs <- which(is.na(foo$CirculationDate)) | |
new_foo <- foo[-which.have.NAs,] | |
#Removing rows that have a Circulation Date lower than the given value | |
withindate <- new_foo[which(new_foo$CirculationDate >= as.Date("2009-01-01")),] | |
##QUESTION 1 (a) | |
#removing rows from Original Completion Date that have NA Values | |
completion.have.NAs <- which(is.na(withindate$OriginalCompletionDate)) | |
completion_foo <- withindate[-completion.have.NAs,] | |
#subtracting Approval Date from Original Completion Date and converting the time difference to months | |
time <- (completion_foo$OriginalCompletionDate - completion_foo$ApprovalDate) | |
#average project duration | |
mean(time) | |
##QUESTION 1(b) | |
#changing circulation dates to only have year | |
completion_foo$CirculationDate <- format(completion_foo$CirculationDate, format = "%Y") | |
#creating a list of unique years from the available Circulation Date data | |
completion_foo$CirculationDate <- format(completion_foo$CirculationDate, format = "%Y") | |
years <- unique(completion_foo$CirculationDate) | |
#calculating the delay and converting the unit to months | |
difference <- (completion_foo$RevisedCompletionDate- completion_foo$OriginalCompletionDate) | |
#converting the elements in the list to be numeric | |
difference <- as.numeric(difference) | |
completion_foo$difference <- difference | |
#Calculating the mean, median and IQR of the data in each year group | |
mean <- aggregate(completion_foo$difference, by =list(completion_foo$CirculationDate), mean) | |
median <- aggregate(completion_foo$difference, by =list(completion_foo$CirculationDate), median) | |
IQR <- aggregate(completion_foo$difference, by =list(completion_foo$CirculationDate), IQR) | |
#plotting the data on a line graph | |
plot(mean , col= "red", type= "b", pch = 15,ylim = c(300, 700), | |
xlab= "Circulation Year", ylab = "Delay (months)", | |
) | |
lines (median, col="green", type = "o", pch=17) | |
lines (IQR, col="blue", type = "o", pch = 16) | |
legend ("bottomleft", pch=c(15,16,17), | |
col= c('red', 'blue', 'green'), | |
legend = c("Mean Delay", "Median Delay", "IQR of Delays")) | |
grid (nx=NA, ny= NULL) | |
##Question 1(c) | |
#calculating the duration between Approval Date and Revised Completion Date | |
duration <- (completion_foo$RevisedCompletionDate- completion_foo$ApprovalDate) | |
#converting the elements in the list to be numeric | |
duration <- as.numeric(duration) | |
#Adding the duration as a column in the data set | |
completion_foo$duration <- duration | |
new_years <- unique(completion_foo$RevisedCompletionDate) | |
#Calculating the mean, median and IQR according to year | |
new_mean <- aggregate(completion_foo$duration, by =list(completion_foo$CirculationDate), mean) | |
new_median <- aggregate(completion_foo$duration, by =list(completion_foo$CirculationDate), median) | |
new_IQR <- aggregate(completion_foo$duration, by =list(completion_foo$CirculationDate), IQR) | |
#plotting the data onto a line graph | |
plot(new_mean , col= "red", type= "b", pch = 15, ylim = c(500, 1400), | |
xlab= "Circulation Year", ylab = "Delay (months)") | |
lines (new_median, col="green", type = "o", pch=17) | |
lines (new_IQR, col="blue", type = "o", pch = 16) | |
legend ("bottomleft", pch=c(15,16,17), | |
col= c('red', 'blue', 'green'), cex = 0.5, | |
legend = c("Mean Delay", "Median Delay", "IQR of Delays")) | |
axis(side = 1, at = c(0,1)) | |
grid (nx= NA, ny = NULL) | |
##QUESTION 2 | |
#removing the entries from Rating which have no values | |
rating.have.NAs <- which(is.na(withindate$Rating)) | |
rating_foo <- withindate[-rating.have.NAs,] | |
#Excluding all ratings before 2010 | |
present_foo <- rating_foo[which(rating_foo$RevisedCompletionDate >= 2010),] | |
#Putting ratings and their frequency into a table | |
ratings <- table(present_foo$Rating) | |
cbind(ratings,prop.table(ratings)) #adding column of percentage of ratings | |
##QUESTION 3 | |
#Only taking PATA project data | |
PATA_foo <- present_foo[which(present_foo$Type=="PATA"),] | |
#Putting PATA Project ratings and their frequency into a table | |
PATA <- table(PATA_foo$Rating) | |
cbind(PATA, prop.table(PATA)) #adding column of percentage of ratings | |
##Question 4 | |
#sorts the data frame based on the revised amount in ascending order | |
rating_foo$RevisedAmount <- sort(rating_foo$RevisedAmount) | |
#The difference in percentage ratings of the top and bottom 10% | |
bottom <- rating_foo[which(rating_foo$RevisedAmount <= quantile(rating_foo$RevisedAmount, 0.1)),] | |
top<- rating_foo[which(rating_foo$RevisedAmount >= quantile(rating_foo$RevisedAmount, 0.9)),] | |
(table(top$Rating) /length(top$Rating)*100) - | |
(table(bottom$Rating) /length(bottom$Rating)*100) | |
#computing the distribution of divisions among the top and bottom 10% of projects | |
division_bottom <- rating_foo$Division[1: tenpercent] | |
division_top <- rating_foo$Division[(length(rating_foo$RevisedAmount)- tenpercent): length(rating_foo$RevisedAmount)] | |
divisiontop_t <- table(division_top) | |
divisionbottom_t <- table(division_bottom) | |
rbind(divisiontop_t, divisionbottom_t) | |
#computing the distribution of countries among the top and bottom 10% of projects | |
country_bottom <- rating_foo$Country[1: tenpercent] | |
country_top <- rating_foo$Country[(length(rating_foo$RevisedAmount)- tenpercent): length(rating_foo$RevisedAmount)] | |
countrytop_t <- table(country_top) | |
countrybottom_t <- table(country_bottom) | |
rbind(countrytop_t, countrybottom_t) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment