Skip to content

Instantly share code, notes, and snippets.

@anglilian
Last active January 24, 2020 04:21
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save anglilian/3a7764c52c5c83474d425ab68630a8f9 to your computer and use it in GitHub Desktop.
Save anglilian/3a7764c52c5c83474d425ab68630a8f9 to your computer and use it in GitHub Desktop.
##SETTING UP DATA
foo <- read.csv("https://tinyurl.com/yb4phxx8")
names(foo)
dim(foo)
head(foo)
date.columns <- c(11,12,14,15,16,17,18,25)
for (i in date.columns) {
which_values_are_missing <- which(as.character(foo[,i]) == "")
foo[which_values_are_missing, i] <- NA
foo[,i] <- as.Date(as.character(foo[,i]))
}
#Removing NA values from data
which.have.NAs <- which(is.na(foo$CirculationDate))
new_foo <- foo[-which.have.NAs,]
#Removing rows that have a Circulation Date lower than the given value
withindate <- new_foo[which(new_foo$CirculationDate >= as.Date("2009-01-01")),]
##QUESTION 1 (a)
#removing rows from Original Completion Date that have NA Values
completion.have.NAs <- which(is.na(withindate$OriginalCompletionDate))
completion_foo <- withindate[-completion.have.NAs,]
#subtracting Approval Date from Original Completion Date and converting the time difference to months
time <- (completion_foo$OriginalCompletionDate - completion_foo$ApprovalDate)
#average project duration
mean(time)
##QUESTION 1(b)
#changing circulation dates to only have year
completion_foo$CirculationDate <- format(completion_foo$CirculationDate, format = "%Y")
#creating a list of unique years from the available Circulation Date data
completion_foo$CirculationDate <- format(completion_foo$CirculationDate, format = "%Y")
years <- unique(completion_foo$CirculationDate)
#calculating the delay and converting the unit to months
difference <- (completion_foo$RevisedCompletionDate- completion_foo$OriginalCompletionDate)
#converting the elements in the list to be numeric
difference <- as.numeric(difference)
completion_foo$difference <- difference
#Calculating the mean, median and IQR of the data in each year group
mean <- aggregate(completion_foo$difference, by =list(completion_foo$CirculationDate), mean)
median <- aggregate(completion_foo$difference, by =list(completion_foo$CirculationDate), median)
IQR <- aggregate(completion_foo$difference, by =list(completion_foo$CirculationDate), IQR)
#plotting the data on a line graph
plot(mean , col= "red", type= "b", pch = 15,ylim = c(300, 700),
xlab= "Circulation Year", ylab = "Delay (months)",
)
lines (median, col="green", type = "o", pch=17)
lines (IQR, col="blue", type = "o", pch = 16)
legend ("bottomleft", pch=c(15,16,17),
col= c('red', 'blue', 'green'),
legend = c("Mean Delay", "Median Delay", "IQR of Delays"))
grid (nx=NA, ny= NULL)
##Question 1(c)
#calculating the duration between Approval Date and Revised Completion Date
duration <- (completion_foo$RevisedCompletionDate- completion_foo$ApprovalDate)
#converting the elements in the list to be numeric
duration <- as.numeric(duration)
#Adding the duration as a column in the data set
completion_foo$duration <- duration
new_years <- unique(completion_foo$RevisedCompletionDate)
#Calculating the mean, median and IQR according to year
new_mean <- aggregate(completion_foo$duration, by =list(completion_foo$CirculationDate), mean)
new_median <- aggregate(completion_foo$duration, by =list(completion_foo$CirculationDate), median)
new_IQR <- aggregate(completion_foo$duration, by =list(completion_foo$CirculationDate), IQR)
#plotting the data onto a line graph
plot(new_mean , col= "red", type= "b", pch = 15, ylim = c(500, 1400),
xlab= "Circulation Year", ylab = "Delay (months)")
lines (new_median, col="green", type = "o", pch=17)
lines (new_IQR, col="blue", type = "o", pch = 16)
legend ("bottomleft", pch=c(15,16,17),
col= c('red', 'blue', 'green'), cex = 0.5,
legend = c("Mean Delay", "Median Delay", "IQR of Delays"))
axis(side = 1, at = c(0,1))
grid (nx= NA, ny = NULL)
##QUESTION 2
#removing the entries from Rating which have no values
rating.have.NAs <- which(is.na(withindate$Rating))
rating_foo <- withindate[-rating.have.NAs,]
#Excluding all ratings before 2010
present_foo <- rating_foo[which(rating_foo$RevisedCompletionDate >= 2010),]
#Putting ratings and their frequency into a table
ratings <- table(present_foo$Rating)
cbind(ratings,prop.table(ratings)) #adding column of percentage of ratings
##QUESTION 3
#Only taking PATA project data
PATA_foo <- present_foo[which(present_foo$Type=="PATA"),]
#Putting PATA Project ratings and their frequency into a table
PATA <- table(PATA_foo$Rating)
cbind(PATA, prop.table(PATA)) #adding column of percentage of ratings
##Question 4
#sorts the data frame based on the revised amount in ascending order
rating_foo$RevisedAmount <- sort(rating_foo$RevisedAmount)
#The difference in percentage ratings of the top and bottom 10%
bottom <- rating_foo[which(rating_foo$RevisedAmount <= quantile(rating_foo$RevisedAmount, 0.1)),]
top<- rating_foo[which(rating_foo$RevisedAmount >= quantile(rating_foo$RevisedAmount, 0.9)),]
(table(top$Rating) /length(top$Rating)*100) -
(table(bottom$Rating) /length(bottom$Rating)*100)
#computing the distribution of divisions among the top and bottom 10% of projects
division_bottom <- rating_foo$Division[1: tenpercent]
division_top <- rating_foo$Division[(length(rating_foo$RevisedAmount)- tenpercent): length(rating_foo$RevisedAmount)]
divisiontop_t <- table(division_top)
divisionbottom_t <- table(division_bottom)
rbind(divisiontop_t, divisionbottom_t)
#computing the distribution of countries among the top and bottom 10% of projects
country_bottom <- rating_foo$Country[1: tenpercent]
country_top <- rating_foo$Country[(length(rating_foo$RevisedAmount)- tenpercent): length(rating_foo$RevisedAmount)]
countrytop_t <- table(country_top)
countrybottom_t <- table(country_bottom)
rbind(countrytop_t, countrybottom_t)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment