Last active
September 26, 2020 03:49
-
-
Save Anna-Nevm/f3e7b8c113493442487f06b4f955a4b6 to your computer and use it in GitHub Desktop.
Analyzing projects sponsored by multilateral development institutions
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#______________________PREPROCESSING THE DATA______________________________________________________________ | |
# reading in the data | |
foo <- read.csv("https://tinyurl.com/yb4phxx8") | |
# creating vector with columns representing calendar dates | |
date.columns <- c(11, 12, 14, 15, 16, 17, 18, 25) | |
# looping through the "date.columns" | |
for(i in date.columns) | |
{ | |
# Finding missing values | |
which_values_are_missing <- which(as.character(foo[, i]) == "") | |
# Replacing them by NAs (indicators of missingness, not strings or numeric values) | |
foo[which_values_are_missing, i] <- NA | |
# Turning values into dates | |
foo[, i] <- as.Date(as.character(foo[, i])) | |
} | |
#____________________ASSIGNMENT_____________________________________________________________________________________ | |
#excluding NAs in Circulation Date column | |
NAs_in_Circulation_Date<-which(is.na(foo$CirculationDate)) | |
no_NAs_foo <-foo[-NAs_in_Circulation_Date,] | |
# creating a dataframe that only contains the data for projects with Ciruclation Date later or equal to Jan.1, 2009 | |
assignment_data <- no_NAs_foo[no_NAs_foo$CirculationDate >= "2009-01-01", ] | |
#__________________QUESTION 1______________________________________________________________________________________ | |
#QUESTION 1-a: calculating the original planned project duration | |
#calculating the difference between Original Completetion Date and Approval Date (original project duration) in days | |
difference_OC_and_AD <- assignment_data$OriginalCompletionDate-assignment_data$ApprovalDate | |
#creating new vector based on the difference_OC_and_AD vector but without NAs in order to use mean() function to calculate | |
#the average planned project duration; mean cannot be calculated for numeric vector with missing values | |
difference_OC_and_AD_noNAs <- difference_OC_and_AD[-which(is.na(difference_OC_and_AD))] | |
#checking how many projects were omitted because of having missing data in either Original Completetion Date or Approval Date columns | |
number_of_omitted_projects <-length(difference_OC_and_AD) - length(difference_OC_and_AD_noNAs) # 16 project were omitted | |
number_of_omitted_projects/length(difference_OC_and_AD) # 0.96% of all projects were omitted | |
#checking the class of the new difference vector - "difftime" | |
class(difference_OC_and_AD_noNAs) | |
#changing the difftime class to numeric because difftime doesn't allow for more complex calcualtions like mean | |
Planned_Project_Duration <- as.numeric(difference_OC_and_AD_noNAs) | |
class(Planned_Project_Duration) | |
#calculating the average planned project duration in days | |
mean(Planned_Project_Duration) # Mean: 651.117 days = 21.7 months = 1.8 years | |
median(Planned_Project_Duration) #Median: 600 days= 20 months = 1.67 years | |
sd(Planned_Project_Duration)/30/12 #Standard deviation: 330 days = 11 months = 0.9 years | |
IQR(Planned_Project_Duration)/30 #IQR: 12.85 | |
#QUESTION 1-b: characterising project delays distribution, determining whether the length of project delay changed overtime | |
project_delay_with_no_NAs <- project_delay[-which(is.na(project_delay))] | |
mean(project_delay_with_no_NAs) #Mean: 568 days= 18.9 months = 1.58 years | |
median(project_delay_with_no_NAs) #Median: 457 days= 15.23 months = 1.27 years | |
quantile(project_delay_with_no_NAs) #Range:3837 days= 128 months= 10.65 years | |
IQR(project_delay_with_no_NAs)/30/12 #IQR= 516 days=17.2 months=1.43 years (the difference between the values of 75% and 25% quantiles) | |
#creating a vector with numeric differences between Revised and Original Completetion dates (project delay values) | |
project_delay <- as.numeric(assignment_data$RevisedCompletionDate- assignment_data$OriginalCompletionDate) | |
str(project_delay) | |
#plotting project delay duration against project's circulation date | |
regression_plot<-plot (assignment_data$CirculationDate, project_delay, cex=0.5, pch = 16, xlab="Circulation Date", ylab="Project Delay", col.lab="darkorchid4", font.lab=2) | |
#creating simple linear model to see whether there's a relationship between the circulation dates and project delays | |
regression <- lm(project_delay~ assignment_data$CirculationDate, na.action = na.omit) | |
#?lm (na.action default explains how NAs were treated) | |
abline(regression, col="darkorchid4", lwd=3) #plotting the regression line | |
#deriving p-value of 0.00772 and slope value of -.03 from the regression summary | |
summary(regression) | |
# QUESTION 1-c: comparison between the actual and planned project durations | |
#calculating descriptive statistics for actual project duration | |
actual_duration <- as.numeric(assignment_data$RevisedCompletionDate-assignment_data$ApprovalDate) | |
mean(actual_duration)/30/12 #Mean: 1218 days= 40.6 months =3.4 years | |
median(actual_duration)/30/12 #Median: 1120 days= 37.3 months = 3.11 years | |
quantile(actual_duration)/30 #Range: 146 months-2 months = 144 months | |
IQR(actual_duration)/30 #IQR: 643 days= 21 months | |
sd(actual_duration)/30 | |
40.6-22 | |
#__________________QUESTION 2______________________________________________________________________________________ | |
#calculating a number of all projects completed after 2010 to be further used for finding a percentage of projects with specific rating | |
all_ratings<- length(which(assignment_data$RevisedCompletionDate >= "2010-01-01")) | |
all_ratings# 1274 completed projects in total | |
#calculating a percentage of projects with specific rating | |
percentage_zero <- length(which(assignment_data$RevisedCompletionDate >= "2010-01-01"& assignment_data$Rating==0))/all_ratings*100 | |
percentage_one <-length(which(assignment_data$RevisedCompletionDate >= "2010-01-01"& assignment_data$Rating==1))/all_ratings*100 | |
percentage_two <-length(which(assignment_data$RevisedCompletionDate >= "2010-01-01"& assignment_data$Rating==2))/all_ratings*100 | |
percentage_three <-length(which(assignment_data$RevisedCompletionDate >= "2010-01-01"& assignment_data$Rating==3))/all_ratings*100 | |
percentage_zero # % of projects rated 0: 2.1% | |
percentage_one # % of projects rated 1: 11.5% | |
percentage_two # % of projects rated 2: 72% | |
percentage_three # % of projects rated 3: 14.2% | |
#__________________QUESTION 3_______________________________________________________________ | |
#calculating a number of all PATA projects completed after 2010 | |
PATA_projects <-length(which(assignment_data$RevisedCompletionDate >= "2010-01-01"& assignment_data$Type =="PATA")) | |
PATA_projects #274 PATA project in total | |
#calculating a percentage of projects with specific rating | |
percentage_zero_PATA<-(length(which(assignment_data$RevisedCompletionDate >= "2010-01-01"& assignment_data$Rating==0 & assignment_data$Type =="PATA")))/PATA_projects*100 | |
percentage_one_PATA<-(length(which(assignment_data$RevisedCompletionDate >= "2010-01-01"& assignment_data$Rating==1 & assignment_data$Type =="PATA")))/PATA_projects*100 | |
percentage_two_PATA<-(length(which(assignment_data$RevisedCompletionDate >= "2010-01-01"& assignment_data$Rating==2 & assignment_data$Type =="PATA")))/PATA_projects*100 | |
percentage_three_PATA<-(length(which(assignment_data$RevisedCompletionDate >= "2010-01-01"& assignment_data$Rating==3 & assignment_data$Type =="PATA")))/PATA_projects*100 | |
percentage_zero_PATA # % of projects rated 0: 1.09% | |
percentage_one_PATA # % of projects rated 1: 8.03% | |
percentage_two_PATA # % of projects rated 2: 71.89% | |
percentage_three_PATA # % of projects rated 3: 18.6% | |
#__________________QUESTION 4__________________________________________________________ | |
descending <- assignment_data[order(assignment_data$RevisedAmount),] | |
top_ten <- descending[1:round(nrow(descending)*0.1),] | |
ascending <-assignment_data[order(-assignment_data$RevisedAmount),] | |
bottom_ten <-ascending[1:round(nrow(ascending)*0.1),] | |
#to be used for density plot | |
d_plot<-density(top_ten$Rating) | |
a_plot<-density(bottom_ten$Rating) | |
#ploting ratings distribution for top 10% funded projects and 10% least funded projects | |
plot(d_plot, main="", xlab="Rating", col="blue", col.lab="darkorchid4", font.lab=2,lwd = 1) | |
lines(a_plot,col="pink",lwd = 1) | |
#n=166 | |
#linear regression of the rating on other independent variables - country, division, department, cluster | |
reg_Country <- lm(assignment_data$Rating ~ assignment_data$Country, assignment_data) | |
reg_Division <- lm(assignment_data$Rating ~ assignment_data$Division, assignment_data) | |
reg_Department <- lm(assignment_data$Rating ~ assignment_data$Dept, assignment_data) | |
reg_Cluster <- lm(assignment_data$Rating ~ assignment_data$Cluster, assignment_data) | |
summary(reg_Country) #p-value: 2.2e-16 | |
summary(reg_Division) #p-value: 2.662e-09 | |
summary(reg_Department) #p-value: 6.831e-14 | |
summary(reg_Cluster) #p-value: 0.467 | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment