Skip to content

Instantly share code, notes, and snippets.

@Anna-Nevm
Last active September 26, 2020 03:49
Show Gist options
  • Save Anna-Nevm/f3e7b8c113493442487f06b4f955a4b6 to your computer and use it in GitHub Desktop.
Save Anna-Nevm/f3e7b8c113493442487f06b4f955a4b6 to your computer and use it in GitHub Desktop.
Analyzing projects sponsored by multilateral development institutions
#______________________PREPROCESSING THE DATA______________________________________________________________
# reading in the data
foo <- read.csv("https://tinyurl.com/yb4phxx8")
# creating vector with columns representing calendar dates
date.columns <- c(11, 12, 14, 15, 16, 17, 18, 25)
# looping through the "date.columns"
for(i in date.columns)
{
# Finding missing values
which_values_are_missing <- which(as.character(foo[, i]) == "")
# Replacing them by NAs (indicators of missingness, not strings or numeric values)
foo[which_values_are_missing, i] <- NA
# Turning values into dates
foo[, i] <- as.Date(as.character(foo[, i]))
}
#____________________ASSIGNMENT_____________________________________________________________________________________
#excluding NAs in Circulation Date column
NAs_in_Circulation_Date<-which(is.na(foo$CirculationDate))
no_NAs_foo <-foo[-NAs_in_Circulation_Date,]
# creating a dataframe that only contains the data for projects with Ciruclation Date later or equal to Jan.1, 2009
assignment_data <- no_NAs_foo[no_NAs_foo$CirculationDate >= "2009-01-01", ]
#__________________QUESTION 1______________________________________________________________________________________
#QUESTION 1-a: calculating the original planned project duration
#calculating the difference between Original Completetion Date and Approval Date (original project duration) in days
difference_OC_and_AD <- assignment_data$OriginalCompletionDate-assignment_data$ApprovalDate
#creating new vector based on the difference_OC_and_AD vector but without NAs in order to use mean() function to calculate
#the average planned project duration; mean cannot be calculated for numeric vector with missing values
difference_OC_and_AD_noNAs <- difference_OC_and_AD[-which(is.na(difference_OC_and_AD))]
#checking how many projects were omitted because of having missing data in either Original Completetion Date or Approval Date columns
number_of_omitted_projects <-length(difference_OC_and_AD) - length(difference_OC_and_AD_noNAs) # 16 project were omitted
number_of_omitted_projects/length(difference_OC_and_AD) # 0.96% of all projects were omitted
#checking the class of the new difference vector - "difftime"
class(difference_OC_and_AD_noNAs)
#changing the difftime class to numeric because difftime doesn't allow for more complex calcualtions like mean
Planned_Project_Duration <- as.numeric(difference_OC_and_AD_noNAs)
class(Planned_Project_Duration)
#calculating the average planned project duration in days
mean(Planned_Project_Duration) # Mean: 651.117 days = 21.7 months = 1.8 years
median(Planned_Project_Duration) #Median: 600 days= 20 months = 1.67 years
sd(Planned_Project_Duration)/30/12 #Standard deviation: 330 days = 11 months = 0.9 years
IQR(Planned_Project_Duration)/30 #IQR: 12.85
#QUESTION 1-b: characterising project delays distribution, determining whether the length of project delay changed overtime
project_delay_with_no_NAs <- project_delay[-which(is.na(project_delay))]
mean(project_delay_with_no_NAs) #Mean: 568 days= 18.9 months = 1.58 years
median(project_delay_with_no_NAs) #Median: 457 days= 15.23 months = 1.27 years
quantile(project_delay_with_no_NAs) #Range:3837 days= 128 months= 10.65 years
IQR(project_delay_with_no_NAs)/30/12 #IQR= 516 days=17.2 months=1.43 years (the difference between the values of 75% and 25% quantiles)
#creating a vector with numeric differences between Revised and Original Completetion dates (project delay values)
project_delay <- as.numeric(assignment_data$RevisedCompletionDate- assignment_data$OriginalCompletionDate)
str(project_delay)
#plotting project delay duration against project's circulation date
regression_plot<-plot (assignment_data$CirculationDate, project_delay, cex=0.5, pch = 16, xlab="Circulation Date", ylab="Project Delay", col.lab="darkorchid4", font.lab=2)
#creating simple linear model to see whether there's a relationship between the circulation dates and project delays
regression <- lm(project_delay~ assignment_data$CirculationDate, na.action = na.omit)
#?lm (na.action default explains how NAs were treated)
abline(regression, col="darkorchid4", lwd=3) #plotting the regression line
#deriving p-value of 0.00772 and slope value of -.03 from the regression summary
summary(regression)
# QUESTION 1-c: comparison between the actual and planned project durations
#calculating descriptive statistics for actual project duration
actual_duration <- as.numeric(assignment_data$RevisedCompletionDate-assignment_data$ApprovalDate)
mean(actual_duration)/30/12 #Mean: 1218 days= 40.6 months =3.4 years
median(actual_duration)/30/12 #Median: 1120 days= 37.3 months = 3.11 years
quantile(actual_duration)/30 #Range: 146 months-2 months = 144 months
IQR(actual_duration)/30 #IQR: 643 days= 21 months
sd(actual_duration)/30
40.6-22
#__________________QUESTION 2______________________________________________________________________________________
#calculating a number of all projects completed after 2010 to be further used for finding a percentage of projects with specific rating
all_ratings<- length(which(assignment_data$RevisedCompletionDate >= "2010-01-01"))
all_ratings# 1274 completed projects in total
#calculating a percentage of projects with specific rating
percentage_zero <- length(which(assignment_data$RevisedCompletionDate >= "2010-01-01"& assignment_data$Rating==0))/all_ratings*100
percentage_one <-length(which(assignment_data$RevisedCompletionDate >= "2010-01-01"& assignment_data$Rating==1))/all_ratings*100
percentage_two <-length(which(assignment_data$RevisedCompletionDate >= "2010-01-01"& assignment_data$Rating==2))/all_ratings*100
percentage_three <-length(which(assignment_data$RevisedCompletionDate >= "2010-01-01"& assignment_data$Rating==3))/all_ratings*100
percentage_zero # % of projects rated 0: 2.1%
percentage_one # % of projects rated 1: 11.5%
percentage_two # % of projects rated 2: 72%
percentage_three # % of projects rated 3: 14.2%
#__________________QUESTION 3_______________________________________________________________
#calculating a number of all PATA projects completed after 2010
PATA_projects <-length(which(assignment_data$RevisedCompletionDate >= "2010-01-01"& assignment_data$Type =="PATA"))
PATA_projects #274 PATA project in total
#calculating a percentage of projects with specific rating
percentage_zero_PATA<-(length(which(assignment_data$RevisedCompletionDate >= "2010-01-01"& assignment_data$Rating==0 & assignment_data$Type =="PATA")))/PATA_projects*100
percentage_one_PATA<-(length(which(assignment_data$RevisedCompletionDate >= "2010-01-01"& assignment_data$Rating==1 & assignment_data$Type =="PATA")))/PATA_projects*100
percentage_two_PATA<-(length(which(assignment_data$RevisedCompletionDate >= "2010-01-01"& assignment_data$Rating==2 & assignment_data$Type =="PATA")))/PATA_projects*100
percentage_three_PATA<-(length(which(assignment_data$RevisedCompletionDate >= "2010-01-01"& assignment_data$Rating==3 & assignment_data$Type =="PATA")))/PATA_projects*100
percentage_zero_PATA # % of projects rated 0: 1.09%
percentage_one_PATA # % of projects rated 1: 8.03%
percentage_two_PATA # % of projects rated 2: 71.89%
percentage_three_PATA # % of projects rated 3: 18.6%
#__________________QUESTION 4__________________________________________________________
descending <- assignment_data[order(assignment_data$RevisedAmount),]
top_ten <- descending[1:round(nrow(descending)*0.1),]
ascending <-assignment_data[order(-assignment_data$RevisedAmount),]
bottom_ten <-ascending[1:round(nrow(ascending)*0.1),]
#to be used for density plot
d_plot<-density(top_ten$Rating)
a_plot<-density(bottom_ten$Rating)
#ploting ratings distribution for top 10% funded projects and 10% least funded projects
plot(d_plot, main="", xlab="Rating", col="blue", col.lab="darkorchid4", font.lab=2,lwd = 1)
lines(a_plot,col="pink",lwd = 1)
#n=166
#linear regression of the rating on other independent variables - country, division, department, cluster
reg_Country <- lm(assignment_data$Rating ~ assignment_data$Country, assignment_data)
reg_Division <- lm(assignment_data$Rating ~ assignment_data$Division, assignment_data)
reg_Department <- lm(assignment_data$Rating ~ assignment_data$Dept, assignment_data)
reg_Cluster <- lm(assignment_data$Rating ~ assignment_data$Cluster, assignment_data)
summary(reg_Country) #p-value: 2.2e-16
summary(reg_Division) #p-value: 2.662e-09
summary(reg_Department) #p-value: 6.831e-14
summary(reg_Cluster) #p-value: 0.467
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment