Anna-Nevm/R_competency .R

## R_competency .R
#______________________PREPROCESSING THE DATA______________________________________________________________


# reading in the data
foo <- read.csv("https://tinyurl.com/yb4phxx8")
# creating vector with columns representing calendar dates
date.columns <- c(11, 12, 14, 15, 16, 17, 18, 25)


# looping through the "date.columns"
for(i in date.columns)
{
  # Finding missing values
  which_values_are_missing <- which(as.character(foo[, i]) == "")
  # Replacing them by NAs (indicators of missingness, not strings or numeric values)
  foo[which_values_are_missing, i] <- NA
  # Turning values into dates
  foo[, i] <- as.Date(as.character(foo[, i]))
}


#____________________ASSIGNMENT_____________________________________________________________________________________

#excluding NAs in Circulation Date column
NAs_in_Circulation_Date<-which(is.na(foo$CirculationDate))
no_NAs_foo <-foo[-NAs_in_Circulation_Date,]

# creating a dataframe that only contains the data for projects with Ciruclation Date later or equal to Jan.1, 2009
assignment_data <- no_NAs_foo[no_NAs_foo$CirculationDate >= "2009-01-01", ]


#__________________QUESTION 1______________________________________________________________________________________

#QUESTION 1-a: calculating the original planned project duration

#calculating the difference between Original Completetion Date and Approval Date (original project duration) in days
difference_OC_and_AD <- assignment_data$OriginalCompletionDate-assignment_data$ApprovalDate

#creating new vector based on the difference_OC_and_AD vector but without NAs in order to use mean() function to calculate
#the average planned project duration; mean cannot be calculated for numeric vector with missing values
difference_OC_and_AD_noNAs <- difference_OC_and_AD[-which(is.na(difference_OC_and_AD))]

#checking how many projects were omitted because of having missing data in either Original Completetion Date or Approval Date columns
number_of_omitted_projects <-length(difference_OC_and_AD) - length(difference_OC_and_AD_noNAs)  # 16 project were omitted
number_of_omitted_projects/length(difference_OC_and_AD) # 0.96% of all projects were omitted

#checking the class of the new difference vector - "difftime"
class(difference_OC_and_AD_noNAs)

#changing the difftime class to numeric because difftime doesn't allow for more complex calcualtions like mean
Planned_Project_Duration <- as.numeric(difference_OC_and_AD_noNAs)
class(Planned_Project_Duration)

#calculating the average planned project duration in days
mean(Planned_Project_Duration) # Mean: 651.117 days = 21.7 months = 1.8 years
median(Planned_Project_Duration) #Median: 600 days= 20 months = 1.67 years
sd(Planned_Project_Duration)/30/12 #Standard deviation: 330 days = 11 months = 0.9 years
IQR(Planned_Project_Duration)/30 #IQR: 12.85


#QUESTION 1-b: characterising project delays distribution, determining whether the length of project delay changed overtime

project_delay_with_no_NAs <- project_delay[-which(is.na(project_delay))]
mean(project_delay_with_no_NAs) #Mean: 568 days= 18.9 months = 1.58 years
median(project_delay_with_no_NAs) #Median: 457 days= 15.23 months = 1.27 years
quantile(project_delay_with_no_NAs) #Range:3837 days= 128 months= 10.65 years
IQR(project_delay_with_no_NAs)/30/12 #IQR= 516 days=17.2 months=1.43 years (the difference between the values of 75% and 25% quantiles)


#creating a vector with numeric differences between Revised and Original Completetion dates (project delay values)
project_delay <- as.numeric(assignment_data$RevisedCompletionDate- assignment_data$OriginalCompletionDate)
str(project_delay)

#plotting project delay duration against project's circulation date
regression_plot<-plot (assignment_data$CirculationDate, project_delay, cex=0.5, pch = 16,  xlab="Circulation Date", ylab="Project Delay", col.lab="darkorchid4", font.lab=2)
#creating simple linear model to see whether there's a relationship between the circulation dates and project delays
regression <- lm(project_delay~ assignment_data$CirculationDate, na.action = na.omit)
#?lm (na.action default explains how NAs were treated)
abline(regression, col="darkorchid4", lwd=3) #plotting the regression line

#deriving p-value of 0.00772 and slope value of -.03 from the regression summary
summary(regression)


# QUESTION 1-c: comparison between the actual and planned project durations

#calculating descriptive statistics for actual project duration
actual_duration <- as.numeric(assignment_data$RevisedCompletionDate-assignment_data$ApprovalDate)

mean(actual_duration)/30/12 #Mean: 1218 days= 40.6 months =3.4 years
median(actual_duration)/30/12 #Median: 1120 days= 37.3 months = 3.11 years
quantile(actual_duration)/30 #Range: 146 months-2 months = 144 months
IQR(actual_duration)/30 #IQR: 643 days= 21 months
sd(actual_duration)/30
40.6-22


#__________________QUESTION 2______________________________________________________________________________________


#calculating a number of all projects completed after 2010 to be further used for finding a percentage of projects with specific rating
all_ratings<- length(which(assignment_data$RevisedCompletionDate >= "2010-01-01"))
all_ratings# 1274 completed projects in total
#calculating a percentage of projects with specific rating
percentage_zero <- length(which(assignment_data$RevisedCompletionDate >= "2010-01-01"& assignment_data$Rating==0))/all_ratings*100
percentage_one <-length(which(assignment_data$RevisedCompletionDate >= "2010-01-01"& assignment_data$Rating==1))/all_ratings*100
percentage_two <-length(which(assignment_data$RevisedCompletionDate >= "2010-01-01"& assignment_data$Rating==2))/all_ratings*100
percentage_three <-length(which(assignment_data$RevisedCompletionDate >= "2010-01-01"& assignment_data$Rating==3))/all_ratings*100


percentage_zero # % of projects rated 0: 2.1%
percentage_one # % of projects rated 1: 11.5%
percentage_two # % of projects rated 2: 72%
percentage_three # % of projects rated 3: 14.2%


#__________________QUESTION 3_______________________________________________________________

#calculating a number of all PATA projects completed after 2010
PATA_projects <-length(which(assignment_data$RevisedCompletionDate >= "2010-01-01"& assignment_data$Type =="PATA"))
PATA_projects #274 PATA project in total

#calculating a percentage of projects with specific rating
percentage_zero_PATA<-(length(which(assignment_data$RevisedCompletionDate >= "2010-01-01"& assignment_data$Rating==0 & assignment_data$Type =="PATA")))/PATA_projects*100
percentage_one_PATA<-(length(which(assignment_data$RevisedCompletionDate >= "2010-01-01"& assignment_data$Rating==1 & assignment_data$Type =="PATA")))/PATA_projects*100
percentage_two_PATA<-(length(which(assignment_data$RevisedCompletionDate >= "2010-01-01"& assignment_data$Rating==2 & assignment_data$Type =="PATA")))/PATA_projects*100
percentage_three_PATA<-(length(which(assignment_data$RevisedCompletionDate >= "2010-01-01"& assignment_data$Rating==3 & assignment_data$Type =="PATA")))/PATA_projects*100

percentage_zero_PATA # % of projects rated 0: 1.09%
percentage_one_PATA # % of projects rated 1: 8.03%
percentage_two_PATA # % of projects rated 2: 71.89%
percentage_three_PATA # % of projects rated 3: 18.6%


#__________________QUESTION 4__________________________________________________________

descending <- assignment_data[order(assignment_data$RevisedAmount),]
top_ten <- descending[1:round(nrow(descending)*0.1),]

ascending <-assignment_data[order(-assignment_data$RevisedAmount),]
bottom_ten <-ascending[1:round(nrow(ascending)*0.1),]

#to be used for density plot
d_plot<-density(top_ten$Rating)
a_plot<-density(bottom_ten$Rating)

#ploting ratings distribution for top 10% funded projects and 10% least funded projects
plot(d_plot, main="", xlab="Rating", col="blue", col.lab="darkorchid4", font.lab=2,lwd = 1)
lines(a_plot,col="pink",lwd = 1)
#n=166


#linear regression of the rating on other independent variables - country, division, department, cluster
reg_Country <- lm(assignment_data$Rating ~ assignment_data$Country, assignment_data)
reg_Division <- lm(assignment_data$Rating ~ assignment_data$Division, assignment_data)
reg_Department <- lm(assignment_data$Rating ~ assignment_data$Dept, assignment_data)
reg_Cluster <- lm(assignment_data$Rating ~ assignment_data$Cluster, assignment_data)

summary(reg_Country) #p-value: 2.2e-16
summary(reg_Division) #p-value: 2.662e-09
summary(reg_Department) #p-value: 6.831e-14
summary(reg_Cluster) #p-value: 0.467
	#______________________PREPROCESSING THE DATA______________________________________________________________


	# reading in the data
	foo <- read.csv("https://tinyurl.com/yb4phxx8")
	# creating vector with columns representing calendar dates
	date.columns <- c(11, 12, 14, 15, 16, 17, 18, 25)


	# looping through the "date.columns"
	for(i in date.columns)
	{
	# Finding missing values
	which_values_are_missing <- which(as.character(foo[, i]) == "")
	# Replacing them by NAs (indicators of missingness, not strings or numeric values)
	foo[which_values_are_missing, i] <- NA
	# Turning values into dates
	foo[, i] <- as.Date(as.character(foo[, i]))
	}



	#____________________ASSIGNMENT_____________________________________________________________________________________

	#excluding NAs in Circulation Date column
	NAs_in_Circulation_Date<-which(is.na(foo$CirculationDate))
	no_NAs_foo <-foo[-NAs_in_Circulation_Date,]

	# creating a dataframe that only contains the data for projects with Ciruclation Date later or equal to Jan.1, 2009
	assignment_data <- no_NAs_foo[no_NAs_foo$CirculationDate >= "2009-01-01", ]




	#__________________QUESTION 1______________________________________________________________________________________

	#QUESTION 1-a: calculating the original planned project duration

	#calculating the difference between Original Completetion Date and Approval Date (original project duration) in days
	difference_OC_and_AD <- assignment_data$OriginalCompletionDate-assignment_data$ApprovalDate

	#creating new vector based on the difference_OC_and_AD vector but without NAs in order to use mean() function to calculate
	#the average planned project duration; mean cannot be calculated for numeric vector with missing values
	difference_OC_and_AD_noNAs <- difference_OC_and_AD[-which(is.na(difference_OC_and_AD))]

	#checking how many projects were omitted because of having missing data in either Original Completetion Date or Approval Date columns
	number_of_omitted_projects <-length(difference_OC_and_AD) - length(difference_OC_and_AD_noNAs) # 16 project were omitted
	number_of_omitted_projects/length(difference_OC_and_AD) # 0.96% of all projects were omitted

	#checking the class of the new difference vector - "difftime"
	class(difference_OC_and_AD_noNAs)

	#changing the difftime class to numeric because difftime doesn't allow for more complex calcualtions like mean
	Planned_Project_Duration <- as.numeric(difference_OC_and_AD_noNAs)
	class(Planned_Project_Duration)

	#calculating the average planned project duration in days
	mean(Planned_Project_Duration) # Mean: 651.117 days = 21.7 months = 1.8 years
	median(Planned_Project_Duration) #Median: 600 days= 20 months = 1.67 years
	sd(Planned_Project_Duration)/30/12 #Standard deviation: 330 days = 11 months = 0.9 years
	IQR(Planned_Project_Duration)/30 #IQR: 12.85




	#QUESTION 1-b: characterising project delays distribution, determining whether the length of project delay changed overtime

	project_delay_with_no_NAs <- project_delay[-which(is.na(project_delay))]
	mean(project_delay_with_no_NAs) #Mean: 568 days= 18.9 months = 1.58 years
	median(project_delay_with_no_NAs) #Median: 457 days= 15.23 months = 1.27 years
	quantile(project_delay_with_no_NAs) #Range:3837 days= 128 months= 10.65 years
	IQR(project_delay_with_no_NAs)/30/12 #IQR= 516 days=17.2 months=1.43 years (the difference between the values of 75% and 25% quantiles)


	#creating a vector with numeric differences between Revised and Original Completetion dates (project delay values)
	project_delay <- as.numeric(assignment_data$RevisedCompletionDate- assignment_data$OriginalCompletionDate)
	str(project_delay)

	#plotting project delay duration against project's circulation date
	regression_plot<-plot (assignment_data$CirculationDate, project_delay, cex=0.5, pch = 16, xlab="Circulation Date", ylab="Project Delay", col.lab="darkorchid4", font.lab=2)
	#creating simple linear model to see whether there's a relationship between the circulation dates and project delays
	regression <- lm(project_delay~ assignment_data$CirculationDate, na.action = na.omit)
	#?lm (na.action default explains how NAs were treated)
	abline(regression, col="darkorchid4", lwd=3) #plotting the regression line

	#deriving p-value of 0.00772 and slope value of -.03 from the regression summary
	summary(regression)





	# QUESTION 1-c: comparison between the actual and planned project durations

	#calculating descriptive statistics for actual project duration
	actual_duration <- as.numeric(assignment_data$RevisedCompletionDate-assignment_data$ApprovalDate)

	mean(actual_duration)/30/12 #Mean: 1218 days= 40.6 months =3.4 years
	median(actual_duration)/30/12 #Median: 1120 days= 37.3 months = 3.11 years
	quantile(actual_duration)/30 #Range: 146 months-2 months = 144 months
	IQR(actual_duration)/30 #IQR: 643 days= 21 months
	sd(actual_duration)/30
	40.6-22



	#__________________QUESTION 2______________________________________________________________________________________


	#calculating a number of all projects completed after 2010 to be further used for finding a percentage of projects with specific rating
	all_ratings<- length(which(assignment_data$RevisedCompletionDate >= "2010-01-01"))
	all_ratings# 1274 completed projects in total
	#calculating a percentage of projects with specific rating
	percentage_zero <- length(which(assignment_data$RevisedCompletionDate >= "2010-01-01"& assignment_data$Rating==0))/all_ratings*100
	percentage_one <-length(which(assignment_data$RevisedCompletionDate >= "2010-01-01"& assignment_data$Rating==1))/all_ratings*100
	percentage_two <-length(which(assignment_data$RevisedCompletionDate >= "2010-01-01"& assignment_data$Rating==2))/all_ratings*100
	percentage_three <-length(which(assignment_data$RevisedCompletionDate >= "2010-01-01"& assignment_data$Rating==3))/all_ratings*100


	percentage_zero # % of projects rated 0: 2.1%
	percentage_one # % of projects rated 1: 11.5%
	percentage_two # % of projects rated 2: 72%
	percentage_three # % of projects rated 3: 14.2%



	#__________________QUESTION 3_______________________________________________________________

	#calculating a number of all PATA projects completed after 2010
	PATA_projects <-length(which(assignment_data$RevisedCompletionDate >= "2010-01-01"& assignment_data$Type =="PATA"))
	PATA_projects #274 PATA project in total

	#calculating a percentage of projects with specific rating
	percentage_zero_PATA<-(length(which(assignment_data$RevisedCompletionDate >= "2010-01-01"& assignment_data$Rating==0 & assignment_data$Type =="PATA")))/PATA_projects*100
	percentage_one_PATA<-(length(which(assignment_data$RevisedCompletionDate >= "2010-01-01"& assignment_data$Rating==1 & assignment_data$Type =="PATA")))/PATA_projects*100
	percentage_two_PATA<-(length(which(assignment_data$RevisedCompletionDate >= "2010-01-01"& assignment_data$Rating==2 & assignment_data$Type =="PATA")))/PATA_projects*100
	percentage_three_PATA<-(length(which(assignment_data$RevisedCompletionDate >= "2010-01-01"& assignment_data$Rating==3 & assignment_data$Type =="PATA")))/PATA_projects*100

	percentage_zero_PATA # % of projects rated 0: 1.09%
	percentage_one_PATA # % of projects rated 1: 8.03%
	percentage_two_PATA # % of projects rated 2: 71.89%
	percentage_three_PATA # % of projects rated 3: 18.6%



	#__________________QUESTION 4__________________________________________________________

	descending <- assignment_data[order(assignment_data$RevisedAmount),]
	top_ten <- descending[1:round(nrow(descending)*0.1),]

	ascending <-assignment_data[order(-assignment_data$RevisedAmount),]
	bottom_ten <-ascending[1:round(nrow(ascending)*0.1),]

	#to be used for density plot
	d_plot<-density(top_ten$Rating)
	a_plot<-density(bottom_ten$Rating)

	#ploting ratings distribution for top 10% funded projects and 10% least funded projects
	plot(d_plot, main="", xlab="Rating", col="blue", col.lab="darkorchid4", font.lab=2,lwd = 1)
	lines(a_plot,col="pink",lwd = 1)
	#n=166


	#linear regression of the rating on other independent variables - country, division, department, cluster
	reg_Country <- lm(assignment_data$Rating ~ assignment_data$Country, assignment_data)
	reg_Division <- lm(assignment_data$Rating ~ assignment_data$Division, assignment_data)
	reg_Department <- lm(assignment_data$Rating ~ assignment_data$Dept, assignment_data)
	reg_Cluster <- lm(assignment_data$Rating ~ assignment_data$Cluster, assignment_data)

	summary(reg_Country) #p-value: 2.2e-16
	summary(reg_Division) #p-value: 2.662e-09
	summary(reg_Department) #p-value: 6.831e-14
	summary(reg_Cluster) #p-value: 0.467