anglilian/CS112 R Assignment

## CS112 R Assignment
##SETTING UP DATA
foo <- read.csv("https://tinyurl.com/yb4phxx8")

names(foo)

dim(foo)
head(foo)
date.columns <- c(11,12,14,15,16,17,18,25)

for (i in date.columns) {
  which_values_are_missing <- which(as.character(foo[,i]) == "")
  foo[which_values_are_missing, i] <- NA
  foo[,i] <- as.Date(as.character(foo[,i]))
}

#Removing NA values from data
which.have.NAs <- which(is.na(foo$CirculationDate))
new_foo <- foo[-which.have.NAs,]

#Removing rows that have a Circulation Date lower than the given value
withindate <- new_foo[which(new_foo$CirculationDate >= as.Date("2009-01-01")),]

##QUESTION 1 (a)
#removing rows from Original Completion Date that have NA Values
completion.have.NAs <- which(is.na(withindate$OriginalCompletionDate))
completion_foo <- withindate[-completion.have.NAs,]

#subtracting Approval Date from Original Completion Date and converting the time difference to months
time <- (completion_foo$OriginalCompletionDate - completion_foo$ApprovalDate)

#average project duration
mean(time)


##QUESTION 1(b)
#changing circulation dates to only have year
completion_foo$CirculationDate <- format(completion_foo$CirculationDate, format = "%Y")

#creating a list of unique years from the available Circulation Date data
completion_foo$CirculationDate <- format(completion_foo$CirculationDate, format = "%Y")
years <- unique(completion_foo$CirculationDate)

#calculating the delay and converting the unit to months
difference <- (completion_foo$RevisedCompletionDate- completion_foo$OriginalCompletionDate)

#converting the elements in the list to be numeric
difference <- as.numeric(difference)
completion_foo$difference <- difference

#Calculating the mean, median and IQR of the data in each year group
mean <- aggregate(completion_foo$difference, by =list(completion_foo$CirculationDate), mean)
median <- aggregate(completion_foo$difference, by =list(completion_foo$CirculationDate), median)
IQR <- aggregate(completion_foo$difference, by =list(completion_foo$CirculationDate), IQR)

#plotting the data on a line graph
plot(mean ,  col= "red", type= "b", pch = 15,ylim = c(300, 700),
     xlab= "Circulation Year", ylab = "Delay (months)",
     )
lines (median, col="green", type = "o", pch=17)
lines (IQR, col="blue", type = "o", pch = 16)
legend ("bottomleft", pch=c(15,16,17),
        col= c('red', 'blue', 'green'),
        legend = c("Mean Delay", "Median Delay", "IQR of Delays"))
grid (nx=NA, ny= NULL)

##Question 1(c)
#calculating the duration between Approval Date and Revised Completion Date
duration <- (completion_foo$RevisedCompletionDate- completion_foo$ApprovalDate)

#converting the elements in the list to be numeric
duration <- as.numeric(duration)

#Adding the duration as a column in the data set
completion_foo$duration <- duration

new_years <- unique(completion_foo$RevisedCompletionDate)

#Calculating the mean, median and IQR according to year
new_mean <- aggregate(completion_foo$duration, by =list(completion_foo$CirculationDate), mean)
new_median <- aggregate(completion_foo$duration, by =list(completion_foo$CirculationDate), median)
new_IQR <- aggregate(completion_foo$duration, by =list(completion_foo$CirculationDate), IQR)

#plotting the data onto a line graph
plot(new_mean ,  col= "red", type= "b", pch = 15, ylim = c(500, 1400),
     xlab= "Circulation Year", ylab = "Delay (months)")
lines (new_median, col="green", type = "o", pch=17)
lines (new_IQR, col="blue", type = "o", pch = 16)
legend ("bottomleft", pch=c(15,16,17),
        col= c('red', 'blue', 'green'), cex = 0.5,
        legend = c("Mean Delay", "Median Delay", "IQR of Delays"))
axis(side = 1, at = c(0,1))
grid (nx= NA, ny = NULL)

##QUESTION 2
#removing the entries from Rating which have no values
rating.have.NAs <- which(is.na(withindate$Rating))
rating_foo <- withindate[-rating.have.NAs,]

#Excluding all ratings before 2010
present_foo <- rating_foo[which(rating_foo$RevisedCompletionDate >= 2010),]

#Putting ratings and their frequency into a table
ratings <- table(present_foo$Rating)
cbind(ratings,prop.table(ratings)) #adding column of percentage of ratings

##QUESTION 3
#Only taking PATA project data
PATA_foo <- present_foo[which(present_foo$Type=="PATA"),]

#Putting PATA Project ratings and their frequency into a table
PATA <- table(PATA_foo$Rating)
cbind(PATA, prop.table(PATA)) #adding column of percentage of ratings

##Question 4
#sorts the data frame based on the revised amount in ascending order
rating_foo$RevisedAmount <- sort(rating_foo$RevisedAmount)

#The difference in percentage ratings of the top and bottom 10%
bottom <- rating_foo[which(rating_foo$RevisedAmount <= quantile(rating_foo$RevisedAmount, 0.1)),]
top<- rating_foo[which(rating_foo$RevisedAmount >= quantile(rating_foo$RevisedAmount, 0.9)),]
(table(top$Rating) /length(top$Rating)*100) -
  (table(bottom$Rating) /length(bottom$Rating)*100)

#computing the distribution of divisions among the top and bottom 10% of projects
division_bottom <- rating_foo$Division[1: tenpercent]
division_top <- rating_foo$Division[(length(rating_foo$RevisedAmount)- tenpercent): length(rating_foo$RevisedAmount)]
divisiontop_t <- table(division_top)
divisionbottom_t <- table(division_bottom)
rbind(divisiontop_t, divisionbottom_t)

#computing the distribution of countries among the top and bottom 10% of projects
country_bottom <- rating_foo$Country[1: tenpercent]
country_top <- rating_foo$Country[(length(rating_foo$RevisedAmount)- tenpercent): length(rating_foo$RevisedAmount)]
countrytop_t <- table(country_top)
countrybottom_t <- table(country_bottom)
rbind(countrytop_t, countrybottom_t)
	##SETTING UP DATA
	foo <- read.csv("https://tinyurl.com/yb4phxx8")

	names(foo)

	dim(foo)
	head(foo)
	date.columns <- c(11,12,14,15,16,17,18,25)

	for (i in date.columns) {
	which_values_are_missing <- which(as.character(foo[,i]) == "")
	foo[which_values_are_missing, i] <- NA
	foo[,i] <- as.Date(as.character(foo[,i]))
	}

	#Removing NA values from data
	which.have.NAs <- which(is.na(foo$CirculationDate))
	new_foo <- foo[-which.have.NAs,]

	#Removing rows that have a Circulation Date lower than the given value
	withindate <- new_foo[which(new_foo$CirculationDate >= as.Date("2009-01-01")),]

	##QUESTION 1 (a)
	#removing rows from Original Completion Date that have NA Values
	completion.have.NAs <- which(is.na(withindate$OriginalCompletionDate))
	completion_foo <- withindate[-completion.have.NAs,]

	#subtracting Approval Date from Original Completion Date and converting the time difference to months
	time <- (completion_foo$OriginalCompletionDate - completion_foo$ApprovalDate)

	#average project duration
	mean(time)


	##QUESTION 1(b)
	#changing circulation dates to only have year
	completion_foo$CirculationDate <- format(completion_foo$CirculationDate, format = "%Y")

	#creating a list of unique years from the available Circulation Date data
	completion_foo$CirculationDate <- format(completion_foo$CirculationDate, format = "%Y")
	years <- unique(completion_foo$CirculationDate)

	#calculating the delay and converting the unit to months
	difference <- (completion_foo$RevisedCompletionDate- completion_foo$OriginalCompletionDate)

	#converting the elements in the list to be numeric
	difference <- as.numeric(difference)
	completion_foo$difference <- difference

	#Calculating the mean, median and IQR of the data in each year group
	mean <- aggregate(completion_foo$difference, by =list(completion_foo$CirculationDate), mean)
	median <- aggregate(completion_foo$difference, by =list(completion_foo$CirculationDate), median)
	IQR <- aggregate(completion_foo$difference, by =list(completion_foo$CirculationDate), IQR)

	#plotting the data on a line graph
	plot(mean , col= "red", type= "b", pch = 15,ylim = c(300, 700),
	xlab= "Circulation Year", ylab = "Delay (months)",
	)
	lines (median, col="green", type = "o", pch=17)
	lines (IQR, col="blue", type = "o", pch = 16)
	legend ("bottomleft", pch=c(15,16,17),
	col= c('red', 'blue', 'green'),
	legend = c("Mean Delay", "Median Delay", "IQR of Delays"))
	grid (nx=NA, ny= NULL)

	##Question 1(c)
	#calculating the duration between Approval Date and Revised Completion Date
	duration <- (completion_foo$RevisedCompletionDate- completion_foo$ApprovalDate)

	#converting the elements in the list to be numeric
	duration <- as.numeric(duration)

	#Adding the duration as a column in the data set
	completion_foo$duration <- duration

	new_years <- unique(completion_foo$RevisedCompletionDate)

	#Calculating the mean, median and IQR according to year
	new_mean <- aggregate(completion_foo$duration, by =list(completion_foo$CirculationDate), mean)
	new_median <- aggregate(completion_foo$duration, by =list(completion_foo$CirculationDate), median)
	new_IQR <- aggregate(completion_foo$duration, by =list(completion_foo$CirculationDate), IQR)

	#plotting the data onto a line graph
	plot(new_mean , col= "red", type= "b", pch = 15, ylim = c(500, 1400),
	xlab= "Circulation Year", ylab = "Delay (months)")
	lines (new_median, col="green", type = "o", pch=17)
	lines (new_IQR, col="blue", type = "o", pch = 16)
	legend ("bottomleft", pch=c(15,16,17),
	col= c('red', 'blue', 'green'), cex = 0.5,
	legend = c("Mean Delay", "Median Delay", "IQR of Delays"))
	axis(side = 1, at = c(0,1))
	grid (nx= NA, ny = NULL)

	##QUESTION 2
	#removing the entries from Rating which have no values
	rating.have.NAs <- which(is.na(withindate$Rating))
	rating_foo <- withindate[-rating.have.NAs,]

	#Excluding all ratings before 2010
	present_foo <- rating_foo[which(rating_foo$RevisedCompletionDate >= 2010),]

	#Putting ratings and their frequency into a table
	ratings <- table(present_foo$Rating)
	cbind(ratings,prop.table(ratings)) #adding column of percentage of ratings

	##QUESTION 3
	#Only taking PATA project data
	PATA_foo <- present_foo[which(present_foo$Type=="PATA"),]

	#Putting PATA Project ratings and their frequency into a table
	PATA <- table(PATA_foo$Rating)
	cbind(PATA, prop.table(PATA)) #adding column of percentage of ratings

	##Question 4
	#sorts the data frame based on the revised amount in ascending order
	rating_foo$RevisedAmount <- sort(rating_foo$RevisedAmount)

	#The difference in percentage ratings of the top and bottom 10%
	bottom <- rating_foo[which(rating_foo$RevisedAmount <= quantile(rating_foo$RevisedAmount, 0.1)),]
	top<- rating_foo[which(rating_foo$RevisedAmount >= quantile(rating_foo$RevisedAmount, 0.9)),]
	(table(top$Rating) /length(top$Rating)*100) -
	(table(bottom$Rating) /length(bottom$Rating)*100)

	#computing the distribution of divisions among the top and bottom 10% of projects
	division_bottom <- rating_foo$Division[1: tenpercent]
	division_top <- rating_foo$Division[(length(rating_foo$RevisedAmount)- tenpercent): length(rating_foo$RevisedAmount)]
	divisiontop_t <- table(division_top)
	divisionbottom_t <- table(division_bottom)
	rbind(divisiontop_t, divisionbottom_t)

	#computing the distribution of countries among the top and bottom 10% of projects
	country_bottom <- rating_foo$Country[1: tenpercent]
	country_top <- rating_foo$Country[(length(rating_foo$RevisedAmount)- tenpercent): length(rating_foo$RevisedAmount)]
	countrytop_t <- table(country_top)
	countrybottom_t <- table(country_bottom)
	rbind(countrytop_t, countrybottom_t)