Bojne/CS112-Assignment-1.R

## CS112-Assignment-1.R
library("dplyr")
library("ggplot2")

RawData <- read.csv("https://tinyurl.com/yb4phxx8") # read in the Multilateral Development Institution Data

# {Step 0} Explore the data
names(RawData) # dimensions of the data set
dim(RawData) # quick look at the data structure

# {Step 1} Preprocess the data
PreProcess <- function(RawData){
  # Set NA, change the data to date type.
  date.columns <- c(11, 12, 14, 15, 16, 17, 18, 25)
  for(i in date.columns) # loops through the "date.columns"
  {
    # Find missing values
    which_values_are_missing <- which(as.character(RawData[, i]) == "")
    # Replace them by NAs
    RawData[which_values_are_missing , i] <- NA
    # Turn values into dates
    RawData[, i] <- as.Date(as.character(RawData[, i]))
  }

  tidy_df = RawData %>%
    filter(!is.na(CirculationDate)) %>% # Drop col which has NA value in 'CirculationDate'
    filter(CirculationDate > "2009-01-01") # Drop col which date is before 2009.01.01 in 'CirculationDate'
  return(tidy_df)
}
tidy_df = PreProcess(RawData) # Name the processed dataframe as 'df'
dim(tidy_df) # dimensions of the data set, after Preprocessing
sapply(tidy_df, class) # Check column names and types

# {Step 2} Mutate the data to answer following questions
df = tidy_df %>%
  mutate('Delays'= as.numeric(RevisedCompletionDate - OriginalCompletionDate)) %>% # Delays between Orignial and Revised Completion Date
  mutate('DurationPlan'= as.numeric(RevisedCompletionDate - ApprovalDate)) %>% # Planed project duration
  mutate('DurationActual'= as.numeric(OriginalCompletionDate - ApprovalDate)) # Actual project duration

### Question 1: Project period
### 1.a
summary(df$DurationPlan / 30)
### 1.b

df1 = df %>% filter(!is.na(Delays))
df1 = df1 %>%
  mutate(EarlyCirculation=CirculationDate < median(df1$CirculationDate))
df1 %>% group_by(EarlyCirculation) %>%
  summarize(Mean = mean(Delays, na.rm = TRUE), Median = median(Delays, na.rm = TRUE))


### {Problem 2}
RatingTable <- function(df){
  df %>% group_by(Rating) %>%
    summarise(Count = n(), Percent= n()*100/length(df$Rating))
}


df2 = df %>% filter(RevisedCompletionDate > "2010-01-01")
RatingTable(df2)

### {Problem 3}
df3 = df %>%
  filter(RevisedCompletionDate > "2010-01-01") %>%
  filter(Type == 'PATA')
RatingTable(df3)

### {Problem 4}
N = round(dim(df)[1]/10)
df4_top10 = df %>% arrange(desc(RevisedAmount)) %>% slice(0:N)
df4_bot10 = df %>% arrange(RevisedAmount) %>% slice((n()-N):n())

RatingTable(df4_top10)
RatingTable(df4_bot10)
summary(df4_top10)
	library("dplyr")
	library("ggplot2")

	RawData <- read.csv("https://tinyurl.com/yb4phxx8") # read in the Multilateral Development Institution Data

	# {Step 0} Explore the data
	names(RawData) # dimensions of the data set
	dim(RawData) # quick look at the data structure

	# {Step 1} Preprocess the data
	PreProcess <- function(RawData){
	# Set NA, change the data to date type.
	date.columns <- c(11, 12, 14, 15, 16, 17, 18, 25)
	for(i in date.columns) # loops through the "date.columns"
	{
	# Find missing values
	which_values_are_missing <- which(as.character(RawData[, i]) == "")
	# Replace them by NAs
	RawData[which_values_are_missing , i] <- NA
	# Turn values into dates
	RawData[, i] <- as.Date(as.character(RawData[, i]))
	}

	tidy_df = RawData %>%
	filter(!is.na(CirculationDate)) %>% # Drop col which has NA value in 'CirculationDate'
	filter(CirculationDate > "2009-01-01") # Drop col which date is before 2009.01.01 in 'CirculationDate'
	return(tidy_df)
	}
	tidy_df = PreProcess(RawData) # Name the processed dataframe as 'df'
	dim(tidy_df) # dimensions of the data set, after Preprocessing
	sapply(tidy_df, class) # Check column names and types

	# {Step 2} Mutate the data to answer following questions
	df = tidy_df %>%
	mutate('Delays'= as.numeric(RevisedCompletionDate - OriginalCompletionDate)) %>% # Delays between Orignial and Revised Completion Date
	mutate('DurationPlan'= as.numeric(RevisedCompletionDate - ApprovalDate)) %>% # Planed project duration
	mutate('DurationActual'= as.numeric(OriginalCompletionDate - ApprovalDate)) # Actual project duration

	### Question 1: Project period
	### 1.a
	summary(df$DurationPlan / 30)
	### 1.b

	df1 = df %>% filter(!is.na(Delays))
	df1 = df1 %>%
	mutate(EarlyCirculation=CirculationDate < median(df1$CirculationDate))
	df1 %>% group_by(EarlyCirculation) %>%
	summarize(Mean = mean(Delays, na.rm = TRUE), Median = median(Delays, na.rm = TRUE))


	### {Problem 2}
	RatingTable <- function(df){
	df %>% group_by(Rating) %>%
	summarise(Count = n(), Percent= n()*100/length(df$Rating))
	}


	df2 = df %>% filter(RevisedCompletionDate > "2010-01-01")
	RatingTable(df2)

	### {Problem 3}
	df3 = df %>%
	filter(RevisedCompletionDate > "2010-01-01") %>%
	filter(Type == 'PATA')
	RatingTable(df3)

	### {Problem 4}
	N = round(dim(df)[1]/10)
	df4_top10 = df %>% arrange(desc(RevisedAmount)) %>% slice(0:N)
	df4_bot10 = df %>% arrange(RevisedAmount) %>% slice((n()-N):n())

	RatingTable(df4_top10)
	RatingTable(df4_bot10)
	summary(df4_top10)