Skip to content

Instantly share code, notes, and snippets.

@Bojne
Created September 21, 2019 13:55
Show Gist options
  • Save Bojne/63bc2d3a078a4d3b9f9652c2bda8b084 to your computer and use it in GitHub Desktop.
Save Bojne/63bc2d3a078a4d3b9f9652c2bda8b084 to your computer and use it in GitHub Desktop.
library("dplyr")
library("ggplot2")
RawData <- read.csv("https://tinyurl.com/yb4phxx8") # read in the Multilateral Development Institution Data
# {Step 0} Explore the data
names(RawData) # dimensions of the data set
dim(RawData) # quick look at the data structure
# {Step 1} Preprocess the data
PreProcess <- function(RawData){
# Set NA, change the data to date type.
date.columns <- c(11, 12, 14, 15, 16, 17, 18, 25)
for(i in date.columns) # loops through the "date.columns"
{
# Find missing values
which_values_are_missing <- which(as.character(RawData[, i]) == "")
# Replace them by NAs
RawData[which_values_are_missing , i] <- NA
# Turn values into dates
RawData[, i] <- as.Date(as.character(RawData[, i]))
}
tidy_df = RawData %>%
filter(!is.na(CirculationDate)) %>% # Drop col which has NA value in 'CirculationDate'
filter(CirculationDate > "2009-01-01") # Drop col which date is before 2009.01.01 in 'CirculationDate'
return(tidy_df)
}
tidy_df = PreProcess(RawData) # Name the processed dataframe as 'df'
dim(tidy_df) # dimensions of the data set, after Preprocessing
sapply(tidy_df, class) # Check column names and types
# {Step 2} Mutate the data to answer following questions
df = tidy_df %>%
mutate('Delays'= as.numeric(RevisedCompletionDate - OriginalCompletionDate)) %>% # Delays between Orignial and Revised Completion Date
mutate('DurationPlan'= as.numeric(RevisedCompletionDate - ApprovalDate)) %>% # Planed project duration
mutate('DurationActual'= as.numeric(OriginalCompletionDate - ApprovalDate)) # Actual project duration
### Question 1: Project period
### 1.a
summary(df$DurationPlan / 30)
### 1.b
df1 = df %>% filter(!is.na(Delays))
df1 = df1 %>%
mutate(EarlyCirculation=CirculationDate < median(df1$CirculationDate))
df1 %>% group_by(EarlyCirculation) %>%
summarize(Mean = mean(Delays, na.rm = TRUE), Median = median(Delays, na.rm = TRUE))
### {Problem 2}
RatingTable <- function(df){
df %>% group_by(Rating) %>%
summarise(Count = n(), Percent= n()*100/length(df$Rating))
}
df2 = df %>% filter(RevisedCompletionDate > "2010-01-01")
RatingTable(df2)
### {Problem 3}
df3 = df %>%
filter(RevisedCompletionDate > "2010-01-01") %>%
filter(Type == 'PATA')
RatingTable(df3)
### {Problem 4}
N = round(dim(df)[1]/10)
df4_top10 = df %>% arrange(desc(RevisedAmount)) %>% slice(0:N)
df4_bot10 = df %>% arrange(RevisedAmount) %>% slice((n()-N):n())
RatingTable(df4_top10)
RatingTable(df4_bot10)
summary(df4_top10)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment