Skip to content

Instantly share code, notes, and snippets.

@abmathewks
Created September 24, 2020 00:45
Show Gist options
  • Save abmathewks/e68a6371b6f15ff976c61c293349902d to your computer and use it in GitHub Desktop.
Save abmathewks/e68a6371b6f15ff976c61c293349902d to your computer and use it in GitHub Desktop.
### EXECUTE ANALYSIS WITH AVERAGES AND IDEALIZED SLOTS
### PREDICT FOR 2020
#################################################################################################
### SET PARAMETERS
data_path = "K:/Data Excellence/4 - Reducing Direct Clinical Costs/ATM/NP_Supply/NP_Supply_Estimation_V3/data/NP_Supply_Data_With_2020.csv"
use_these_packages <- c("dplyr", "ggplot2", "data.table", "lubridate", "stringr", "boot")
how_far_back_weeks = 5
plot_results = FALSE
save_output = TRUE
#################################################################################################
### PRELIMINARIES
options(scipen = 999)
library(logger)
log_threshold(TRACE)
log_info("Script initialized....")
#################################################################################################
### IMPORT PACKAGES
new_packages <- use_these_packages[!(use_these_packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) {
install.packages(new_packages)
sapply(use_these_packages, require, character.only = TRUE)
} else {
sapply(use_these_packages, require, character.only = TRUE)
}
#################################################################################################
### SET DATE PARAMETERS
curr_year = year(Sys.Date())
months_active = month(Sys.Date())-1
#################################################################################################
### IMPORT RAW DATA
log_info("Loading data")
dat_capacity = fread(data_path)
log_info("The dataset includes {nrow(dat_capacity)} rows")
log_info("The dataset includes data on {length(unique(dat_capacity$StaffResourceID))} staff members")
log_info("The dataset includes data from {min(as.Date(dat_capacity$WorkDay))} to {max(as.Date(dat_capacity$WorkDay))}")
#dat_capacity[1:4]
#dim(dat_capacity)
#str(dat_capacity)
#length(unique(dat_capacity$StaffResourceID))
#length(unique(dat_capacity$ZoneName))
#colSums(is.na(dat_capacity))
#################################################################################################
### MODIFY VARIABLES
log_info("Cleaning data for analysis")
dat_capacity[, WorkDay_date := ymd(WorkDay)]
dat_capacity[, Slots_num := suppressWarnings( as.numeric(Slots) )]
dat_capacity[, UsedSlots_num := suppressWarnings( as.numeric(UsedSlots) )]
dat_capacity[, slot_utilization := UsedSlots_num/Slots_num]
dat_capacity[, WorkDay_day := weekdays.Date(WorkDay_date)]
dat_capacity[, WorkDay_week := floor_date(WorkDay_date, "weeks")]
#################################################################################################
### ORDER DAYA BY NP AND DATE
dat_capacity = dat_capacity[order(StaffResourceID, ZoneName, WorkDay_date)]
#################################################################################################
### FILTER NP'S WITH NO RESOURCE ID
dat_capacity = dat_capacity[!StaffResourceID == "NULL",]
dat_capacity = dat_capacity[!ZoneName == "NULL",]
#dat_capacity = dat_capacity[any(JobTitle %like% "NP"),]
dat_capacity = dat_capacity[grepl('NP', JobTitle),]
#dat_capacity[, .N, by = JobTitle][, JobTitle]
#################################################################################################
### FILTER FULL TIME NP's
dat_capacity = dat_capacity[NPType == "FT",]
#################################################################################################
### CREATE DICTIONARY FOR EACH NP
log_info("Create NP dictionary")
np_dict = dat_capacity[NPType =="FT" &
year(WorkDay_date) %in% c(curr_year),
.N,
keyby = .(StaffResourceID, ZoneName)]
np_dict = np_dict[order(StaffResourceID, -N)][
, .(StaffResourceID, ZoneName, N)]
np_dict[, row_id := .I] # Adding ID column
#np_dict
#################################################################################################
### CREATE DICTIONARY FOR MONTHS
log_info("Create monthly dictionary")
date_dict = data.table(WorkDay = seq(ymd(floor_date(Sys.Date(), "year")) + months(1),
ymd(paste0(curr_year, "12", "31", sep="-")), by = "1 weeks"))
#date_dict
#################################################################################################
### CREATE LISTS TO STORE OUTPUT FROM ANALYSIS
log_info("Create data frames to store output from analysis")
summary_output <- data.table()
np_count_output <- data.table()
#################################################################################################
### FUNCTION FOR ERROR BANDS
my.mean = function(x, indices) {
return( mean( x[indices] ) )
}
#################################################################################################
### LOOP THROUGH NP DICT AND WEEK DICT
log_info("Start looping through each NP and date")
#each_np = 2
for(each_np in 1:nrow(np_dict)){
curr_row_dict = np_dict[each_np,]
#curr_row_dict
log_info("Executing analysis for {curr_row_dict$row_id}")
filter_criteria_1 = paste0(colnames(curr_row_dict)[1], " == ", curr_row_dict[, get(colnames(curr_row_dict)[1])] )
filter_criteria_2 = paste0(colnames(curr_row_dict)[2], " == '", curr_row_dict[, get(colnames(curr_row_dict)[2])], "' " )
filter_criteria = paste0(filter_criteria_1, " & ", filter_criteria_2)
log_info("Executing analysis for {filter_criteria}")
#filter_criteria = paste0(filter_criteria_1)
#dat_capacity %>% filter_(filter_criteria)
dat_capacity2 = dat_capacity[eval(parse(text=filter_criteria)),]
#dat_capacity2
#months_active = 12
if( !length(unique(month(dat_capacity2$WorkDay_date))) %in% c(months_active) ) {
log_info("NP was not active for every month to date in the current year.")
next
}
log_info("Filtering resulted in the following number of rows: {nrow(dat_capacity2)}")
#each_date = 14
for(each_date in 1:nrow(date_dict)){
#print(week_dict[each_date,])
curr_date_dict = ymd( date_dict[each_date, WorkDay] )
#curr_date_dict
if(!curr_date_dict <= Sys.Date()) next
log_info("Executing date: {curr_date_dict}")
if( nrow(dat_capacity2) >= 1){
service_location_name = dat_capacity2[.N, Service_location]
summary_output_estimates.tmp = dat_capacity2[WorkDay_date %in% seq(curr_date_dict - weeks(how_far_back_weeks), curr_date_dict, by = "days"),
.(Slots_mean = mean(Slots_num, na.rm = TRUE)),
by = .(StaffResourceID, ZoneName)][
, each_date := curr_date_dict][
, Slot_estimate_yearly := Slots_mean * (52*5)
][, how_far_back_weeks_val := how_far_back_weeks][
, Service_location_name := service_location_name
]
#summary_output_estimates.tmp[]
eb_dat = dat_capacity2[WorkDay_date %in% seq(curr_date_dict - weeks(how_far_back_weeks), curr_date_dict, by = "days"),]
#eb_dat
if(length(unique(eb_dat$Slots_num)) >= 3 & nrow(eb_dat) >= 5){
time.boot = boot(eb_dat[,Slots_num], my.mean, 10000)
summary_output_estimates.tmp[, Slot_estimate_yearly_lo := boot.ci(time.boot, conf = 0.8)$normal[2] * (52*5)]
summary_output_estimates.tmp[, Slot_estimate_yearly_hi := boot.ci(time.boot, conf = 0.8)$normal[3] * (52*5)]
} else {
#summary_output_estimates.tmp[, Slot_estimate_yearly_lo := NA]
#summary_output_estimates.tmp[, Slot_estimate_yearly_hi := NA]
summary_output_estimates.tmp[, Slot_estimate_yearly_lo := (Slot_estimate_yearly*0.80)]
summary_output_estimates.tmp[, Slot_estimate_yearly_hi := (Slot_estimate_yearly*1.20)]
}
summary_output_actuals.tmp = dat_capacity2[year(WorkDay_date) %in% year(curr_date_dict),
.(Slots_actuals_year = sum(Slots_num, na.rm = TRUE)),
by = .(StaffResourceID, ZoneName)]
#summary_output_actuals.tmp[]
summary_output.tmp = merge(summary_output_estimates.tmp,
summary_output_actuals.tmp,
by = c("StaffResourceID","ZoneName"))
summary_output = rbind(summary_output, summary_output.tmp)
log_info(' Loop iteration completed ')
} else {
log_info("Loop iteration skipped")
next
}
}
}
log_info("Final output from loop has {nrow(summary_output)} rows")
log_info("The final yearly estimates range from {min(summary_output$Slot_estimate_yearly)} and {max(summary_output$Slot_estimate_yearly)}")
#summary_output[1:2]
#dim(summary_output)
#################################################################################################
### SUMMARIZE OUTPUT
log_info(' Collecting data for actuals by date ')
#each_date = 3
for(each_date in 1:nrow(date_dict)){
#print(week_dict[each_date,])
curr_date_dict = ymd( date_dict[each_date, WorkDay] )
#curr_date_dict
if(!curr_date_dict <= Sys.Date()) next
if( nrow(dat_capacity2) >= 1){
np_count.tmp = dat_capacity[WorkDay_date %in% seq(curr_date_dict - weeks(how_far_back_weeks), curr_date_dict, by = "days"),
.(NP_Unique_window = uniqueN(StaffResourceID),
Slots_Actual_window = sum(Slots_num)),
by = ZoneName][, each_date := curr_date_dict]
#np_count.tmp[]
np_count_actual.tmp = dat_capacity[year(WorkDay_date) == year(curr_date_dict),
.(NP_Unique_total = uniqueN(StaffResourceID),
Slots_Actual_total = sum(Slots_num)),
by = ZoneName][, each_date := curr_date_dict]
#np_count_actual.tmp[]
np_count.tmp = merge(np_count.tmp,
np_count_actual.tmp,
by = c("ZoneName","each_date"))
np_count_output = rbind(np_count_output, np_count.tmp)
}
}
log_info("Final output from loop has {nrow(np_count_output)} rows")
log_info("Finished collecting data for actuals by date")
#np_count_output
###################
log_info("Merge all data together")
final_output = merge(
summary_output[, .(sum_slots_estimate_yearly = sum(Slot_estimate_yearly, na.rm = TRUE),
sum_slots_estimate_yearly_lo = sum(Slot_estimate_yearly_lo, na.rm = TRUE),
sum_slots_estimate_yearly_hi = sum(Slot_estimate_yearly_hi, na.rm = TRUE)),
by = .(each_date, ZoneName)][order(each_date)],
np_count_output,
by = c("each_date", "ZoneName"),
all.x = TRUE
)
#final_output[1:2]
#dim(final_output)
final_output[, abs_perc_diff := abs((sum_slots_estimate_yearly - Slots_Actual_total)/Slots_Actual_total)]
final_output = final_output[, .SD[which.max(as.Date(each_date))], by = ZoneName]
#final_output[1:2]
#dim(final_output)
#summary(final_output$sum_slots_estimate_yearly_lo)
#summary(final_output$sum_slots_estimate_yearly_hi)
#summary(final_output$abs_perc_diff)
#summary(final_output$sum_slots_estimate_yearly)
#################################################################################################
### VISUALIZE OUTPUT
#final_output
#final_output[,ZoneName]
# final_output[ZoneName %in% c("PA_Brookville",
# "PA_Clymer",
# "PA_PittsburghDurkin_FT",
# "PA_Phoenixville",
# "PA_Youngsville",
# "PA_PittsburghOpalach_FT")] %>%
# ggplot(., aes(x = each_date, y = sum_slots_estimate_yearly)) +
# geom_line(lwd = 2) +
# geom_line(aes(y = NP_Unique_window), lwd = 1.5, color = "red", linetype = "dashed") +
# geom_line(aes(y = NP_Unique_total), lwd = 1.5, color = "yellow", linetype = "dashed") +
# geom_line(aes(y = Slots_Actual_window), lwd = 1.5, color = "blue") +
# geom_line(aes(y = Slots_Actual_total), lwd = 1.5, color = "green") +
# ggtitle("2020 Estimates") +
# facet_wrap(~ZoneName)
#################################################################################################
### EXPORT RESULTS
if(save_output){
log_info("Export final results")
fwrite(final_output,
paste0("K:/Data Excellence/4 - Reducing Direct Clinical Costs/ATM/NP_Supply/NP_Supply_Estimation_V3/output/",
"Supply_Estimates_Using_Averages_by_GeoZone_",
str_replace_all(Sys.Date(), "[^[:alnum:]]", ""),
".csv", sep=""))
}
log_info("Script completed...")
#################################################################################################
#################################################################################################
#################################################################################################
#final_output[1:2]
#final_output2 = final_output[, .(ZoneName, sum_slots_estimate_yearly)]
#fwrite(final_output2, "K:/Data Excellence/4 - Reducing Direct Clinical Costs/ATM/NP_Supply/NP_Supply_Estimation_V3/output/Supply_Estimates_Using_Averages_by_GeoZone.csv")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment