Created
September 24, 2020 00:45
-
-
Save abmathewks/e68a6371b6f15ff976c61c293349902d to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
### EXECUTE ANALYSIS WITH AVERAGES AND IDEALIZED SLOTS | |
### PREDICT FOR 2020 | |
################################################################################################# | |
### SET PARAMETERS | |
data_path = "K:/Data Excellence/4 - Reducing Direct Clinical Costs/ATM/NP_Supply/NP_Supply_Estimation_V3/data/NP_Supply_Data_With_2020.csv" | |
use_these_packages <- c("dplyr", "ggplot2", "data.table", "lubridate", "stringr", "boot") | |
how_far_back_weeks = 5 | |
plot_results = FALSE | |
save_output = TRUE | |
################################################################################################# | |
### PRELIMINARIES | |
options(scipen = 999) | |
library(logger) | |
log_threshold(TRACE) | |
log_info("Script initialized....") | |
################################################################################################# | |
### IMPORT PACKAGES | |
new_packages <- use_these_packages[!(use_these_packages %in% installed.packages()[,"Package"])] | |
if(length(new_packages)) { | |
install.packages(new_packages) | |
sapply(use_these_packages, require, character.only = TRUE) | |
} else { | |
sapply(use_these_packages, require, character.only = TRUE) | |
} | |
################################################################################################# | |
### SET DATE PARAMETERS | |
curr_year = year(Sys.Date()) | |
months_active = month(Sys.Date())-1 | |
################################################################################################# | |
### IMPORT RAW DATA | |
log_info("Loading data") | |
dat_capacity = fread(data_path) | |
log_info("The dataset includes {nrow(dat_capacity)} rows") | |
log_info("The dataset includes data on {length(unique(dat_capacity$StaffResourceID))} staff members") | |
log_info("The dataset includes data from {min(as.Date(dat_capacity$WorkDay))} to {max(as.Date(dat_capacity$WorkDay))}") | |
#dat_capacity[1:4] | |
#dim(dat_capacity) | |
#str(dat_capacity) | |
#length(unique(dat_capacity$StaffResourceID)) | |
#length(unique(dat_capacity$ZoneName)) | |
#colSums(is.na(dat_capacity)) | |
################################################################################################# | |
### MODIFY VARIABLES | |
log_info("Cleaning data for analysis") | |
dat_capacity[, WorkDay_date := ymd(WorkDay)] | |
dat_capacity[, Slots_num := suppressWarnings( as.numeric(Slots) )] | |
dat_capacity[, UsedSlots_num := suppressWarnings( as.numeric(UsedSlots) )] | |
dat_capacity[, slot_utilization := UsedSlots_num/Slots_num] | |
dat_capacity[, WorkDay_day := weekdays.Date(WorkDay_date)] | |
dat_capacity[, WorkDay_week := floor_date(WorkDay_date, "weeks")] | |
################################################################################################# | |
### ORDER DAYA BY NP AND DATE | |
dat_capacity = dat_capacity[order(StaffResourceID, ZoneName, WorkDay_date)] | |
################################################################################################# | |
### FILTER NP'S WITH NO RESOURCE ID | |
dat_capacity = dat_capacity[!StaffResourceID == "NULL",] | |
dat_capacity = dat_capacity[!ZoneName == "NULL",] | |
#dat_capacity = dat_capacity[any(JobTitle %like% "NP"),] | |
dat_capacity = dat_capacity[grepl('NP', JobTitle),] | |
#dat_capacity[, .N, by = JobTitle][, JobTitle] | |
################################################################################################# | |
### FILTER FULL TIME NP's | |
dat_capacity = dat_capacity[NPType == "FT",] | |
################################################################################################# | |
### CREATE DICTIONARY FOR EACH NP | |
log_info("Create NP dictionary") | |
np_dict = dat_capacity[NPType =="FT" & | |
year(WorkDay_date) %in% c(curr_year), | |
.N, | |
keyby = .(StaffResourceID, ZoneName)] | |
np_dict = np_dict[order(StaffResourceID, -N)][ | |
, .(StaffResourceID, ZoneName, N)] | |
np_dict[, row_id := .I] # Adding ID column | |
#np_dict | |
################################################################################################# | |
### CREATE DICTIONARY FOR MONTHS | |
log_info("Create monthly dictionary") | |
date_dict = data.table(WorkDay = seq(ymd(floor_date(Sys.Date(), "year")) + months(1), | |
ymd(paste0(curr_year, "12", "31", sep="-")), by = "1 weeks")) | |
#date_dict | |
################################################################################################# | |
### CREATE LISTS TO STORE OUTPUT FROM ANALYSIS | |
log_info("Create data frames to store output from analysis") | |
summary_output <- data.table() | |
np_count_output <- data.table() | |
################################################################################################# | |
### FUNCTION FOR ERROR BANDS | |
my.mean = function(x, indices) { | |
return( mean( x[indices] ) ) | |
} | |
################################################################################################# | |
### LOOP THROUGH NP DICT AND WEEK DICT | |
log_info("Start looping through each NP and date") | |
#each_np = 2 | |
for(each_np in 1:nrow(np_dict)){ | |
curr_row_dict = np_dict[each_np,] | |
#curr_row_dict | |
log_info("Executing analysis for {curr_row_dict$row_id}") | |
filter_criteria_1 = paste0(colnames(curr_row_dict)[1], " == ", curr_row_dict[, get(colnames(curr_row_dict)[1])] ) | |
filter_criteria_2 = paste0(colnames(curr_row_dict)[2], " == '", curr_row_dict[, get(colnames(curr_row_dict)[2])], "' " ) | |
filter_criteria = paste0(filter_criteria_1, " & ", filter_criteria_2) | |
log_info("Executing analysis for {filter_criteria}") | |
#filter_criteria = paste0(filter_criteria_1) | |
#dat_capacity %>% filter_(filter_criteria) | |
dat_capacity2 = dat_capacity[eval(parse(text=filter_criteria)),] | |
#dat_capacity2 | |
#months_active = 12 | |
if( !length(unique(month(dat_capacity2$WorkDay_date))) %in% c(months_active) ) { | |
log_info("NP was not active for every month to date in the current year.") | |
next | |
} | |
log_info("Filtering resulted in the following number of rows: {nrow(dat_capacity2)}") | |
#each_date = 14 | |
for(each_date in 1:nrow(date_dict)){ | |
#print(week_dict[each_date,]) | |
curr_date_dict = ymd( date_dict[each_date, WorkDay] ) | |
#curr_date_dict | |
if(!curr_date_dict <= Sys.Date()) next | |
log_info("Executing date: {curr_date_dict}") | |
if( nrow(dat_capacity2) >= 1){ | |
service_location_name = dat_capacity2[.N, Service_location] | |
summary_output_estimates.tmp = dat_capacity2[WorkDay_date %in% seq(curr_date_dict - weeks(how_far_back_weeks), curr_date_dict, by = "days"), | |
.(Slots_mean = mean(Slots_num, na.rm = TRUE)), | |
by = .(StaffResourceID, ZoneName)][ | |
, each_date := curr_date_dict][ | |
, Slot_estimate_yearly := Slots_mean * (52*5) | |
][, how_far_back_weeks_val := how_far_back_weeks][ | |
, Service_location_name := service_location_name | |
] | |
#summary_output_estimates.tmp[] | |
eb_dat = dat_capacity2[WorkDay_date %in% seq(curr_date_dict - weeks(how_far_back_weeks), curr_date_dict, by = "days"),] | |
#eb_dat | |
if(length(unique(eb_dat$Slots_num)) >= 3 & nrow(eb_dat) >= 5){ | |
time.boot = boot(eb_dat[,Slots_num], my.mean, 10000) | |
summary_output_estimates.tmp[, Slot_estimate_yearly_lo := boot.ci(time.boot, conf = 0.8)$normal[2] * (52*5)] | |
summary_output_estimates.tmp[, Slot_estimate_yearly_hi := boot.ci(time.boot, conf = 0.8)$normal[3] * (52*5)] | |
} else { | |
#summary_output_estimates.tmp[, Slot_estimate_yearly_lo := NA] | |
#summary_output_estimates.tmp[, Slot_estimate_yearly_hi := NA] | |
summary_output_estimates.tmp[, Slot_estimate_yearly_lo := (Slot_estimate_yearly*0.80)] | |
summary_output_estimates.tmp[, Slot_estimate_yearly_hi := (Slot_estimate_yearly*1.20)] | |
} | |
summary_output_actuals.tmp = dat_capacity2[year(WorkDay_date) %in% year(curr_date_dict), | |
.(Slots_actuals_year = sum(Slots_num, na.rm = TRUE)), | |
by = .(StaffResourceID, ZoneName)] | |
#summary_output_actuals.tmp[] | |
summary_output.tmp = merge(summary_output_estimates.tmp, | |
summary_output_actuals.tmp, | |
by = c("StaffResourceID","ZoneName")) | |
summary_output = rbind(summary_output, summary_output.tmp) | |
log_info(' Loop iteration completed ') | |
} else { | |
log_info("Loop iteration skipped") | |
next | |
} | |
} | |
} | |
log_info("Final output from loop has {nrow(summary_output)} rows") | |
log_info("The final yearly estimates range from {min(summary_output$Slot_estimate_yearly)} and {max(summary_output$Slot_estimate_yearly)}") | |
#summary_output[1:2] | |
#dim(summary_output) | |
################################################################################################# | |
### SUMMARIZE OUTPUT | |
log_info(' Collecting data for actuals by date ') | |
#each_date = 3 | |
for(each_date in 1:nrow(date_dict)){ | |
#print(week_dict[each_date,]) | |
curr_date_dict = ymd( date_dict[each_date, WorkDay] ) | |
#curr_date_dict | |
if(!curr_date_dict <= Sys.Date()) next | |
if( nrow(dat_capacity2) >= 1){ | |
np_count.tmp = dat_capacity[WorkDay_date %in% seq(curr_date_dict - weeks(how_far_back_weeks), curr_date_dict, by = "days"), | |
.(NP_Unique_window = uniqueN(StaffResourceID), | |
Slots_Actual_window = sum(Slots_num)), | |
by = ZoneName][, each_date := curr_date_dict] | |
#np_count.tmp[] | |
np_count_actual.tmp = dat_capacity[year(WorkDay_date) == year(curr_date_dict), | |
.(NP_Unique_total = uniqueN(StaffResourceID), | |
Slots_Actual_total = sum(Slots_num)), | |
by = ZoneName][, each_date := curr_date_dict] | |
#np_count_actual.tmp[] | |
np_count.tmp = merge(np_count.tmp, | |
np_count_actual.tmp, | |
by = c("ZoneName","each_date")) | |
np_count_output = rbind(np_count_output, np_count.tmp) | |
} | |
} | |
log_info("Final output from loop has {nrow(np_count_output)} rows") | |
log_info("Finished collecting data for actuals by date") | |
#np_count_output | |
################### | |
log_info("Merge all data together") | |
final_output = merge( | |
summary_output[, .(sum_slots_estimate_yearly = sum(Slot_estimate_yearly, na.rm = TRUE), | |
sum_slots_estimate_yearly_lo = sum(Slot_estimate_yearly_lo, na.rm = TRUE), | |
sum_slots_estimate_yearly_hi = sum(Slot_estimate_yearly_hi, na.rm = TRUE)), | |
by = .(each_date, ZoneName)][order(each_date)], | |
np_count_output, | |
by = c("each_date", "ZoneName"), | |
all.x = TRUE | |
) | |
#final_output[1:2] | |
#dim(final_output) | |
final_output[, abs_perc_diff := abs((sum_slots_estimate_yearly - Slots_Actual_total)/Slots_Actual_total)] | |
final_output = final_output[, .SD[which.max(as.Date(each_date))], by = ZoneName] | |
#final_output[1:2] | |
#dim(final_output) | |
#summary(final_output$sum_slots_estimate_yearly_lo) | |
#summary(final_output$sum_slots_estimate_yearly_hi) | |
#summary(final_output$abs_perc_diff) | |
#summary(final_output$sum_slots_estimate_yearly) | |
################################################################################################# | |
### VISUALIZE OUTPUT | |
#final_output | |
#final_output[,ZoneName] | |
# final_output[ZoneName %in% c("PA_Brookville", | |
# "PA_Clymer", | |
# "PA_PittsburghDurkin_FT", | |
# "PA_Phoenixville", | |
# "PA_Youngsville", | |
# "PA_PittsburghOpalach_FT")] %>% | |
# ggplot(., aes(x = each_date, y = sum_slots_estimate_yearly)) + | |
# geom_line(lwd = 2) + | |
# geom_line(aes(y = NP_Unique_window), lwd = 1.5, color = "red", linetype = "dashed") + | |
# geom_line(aes(y = NP_Unique_total), lwd = 1.5, color = "yellow", linetype = "dashed") + | |
# geom_line(aes(y = Slots_Actual_window), lwd = 1.5, color = "blue") + | |
# geom_line(aes(y = Slots_Actual_total), lwd = 1.5, color = "green") + | |
# ggtitle("2020 Estimates") + | |
# facet_wrap(~ZoneName) | |
################################################################################################# | |
### EXPORT RESULTS | |
if(save_output){ | |
log_info("Export final results") | |
fwrite(final_output, | |
paste0("K:/Data Excellence/4 - Reducing Direct Clinical Costs/ATM/NP_Supply/NP_Supply_Estimation_V3/output/", | |
"Supply_Estimates_Using_Averages_by_GeoZone_", | |
str_replace_all(Sys.Date(), "[^[:alnum:]]", ""), | |
".csv", sep="")) | |
} | |
log_info("Script completed...") | |
################################################################################################# | |
################################################################################################# | |
################################################################################################# | |
#final_output[1:2] | |
#final_output2 = final_output[, .(ZoneName, sum_slots_estimate_yearly)] | |
#fwrite(final_output2, "K:/Data Excellence/4 - Reducing Direct Clinical Costs/ATM/NP_Supply/NP_Supply_Estimation_V3/output/Supply_Estimates_Using_Averages_by_GeoZone.csv") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment