Skip to content

Instantly share code, notes, and snippets.

@mrecos
Created October 3, 2022 18:38
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mrecos/057926133a8df2b789b5f1cf7ce14ad2 to your computer and use it in GitHub Desktop.
Save mrecos/057926133a8df2b789b5f1cf7ce14ad2 to your computer and use it in GitHub Desktop.
Comparing models and MODEL RANK
library(readr)
library(tidyverse)
library(glue)
library(DataRobotColors)
pred <- read.csv("DATA/Predictions/[MDH]_DRPrimaryV3_10pcnt_SF_20221003_eXtreme_Gradient_Boosted_Trees_Classifier_with_Ear_(57)_79.93_IF_-_MDH_Reduced_v2_holdout.csv")
pred_rank <- pred %>%
group_by(TRAVELERID) %>%
arrange(TRAVELERID, desc(Cross.Validation.Prediction)) %>%
mutate(pred_order = 1:n())
top20_pred <- pred_rank %>%
filter(BOOKED == 1, pred_order <= 20)
top20_MODEL_RANK <- pred_rank %>%
filter(MODEL_RANK > 0) %>%
filter(BOOKED == 1, MODEL_RANK <= 20)
all_trav_pred <- length(unique(pred$TRAVELERID))
all_trav_RANK <- length(unique(filter(pred, MODEL_RANK > 0)$TRAVELERID))
top20_pred_trav <- nrow(top20_pred)
top20_RANK_trav <- nrow(top20_MODEL_RANK)
all_booked <- sum(pred$BOOKED)
pcnt_top_20_pred <- round((top20_pred_trav / all_booked) * 100,2)
pcnt_top_20_RANK <- round((top20_RANK_trav / all_booked) * 100,2)
p_order_pred <- data.frame(table(top20_pred$BOOKED, top20_pred$pred_order)) %>%
filter(Var1 == 1) %>%
rename(Booked = Var1, Pred_order = Var2) %>%
mutate(pcnt_of_all_booked = (Freq / all_booked)*100,
model = "DR Prediction")
p_order_RANK <- data.frame(table(top20_MODEL_RANK$BOOKED, top20_MODEL_RANK$MODEL_RANK)) %>%
filter(Var1 == 1) %>%
rename(Booked = Var1, Pred_order = Var2) %>%
mutate(pcnt_of_all_booked = (Freq / all_booked)*100,
model = "MODEL RANK")
pred_compare <- rbind(p_order_pred, p_order_RANK)
ggplot(pred_compare, aes(x = Pred_order, y = pcnt_of_all_booked, fill = model)) +
scale_fill_DataRobot(palette = "DR_Diverging") +
geom_bar(stat = "identity") +
labs(x = "Top 20 Recommendations (Predicted)",
y = "% of All Booked O/T Pairs",
title = "What percent of 'Booked' orders appear in the top 20 predictions",
subtitle = "Project: [MDH] DRPrimaryV3_10pcnt SF 20221003, Model: 112, Blueprint: 57",
caption = glue("{pcnt_top_20_pred}% of 'Booked' O/T pairs are in top 20 DataRobot recommended Orders\n{pcnt_top_20_RANK}% of 'Booked' O/T pairs are in top 20 MODEL RANK recommended Orders")) +
facet_wrap(~model) +
theme_bw() +
theme(
legend.position = "none"
)
pred_rank %>%
ungroup() %>%
filter(MODEL_RANK > 0) %>%
select(MODEL_RANK, pred_order) %>%
na.omit() %>%
sample_frac(0.25) %>%
filter(pred_order <= 100 & MODEL_RANK <= 100) %>%
ggplot(aes(y = MODEL_RANK, x = pred_order)) +
geom_density_2d() +
# geom_density_2d_filled(bins=30) +
coord_equal() +
labs(x = "DR Prediction Order", y = "MODEL RANK") +
geom_point(alpha = 0.05) +
scale_x_reverse(limits = c(100,1),
labels=c(1,seq(10,100,10)),
breaks=c(1,seq(10,100,10)),
expand=c(0,0)) +
scale_y_reverse(limits = c(100,1),
labels=c(1,seq(10,100,10)),
breaks=c(1,seq(10,100,10)),
expand=c(0,0)) +
# geom_smooth() +
theme_bw() +
theme(
legend.position = "none"
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment