Last active
September 9, 2017 20:34
-
-
Save pplonski/073012dd1be0ed3d3039f755bfa0efb8 to your computer and use it in GitHub Desktop.
Churn prediction with MLJAR and R-wrapper
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This is example, how to use MLJAR service for automatic machine learning and its R-wrapper for churn prediction. | |
# Example is based on data from https://github.com/WLOGSolutions/telco-customer-churn-in-r-and-h2o/tree/master/data | |
# Example by Dominik Krzemiński | |
library(mljar) | |
library(data.table) | |
# Read and clean the dataset | |
all_data <- fread("data/edw_cdr.csv") | |
all_data <- all_data[, !c("month", "year"), with = FALSE] | |
all_data <- all_data[complete.cases(all_data)] | |
all_data <- all_data[!duplicated(all_data)] | |
# Split the data into training and test set in proportion 7:3 | |
all_data[, ind := factor(sample(0:1, size = .N, replace = TRUE, prob = c(0.3, 0.7)), | |
levels = 0:1, | |
labels = c("Test", "Train"))] | |
all_data[, churn := factor(ifelse(churn == 1, "churn", "nochurn"))] | |
# Assign features and labels to separate variables | |
x_tr <- as.data.frame(all_data[ind == "Train",] )[, -which(names(all_data)==c("churn", "ind"))] | |
y_tr <- as.data.frame(all_data[ind == "Train",] )[, which(names(all_data)=="churn")] | |
x_ts <- as.data.frame(all_data[ind == "Test",] )[, -which(names(all_data)==c("churn", "ind"))] | |
y_ts <- as.data.frame(all_data[ind == "Test",] )[, which(names(all_data)=="churn")] | |
# Run models training (automatic machine learning model tunning in MLJAR) | |
model <- mljar_fit(x_tr, y_tr, NULL, NULL, proj_title="Churn", | |
exp_title="firstchurn", dataset_title = "churndata", | |
algorithms = c("logreg", "xgb", "lgb", "etc", "rgfc", "knnc"), | |
metric = "auc") | |
# Compute predictions | |
y_pr <- mljar_predict(model, x_ts, "Churn") | |
# Get threshold | |
library(pROC) | |
my_roc <- roc(y_ts, y_pr$prediction) | |
bestthr <- coords(my_roc, "best", ret = "threshold") | |
y_pr_c <- as.data.frame(y_pr > bestthr) | |
y_pr_c$prediction[y_pr_c$prediction==TRUE] <- "nochurn" | |
y_pr_c$prediction[y_pr_c$prediction==FALSE] <- "churn" | |
y_pr_c <- data.frame(y_pr_c, stringsAsFactors = TRUE) | |
# Compute metrics | |
precision <- sum(y_ts == "churn" & y_pr_c$prediction == "churn")/(sum(y_ts == "churn" & y_pr_c$prediction == "churn") + sum(y_ts == "nochurn" & y_pr_c$prediction == "churn")) | |
recall <- sum(y_ts == "churn" & y_pr_c$prediction == "churn")/(sum(y_ts == "churn" & y_pr_c$prediction == "churn") + sum(y_ts == "churn" & y_pr_c$prediction == "nochurn")) | |
sprintf("Test accuracy: %.3f", mean(y_ts == y_pr_c$prediction)) | |
sprintf("Test recall: %.3f", recall) | |
sprintf("Test precision: %.3f", precision) | |
sprintf("Test AUC: %.3f”, auc(my_roc)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment