Skip to content

Instantly share code, notes, and snippets.

@pplonski
Last active September 9, 2017 20:34
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save pplonski/073012dd1be0ed3d3039f755bfa0efb8 to your computer and use it in GitHub Desktop.
Save pplonski/073012dd1be0ed3d3039f755bfa0efb8 to your computer and use it in GitHub Desktop.
Churn prediction with MLJAR and R-wrapper
# This is example, how to use MLJAR service for automatic machine learning and its R-wrapper for churn prediction.
# Example is based on data from https://github.com/WLOGSolutions/telco-customer-churn-in-r-and-h2o/tree/master/data
# Example by Dominik Krzemiński
library(mljar)
library(data.table)
# Read and clean the dataset
all_data <- fread("data/edw_cdr.csv")
all_data <- all_data[, !c("month", "year"), with = FALSE]
all_data <- all_data[complete.cases(all_data)]
all_data <- all_data[!duplicated(all_data)]
# Split the data into training and test set in proportion 7:3
all_data[, ind := factor(sample(0:1, size = .N, replace = TRUE, prob = c(0.3, 0.7)),
levels = 0:1,
labels = c("Test", "Train"))]
all_data[, churn := factor(ifelse(churn == 1, "churn", "nochurn"))]
# Assign features and labels to separate variables
x_tr <- as.data.frame(all_data[ind == "Train",] )[, -which(names(all_data)==c("churn", "ind"))]
y_tr <- as.data.frame(all_data[ind == "Train",] )[, which(names(all_data)=="churn")]
x_ts <- as.data.frame(all_data[ind == "Test",] )[, -which(names(all_data)==c("churn", "ind"))]
y_ts <- as.data.frame(all_data[ind == "Test",] )[, which(names(all_data)=="churn")]
# Run models training (automatic machine learning model tunning in MLJAR)
model <- mljar_fit(x_tr, y_tr, NULL, NULL, proj_title="Churn",
exp_title="firstchurn", dataset_title = "churndata",
algorithms = c("logreg", "xgb", "lgb", "etc", "rgfc", "knnc"),
metric = "auc")
# Compute predictions
y_pr <- mljar_predict(model, x_ts, "Churn")
# Get threshold
library(pROC)
my_roc <- roc(y_ts, y_pr$prediction)
bestthr <- coords(my_roc, "best", ret = "threshold")
y_pr_c <- as.data.frame(y_pr > bestthr)
y_pr_c$prediction[y_pr_c$prediction==TRUE] <- "nochurn"
y_pr_c$prediction[y_pr_c$prediction==FALSE] <- "churn"
y_pr_c <- data.frame(y_pr_c, stringsAsFactors = TRUE)
# Compute metrics
precision <- sum(y_ts == "churn" & y_pr_c$prediction == "churn")/(sum(y_ts == "churn" & y_pr_c$prediction == "churn") + sum(y_ts == "nochurn" & y_pr_c$prediction == "churn"))
recall <- sum(y_ts == "churn" & y_pr_c$prediction == "churn")/(sum(y_ts == "churn" & y_pr_c$prediction == "churn") + sum(y_ts == "churn" & y_pr_c$prediction == "nochurn"))
sprintf("Test accuracy: %.3f", mean(y_ts == y_pr_c$prediction))
sprintf("Test recall: %.3f", recall)
sprintf("Test precision: %.3f", precision)
sprintf("Test AUC: %.3f”, auc(my_roc))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment