pplonski/churn_prediction_mljar.R

## churn_prediction_mljar.R
# This is example, how to use MLJAR service for automatic machine learning and its R-wrapper for churn prediction.
# Example is based on data from https://github.com/WLOGSolutions/telco-customer-churn-in-r-and-h2o/tree/master/data
# Example by Dominik Krzemiński
library(mljar)
library(data.table)

# Read and clean the dataset
all_data <- fread("data/edw_cdr.csv")
all_data <- all_data[, !c("month", "year"), with = FALSE]
all_data <- all_data[complete.cases(all_data)]
all_data <- all_data[!duplicated(all_data)]

# Split the data into training and test set in proportion 7:3
all_data[, ind := factor(sample(0:1, size = .N, replace = TRUE, prob = c(0.3, 0.7)),
                         levels = 0:1,
                         labels = c("Test", "Train"))]
all_data[, churn := factor(ifelse(churn == 1, "churn", "nochurn"))]

# Assign features and labels to separate variables
x_tr <- as.data.frame(all_data[ind == "Train",] )[, -which(names(all_data)==c("churn", "ind"))]
y_tr <- as.data.frame(all_data[ind == "Train",] )[, which(names(all_data)=="churn")]
x_ts <- as.data.frame(all_data[ind == "Test",] )[, -which(names(all_data)==c("churn", "ind"))]
y_ts <- as.data.frame(all_data[ind == "Test",] )[, which(names(all_data)=="churn")]

# Run models training (automatic machine learning model tunning in MLJAR)
model <- mljar_fit(x_tr, y_tr, NULL, NULL, proj_title="Churn",
                   exp_title="firstchurn", dataset_title = "churndata",
                   algorithms = c("logreg", "xgb", "lgb", "etc", "rgfc", "knnc"),
                   metric = "auc")

# Compute predictions
y_pr <- mljar_predict(model, x_ts, "Churn")

# Get threshold
library(pROC)
my_roc <- roc(y_ts, y_pr$prediction)
bestthr <- coords(my_roc, "best", ret = "threshold")
y_pr_c <- as.data.frame(y_pr > bestthr)

y_pr_c$prediction[y_pr_c$prediction==TRUE]  <- "nochurn"
y_pr_c$prediction[y_pr_c$prediction==FALSE] <- "churn"
y_pr_c <- data.frame(y_pr_c, stringsAsFactors = TRUE)

# Compute metrics
precision <- sum(y_ts == "churn" & y_pr_c$prediction == "churn")/(sum(y_ts == "churn" & y_pr_c$prediction == "churn") + sum(y_ts == "nochurn" & y_pr_c$prediction == "churn"))
recall <- sum(y_ts == "churn" & y_pr_c$prediction == "churn")/(sum(y_ts == "churn" & y_pr_c$prediction == "churn") + sum(y_ts == "churn" & y_pr_c$prediction == "nochurn"))
sprintf("Test accuracy: %.3f", mean(y_ts == y_pr_c$prediction))
sprintf("Test recall: %.3f", recall)
sprintf("Test precision: %.3f", precision)
sprintf("Test AUC: %.3f”, auc(my_roc))
	# This is example, how to use MLJAR service for automatic machine learning and its R-wrapper for churn prediction.
	# Example is based on data from https://github.com/WLOGSolutions/telco-customer-churn-in-r-and-h2o/tree/master/data
	# Example by Dominik Krzemiński
	library(mljar)
	library(data.table)

	# Read and clean the dataset
	all_data <- fread("data/edw_cdr.csv")
	all_data <- all_data[, !c("month", "year"), with = FALSE]
	all_data <- all_data[complete.cases(all_data)]
	all_data <- all_data[!duplicated(all_data)]

	# Split the data into training and test set in proportion 7:3
	all_data[, ind := factor(sample(0:1, size = .N, replace = TRUE, prob = c(0.3, 0.7)),
	levels = 0:1,
	labels = c("Test", "Train"))]
	all_data[, churn := factor(ifelse(churn == 1, "churn", "nochurn"))]

	# Assign features and labels to separate variables
	x_tr <- as.data.frame(all_data[ind == "Train",] )[, -which(names(all_data)==c("churn", "ind"))]
	y_tr <- as.data.frame(all_data[ind == "Train",] )[, which(names(all_data)=="churn")]
	x_ts <- as.data.frame(all_data[ind == "Test",] )[, -which(names(all_data)==c("churn", "ind"))]
	y_ts <- as.data.frame(all_data[ind == "Test",] )[, which(names(all_data)=="churn")]

	# Run models training (automatic machine learning model tunning in MLJAR)
	model <- mljar_fit(x_tr, y_tr, NULL, NULL, proj_title="Churn",
	exp_title="firstchurn", dataset_title = "churndata",
	algorithms = c("logreg", "xgb", "lgb", "etc", "rgfc", "knnc"),
	metric = "auc")

	# Compute predictions
	y_pr <- mljar_predict(model, x_ts, "Churn")

	# Get threshold
	library(pROC)
	my_roc <- roc(y_ts, y_pr$prediction)
	bestthr <- coords(my_roc, "best", ret = "threshold")
	y_pr_c <- as.data.frame(y_pr > bestthr)

	y_pr_c$prediction[y_pr_c$prediction==TRUE] <- "nochurn"
	y_pr_c$prediction[y_pr_c$prediction==FALSE] <- "churn"
	y_pr_c <- data.frame(y_pr_c, stringsAsFactors = TRUE)

	# Compute metrics
	precision <- sum(y_ts == "churn" & y_pr_c$prediction == "churn")/(sum(y_ts == "churn" & y_pr_c$prediction == "churn") + sum(y_ts == "nochurn" & y_pr_c$prediction == "churn"))
	recall <- sum(y_ts == "churn" & y_pr_c$prediction == "churn")/(sum(y_ts == "churn" & y_pr_c$prediction == "churn") + sum(y_ts == "churn" & y_pr_c$prediction == "nochurn"))
	sprintf("Test accuracy: %.3f", mean(y_ts == y_pr_c$prediction))
	sprintf("Test recall: %.3f", recall)
	sprintf("Test precision: %.3f", precision)
	sprintf("Test AUC: %.3f”, auc(my_roc))