khakieconomics/never_use_neural_networks_when_matrix_multiplication_will_do.R

## never_use_neural_networks_when_matrix_multiplication_will_do.R
# This gist uses the classifier defined in this post: http://modernstatisticalworkflow.blogspot.com/2018/03/1000-labels-and-4500-observations-have.html
# applied with this approximation: https://gist.github.com/khakieconomics/0325c054b1499d5037a1de5d1014645a
# to Kaggle's credit card fraud data--a fun rare-case problem. In a cross-validation exercise with 50 randomly selected hold-out
# sets, it appears perform similarly (or perhaps better) than others' attempts using random forests and neural networks.
# The upside of course is that it estimates/generates predictions in a couple of seconds.

library(tidyverse); library(reticulate); library(pROC)

# Download some data from Kaggle
system("kaggle datasets download -d mlg-ulb/creditcardfraud -p ~/Documents/kagglestuff")

# Read it -- there's a row in there that's got a float in an integer position
credit_cards <- read_csv("~/Documents/kagglestuff/creditcard.csv")
credit_cards <- credit_cards %>% filter(complete.cases(.))
labels <- credit_cards$Class

# We're going to discretize each column
N <- 10

# Some functions we'll need
make_percentiles <- function(x) ordered(ntile(x, n = N))
softmax <- function(x) exp(x)/sum(exp(x))

pd <- import("pandas")

X <- credit_cards %>%
  dplyr::select(-Class) %>%
  mutate_all(.funs = funs(make_percentiles)) %>%
  apply(2, pd$get_dummies) %>%
  as.data.frame()

# Let's do some random cross-validation
dev.off()

folds <- 50
aucs <- NULL
for(i in 1:folds) {
  training_rows <- sample(1:nrow(X), round(nrow(X)*((folds - 1)/folds)))

  training_X <- X[training_rows,] %>% as.matrix
  testing_X <- X[-training_rows,] %>% as.matrix

  training_labels <- labels[training_rows]
  testing_labels <- labels[-training_rows]

  almost_average <- function(x) mean(c(x, 1/N))

  X2 <- bind_cols(labels = training_labels, as_data_frame(training_X)) %>%
    group_by(labels) %>%
    summarise_all(.funs= funs(almost_average)) %>%
    select(-labels) %>%
    as.matrix

  t_probs <- log(X2)
  n_probs <- log(1 - X2)

  # Approximate log likelihood

  log_likelihoods <- testing_X %*% t(t_probs) + (1 - testing_X) %*% t(n_probs)

  predicted_probs <- t(apply(log_likelihoods, 1, function(x) exp(x)/sum(exp(x))))

  roc_1 <- roc(testing_labels, predicted_probs[,2])
  aucs[i] <- roc_1$auc
  par(new = T)
  print(plot(roc_1, col = i))
}

data_frame(auc = aucs) %>%
  ggplot(aes(y = auc, x = 1)) +
  geom_violin(alpha = 0.6) +
  labs(title = "Matrix multiplication + dplyr beats nets/forests",
       subtitle = "AUC from 50 randomly-selected holdout sets (1/50th of sample)")

mean(aucs)

data_frame(auc = aucs) %>%
  ggplot(aes(x = auc)) +
  geom_histogram(alpha = 0.6) +
  labs(title = "Matrix multiplication + dplyr beats nets/forests",
       subtitle = "AUC from 50 randomly-selected holdout sets (1/50th of sample)")
	# This gist uses the classifier defined in this post: http://modernstatisticalworkflow.blogspot.com/2018/03/1000-labels-and-4500-observations-have.html
	# applied with this approximation: https://gist.github.com/khakieconomics/0325c054b1499d5037a1de5d1014645a
	# to Kaggle's credit card fraud data--a fun rare-case problem. In a cross-validation exercise with 50 randomly selected hold-out
	# sets, it appears perform similarly (or perhaps better) than others' attempts using random forests and neural networks.
	# The upside of course is that it estimates/generates predictions in a couple of seconds.

	library(tidyverse); library(reticulate); library(pROC)

	# Download some data from Kaggle
	system("kaggle datasets download -d mlg-ulb/creditcardfraud -p ~/Documents/kagglestuff")

	# Read it -- there's a row in there that's got a float in an integer position
	credit_cards <- read_csv("~/Documents/kagglestuff/creditcard.csv")
	credit_cards <- credit_cards %>% filter(complete.cases(.))
	labels <- credit_cards$Class

	# We're going to discretize each column
	N <- 10

	# Some functions we'll need
	make_percentiles <- function(x) ordered(ntile(x, n = N))
	softmax <- function(x) exp(x)/sum(exp(x))

	pd <- import("pandas")

	X <- credit_cards %>%
	dplyr::select(-Class) %>%
	mutate_all(.funs = funs(make_percentiles)) %>%
	apply(2, pd$get_dummies) %>%
	as.data.frame()

	# Let's do some random cross-validation
	dev.off()

	folds <- 50
	aucs <- NULL
	for(i in 1:folds) {
	training_rows <- sample(1:nrow(X), round(nrow(X)*((folds - 1)/folds)))

	training_X <- X[training_rows,] %>% as.matrix
	testing_X <- X[-training_rows,] %>% as.matrix

	training_labels <- labels[training_rows]
	testing_labels <- labels[-training_rows]

	almost_average <- function(x) mean(c(x, 1/N))

	X2 <- bind_cols(labels = training_labels, as_data_frame(training_X)) %>%
	group_by(labels) %>%
	summarise_all(.funs= funs(almost_average)) %>%
	select(-labels) %>%
	as.matrix

	t_probs <- log(X2)
	n_probs <- log(1 - X2)

	# Approximate log likelihood

	log_likelihoods <- testing_X %% t(t_probs) + (1 - testing_X) %% t(n_probs)

	predicted_probs <- t(apply(log_likelihoods, 1, function(x) exp(x)/sum(exp(x))))

	roc_1 <- roc(testing_labels, predicted_probs[,2])
	aucs[i] <- roc_1$auc
	par(new = T)
	print(plot(roc_1, col = i))
	}

	data_frame(auc = aucs) %>%
	ggplot(aes(y = auc, x = 1)) +
	geom_violin(alpha = 0.6) +
	labs(title = "Matrix multiplication + dplyr beats nets/forests",
	subtitle = "AUC from 50 randomly-selected holdout sets (1/50th of sample)")

	mean(aucs)

	data_frame(auc = aucs) %>%
	ggplot(aes(x = auc)) +
	geom_histogram(alpha = 0.6) +
	labs(title = "Matrix multiplication + dplyr beats nets/forests",
	subtitle = "AUC from 50 randomly-selected holdout sets (1/50th of sample)")