Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save khakieconomics/bbc56c0cc5b4c51f70ef3a9b34205b0b to your computer and use it in GitHub Desktop.
Save khakieconomics/bbc56c0cc5b4c51f70ef3a9b34205b0b to your computer and use it in GitHub Desktop.
# This gist uses the classifier defined in this post: http://modernstatisticalworkflow.blogspot.com/2018/03/1000-labels-and-4500-observations-have.html
# applied with this approximation: https://gist.github.com/khakieconomics/0325c054b1499d5037a1de5d1014645a
# to Kaggle's credit card fraud data--a fun rare-case problem. In a cross-validation exercise with 50 randomly selected hold-out
# sets, it appears perform similarly (or perhaps better) than others' attempts using random forests and neural networks.
# The upside of course is that it estimates/generates predictions in a couple of seconds.
library(tidyverse); library(reticulate); library(pROC)
# Download some data from Kaggle
system("kaggle datasets download -d mlg-ulb/creditcardfraud -p ~/Documents/kagglestuff")
# Read it -- there's a row in there that's got a float in an integer position
credit_cards <- read_csv("~/Documents/kagglestuff/creditcard.csv")
credit_cards <- credit_cards %>% filter(complete.cases(.))
labels <- credit_cards$Class
# We're going to discretize each column
N <- 10
# Some functions we'll need
make_percentiles <- function(x) ordered(ntile(x, n = N))
softmax <- function(x) exp(x)/sum(exp(x))
pd <- import("pandas")
X <- credit_cards %>%
dplyr::select(-Class) %>%
mutate_all(.funs = funs(make_percentiles)) %>%
apply(2, pd$get_dummies) %>%
as.data.frame()
# Let's do some random cross-validation
dev.off()
folds <- 50
aucs <- NULL
for(i in 1:folds) {
training_rows <- sample(1:nrow(X), round(nrow(X)*((folds - 1)/folds)))
training_X <- X[training_rows,] %>% as.matrix
testing_X <- X[-training_rows,] %>% as.matrix
training_labels <- labels[training_rows]
testing_labels <- labels[-training_rows]
almost_average <- function(x) mean(c(x, 1/N))
X2 <- bind_cols(labels = training_labels, as_data_frame(training_X)) %>%
group_by(labels) %>%
summarise_all(.funs= funs(almost_average)) %>%
select(-labels) %>%
as.matrix
t_probs <- log(X2)
n_probs <- log(1 - X2)
# Approximate log likelihood
log_likelihoods <- testing_X %*% t(t_probs) + (1 - testing_X) %*% t(n_probs)
predicted_probs <- t(apply(log_likelihoods, 1, function(x) exp(x)/sum(exp(x))))
roc_1 <- roc(testing_labels, predicted_probs[,2])
aucs[i] <- roc_1$auc
par(new = T)
print(plot(roc_1, col = i))
}
data_frame(auc = aucs) %>%
ggplot(aes(y = auc, x = 1)) +
geom_violin(alpha = 0.6) +
labs(title = "Matrix multiplication + dplyr beats nets/forests",
subtitle = "AUC from 50 randomly-selected holdout sets (1/50th of sample)")
mean(aucs)
data_frame(auc = aucs) %>%
ggplot(aes(x = auc)) +
geom_histogram(alpha = 0.6) +
labs(title = "Matrix multiplication + dplyr beats nets/forests",
subtitle = "AUC from 50 randomly-selected holdout sets (1/50th of sample)")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment