Skip to content

Instantly share code, notes, and snippets.

@hoxo-m
Created April 4, 2019 15:41
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save hoxo-m/74d45f67f649abaaf0ae7d3a115fcc0e to your computer and use it in GitHub Desktop.
Save hoxo-m/74d45f67f649abaaf0ae7d3a115fcc0e to your computer and use it in GitHub Desktop.
AdaBoost は訓練データの正答率が100%になった後も学習を続けるとテストデータの正答率が上がる (Hastie et al. 2008)
f <- function(X) {
X <- as.matrix(X)
limit <- qchisq(0.5, df = ncol(X))
apply(X, 1, function(row) {
if(sum(row^2) > limit) 1 else -1
})
}
D <- 2
M <- 2200
N_train <- 2000
N_test <- 10000
set.seed(314)
X_train <- data.frame(matrix(rnorm(N_train * D), nrow = N_train))
y_train <- f(X_train)
X_test <- data.frame(matrix(rnorm(N_test * D), nrow = N_test))
y_test <- f(X_test)
library(rpart)
discrete_adaboost <- function(X, y, M) {
N <- nrow(X)
w <- rep(1/N, N)
alpha_m <- double(M)
Gm <- vector("list", length = M)
X$y <- factor(y)
for (i in seq_len(M)) {
Gm[[i]] <- rpart(y ~ ., data = X, weights = w, maxdepth = 1)
y_hat <- predict(Gm[[i]], type = "class")
err <- y != y_hat
err_m <- sum(err * w) / sum(w)
alpha_m[i] <- log((1 - err_m) / err_m)
w <- w * exp(alpha_m[i] * err)
}
list(alpha_m = alpha_m, Gm = Gm)
}
res <- discrete_adaboost(X_train, y_train, M = M)
compute_acc <- function(n_iter) {
pred_train <- double(N_train)
pred_test <- double(N_test)
acc_train <- double(n_iter)
acc_test <- double(n_iter)
for (i in seq_len(n_iter)) {
pred_train <- pred_train + res$alpha_m[[i]] * as.integer(as.character(predict(res$Gm[[i]], type = "class")))
pred_test <- pred_test + res$alpha_m[[i]] * as.integer(as.character(predict(res$Gm[[i]], newdata = X_test, type = "class")))
acc_train[[i]] <- mean(ifelse(pred_train >= 0, 1, -1) == y_train)
acc_test[[i]] <- mean(ifelse(pred_test >= 0, 1, -1) == y_test)
}
data.frame(iter = 1:n_iter, train = acc_train, test = acc_test)
}
library(tidyverse)
df <- compute_acc(M) %>%
gather(train_or_test, acc, -iter) %>%
mutate(label = ifelse(iter == max(iter), train_or_test, NA))
ggplot(df, aes(iter, acc, color = train_or_test)) +
geom_line() +
geom_label(aes(label = label), na.rm = TRUE) +
scale_y_continuous(limits = c(0.97, NA),
labels = scales::percent_format(accuracy = 1)) +
scale_color_discrete(guide = FALSE) +
xlab("Iteration") + ylab("Accuracy")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment