Created
April 29, 2018 13:16
-
-
Save tuner/9898d16ecb3a24e8b0d56bf63e0043aa to your computer and use it in GitHub Desktop.
Stochastic Gradient Descent
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# (c) Kari Lavikka | |
######### Stochastic gradient descent "library" ########## | |
gradient_descent <- function(x, y, | |
validation_x, validation_y, | |
loss_function, gradient_function, | |
step_function, | |
lambda, M, | |
normalize = FALSE) { | |
# Randomized initial parameters seem to yield inconsistent results | |
#theta <- rnorm(ncol(x)) | |
# A vector of zeroes works better | |
theta <- rep(0, ncol(x)) | |
training_losses <- numeric() | |
validation_losses <- numeric() | |
min_loss <- Inf | |
iters_since_last_min <- 0 | |
while (iters_since_last_min < 20) { | |
samples <- sample(nrow(x), M) | |
gradient <- gradient_function(theta, x[samples, ], y[samples], lambda) | |
alpha <- step_function(theta, gradient) | |
if (normalize) { | |
theta <- theta - alpha * gradient / sqrt(sum(gradient^2)) | |
} else { | |
theta <- theta - alpha * gradient | |
} | |
validation_loss <- loss_function(theta, validation_x, validation_y, lambda) | |
validation_losses <- c(validation_losses, validation_loss) | |
training_losses <- c(training_losses, loss_function(theta, x[samples, ], y[samples], lambda)) | |
if (validation_loss < min_loss) { | |
min_loss <- validation_loss | |
iters_since_last_min <- 0 | |
} else { | |
iters_since_last_min <- iters_since_last_min + 1 | |
} | |
} | |
print(theta) | |
plot(training_losses, type = "l", log = "y", xlab = "Iteration", ylab = "Loss", col = "red") | |
lines(validation_losses, type = "l", col = "blue") | |
} | |
compute_loss <- function(theta, x, y, lambda) { | |
if (!is.null(dim(x))) { | |
residuals <- apply(x, 1, function(row) sum(theta * row)) - y | |
mean(residuals^2) + lambda * sum(theta * theta) | |
} else { | |
# Huh, apply doesn't work with one-dimensional matrices | |
residuals <- sum(theta * x) - y | |
residuals^2 + lambda * sum(theta * theta) | |
} | |
} | |
compute_gradient <- function(theta, x, y, lambda) { | |
if (!is.null(dim(x))) { | |
residuals <- apply(x, 1, function(row) sum(theta * row)) - y | |
2 * colMeans(x * residuals) + 2 * lambda * theta | |
} else { | |
residuals <- sum(theta * x) - y | |
2 * residuals + 2 * lambda * theta | |
} | |
} | |
create_deterministic_schedule <- function(initial_eta, decay_speed) { | |
t <- -1 | |
function(theta, gradient) { | |
t <<- t + 1 | |
initial_eta / (1 + initial_eta * decay_speed * t) | |
} | |
} | |
create_ada_grad <- function(initial_eta, tau) { | |
s <- 0 | |
function(theta, gradient) { | |
s <<- s + gradient^2 | |
initial_eta / (tau + sqrt(s)) | |
} | |
} | |
######### Let's play with the "library" ########## | |
library(readr) | |
boston_house_prices <- read_csv("boston_house_prices.csv") | |
boston_house_prices <- boston_house_prices[sample.int(nrow(boston_house_prices)), ] | |
training_data <- boston_house_prices[1:400, ] | |
validation_data <- boston_house_prices[-(1:400), ] | |
y <- training_data$MEDV | |
x <- training_data[-14] | |
x <- as.matrix(cbind(x, BIAS = 1)) | |
validation_y <- validation_data$MEDV | |
validation_x <- validation_data[-14] | |
validation_x <- as.matrix(cbind(validation_x, BIAS = 1)) | |
gradient_descent(x, y, validation_x, validation_y, compute_loss, compute_gradient, | |
create_deterministic_schedule(1, 6), | |
lambda = 0, M = 25, | |
normalize = TRUE) | |
title("M = 25, Lambda = 0, Deterministic(1, 6)") | |
gradient_descent(x, y, validation_x, validation_y, compute_loss, compute_gradient, | |
create_deterministic_schedule(1, 3), | |
lambda = 0, M = 25, | |
normalize = TRUE) | |
title("M = 25, Lambda = 0, Deterministic(1, 3)") | |
gradient_descent(x, y, validation_x, validation_y, compute_loss, compute_gradient, | |
create_deterministic_schedule(1, 1), | |
lambda = 100, M = 25, | |
normalize = TRUE) | |
title("M = 25, Lambda = 0, Deterministic(1, 1)") | |
gradient_descent(x, y, validation_x, validation_y, compute_loss, compute_gradient, | |
create_deterministic_schedule(1, 0.1), | |
lambda = 100, M = 25, | |
normalize = TRUE) | |
title("M = 25, Lambda = 0, Deterministic(1, 0.1)") | |
gradient_descent(x, y, validation_x, validation_y, compute_loss, compute_gradient, | |
create_ada_grad(1, 2), | |
lambda = 1, M = 1, | |
normalize = FALSE) | |
title("M = 1, Lambda = 1, AdaGrad(1, 2)") | |
gradient_descent(x, y, validation_x, validation_y, compute_loss, compute_gradient, | |
create_ada_grad(1, 2), | |
lambda = 10, M = 25, | |
normalize = FALSE) | |
title("M = 25, Lambda = 1, AdaGrad(1, 2)") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment