Skip to content

Instantly share code, notes, and snippets.

@tuner
Created April 29, 2018 13:16
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tuner/9898d16ecb3a24e8b0d56bf63e0043aa to your computer and use it in GitHub Desktop.
Save tuner/9898d16ecb3a24e8b0d56bf63e0043aa to your computer and use it in GitHub Desktop.
Stochastic Gradient Descent
# (c) Kari Lavikka
######### Stochastic gradient descent "library" ##########
gradient_descent <- function(x, y,
validation_x, validation_y,
loss_function, gradient_function,
step_function,
lambda, M,
normalize = FALSE) {
# Randomized initial parameters seem to yield inconsistent results
#theta <- rnorm(ncol(x))
# A vector of zeroes works better
theta <- rep(0, ncol(x))
training_losses <- numeric()
validation_losses <- numeric()
min_loss <- Inf
iters_since_last_min <- 0
while (iters_since_last_min < 20) {
samples <- sample(nrow(x), M)
gradient <- gradient_function(theta, x[samples, ], y[samples], lambda)
alpha <- step_function(theta, gradient)
if (normalize) {
theta <- theta - alpha * gradient / sqrt(sum(gradient^2))
} else {
theta <- theta - alpha * gradient
}
validation_loss <- loss_function(theta, validation_x, validation_y, lambda)
validation_losses <- c(validation_losses, validation_loss)
training_losses <- c(training_losses, loss_function(theta, x[samples, ], y[samples], lambda))
if (validation_loss < min_loss) {
min_loss <- validation_loss
iters_since_last_min <- 0
} else {
iters_since_last_min <- iters_since_last_min + 1
}
}
print(theta)
plot(training_losses, type = "l", log = "y", xlab = "Iteration", ylab = "Loss", col = "red")
lines(validation_losses, type = "l", col = "blue")
}
compute_loss <- function(theta, x, y, lambda) {
if (!is.null(dim(x))) {
residuals <- apply(x, 1, function(row) sum(theta * row)) - y
mean(residuals^2) + lambda * sum(theta * theta)
} else {
# Huh, apply doesn't work with one-dimensional matrices
residuals <- sum(theta * x) - y
residuals^2 + lambda * sum(theta * theta)
}
}
compute_gradient <- function(theta, x, y, lambda) {
if (!is.null(dim(x))) {
residuals <- apply(x, 1, function(row) sum(theta * row)) - y
2 * colMeans(x * residuals) + 2 * lambda * theta
} else {
residuals <- sum(theta * x) - y
2 * residuals + 2 * lambda * theta
}
}
create_deterministic_schedule <- function(initial_eta, decay_speed) {
t <- -1
function(theta, gradient) {
t <<- t + 1
initial_eta / (1 + initial_eta * decay_speed * t)
}
}
create_ada_grad <- function(initial_eta, tau) {
s <- 0
function(theta, gradient) {
s <<- s + gradient^2
initial_eta / (tau + sqrt(s))
}
}
######### Let's play with the "library" ##########
library(readr)
boston_house_prices <- read_csv("boston_house_prices.csv")
boston_house_prices <- boston_house_prices[sample.int(nrow(boston_house_prices)), ]
training_data <- boston_house_prices[1:400, ]
validation_data <- boston_house_prices[-(1:400), ]
y <- training_data$MEDV
x <- training_data[-14]
x <- as.matrix(cbind(x, BIAS = 1))
validation_y <- validation_data$MEDV
validation_x <- validation_data[-14]
validation_x <- as.matrix(cbind(validation_x, BIAS = 1))
gradient_descent(x, y, validation_x, validation_y, compute_loss, compute_gradient,
create_deterministic_schedule(1, 6),
lambda = 0, M = 25,
normalize = TRUE)
title("M = 25, Lambda = 0, Deterministic(1, 6)")
gradient_descent(x, y, validation_x, validation_y, compute_loss, compute_gradient,
create_deterministic_schedule(1, 3),
lambda = 0, M = 25,
normalize = TRUE)
title("M = 25, Lambda = 0, Deterministic(1, 3)")
gradient_descent(x, y, validation_x, validation_y, compute_loss, compute_gradient,
create_deterministic_schedule(1, 1),
lambda = 100, M = 25,
normalize = TRUE)
title("M = 25, Lambda = 0, Deterministic(1, 1)")
gradient_descent(x, y, validation_x, validation_y, compute_loss, compute_gradient,
create_deterministic_schedule(1, 0.1),
lambda = 100, M = 25,
normalize = TRUE)
title("M = 25, Lambda = 0, Deterministic(1, 0.1)")
gradient_descent(x, y, validation_x, validation_y, compute_loss, compute_gradient,
create_ada_grad(1, 2),
lambda = 1, M = 1,
normalize = FALSE)
title("M = 1, Lambda = 1, AdaGrad(1, 2)")
gradient_descent(x, y, validation_x, validation_y, compute_loss, compute_gradient,
create_ada_grad(1, 2),
lambda = 10, M = 25,
normalize = FALSE)
title("M = 25, Lambda = 1, AdaGrad(1, 2)")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment