georgeblck/min-char-rnn.R

## min-char-rnn.R
###
### Minimal character-level Vanilla RNN model. Written by Andrej Karpathy (@karpathy)
### BSD License
### Re-written in R by @georgeblck
###

rm(list=ls(all=TRUE))
options(digits=10)


# Make clipping functions of various speeds (for speed)
if ("Rcpp" %in% rownames(installed.packages())){
  library(Rcpp)
  cppFunction('NumericVector rcpp_clip( NumericVector x, double a, double b){
    return clamp( a, x, b ) ;
}')
  clip.mat <- function(mat){
    matrix(apply(mat, 2, rcpp_clip, a =-5, b=5),
           nrow = nrow(mat), ncol = ncol(mat))
  }
} else {
  clip <- function(vec, UB=5, LB=-5){
    pmax( LB, pmin( vec, UB))
  }

  clip.mat <- function(mat){
    matrix(apply(mat,2,clip), nrow = nrow(mat), ncol = ncol(mat))
  }
}


# Data I/O
# If you have internet access download the data
data  <- readLines("https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt")
data  <- paste(data, collapse = "\n")
data  <- strsplit(data, "")[[1]]
chars <- unique(data)
data_size   <- length(data)
vocab_size  <- length(chars)

char_to_ix <- rbind(chars, 1:vocab_size)

# hyperparameters
hidden_size   <- 100
seq_length    <- 25
learning_rate <- 0.1

# model parameters
set.seed(12345)
Wxh <- matrix(rnorm(hidden_size * vocab_size),
        hidden_size, vocab_size) * 0.01 # input to hidden
Whh <- matrix(rnorm(hidden_size * hidden_size),
              hidden_size, hidden_size) * 0.01 # hidden to hidden
Why <- matrix(rnorm(hidden_size * vocab_size),
              vocab_size, hidden_size) * 0.01 # hidden to output
bh  <- matrix(0, hidden_size, 1) # hidden bias
by  <- matrix(0, vocab_size, 1) # output bias
weight.list <- list(Wxh = Wxh, Whh = Whh, Why = Why,
           bh = bh, by = by)


lossFun <- function(inputs, targets, hprev, w){

  ## inputs,targets are both vectors of integers (i.e. x, y)
  ## hprev is Hx1 vector of initial hidden state
  ## w is the list of weights
  ## returns the loss, gradients on model parameters, and last hidden state

  # Initialize variables
  loss <- 0
  len_input <- length(inputs)
  xs <- matrix(0, nrow = vocab_size, ncol = len_input)
  hs <- matrix(0, nrow = hidden_size, ncol = (len_input + 1))
  hs[,1] <- hprev
  ys <- xs
  ps <- ys

  # forward pass
  for (t in 1:len_input){
    # encode in 1-of-k representation
    xs[inputs[t], t] <- 1
    hs[, (t+1)] <- tanh(w$Wxh %*% xs[, t] +
                      w$Whh %*% hs[, (t-1+1)] + w$bh)
    # unnormalized log probabilities for next chars
    ys[, t] <- w$Why %*% hs[, (t+1)] + w$by
    # probabilities for next chars
    ps[, t] <- exp(ys[, t]) / sum(exp(ys[, t]))
    # softmax (cross-entropy loss)
    loss <- loss + (-1.0)*log(ps[targets[t], t])
  }

  # backward pass: compute gradients going backwards
  dWxh  <- matrix(0, hidden_size, vocab_size)
  dWhh  <- matrix(0, hidden_size, hidden_size)
  dWhy  <- matrix(0, vocab_size, hidden_size)
  dbh   <- matrix(0, hidden_size, 1)
  dby   <- matrix(0, vocab_size, 1)
  dhnext <- 0

  for (t in len_input:1){
    dy <- ps[, t]
    # backprop into y
    dy[targets[t]] <- dy[targets[t]] - 1
    dWhy  <- dWhy + dy %*% t(hs[, t+1])
    dby   <- dby + dy
    # backprop into h
    dh    <- t(w$Why) %*% dy + dhnext
    # backprop through tanh nonlinearity
    dhraw   <- (1 - hs[, t+1]^2) * dh
    dbh     <- dbh + dhraw
    dWxh    <- dWxh + dhraw %*% t(xs[, t])
    dWhh    <- dWhh + dhraw %*% t(hs[, (t-1+1)])
    dhnext  <- t(w$Whh) %*% dhraw

  }
  dweights.list <- list(dWxh = dWxh, dWhh = dWhh, dWhy = dWhy,
                  dbh = dbh, dby = dby)
  # clip to mitigate exploding gradients
  dweights.list <- lapply(dweights.list, clip.mat)
  return(list(loss = loss, dweights.list = dweights.list,
              hprev = hs[, len_input+1]))
}


sampled <- function(h, seed_ix, n, w){
  ## samples a sequence of integers from the model
  ## h is the memory state, seed_ix is seed letter for first time step
  ## w is the list of weights

  x <-matrix(0, vocab_size, 1)
  x[seed_ix,] <- 1
  ixes <- NULL
  for (t in 1:n){
    h   <- tanh(w$Wxh %*% x + w$Whh %*% h + w$bh)
    y   <- w$Why %*% h + by
    p   <- exp(y) / sum(exp(y))
    ix  <- sample(x = 1:vocab_size, size = 1,
                  p = drop(p), replace = TRUE)
    x   <- matrix(0, vocab_size, 1)
    x[ix, ] <- 1
    ixes <- c(ixes, ix)
  }
  return(ixes)
}

# memory variables for Adagrad
mWxh  <- matrix(0, hidden_size, vocab_size)
mWhh  <- matrix(0, hidden_size, hidden_size)
mWhy  <- matrix(0, vocab_size, hidden_size)
mbh   <- matrix(0, hidden_size, 1)
mby   <- matrix(0, vocab_size, 1)
mweights.list <- list(mWxh= mWxh, mWhh = mWhh, mWhy = mWhy,
           mbh = mbh, mby = mby)
# loss at iteration 0
smooth_loss <- (-1) * log(1/vocab_size) * seq_length

n <- 1
p <- 1

while (TRUE){
  # prepare inputs (we're sweeping from left to right in steps seq_length long)
  if ((p+seq_length+1) >= length(data) || n == 1){
    cat("\nNew Epoch\n")
    hprev <- matrix(0, hidden_size, 1) # reset RNN memory
    p     <- 1 # go from start of data
  }
  inputs  <- apply(as.matrix(data[p:(p+seq_length-1)]), 1,
                  function(x)grep(x, char_to_ix[1,], fixed = TRUE))
  targets <- apply(as.matrix(data[(p+1):(p+seq_length)]), 1,
                  function(x)grep(x, char_to_ix[1,], fixed = TRUE))

  # Sample from the model now and then
  if (n%%1000 == 0){
    sample_ix <- sampled(hprev, inputs[1], 200, weight.list)
    cat(paste(char_to_ix[1,sample_ix], collapse = ""))
  }

  # forward seq_length characters through the net and fetch gradient
  results     <- lossFun(inputs, targets, hprev, weight.list)
  smooth_loss <- smooth_loss * 0.999 + results$loss * 0.001
  if (n%%1000 == 0){
    cat("\nIteration ", n, ", loss:\t", smooth_loss)
  }
  hprev <- results$hprev

  # Update with adagrad
  updated <- mapply(FUN = function(we, dwe, mem){
    mem <- mem + dwe*dwe
    # adagrad update
    we  <- we - (learning_rate * dwe) / sqrt(mem + 1/100000000)
    return(list(we = we, mem = mem))
  }, we = weight.list, dwe = results$dweights.list,
  mem = mweights.list, SIMPLIFY = FALSE)
  weight.list   <- lapply(updated, function(x)x$we)
  mweights.list <- lapply(updated, function(x)x$mem)

  # move data pointer
  p <- p + seq_length
  # iteration counter
  n <- n + 1

  if (n == 10001) break
}


# gradient checking function
gradCheck <- function(inputs, targets, hprev, w){
  # How many checks per Parameter?
  num_checks  <- 10
  delta       <- 0.00001

  dw    <- lossFun(inputs, targets, hprev, w)$dweights.list
  doit  <- mapply(FUN = function(weights, dweights, names){
    s0  <- dim(weights)
    s1  <- dim(dweights)
    if (any(s0 != s1) )
      cat("Error dims dont match:\t", s0, "\t", s1, "\n")
    w.temp <- w
    cat("\nWeight:", names, "\n")
    for (i in 1:num_checks){
      ri <- runif(n = 1, min = 0, max = length(weights))
      weights_vec <- as.vector(weights)
      old_val     <- weights_vec[ri]
      # evaluate cost at [x + delta] and [x - delta]
      weights_vec[ri]   <- old_val + delta
      w.temp[[names]]   <- matrix(weights_vec, dim(weights))
      cg0   <- lossFun(inputs, targets, hprev, w.temp)$loss

      weights_vec[ri]   <- old_val - delta
      w.temp[[names]]   <- matrix(weights_vec, dim(weights))
      cg1   <- lossFun(inputs, targets, hprev, w.temp)$loss

      # reset old value for this parameter
      w.temp[[names]] <- weights

      # fetch both numerical and analytic gradient
      grad_analytical <- as.vector(dweights)[ri]
      grad_numerical  <- (cg0 - cg1) / (2 * delta)
      rel_error <- abs(grad_analytical - grad_numerical) / abs(grad_numerical + grad_analytical)
      # rel_error should be on order of 1e-7 or less
      cat("Numerical: ", grad_numerical, "|| Analytical: ", grad_analytical, "||-->\t", rel_error, "\n")
    }
  }, weights = w, dweights = dw, names = names(w))

}

set.seed(123456)
gradCheck(inputs, targets, hprev, weight.list)
	###
	### Minimal character-level Vanilla RNN model. Written by Andrej Karpathy (@karpathy)
	### BSD License
	### Re-written in R by @georgeblck
	###

	rm(list=ls(all=TRUE))
	options(digits=10)


	# Make clipping functions of various speeds (for speed)
	if ("Rcpp" %in% rownames(installed.packages())){
	library(Rcpp)
	cppFunction('NumericVector rcpp_clip( NumericVector x, double a, double b){
	return clamp( a, x, b ) ;
	}')
	clip.mat <- function(mat){
	matrix(apply(mat, 2, rcpp_clip, a =-5, b=5),
	nrow = nrow(mat), ncol = ncol(mat))
	}
	} else {
	clip <- function(vec, UB=5, LB=-5){
	pmax( LB, pmin( vec, UB))
	}

	clip.mat <- function(mat){
	matrix(apply(mat,2,clip), nrow = nrow(mat), ncol = ncol(mat))
	}
	}


	# Data I/O
	# If you have internet access download the data
	data <- readLines("https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt")
	data <- paste(data, collapse = "\n")
	data <- strsplit(data, "")[[1]]
	chars <- unique(data)
	data_size <- length(data)
	vocab_size <- length(chars)

	char_to_ix <- rbind(chars, 1:vocab_size)

	# hyperparameters
	hidden_size <- 100
	seq_length <- 25
	learning_rate <- 0.1

	# model parameters
	set.seed(12345)
	Wxh <- matrix(rnorm(hidden_size * vocab_size),
	hidden_size, vocab_size) * 0.01 # input to hidden
	Whh <- matrix(rnorm(hidden_size * hidden_size),
	hidden_size, hidden_size) * 0.01 # hidden to hidden
	Why <- matrix(rnorm(hidden_size * vocab_size),
	vocab_size, hidden_size) * 0.01 # hidden to output
	bh <- matrix(0, hidden_size, 1) # hidden bias
	by <- matrix(0, vocab_size, 1) # output bias
	weight.list <- list(Wxh = Wxh, Whh = Whh, Why = Why,
	bh = bh, by = by)


	lossFun <- function(inputs, targets, hprev, w){

	## inputs,targets are both vectors of integers (i.e. x, y)
	## hprev is Hx1 vector of initial hidden state
	## w is the list of weights
	## returns the loss, gradients on model parameters, and last hidden state

	# Initialize variables
	loss <- 0
	len_input <- length(inputs)
	xs <- matrix(0, nrow = vocab_size, ncol = len_input)
	hs <- matrix(0, nrow = hidden_size, ncol = (len_input + 1))
	hs[,1] <- hprev
	ys <- xs
	ps <- ys

	# forward pass
	for (t in 1:len_input){
	# encode in 1-of-k representation
	xs[inputs[t], t] <- 1
	hs[, (t+1)] <- tanh(w$Wxh %*% xs[, t] +
	w$Whh %*% hs[, (t-1+1)] + w$bh)
	# unnormalized log probabilities for next chars
	ys[, t] <- w$Why %*% hs[, (t+1)] + w$by
	# probabilities for next chars
	ps[, t] <- exp(ys[, t]) / sum(exp(ys[, t]))
	# softmax (cross-entropy loss)
	loss <- loss + (-1.0)*log(ps[targets[t], t])
	}

	# backward pass: compute gradients going backwards
	dWxh <- matrix(0, hidden_size, vocab_size)
	dWhh <- matrix(0, hidden_size, hidden_size)
	dWhy <- matrix(0, vocab_size, hidden_size)
	dbh <- matrix(0, hidden_size, 1)
	dby <- matrix(0, vocab_size, 1)
	dhnext <- 0

	for (t in len_input:1){
	dy <- ps[, t]
	# backprop into y
	dy[targets[t]] <- dy[targets[t]] - 1
	dWhy <- dWhy + dy %*% t(hs[, t+1])
	dby <- dby + dy
	# backprop into h
	dh <- t(w$Why) %*% dy + dhnext
	# backprop through tanh nonlinearity
	dhraw <- (1 - hs[, t+1]^2) * dh
	dbh <- dbh + dhraw
	dWxh <- dWxh + dhraw %*% t(xs[, t])
	dWhh <- dWhh + dhraw %*% t(hs[, (t-1+1)])
	dhnext <- t(w$Whh) %*% dhraw

	}
	dweights.list <- list(dWxh = dWxh, dWhh = dWhh, dWhy = dWhy,
	dbh = dbh, dby = dby)
	# clip to mitigate exploding gradients
	dweights.list <- lapply(dweights.list, clip.mat)
	return(list(loss = loss, dweights.list = dweights.list,
	hprev = hs[, len_input+1]))
	}


	sampled <- function(h, seed_ix, n, w){
	## samples a sequence of integers from the model
	## h is the memory state, seed_ix is seed letter for first time step
	## w is the list of weights

	x <-matrix(0, vocab_size, 1)
	x[seed_ix,] <- 1
	ixes <- NULL
	for (t in 1:n){
	h <- tanh(w$Wxh %% x + w$Whh %% h + w$bh)
	y <- w$Why %*% h + by
	p <- exp(y) / sum(exp(y))
	ix <- sample(x = 1:vocab_size, size = 1,
	p = drop(p), replace = TRUE)
	x <- matrix(0, vocab_size, 1)
	x[ix, ] <- 1
	ixes <- c(ixes, ix)
	}
	return(ixes)
	}

	# memory variables for Adagrad
	mWxh <- matrix(0, hidden_size, vocab_size)
	mWhh <- matrix(0, hidden_size, hidden_size)
	mWhy <- matrix(0, vocab_size, hidden_size)
	mbh <- matrix(0, hidden_size, 1)
	mby <- matrix(0, vocab_size, 1)
	mweights.list <- list(mWxh= mWxh, mWhh = mWhh, mWhy = mWhy,
	mbh = mbh, mby = mby)
	# loss at iteration 0
	smooth_loss <- (-1) * log(1/vocab_size) * seq_length

	n <- 1
	p <- 1

	while (TRUE){
	# prepare inputs (we're sweeping from left to right in steps seq_length long)
	if ((p+seq_length+1) >= length(data) \|\| n == 1){
	cat("\nNew Epoch\n")
	hprev <- matrix(0, hidden_size, 1) # reset RNN memory
	p <- 1 # go from start of data
	}
	inputs <- apply(as.matrix(data[p:(p+seq_length-1)]), 1,
	function(x)grep(x, char_to_ix[1,], fixed = TRUE))
	targets <- apply(as.matrix(data[(p+1):(p+seq_length)]), 1,
	function(x)grep(x, char_to_ix[1,], fixed = TRUE))

	# Sample from the model now and then
	if (n%%1000 == 0){
	sample_ix <- sampled(hprev, inputs[1], 200, weight.list)
	cat(paste(char_to_ix[1,sample_ix], collapse = ""))
	}

	# forward seq_length characters through the net and fetch gradient
	results <- lossFun(inputs, targets, hprev, weight.list)
	smooth_loss <- smooth_loss * 0.999 + results$loss * 0.001
	if (n%%1000 == 0){
	cat("\nIteration ", n, ", loss:\t", smooth_loss)
	}
	hprev <- results$hprev

	# Update with adagrad
	updated <- mapply(FUN = function(we, dwe, mem){
	mem <- mem + dwe*dwe
	# adagrad update
	we <- we - (learning_rate * dwe) / sqrt(mem + 1/100000000)
	return(list(we = we, mem = mem))
	}, we = weight.list, dwe = results$dweights.list,
	mem = mweights.list, SIMPLIFY = FALSE)
	weight.list <- lapply(updated, function(x)x$we)
	mweights.list <- lapply(updated, function(x)x$mem)

	# move data pointer
	p <- p + seq_length
	# iteration counter
	n <- n + 1

	if (n == 10001) break
	}


	# gradient checking function
	gradCheck <- function(inputs, targets, hprev, w){
	# How many checks per Parameter?
	num_checks <- 10
	delta <- 0.00001

	dw <- lossFun(inputs, targets, hprev, w)$dweights.list
	doit <- mapply(FUN = function(weights, dweights, names){
	s0 <- dim(weights)
	s1 <- dim(dweights)
	if (any(s0 != s1) )
	cat("Error dims dont match:\t", s0, "\t", s1, "\n")
	w.temp <- w
	cat("\nWeight:", names, "\n")
	for (i in 1:num_checks){
	ri <- runif(n = 1, min = 0, max = length(weights))
	weights_vec <- as.vector(weights)
	old_val <- weights_vec[ri]
	# evaluate cost at [x + delta] and [x - delta]
	weights_vec[ri] <- old_val + delta
	w.temp[[names]] <- matrix(weights_vec, dim(weights))
	cg0 <- lossFun(inputs, targets, hprev, w.temp)$loss

	weights_vec[ri] <- old_val - delta
	w.temp[[names]] <- matrix(weights_vec, dim(weights))
	cg1 <- lossFun(inputs, targets, hprev, w.temp)$loss

	# reset old value for this parameter
	w.temp[[names]] <- weights

	# fetch both numerical and analytic gradient
	grad_analytical <- as.vector(dweights)[ri]
	grad_numerical <- (cg0 - cg1) / (2 * delta)
	rel_error <- abs(grad_analytical - grad_numerical) / abs(grad_numerical + grad_analytical)
	# rel_error should be on order of 1e-7 or less
	cat("Numerical: ", grad_numerical, "\|\| Analytical: ", grad_analytical, "\|\|-->\t", rel_error, "\n")
	}
	}, weights = w, dweights = dw, names = names(w))

	}

	set.seed(123456)
	gradCheck(inputs, targets, hprev, weight.list)