bjpcjp/main.r

## main.r
# How to build your own NN classifier in r
# source: http://www.r-bloggers.com/build-your-own-neural-network-classifier-in-r/
# reference: http://junma5.weebly.com/data-blog/build-your-own-neural-network-classifier-in-r

# project:
# 1) build simple NN with 2 fully-connected layers
# 2) use NN to classify a dataset of 4-class 2D images & visualize decision boundary.
# 3) train NN with MNIST dataset

# ref: stanford CS23 source: http://cs231n.github.io/

# Build spiral dataset, 4 classes * 200 examples

library(ggplot2)
library(caret)

# didn't have caret library installed. fixed inside R envt with
# install.packages('caret')
# caret = predictive model creation library
# -- data splits
# -- pre processing
# -- feature selection
# -- model tuning
# -- variable importance estimation

N <- 200 # number of points per class
D <- 2 # dimensionality
K <- 4 # number of classes
X <- data.frame() # data matrix (rows = records)
y <- data.frame() # class labels

set.seed(308) # reset rndm number generator to known state

for (j in (1:K)){ # for each class,

  r <- seq(0.05,1,length.out = N) # radius = new sequence of length N
  t <- seq((j-1)*4.7,j*4.7, length.out = N) + rnorm(N, sd = 0.3) # theta

  Xtemp <- data.frame(x =r*sin(t) , y = r*cos(t))
  ytemp <- data.frame(matrix(j, N, 1))
  X <- rbind(X, Xtemp)
  y <- rbind(y, ytemp)
}

data <- cbind(X,y) # data matrix
colnames(data) <- c(colnames(X), 'label')

# X & Y are 800x2 & 800x1 data frames

x_min <- min(X[,1])-0.2
x_max <- max(X[,1])+0.2
y_min <- min(X[,2])-0.2
y_max <- max(X[,2])+0.2

# lets visualize the data:
ggplot(data) + geom_point(aes(
	x=x,
	y=y,
	color = as.character(label)),
	size = 2)
	+ theme_bw(base_size = 15) +
  xlim(x_min, x_max) + ylim(y_min, y_max) +
  ggtitle('Spiral Data Visulization') +
  coord_fixed(ratio = 0.8) +
  theme(axis.ticks=element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        axis.text=element_blank(), axis.title=element_blank(), legend.position = 'none')

# construct the NN
# need a new matrix Y (800x4) so for each row in Y, entry with index==label = 1 (0 otherwise)

X <- as.matrix(X)
Y <- matrix(0, N*K, K)

for (i in 1:(N*K)){
  Y[i, y[i,]] <- 1
}

################################################################################
# NN construction
#
# use X & Y matrices. Return list of 4 (W,b,W2,b2 = weight & bias for each layer)
# specify step_size, alias learning rate
# specify regularization_strength, alias lambda
#
# activation function ReLU
# loss/cost function: softmax
#
# implementation uses vectorized ops
#
################################################################################

# %*% dot product, * element wise product
nnet <- function(X, Y, step_size = 0.5, reg = 0.001, h = 10, niteration){
  # get dim of input
  N <- nrow(X) # number of examples
  K <- ncol(Y) # number of classes
  D <- ncol(X) # dimensionality

  # initialize parameters randomly.
  # rnorm = random number, normal distribution

  W <- 0.01 *  matrix(rnorm(D*h), nrow = D)
  b <-         matrix(0, nrow = 1, ncol = h)
  W2 <- 0.01 * matrix(rnorm(h*K), nrow = h)
  b2 <-        matrix(0, nrow = 1, ncol = K)

  # gradient descent loop to update weight and bias
  for (i in 0:niteration){

    # hidden layer, ReLU activation
    hidden_layer <- pmax(0, X%*% W + matrix(rep(b,N), nrow = N, byrow = T))
    hidden_layer <- matrix(hidden_layer, nrow = N)

    # class score
    scores <- hidden_layer%*%W2 + matrix(rep(b2,N), nrow = N, byrow = T)

    # compute and normalize class probabilities
    exp_scores <- exp(scores)
    probs <- exp_scores / rowSums(exp_scores)

    # compute the loss: softmax and regularization
    corect_logprobs <- -log(probs)
    data_loss <- sum(corect_logprobs*Y)/N
    reg_loss <- 0.5*reg*sum(W*W) + 0.5*reg*sum(W2*W2)
    loss <- data_loss + reg_loss

    # check progress
    if (i%%1000 == 0 | i == niteration){
      print(paste("iteration", i,': loss', loss))}

    # compute the gradient on scores
    dscores <- probs-Y
    dscores <- dscores/N

    # backpropate the gradient to the parameters
    dW2 <- t(hidden_layer)%*%dscores
    db2 <- colSums(dscores)
    # next backprop into hidden layer
    dhidden <- dscores%*%t(W2)
    # backprop the ReLU non-linearity
    dhidden[hidden_layer <= 0] <- 0
    # finally into W,b
    dW <- t(X)%*%dhidden
    db <- colSums(dhidden)

    # add regularization gradient contribution
    dW2 <- dW2 + reg *W2
    dW <- dW + reg *W

    # update parameter
    W <- W-step_size*dW
    b <- b-step_size*db
    W2 <- W2-step_size*dW2
    b2 <- b2-step_size*db2
  }
  return(list(W, b, W2, b2))
}

################################################################################
# Prediction function
#
################################################################################

nnetPred <- function(X, para = list()){
  W <- para[[1]]
  b <- para[[2]]
  W2 <- para[[3]]
  b2 <- para[[4]]

  # pmax = parallel max on a pair of vectors
  # A %*% B = matrix multiplication

  N <- nrow(X) # number of rows in X

  hidden_layer <- pmax(0, X%*% W + matrix(rep(b,N), nrow = N, byrow = T))
  hidden_layer <- matrix(hidden_layer, nrow = N)

  scores <- hidden_layer%*%W2 + matrix(rep(b2,N), nrow = N, byrow = T)
  predicted_class <- apply(scores, 1, which.max)

  return(predicted_class)
}

nnet.model <- nnet(X, Y, step_size = 0.4,reg = 0.0002, h=50, niteration = 6000)

predicted_class <- nnetPred(X, nnet.model)
print(paste('training accuracy:',mean(predicted_class == (y))))

################################################################################
# plot decision boundary
#
################################################################################

hs <- 0.01
grid <- as.matrix(expand.grid(seq(x_min, x_max, by = hs), seq(y_min, y_max, by =hs)))
Z <- nnetPred(grid, nnet.model)

ggplot()+
  geom_tile(aes(x = grid[,1],y = grid[,2],fill=as.character(Z)), alpha = 0.3, show.legend = F)+
  geom_point(data = data, aes(x=x, y=y, color = as.character(label)), size = 2) + theme_bw(base_size = 15) +
  ggtitle('Neural Network Decision Boundary') +
  coord_fixed(ratio = 0.8) +
  theme(axis.ticks=element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        axis.text=element_blank(), axis.title=element_blank(), legend.position = 'none')

print('done')

################################################################################
# MNIST data & pre processing
#
################################################################################

displayDigit <- function(X){
  m <- matrix(unlist(X),nrow = 28,byrow = T)
  m <- t(apply(m, 2, rev))
  image(m,col=grey.colors(255))
}

train <- read.csv("data/train.csv", header = TRUE, stringsAsFactors = F)
displayDigit(train[18,-1])

################################################################################
# pre process by removing near-zero variance columns & sclae by max(X)
#
# data is split for cross-validation
################################################################################

nzv <- nearZeroVar(train)
nzv.nolabel <- nzv-1

inTrain <- createDataPartition(y=train$label, p=0.7, list=F)

training <- train[inTrain, ]
CV <- train[-inTrain, ]

X <- as.matrix(training[, -1]) # data matrix (each row = single example)
N <- nrow(X) # number of examples
y <- training[, 1] # class labels

K <- length(unique(y)) # number of classes
X.proc <- X[, -nzv.nolabel]/max(X) # scale
D <- ncol(X.proc) # dimensionality

Xcv <- as.matrix(CV[, -1]) # data matrix (each row = single example)
ycv <- CV[, 1] # class labels
Xcv.proc <- Xcv[, -nzv.nolabel]/max(X) # scale CV data

Y <- matrix(0, N, K)

for (i in 1:N){
  Y[i, y[i]+1] <- 1
}

################################################################################
# model training & CV accuracy
#
################################################################################

nnet.mnist <- nnet(X.proc, Y, step_size = 0.3,
                   reg = 0.0001, niteration = 3500)

predicted_class <- nnetPred(X.proc, nnet.mnist)

print(paste('training set accuracy:',
            mean(predicted_class == (y+1))))

predicted_class <- nnetPred(Xcv.proc, nnet.mnist)

print(paste('CV accuracy:',
            mean(predicted_class == (ycv+1))))

################################################################################
# randomly select an image & predict the label
#
################################################################################

Xtest <- Xcv[sample(1:nrow(Xcv), 1), ]
Xtest.proc <- as.matrix(Xtest[-nzv.nolabel], nrow = 1)
predicted_test <- nnetPred(t(Xtest.proc), nnet.mnist)
print(paste('The predicted digit is:',predicted_test-1 ))

displayDigit(Xtest)
	# How to build your own NN classifier in r
	# source: http://www.r-bloggers.com/build-your-own-neural-network-classifier-in-r/
	# reference: http://junma5.weebly.com/data-blog/build-your-own-neural-network-classifier-in-r

	# project:
	# 1) build simple NN with 2 fully-connected layers
	# 2) use NN to classify a dataset of 4-class 2D images & visualize decision boundary.
	# 3) train NN with MNIST dataset

	# ref: stanford CS23 source: http://cs231n.github.io/

	# Build spiral dataset, 4 classes * 200 examples

	library(ggplot2)
	library(caret)

	# didn't have caret library installed. fixed inside R envt with
	# install.packages('caret')
	# caret = predictive model creation library
	# -- data splits
	# -- pre processing
	# -- feature selection
	# -- model tuning
	# -- variable importance estimation

	N <- 200 # number of points per class
	D <- 2 # dimensionality
	K <- 4 # number of classes
	X <- data.frame() # data matrix (rows = records)
	y <- data.frame() # class labels

	set.seed(308) # reset rndm number generator to known state

	for (j in (1:K)){ # for each class,

	r <- seq(0.05,1,length.out = N) # radius = new sequence of length N
	t <- seq((j-1)4.7,j4.7, length.out = N) + rnorm(N, sd = 0.3) # theta

	Xtemp <- data.frame(x =rsin(t) , y = rcos(t))
	ytemp <- data.frame(matrix(j, N, 1))
	X <- rbind(X, Xtemp)
	y <- rbind(y, ytemp)
	}

	data <- cbind(X,y) # data matrix
	colnames(data) <- c(colnames(X), 'label')

	# X & Y are 800x2 & 800x1 data frames

	x_min <- min(X[,1])-0.2
	x_max <- max(X[,1])+0.2
	y_min <- min(X[,2])-0.2
	y_max <- max(X[,2])+0.2

	# lets visualize the data:
	ggplot(data) + geom_point(aes(
	x=x,
	y=y,
	color = as.character(label)),
	size = 2)
	+ theme_bw(base_size = 15) +
	xlim(x_min, x_max) + ylim(y_min, y_max) +
	ggtitle('Spiral Data Visulization') +
	coord_fixed(ratio = 0.8) +
	theme(axis.ticks=element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
	axis.text=element_blank(), axis.title=element_blank(), legend.position = 'none')

	# construct the NN
	# need a new matrix Y (800x4) so for each row in Y, entry with index==label = 1 (0 otherwise)

	X <- as.matrix(X)
	Y <- matrix(0, N*K, K)

	for (i in 1:(N*K)){
	Y[i, y[i,]] <- 1
	}

	################################################################################
	# NN construction
	#
	# use X & Y matrices. Return list of 4 (W,b,W2,b2 = weight & bias for each layer)
	# specify step_size, alias learning rate
	# specify regularization_strength, alias lambda
	#
	# activation function ReLU
	# loss/cost function: softmax
	#
	# implementation uses vectorized ops
	#
	################################################################################

	# %% dot product, element wise product
	nnet <- function(X, Y, step_size = 0.5, reg = 0.001, h = 10, niteration){
	# get dim of input
	N <- nrow(X) # number of examples
	K <- ncol(Y) # number of classes
	D <- ncol(X) # dimensionality

	# initialize parameters randomly.
	# rnorm = random number, normal distribution

	W <- 0.01 * matrix(rnorm(D*h), nrow = D)
	b <- matrix(0, nrow = 1, ncol = h)
	W2 <- 0.01 * matrix(rnorm(h*K), nrow = h)
	b2 <- matrix(0, nrow = 1, ncol = K)

	# gradient descent loop to update weight and bias
	for (i in 0:niteration){

	# hidden layer, ReLU activation
	hidden_layer <- pmax(0, X%*% W + matrix(rep(b,N), nrow = N, byrow = T))
	hidden_layer <- matrix(hidden_layer, nrow = N)

	# class score
	scores <- hidden_layer%*%W2 + matrix(rep(b2,N), nrow = N, byrow = T)

	# compute and normalize class probabilities
	exp_scores <- exp(scores)
	probs <- exp_scores / rowSums(exp_scores)

	# compute the loss: softmax and regularization
	corect_logprobs <- -log(probs)
	data_loss <- sum(corect_logprobs*Y)/N
	reg_loss <- 0.5regsum(WW) + 0.5regsum(W2W2)
	loss <- data_loss + reg_loss

	# check progress
	if (i%%1000 == 0 \| i == niteration){
	print(paste("iteration", i,': loss', loss))}

	# compute the gradient on scores
	dscores <- probs-Y
	dscores <- dscores/N

	# backpropate the gradient to the parameters
	dW2 <- t(hidden_layer)%*%dscores
	db2 <- colSums(dscores)
	# next backprop into hidden layer
	dhidden <- dscores%*%t(W2)
	# backprop the ReLU non-linearity
	dhidden[hidden_layer <= 0] <- 0
	# finally into W,b
	dW <- t(X)%*%dhidden
	db <- colSums(dhidden)

	# add regularization gradient contribution
	dW2 <- dW2 + reg *W2
	dW <- dW + reg *W

	# update parameter
	W <- W-step_size*dW
	b <- b-step_size*db
	W2 <- W2-step_size*dW2
	b2 <- b2-step_size*db2
	}
	return(list(W, b, W2, b2))
	}

	################################################################################
	# Prediction function
	#
	################################################################################

	nnetPred <- function(X, para = list()){
	W <- para[[1]]
	b <- para[[2]]
	W2 <- para[[3]]
	b2 <- para[[4]]

	# pmax = parallel max on a pair of vectors
	# A %*% B = matrix multiplication

	N <- nrow(X) # number of rows in X

	hidden_layer <- pmax(0, X%*% W + matrix(rep(b,N), nrow = N, byrow = T))
	hidden_layer <- matrix(hidden_layer, nrow = N)

	scores <- hidden_layer%*%W2 + matrix(rep(b2,N), nrow = N, byrow = T)
	predicted_class <- apply(scores, 1, which.max)

	return(predicted_class)
	}

	nnet.model <- nnet(X, Y, step_size = 0.4,reg = 0.0002, h=50, niteration = 6000)

	predicted_class <- nnetPred(X, nnet.model)
	print(paste('training accuracy:',mean(predicted_class == (y))))

	################################################################################
	# plot decision boundary
	#
	################################################################################

	hs <- 0.01
	grid <- as.matrix(expand.grid(seq(x_min, x_max, by = hs), seq(y_min, y_max, by =hs)))
	Z <- nnetPred(grid, nnet.model)

	ggplot()+
	geom_tile(aes(x = grid[,1],y = grid[,2],fill=as.character(Z)), alpha = 0.3, show.legend = F)+
	geom_point(data = data, aes(x=x, y=y, color = as.character(label)), size = 2) + theme_bw(base_size = 15) +
	ggtitle('Neural Network Decision Boundary') +
	coord_fixed(ratio = 0.8) +
	theme(axis.ticks=element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
	axis.text=element_blank(), axis.title=element_blank(), legend.position = 'none')

	print('done')

	################################################################################
	# MNIST data & pre processing
	#
	################################################################################

	displayDigit <- function(X){
	m <- matrix(unlist(X),nrow = 28,byrow = T)
	m <- t(apply(m, 2, rev))
	image(m,col=grey.colors(255))
	}

	train <- read.csv("data/train.csv", header = TRUE, stringsAsFactors = F)
	displayDigit(train[18,-1])

	################################################################################
	# pre process by removing near-zero variance columns & sclae by max(X)
	#
	# data is split for cross-validation
	################################################################################

	nzv <- nearZeroVar(train)
	nzv.nolabel <- nzv-1

	inTrain <- createDataPartition(y=train$label, p=0.7, list=F)

	training <- train[inTrain, ]
	CV <- train[-inTrain, ]

	X <- as.matrix(training[, -1]) # data matrix (each row = single example)
	N <- nrow(X) # number of examples
	y <- training[, 1] # class labels

	K <- length(unique(y)) # number of classes
	X.proc <- X[, -nzv.nolabel]/max(X) # scale
	D <- ncol(X.proc) # dimensionality

	Xcv <- as.matrix(CV[, -1]) # data matrix (each row = single example)
	ycv <- CV[, 1] # class labels
	Xcv.proc <- Xcv[, -nzv.nolabel]/max(X) # scale CV data

	Y <- matrix(0, N, K)

	for (i in 1:N){
	Y[i, y[i]+1] <- 1
	}

	################################################################################
	# model training & CV accuracy
	#
	################################################################################

	nnet.mnist <- nnet(X.proc, Y, step_size = 0.3,
	reg = 0.0001, niteration = 3500)

	predicted_class <- nnetPred(X.proc, nnet.mnist)

	print(paste('training set accuracy:',
	mean(predicted_class == (y+1))))

	predicted_class <- nnetPred(Xcv.proc, nnet.mnist)

	print(paste('CV accuracy:',
	mean(predicted_class == (ycv+1))))

	################################################################################
	# randomly select an image & predict the label
	#
	################################################################################

	Xtest <- Xcv[sample(1:nrow(Xcv), 1), ]
	Xtest.proc <- as.matrix(Xtest[-nzv.nolabel], nrow = 1)
	predicted_test <- nnetPred(t(Xtest.proc), nnet.mnist)
	print(paste('The predicted digit is:',predicted_test-1 ))

	displayDigit(Xtest)