ianjohns/neural_network.R

## neural_network.R
#cars_19 data set
#neural network with 2 hidden layers (7 neurons and 3 neurons)

#raw data
#https://www.fueleconomy.gov/feg/epadata/19data.zip

library(neuralnet)
library(caret)

#load("~/R_Cars_19/Data/cars_19.Rdata")
title <- "Neural Network"

maxs <- apply(cars_19[, c(1:3, 5, 8)], 2, max)
mins <- apply(cars_19[, c(1:3, 5, 8)], 2, min)

scaled <- as.data.frame(scale(cars_19[, c(1:3, 5, 8)], center = mins, scale = maxs - mins))
tmp <- data.frame(scaled, cars_19[, c(4, 6, 7, 9:12)])

n <- names(cars_19)
f <- as.formula(paste("fuel_economy_combined ~", paste(n[!n %in% "fuel_economy_combined"], collapse = " + ")))
m <- model.matrix(f, data = tmp)
m <- as.matrix(data.frame(m, tmp[, 1]))
colnames(m)[28] <- "fuel_economy_combined"

set.seed(123)
indices <- sample(1:nrow(cars_19), size = 0.75 * nrow(cars_19))
train <- m[indices,]
test <- m[-indices,]

n <- colnames(m)[2:28]
f <- as.formula(paste("fuel_economy_combined ~", paste(n[!n %in% "fuel_economy_combined"], collapse = " + ")))
m1_nn <- neuralnet(f,
                   data = train,
                   hidden = c(7,3),
                   linear.output = TRUE)

pred_nn <- predict(m1_nn, test)

yhat <-pred_nn * (max(cars_19$fuel_economy_combined) - min(cars_19$fuel_economy_combined)) + min(cars_19$fuel_economy_combined)
y <- test[, 28] * (max(cars_19$fuel_economy_combined) - min(cars_19$fuel_economy_combined)) +min(cars_19$fuel_economy_combined)
postResample(yhat, y)

##################################
#20 fold cv

set.seed(123)
stats <- NULL

for (i in 1:20) {
  indices <- sample(1:nrow(cars_19), size = 0.75 * nrow(cars_19))
  train_tmp <- m[indices, ]
  test_tmp <- m[-indices, ]

  nn_tmp <- neuralnet(f,
                      data = train_tmp,
                      hidden = c(7, 3),
                      linear.output = TRUE)

  pred_nn_tmp <- predict(nn_tmp, test_tmp)

  yhat <- pred_nn_tmp * (max(cars_19$fuel_economy_combined) - min(cars_19$fuel_economy_combined)) + min(cars_19$fuel_economy_combined)
  y <- test_tmp[, 28] * (max(cars_19$fuel_economy_combined) - min(cars_19$fuel_economy_combined)) + min(cars_19$fuel_economy_combined)
  stats_tmp <- postResample(yhat, y)
  stats <- rbind(stats, stats_tmp)
  cat(i, "\n")
}

mean(stats[, 1] ^ 2)      #avg mse  4.261991
mean(stats[, 1] ^ 2) ^ .5 #avg rmse 2.064459
colMeans(stats) #ignore rmse
#RMSE Rsquared      MAE
#xxx  0.880502 1.466458

plot(nn_tmp,rep="best")
	#cars_19 data set
	#neural network with 2 hidden layers (7 neurons and 3 neurons)

	#raw data
	#https://www.fueleconomy.gov/feg/epadata/19data.zip

	library(neuralnet)
	library(caret)

	#load("~/R_Cars_19/Data/cars_19.Rdata")
	title <- "Neural Network"

	maxs <- apply(cars_19[, c(1:3, 5, 8)], 2, max)
	mins <- apply(cars_19[, c(1:3, 5, 8)], 2, min)

	scaled <- as.data.frame(scale(cars_19[, c(1:3, 5, 8)], center = mins, scale = maxs - mins))
	tmp <- data.frame(scaled, cars_19[, c(4, 6, 7, 9:12)])

	n <- names(cars_19)
	f <- as.formula(paste("fuel_economy_combined ~", paste(n[!n %in% "fuel_economy_combined"], collapse = " + ")))
	m <- model.matrix(f, data = tmp)
	m <- as.matrix(data.frame(m, tmp[, 1]))
	colnames(m)[28] <- "fuel_economy_combined"

	set.seed(123)
	indices <- sample(1:nrow(cars_19), size = 0.75 * nrow(cars_19))
	train <- m[indices,]
	test <- m[-indices,]

	n <- colnames(m)[2:28]
	f <- as.formula(paste("fuel_economy_combined ~", paste(n[!n %in% "fuel_economy_combined"], collapse = " + ")))
	m1_nn <- neuralnet(f,
	data = train,
	hidden = c(7,3),
	linear.output = TRUE)

	pred_nn <- predict(m1_nn, test)

	yhat <-pred_nn * (max(cars_19$fuel_economy_combined) - min(cars_19$fuel_economy_combined)) + min(cars_19$fuel_economy_combined)
	y <- test[, 28] * (max(cars_19$fuel_economy_combined) - min(cars_19$fuel_economy_combined)) +min(cars_19$fuel_economy_combined)
	postResample(yhat, y)

	##################################
	#20 fold cv

	set.seed(123)
	stats <- NULL

	for (i in 1:20) {
	indices <- sample(1:nrow(cars_19), size = 0.75 * nrow(cars_19))
	train_tmp <- m[indices, ]
	test_tmp <- m[-indices, ]

	nn_tmp <- neuralnet(f,
	data = train_tmp,
	hidden = c(7, 3),
	linear.output = TRUE)

	pred_nn_tmp <- predict(nn_tmp, test_tmp)

	yhat <- pred_nn_tmp * (max(cars_19$fuel_economy_combined) - min(cars_19$fuel_economy_combined)) + min(cars_19$fuel_economy_combined)
	y <- test_tmp[, 28] * (max(cars_19$fuel_economy_combined) - min(cars_19$fuel_economy_combined)) + min(cars_19$fuel_economy_combined)
	stats_tmp <- postResample(yhat, y)
	stats <- rbind(stats, stats_tmp)
	cat(i, "\n")
	}

	mean(stats[, 1] ^ 2) #avg mse 4.261991
	mean(stats[, 1] ^ 2) ^ .5 #avg rmse 2.064459
	colMeans(stats) #ignore rmse
	#RMSE Rsquared MAE
	#xxx 0.880502 1.466458

	plot(nn_tmp,rep="best")