annecool37/random_forest.R

## random_forest.R
## ~~~~~~~~~~~~~ ##
## Random Forest ##
## ~~~~~~~~~~~~~ ##
set.seed(8)
# subset training and testing datasets
train_idx = sample(1:nrow(rf_model_df), 8*nrow(rf_model_df)/10)
tree_train = rf_model_df[train_idx,]
tree_test = rf_model_df[-train_idx,]

# find the best mtry
tuneRF(tree_train[,-2], tree_train[,2], stepFactor=1.5) # note that the default ntreeTry is 50 instead of 500 for tuneRF()
#   mtry OOBError
# 3    3 292.5267
# 4    4 283.6540 <- this mtry is picked since it has lowest OOBError
# 6    6 286.6532

set.seed(0)
fit_500 = randomForest(participant_count~., data = tree_train,
                      importance = TRUE, mtry = 4)  # default: ntree = 500

# check which variable is more important in our model
importance(fit_500)
varImpPlot(fit_500, main = "Variable Importance Plot")

# predict the number of participant with our random forest model
prediction = predict(fit_500, tree_test)
mean((as.numeric(prediction) - tree_test$participant_count)^2)
# MSE is 225.7062
	## ~~~~~~~~~~~~~ ##
	## Random Forest ##
	## ~~~~~~~~~~~~~ ##
	set.seed(8)
	# subset training and testing datasets
	train_idx = sample(1:nrow(rf_model_df), 8*nrow(rf_model_df)/10)
	tree_train = rf_model_df[train_idx,]
	tree_test = rf_model_df[-train_idx,]

	# find the best mtry
	tuneRF(tree_train[,-2], tree_train[,2], stepFactor=1.5) # note that the default ntreeTry is 50 instead of 500 for tuneRF()
	# mtry OOBError
	# 3 3 292.5267
	# 4 4 283.6540 <- this mtry is picked since it has lowest OOBError
	# 6 6 286.6532

	set.seed(0)
	fit_500 = randomForest(participant_count~., data = tree_train,
	importance = TRUE, mtry = 4) # default: ntree = 500

	# check which variable is more important in our model
	importance(fit_500)
	varImpPlot(fit_500, main = "Variable Importance Plot")

	# predict the number of participant with our random forest model
	prediction = predict(fit_500, tree_test)
	mean((as.numeric(prediction) - tree_test$participant_count)^2)
	# MSE is 225.7062