Last active
August 20, 2016 19:12
-
-
Save annecool37/60cf33ebcb732e603cbb9ecea66c7a1d to your computer and use it in GitHub Desktop.
meetup_analysis
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## ~~~~~~~~~~~~~ ## | |
## Random Forest ## | |
## ~~~~~~~~~~~~~ ## | |
set.seed(8) | |
# subset training and testing datasets | |
train_idx = sample(1:nrow(rf_model_df), 8*nrow(rf_model_df)/10) | |
tree_train = rf_model_df[train_idx,] | |
tree_test = rf_model_df[-train_idx,] | |
# find the best mtry | |
tuneRF(tree_train[,-2], tree_train[,2], stepFactor=1.5) # note that the default ntreeTry is 50 instead of 500 for tuneRF() | |
# mtry OOBError | |
# 3 3 292.5267 | |
# 4 4 283.6540 <- this mtry is picked since it has lowest OOBError | |
# 6 6 286.6532 | |
set.seed(0) | |
fit_500 = randomForest(participant_count~., data = tree_train, | |
importance = TRUE, mtry = 4) # default: ntree = 500 | |
# check which variable is more important in our model | |
importance(fit_500) | |
varImpPlot(fit_500, main = "Variable Importance Plot") | |
# predict the number of participant with our random forest model | |
prediction = predict(fit_500, tree_test) | |
mean((as.numeric(prediction) - tree_test$participant_count)^2) | |
# MSE is 225.7062 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment