Created
November 13, 2018 02:59
-
-
Save estebanz01/541a2694df49fae2c87fd1d0ac3b562d to your computer and use it in GitHub Desktop.
Bike Sharing Dataset - Random Forest basic prediction with validation of variables used at training time
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# I got this working with code from https://datascienceplus.com/random-forests-in-r/ | |
# It is an excelent resource! | |
set.seed(50) | |
bikeperday <- read.csv('Bike-Sharing-Dataset/day.csv') | |
# Calculate sample | |
sample <- sample.int(n = nrow(bikeperday), | |
size = floor(.75 * nrow(bikeperday)), | |
replace = F) | |
train <- bikeperday[sample, ] | |
test <- bikeperday[-sample, ] | |
# Let's use random forests to create a basic prediction | |
require(randomForest) | |
# First, we try to decide the number of variables used at train time | |
oob.err=double(7) # Out of Bag error, i.e. errors at train time | |
test.err=double(7) # Test error | |
for(tries in 1:7) { | |
model <- randomForest(cnt ~ atemp * hum * windspeed * season * mnth * weathersit * workingday, | |
data = train, mtry = tries, ntree = 800) | |
oob.err[tries] <- model$mse[800] # We get the mean squared error of the last tree | |
pred <- predict(model, test) | |
test.err[tries] <- with(test, mean((cnt - pred)^2)) # Mean squared Error of predicted value | |
varImpPlot(model) | |
cat(paste0('Try number', tries), ' ') # Show in console | |
} | |
# Plot the errors | |
matplot(1:7, cbind(oob.err,test.err), pch=19, | |
col=c("red","blue"), | |
type="b", | |
ylab="Mean Squared Error", | |
xlab="Number of Predictors Considered at each Split") | |
legend("topright", legend=c("Out of Bag Error","Test Error"), pch=19, col=c("red","blue")) | |
# Based on the results, let's use 3 variables on the model training | |
final_model <- randomForest(cnt ~ atemp * hum * windspeed * season * mnth * weathersit * workingday, | |
data = train, | |
mtry = 3, | |
ntree = 500) | |
varImpPlot(final_model) | |
# Let's do some prediction | |
newdata <- data.frame(atemp = 35.3, # Celcius | |
season = 2, # summer | |
hum = 0.8, # 80 % | |
mnth = 6, # June | |
windspeed = 0, # No wind. Values up to x/67 | |
weathersit = 1, # Clear skies | |
workingday = 1) | |
pred <- predict(final_model, newdata) | |
print(paste('Approx Number of rented bikes: ', pred)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment