Skip to content

Instantly share code, notes, and snippets.

@estebanz01
Created November 13, 2018 02:59
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save estebanz01/541a2694df49fae2c87fd1d0ac3b562d to your computer and use it in GitHub Desktop.
Save estebanz01/541a2694df49fae2c87fd1d0ac3b562d to your computer and use it in GitHub Desktop.
Bike Sharing Dataset - Random Forest basic prediction with validation of variables used at training time
# I got this working with code from https://datascienceplus.com/random-forests-in-r/
# It is an excelent resource!
set.seed(50)
bikeperday <- read.csv('Bike-Sharing-Dataset/day.csv')
# Calculate sample
sample <- sample.int(n = nrow(bikeperday),
size = floor(.75 * nrow(bikeperday)),
replace = F)
train <- bikeperday[sample, ]
test <- bikeperday[-sample, ]
# Let's use random forests to create a basic prediction
require(randomForest)
# First, we try to decide the number of variables used at train time
oob.err=double(7) # Out of Bag error, i.e. errors at train time
test.err=double(7) # Test error
for(tries in 1:7) {
model <- randomForest(cnt ~ atemp * hum * windspeed * season * mnth * weathersit * workingday,
data = train, mtry = tries, ntree = 800)
oob.err[tries] <- model$mse[800] # We get the mean squared error of the last tree
pred <- predict(model, test)
test.err[tries] <- with(test, mean((cnt - pred)^2)) # Mean squared Error of predicted value
varImpPlot(model)
cat(paste0('Try number', tries), ' ') # Show in console
}
# Plot the errors
matplot(1:7, cbind(oob.err,test.err), pch=19,
col=c("red","blue"),
type="b",
ylab="Mean Squared Error",
xlab="Number of Predictors Considered at each Split")
legend("topright", legend=c("Out of Bag Error","Test Error"), pch=19, col=c("red","blue"))
# Based on the results, let's use 3 variables on the model training
final_model <- randomForest(cnt ~ atemp * hum * windspeed * season * mnth * weathersit * workingday,
data = train,
mtry = 3,
ntree = 500)
varImpPlot(final_model)
# Let's do some prediction
newdata <- data.frame(atemp = 35.3, # Celcius
season = 2, # summer
hum = 0.8, # 80 %
mnth = 6, # June
windspeed = 0, # No wind. Values up to x/67
weathersit = 1, # Clear skies
workingday = 1)
pred <- predict(final_model, newdata)
print(paste('Approx Number of rented bikes: ', pred))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment