Skip to content

Instantly share code, notes, and snippets.

@phileas-condemine
Created February 3, 2020 14:31
Show Gist options
  • Save phileas-condemine/be25c4a88ff65e2ecb74a2a90c9382d4 to your computer and use it in GitHub Desktop.
Save phileas-condemine/be25c4a88ff65e2ecb74a2a90c9382d4 to your computer and use it in GitHub Desktop.
Benchmark glmnet svr xgboost sur un problème de régression
#------------------
# Data Preparation
#------------------
library(data.table)
library(ggplot2)
library(plotly)
#Read datasets
#Download the data from http://www.saedsayad.com/datasets/BikeRental.zip
train <- read.csv("data/BikeRental/bike_rental_train.csv")
test <- read.csv("data/BikeRental/bike_rental_test.csv")
#Rows and Cols
dim(train)
dim(test)
#Columns name
colnames(train)
colnames(test)
#Show
head(train)
head(test)
#Rows and Cols
dim(train)
dim(test)
#Columns name
colnames(train)
colnames(test)
#Show
head(train)
head(test)
#Scatter plot
pairs(~temp+humidity+windspeed+bike_rent_count, data=head(train,100), main="Scatterplot - train", col="darkgreen")
pairs(~temp+humidity+windspeed+bike_rent_count, data=head(test,100), main="Scatterplot - test", col="brown")
#--------------------------------------
# GLM
#--------------------------------------
library(glmnet)
library(dplyr)
benchmark_glmnet = cv.glmnet(train%>%select(-bike_rent_count)%>%as.matrix(),train$bike_rent_count)
pred.benchmark = predict(benchmark_glmnet,test%>%select(-bike_rent_count)%>%as.matrix())
pred.benchmark = pred.benchmark[,1]
err.benchmark <- test$bike_rent_count - pred.benchmark
rmse.benchmark <- sqrt(mean((err.benchmark^2)))
#--------------------------------------
# Support Vector Machines - Regression
#--------------------------------------
library(e1071)
#Train
model.SVR <- svm(bike_rent_count~., train)
summary(model.SVR)
#Residual plot
res.SVR = train$bike_rent_count-predict(model.SVR, newdata=train)
plot(train$temp, res.SVR, ylab="Residuals", xlab="Temperature", main="Residual Plot")
abline(0, 0)
#Q-Q plot
stdres.SVR = scale(res.SVR)
qqnorm(stdres.SVR, ylab="Standardized Residuals", xlab="Normal Scores", main="QQ Plot")
qqline(stdres.SVR)
#Test
pred.SVR <- predict(model.SVR, newdata=test)
err.SVR <- test$bike_rent_count - pred.SVR
rmse.SVR <- sqrt(mean((err.SVR^2)))
rmse.SVR
rmse.benchmark
#Errors histogram
hist(train$bike_rent_count, main="bike_rent_count", sub="values", xlab="Number", breaks=10, col="darkred")
hist(err.SVR, main="bike_rent_count", sub="(Actual-Predicted)", xlab="Error", breaks=10, col="darkred")
#--------------------------------------
# Comparaison divers noyaux et params
#--------------------------------------
run_svm = function(kernel, degree=3,gamma=1){
model.SVR <- svm(bike_rent_count~., train,kernel=kernel,degree=degree,gamma=gamma)
pred.SVR <- predict(model.SVR, newdata=test)
err.SVR <- test$bike_rent_count - pred.SVR
sqrt(mean((err.SVR^2)))
}
run_svm('radial')
# 154
vals = 1:10
fine_tune_rbf = pbapply::pblapply(vals,function(x){
gamma = 5^(x-5)
kernel = 'radial'
time <- system.time(perf <- run_svm(kernel,gamma = gamma))
res = data.table(time = time[3], error = perf, gamma = gamma, kernel = kernel)
print(res)
res
})
fine_tune_rbf = rbindlist(fine_tune_rbf)
fine_tune_rbf$id = vals
g <- ggplot(fine_tune_rbf,aes(x=gamma^2,y=time,fill=error))+xlab("gamma")+geom_col()+scale_x_continuous(trans="log10")
ggplotly(g)
run_svm('linear')
#160
run_svm('polynomial',2)
run_svm('polynomial',3)
run_svm('polynomial',4)
run_svm('polynomial',5)
#--------------------------------------
# Support Vector Machines - Regression
#--------------------------------------
library(gbm)
# On remarquera qu'on peut utiliser le paramètre "var.monotone"
gbm = gbm(bike_rent_count ~ .,
# var.monotone=c(1,-1,1),
data=train,shrinkage =.01,
n.trees = 500,
n.minobsinnode = 10,
interaction.depth = 4,
bag.fraction = .6,
train.fraction = .7,
verbose = T,
distribution = "poisson")
gbm.perf(gbm)
# On peut immédiatement examiner les dépendances partielles
plot(gbm,"temp")
plot(gbm,"humidity")
plot(gbm,"windspeed")
# Faisons un gridsearch sur la profondeur et la taille des feuilles
grid = expand.grid(minobs = c(10,20,50,100,200),
depth = c(1,2,3,4))
perf = pbapply::pbapply(grid,1,function(x){
time = system.time(gbm <- gbm(bike_rent_count ~ .,
var.monotone=c(1,-1,1),cv.folds=2,
data=train,shrinkage =.01,
n.trees = 1000,
n.minobsinnode = x['minobs'],
interaction.depth = x['depth'],
bag.fraction = .6,
train.fraction = .7,verbose = F,
distribution = "poisson"))
best_iter = gbm.perf(gbm, method = "cv")
pred.gbm = predict(gbm,newdata = test,type="response")
err.gbm <- test$bike_rent_count - pred.gbm
rmse = sqrt(mean((err.gbm^2)))
data.table(error = rmse, minobs=x['minobs'],depth=x['depth'],time=time[3],best_iter=best_iter)
})
perf = rbindlist(perf)
g <- ggplot(data=perf,aes(x=depth,size=log(minobs),y=error,color=time))+
geom_point()
ggplotly(g)
# Bilan : GBM > SVM > GLMNET
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment