Skip to content

Instantly share code, notes, and snippets.

@gowrishankarin
Last active February 4, 2017 17:44
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save gowrishankarin/ad3889a381bed18d5a6be427c962a06c to your computer and use it in GitHub Desktop.
Save gowrishankarin/ad3889a381bed18d5a6be427c962a06c to your computer and use it in GitHub Desktop.
# https://www.kaggle.com/c/grupo-bimbo-inventory-demand/discussion/23863#136641
library(data.table)
library(xgboost)
train = fread('_data/train.csv', select=c("Semana", 'Cliente_ID', 'Producto_ID',
'Agencia_ID', 'Ruta_SAK', 'Demanda_uni_equil'))
test = fread('_data/test.csv', select=c("Semana", 'id', 'Cliente_ID', 'Producto_ID',
'Agencia_ID', 'Ruta_SAK'))
trainPersist <- train
testPersist <- test
train=train[Semana > 7,]
train$id = 0 # Create an id column for train data to match the columns with test
train[, target := Demanda_uni_equil] # Set target value for train
train[, Demanda_uni_equil := NULL] # Reset demanda uniq equil
train[, tst := 0] # Mark train data
test$target = 0 # Set target for test data to zero
test[, tst := 1] # Mark test data
data = rbind(train, test) # Combine test and train data
rm(test)
rm(train)
if(FALSE) {
data1 <- data[, .(Semana=Semana+2, Cliente_ID, Producto_ID, target)]
data = merge(data, data1[Semana > 8, .(target12 = mean(target)),
by=.(Semana, Cliente_ID, Producto_ID)], all.x=T, by=c("Semana", "Cliente_ID", "Producto_ID"))
data1 <- data[, .(Semana=Semana+3, Cliente_ID, Producto_ID, target)]
data = merge(data, data1[Semana > 8, .(target13 = mean(target)),
by = .(Semana, Cliente_ID, Producto_ID)], all.x=T, by=c("Semana", "Cliente_ID", "Producto_ID"))
data1 <- data[, .(Semana=Semana+4, Cliente_ID, Producto_ID, target)]
data = merge(data, data1[Semana > 8, .(target14 = mean(target)),
by = .(Semana, Cliente_ID, Producto_ID)], all.x=T, by=c("Semana", "Cliente_ID", "Producto_ID"))
data1 <- data[, .(Semana=Semana+5, Cliente_ID, Producto_ID, target)]
data = merge(data, data1[Semana > 8, .(target15 = mean(target)),
by = .(Semana, Cliente_ID, Producto_ID)], all.x=T, by=c("Semana", "Cliente_ID", "Producto_ID"))
}
rm(data1)
data = data[Semana > 8, ]
# Creating frequency features for some factor variables
nAgencia_ID = data[, .(nAgencia_ID=.N), by=.(Agencia_ID, Semana)]
nRuta_SAK = data[, .(nRuta_SAK=.N), by=.(Ruta_SAK, Semana)]
nCliente_ID = data[, .(nCliente_ID=.N), by = .(Cliente_ID, Semana)]
nProducto_ID = data[, .(nProducto_ID=.N), by = .(Producto_ID, Semana)]
nAgencia_ID = nAgencia_ID[, .(nAgencia_ID=mean(nAgencia_ID, na.rm=T)), by=Agencia_ID]
nRuta_SAK = nRuta_SAK[, .(nRuta_SAK=mean(nRuta_SAK, na.rm=T)), by=Ruta_SAK]
nCliente_ID = nCliente_ID[, .(nCliente_ID=mean(nCliente_ID, na.rm=T)), by=Cliente_ID]
nProducto_ID = nProducto_ID[, .(nProducto_ID=mean(nProducto_ID, na.rm=T)), by=Producto_ID]
data = merge(data, nAgencia_ID, by='Agencia_ID', all.x = T)
data = merge(data, nRuta_SAK, by='Ruta_SAK', all.x = T)
data = merge(data, nCliente_ID, by='Cliente_ID', all.x = T)
data = merge(data, nProducto_ID, by='Producto_ID', all.x = T)
data$target = log(data$target + 1)
data_train <- data[tst==0,]
data_test <- data[tst==1,]
features=names(data_train)[!(names(data_train) %in% c('id', 'target', 'tst'))]
rm(data)
wltst = sample(nrow(data_train), 30000)
dval <- xgb.DMatrix(
data = data.matrix(data_train[wltst, features, with=FALSE]),
label = data.matrix(data_train[wltst, target]),
missing = NA
)
watchlist <- list(dval = dval)
clf <- xgb.train(
params = list(
objective = "reg:linear",
booster = "gbtree",
eta = 0.1,
max_depth = 10,
subsample = 0.85,
colsample_bytree=0.7
),
data = xgb.DMatrix(
data = data.matrix(data_train[-wltst, features, with=FALSE]),
label = data.matrix(data_train[-wltst, target]), missing=NA
),
nrounds = 75,
verbost = 1,
print_every_n = 5,
early_stopping_rounds = 10,
watchlist = watchlist,
maximize = FALSE,
eval_metric = 'rmse'
)
data_test1 <- data_test[Semana==10,]
pred <- predict(clf, xgb.DMatrix(
data.matrix(data_test1[, features, with=FALSE]), missing=NA
))
res = exp(round(pred, 5)) - 1
data_test_lag1 = data_test1[, .(Cliente_ID, Producto_ID)]
data_test_lag1$targetl1 = res
data_test_lag1 = data_test_lag1[, .(targetl1 = mean(targetl1)), by=.(Cliente_ID, Producto_ID)]
results = data.frame(id=data_test1$id, Demanda_uni_equil=res)
#-------
data_test2 = data_test[Semana == 11,]
data_test2[,targetl1 := NULL]
# Merge lagged values of target variable to test the set for the 11th week
data_test2 = merge(data_test2, data_test_lag1, all.x=T, by=c('Cliente_ID', 'Producto_ID'))
pred <- predict(clf, xgb.DMatrix(
data.matrix(data_test2[, features, with=FALSE]), missing=NA)
)
res = exp(round(pred, 5)) - 1
res.df = data.frame(id=data_test2$id, Demanda_uni_equil=res)
results = rbind(results, res.df)
results[results[,2]<0,2]=0
results[,2]=round(results[,2],1)
results[,1]=as.integer(results[,1])
class(results[,1])='int32'
options(digits=18)
write.csv(results,file='results1.csv',row.names=F)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment