Created
July 26, 2014 16:23
-
-
Save alex-lebedev/cb56a1c7a0cb2762a03a to your computer and use it in GitHub Desktop.
MLSP Kaggle competition: 2nd place solution
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# MLSP Kaggle competition | |
# 2nd place solution | |
# Author: Alexander V. Lebedev | |
# I. PREPARATORY STEP: | |
# Ia. Load the libraries: | |
library(caret) | |
library(e1071) | |
library(kernlab) | |
library(doMC) | |
library(foreach) | |
# Ib. Read the data: | |
trFC <- read.csv('/YOUR-PATH/Kaggle/SCH/Train/train_FNC.csv') | |
trSBM <- read.csv('/YOUR-PATH/Kaggle/SCH/Train/train_SBM.csv') | |
tr <- merge(trFC, trSBM, by='Id') | |
tstFC <- read.csv('/YOUR-PATH/Kaggle/SCH/Test/test_FNC.csv') | |
tstSBM <- read.csv('/YOUR-PATH/Kaggle/SCH/Test/test_SBM.csv') | |
tst <- merge(tstFC, tstSBM, by='Id') | |
y <- read.csv('/YOUR-PATH/Kaggle/SCH/Train/train_labels.csv') | |
# II. ANALYSIS | |
# IIa. "Feature trimming" | |
# Registering 6 cores to speed up my computations: | |
registerDoMC(cores=6) | |
# Just converting a y-label vector into appropriate format: | |
y <- as.factor(paste('X.', y[,2], sep = '')) | |
# This step is important (I introduce a random vector into my feature set): | |
all <- cbind(tr, rnorm(1:dim(tr)[1])) | |
colnames(all)[412] <- 'rand' | |
# Now I train Random Forest with this (full) feature set: | |
rf.mod <- foreach(ntree=rep(2500, 6), .combine=combine, .multicombine=TRUE, | |
.packages='randomForest') %dopar% { | |
randomForest(all[,2:412], y, ntree=ntree) | |
} | |
# Now, I am looking at the feature importances: | |
imp <- as.data.frame(rf.mod$importance[order(rf.mod$importance),]) | |
# Everything below importance of my "dummy" feature (random vector) can likely be ignored | |
imp <- subset(imp, imp>imp['rand',]) | |
# Saving the data in one rda-file for further analyses: | |
save('all', 'y', 'tst', 'imp', file = '/YOUR-PATH/Kaggle/SCH/Train/AllData.rda') | |
# Now, I reduce my feature set: | |
dat <- all[,rownames(imp)] | |
IIb. Training the final model: | |
# I usually start from SVM and then proceed with ensemble methods | |
# (in this competition, the use of boosted trees did not result in superior performance and I stopped) | |
# I would have tried other algorithms and more sophisticated feature selection approaches | |
# (like SCAD-SVM and recursive feature elimination), but I was not impressed with my intermediate results and gave up =) | |
# So, here is the model that provided me 2nd position: | |
# First, I estimate "sigma" (inverse width parameter for the RBF-SVM) | |
# Of note, sometimes I use a subset of my data, but here I used the whole training set due to its very limited size: | |
sigDist <- sigest(y ~ as.matrix(dat), data=dat, frac = 1) | |
# Creating a tune grid for further C-parameter selection): | |
svmTuneGrid <- data.frame(.sigma = sigDist[1], .C = 2^(-20:100)) | |
# And... training the final RBF-SVM model with leave-one-out cross-validation: | |
# (Yes. It's as simple as that!) | |
svmFit <- train(dat,y, | |
method = "svmRadial", | |
preProc = c("center", "scale"), | |
tuneGrid = svmTuneGrid, | |
trControl = trainControl(method = "cv", number = 86, classProbs = TRUE)) | |
# III. FINAL STEP | |
# Making predictions | |
ttst <- tst[,rownames(imp)] | |
predTst <- predict(svmFit, ttst, type='prob') | |
predTst <- predTst[,2] | |
# Formatting submission: | |
pred <- cbind(as.integer(tst$Id), as.numeric(predTst)) | |
colnames(pred) <- c('Id', 'Probability') | |
# Writing: | |
write.table(pred, file = '/YOUR-PATH/Kaggle/SCH/submissions/submission_rbfSVM_RFtrimmed.csv', sep=',', quote=F, row.names=F, fileEncoding = 'UTF-16LE') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
For more details, see: https://github.com/alex-lebedev/Kaggle-MLSP-2014