Last active
January 10, 2017 14:48
-
-
Save RottenFruits/5c65606a7b1d82a55ec6fec062ca3ab0 to your computer and use it in GitHub Desktop.
「RでStacking ー第二回:実践編ー」の記事用です。http://qiita.com/Rotten_Fruits/items/694aa533c6c5d0aa0351
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
###関数定義-------------------------------------------------- | |
#リストの中から正解率を取り出し平均値を算出、予測値を取り出す | |
procAccPredL <- function(acc_and_pred_list){ | |
acc <- mean(sapply(acc_and_pred_list[seq(1, length(acc_and_pred_list), 2)], sum)) | |
predL <- rbindlist(acc_and_pred_list[seq(2, length(acc_and_pred_list), 2)]) | |
predL <- predL[order(as.integer(predL$rowNo)), ] | |
return(list(acc, predL)) | |
} | |
#meta featureの作成 | |
#設定されたアルゴリズムとパラメータでクロスバリデーションした結果を返す | |
calcAccCreMetaFeature <- function(formula, train_df, numberOfFolds, argorithm, params){ | |
features <- all.vars(formula)[-1] | |
target <- all.vars(formula)[1] | |
if(features == "."){ | |
features_name <- names(train_df) | |
features_name <- features_name[features_name != "FoldID"] | |
features_name <- features_name[features_name != "rowNo"] | |
features <- features_name[features_name != target] | |
} | |
train_df$rowNo <- row.names(train_df) | |
if(numberOfFolds == 0){ | |
return() | |
}else{ | |
# 学習用と予測用にデータを分割 | |
testFold <- train_df[J(FoldID = numberOfFolds), on = "FoldID"] | |
trainFolds <- train_df[!J(FoldID = numberOfFolds), on = "FoldID"] # Exclude fold i from trainFolds | |
testFold$Pred <- selectModelTrain(features, target, trainFolds, testFold, argorithm, params) | |
predList <- list(testFold[, list(rowNo, FoldID, Pred)]) | |
# 正解率算出 | |
score <- mean(testFold$Pred == c(testFold[, target, with = FALSE])[[1]]) | |
return(procAccPredL(c(score, predList, Recall(formula, train_df, numberOfFolds - 1, argorithm, params)))) | |
} | |
} | |
#モデルの選択と学習 | |
selectModelTrain <- function(features, target, train_df, test_df, argorithm, params){ | |
if(argorithm == "knn"){ | |
Pred <- knn(train = train_df[, features, with = FALSE], test = test_df[, features, with = FALSE], cl = c(train_df[, target, with = FALSE])[[1]], k = params$k) | |
}else if(argorithm == "svm"){ | |
svm <- LiblineaR(data = train_df[, features, with = FALSE], target = c(train_df[, target, with = FALSE])[[1]], type = params$type, cost = params$cost) | |
Pred <- predict(svm, test_df[, features, with = FALSE])$predictions | |
}else if(argorithm == "rf"){ | |
rf <- randomForest(x = train_df[, features, with = FALSE], y = c(train_df[, target, with = FALSE])[[1]], data = train_df, ntree = 500, mtry = params$mtry) | |
Pred <- predict(rf, test_df) | |
} | |
return(Pred) | |
} | |
#テストデータ用のmeta data 作成 | |
calcAccCreMetaFeatureForTestData <- function(formula, test_df, train_df, argorithm, params){ | |
features <- all.vars(formula)[-1] | |
target <- all.vars(formula)[1] | |
if(features == "."){ | |
features_name <- names(test_df) | |
features <- features_name[features_name != target] | |
} | |
Pred <- selectModelTrain(features, target, train_df, test_df, argorithm, params) | |
# 正解率算出 | |
score <- mean(Pred == c(test_df[, target, with = FALSE])[[1]]) | |
return(list(score, Pred)) | |
} | |
#パラメータリストを与えて、正解率やmeta featureを返す | |
gridCalcAccMeta <- function(formula, train_df, numberOfFolds, argorithm, param_list){ | |
return_list <- list() | |
for(i in seq_len(nrow(param_list[["ParamGrid"]]))){ | |
res <- calcAccCreMetaFeature(formula, train_df, numberOfFolds, argorithm, param_list[["ParamGrid"]][i]) | |
print(paste("Params:", paste(paste(colnames(param_list[["ParamGrid"]][i]), param_list[["ParamGrid"]][i], collapse = " | "), | |
"|", "Score:", res[[1]]))) | |
#結果をリストに格納 | |
return_list[["Params"]][i] <- list(param_list[["ParamGrid"]][i]) | |
return_list[["Scores"]][i] <- res[[1]] | |
return_list[["MetaFeature"]][i] <- list(res[[2]]$Pred) | |
} | |
return(return_list) | |
} | |
#ベストなパラメータを取得 | |
getBestParas <- function(res_list){ | |
ret_list <- list() | |
best <- which.max(res_list[["Scores"]]) | |
ret_list[["BestParams"]] <- res_list[["Params"]][best] | |
ret_list[["BestScores"]] <- res_list[["Scores"]][best] | |
ret_list[["BestMeta"]] <- res_list[["MetaFeature"]][best] | |
return(ret_list) | |
} | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
###ライブラリ読み込み-------------------------------------------------- | |
#前処理用 | |
library(data.table) | |
library(mltools) | |
library(dplyr) | |
#アルゴリズム用 | |
library(class) #k近傍法 | |
library(LiblineaR) #サポートベクターマシン | |
library(randomForest) #ランダムフォレスト | |
#データ用 | |
library(kernlab) #spamデータ用 | |
data(spam) | |
#関数読み込み | |
source("stacking_in_r_function.R") | |
###データを読み込んで学習データと検証データに分ける-------- | |
spam <- data.table(spam) | |
spam[, FoldID := folds(type, nfolds = 10, stratified = TRUE, seed = 123)] #stratified = TRUEで正例が各Foldで均等になるように分割 | |
spam_train <- spam %>% filter(FoldID <= 3) | |
spam_test <- spam %>% filter(FoldID > 3) | |
spam_train$FoldID <- NULL | |
spam_test$FoldID <- NULL | |
#学習データにmeta faeture------------------------------------------------------------ | |
#クロスバリデーション設定 | |
spam_train[, FoldID := folds(type, nfolds = 5, stratified = TRUE, seed = 123)] | |
numberOfFolds <- max(spam_train[, "FoldID"]) | |
#k近傍法 | |
knnCV <- list() | |
knnCV[["ParamGrid"]] <- CJ(k=seq(1, 30)) | |
res_knn <- gridCalcAccMeta(type ~ ., spam_train, numberOfFolds, "knn", knnCV) | |
knn_train_pred <- getBestParas(res_knn) | |
#svm | |
svmCV <- list() | |
svmCV[["ParamGrid"]] <- CJ(type = 1:5, cost = c(.01, .1, 1, 10, 100, 1000, 2000)) | |
res_svm <- gridCalcAccMeta(type ~ ., spam_train, numberOfFolds, "svm", svmCV) | |
svm_train_pred <- getBestParas(res_svm) | |
#radom forest | |
set.seed(123) | |
rfCV <- list() | |
rfCV[["ParamGrid"]] <- CJ(mtry = seq(15, 20)) | |
res_rf <- gridCalcAccMeta(type ~ ., spam_train, numberOfFolds, "rf", rfCV) | |
rf_train_pred <- getBestParas(res_rf) | |
#meta featureの結合 | |
spam_train$meta_knn <- knn_train_pred[["BestMeta"]][[1]] | |
spam_train$meta_svm <- svm_train_pred[["BestMeta"]][[1]] | |
spam_train$meta_rf <- rf_train_pred[["BestMeta"]][[1]] | |
spam_train$FoldID <- NULL | |
#検証モデルにmeta feature---------------------------------------------- | |
# knn | |
knn_test_pred <- calcAccCreMetaFeatureForTestData(type ~ ., spam_test, spam_train, "knn", knn_train_pred[["BestParams"]][[1]]) | |
# svm | |
svm_test_pred <- calcAccCreMetaFeatureForTestData(type ~ ., spam_test, spam_train, "svm", svm_train_pred[["BestParams"]][[1]]) | |
# rf | |
set.seed(123) | |
rf_test_pred <- calcAccCreMetaFeatureForTestData(type ~ ., spam_test, spam_train, "rf", rf_train_pred[["BestParams"]][[1]]) | |
#meta feature結合 | |
spam_test$meta_knn <- knn_test_pred[[2]] | |
spam_test$meta_svm <- svm_test_pred[[2]] | |
spam_test$meta_rf <- rf_test_pred[[2]] | |
#検証データ予測--------------------------------------------- | |
set.seed(123) | |
stacking_model <- randomForest(type ~ ., data = spam_train) | |
pred_stacking <- predict(stacking_model, spam_test) | |
#正解率 | |
mean(spam_test$type == knn_test_pred[[2]]) | |
mean(spam_test$type == svm_test_pred[[2]]) | |
mean(spam_test$type == rf_test_pred[[2]]) | |
mean(spam_test$type == pred_stacking) | |
#混同行列 | |
table(spam_test$type, knn_test_pred[[2]]) | |
table(spam_test$type, svm_test_pred[[2]]) | |
table(spam_test$type, rf_test_pred[[2]]) | |
table(spam_test$type, pred_stacking) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment