Skip to content

Instantly share code, notes, and snippets.

@RottenFruits
Last active January 10, 2017 14:48
Show Gist options
  • Save RottenFruits/5c65606a7b1d82a55ec6fec062ca3ab0 to your computer and use it in GitHub Desktop.
Save RottenFruits/5c65606a7b1d82a55ec6fec062ca3ab0 to your computer and use it in GitHub Desktop.
「RでStacking ー第二回:実践編ー」の記事用です。http://qiita.com/Rotten_Fruits/items/694aa533c6c5d0aa0351
###関数定義--------------------------------------------------
#リストの中から正解率を取り出し平均値を算出、予測値を取り出す
procAccPredL <- function(acc_and_pred_list){
acc <- mean(sapply(acc_and_pred_list[seq(1, length(acc_and_pred_list), 2)], sum))
predL <- rbindlist(acc_and_pred_list[seq(2, length(acc_and_pred_list), 2)])
predL <- predL[order(as.integer(predL$rowNo)), ]
return(list(acc, predL))
}
#meta featureの作成
#設定されたアルゴリズムとパラメータでクロスバリデーションした結果を返す
calcAccCreMetaFeature <- function(formula, train_df, numberOfFolds, argorithm, params){
features <- all.vars(formula)[-1]
target <- all.vars(formula)[1]
if(features == "."){
features_name <- names(train_df)
features_name <- features_name[features_name != "FoldID"]
features_name <- features_name[features_name != "rowNo"]
features <- features_name[features_name != target]
}
train_df$rowNo <- row.names(train_df)
if(numberOfFolds == 0){
return()
}else{
# 学習用と予測用にデータを分割
testFold <- train_df[J(FoldID = numberOfFolds), on = "FoldID"]
trainFolds <- train_df[!J(FoldID = numberOfFolds), on = "FoldID"] # Exclude fold i from trainFolds
testFold$Pred <- selectModelTrain(features, target, trainFolds, testFold, argorithm, params)
predList <- list(testFold[, list(rowNo, FoldID, Pred)])
# 正解率算出
score <- mean(testFold$Pred == c(testFold[, target, with = FALSE])[[1]])
return(procAccPredL(c(score, predList, Recall(formula, train_df, numberOfFolds - 1, argorithm, params))))
}
}
#モデルの選択と学習
selectModelTrain <- function(features, target, train_df, test_df, argorithm, params){
if(argorithm == "knn"){
Pred <- knn(train = train_df[, features, with = FALSE], test = test_df[, features, with = FALSE], cl = c(train_df[, target, with = FALSE])[[1]], k = params$k)
}else if(argorithm == "svm"){
svm <- LiblineaR(data = train_df[, features, with = FALSE], target = c(train_df[, target, with = FALSE])[[1]], type = params$type, cost = params$cost)
Pred <- predict(svm, test_df[, features, with = FALSE])$predictions
}else if(argorithm == "rf"){
rf <- randomForest(x = train_df[, features, with = FALSE], y = c(train_df[, target, with = FALSE])[[1]], data = train_df, ntree = 500, mtry = params$mtry)
Pred <- predict(rf, test_df)
}
return(Pred)
}
#テストデータ用のmeta data 作成
calcAccCreMetaFeatureForTestData <- function(formula, test_df, train_df, argorithm, params){
features <- all.vars(formula)[-1]
target <- all.vars(formula)[1]
if(features == "."){
features_name <- names(test_df)
features <- features_name[features_name != target]
}
Pred <- selectModelTrain(features, target, train_df, test_df, argorithm, params)
# 正解率算出
score <- mean(Pred == c(test_df[, target, with = FALSE])[[1]])
return(list(score, Pred))
}
#パラメータリストを与えて、正解率やmeta featureを返す
gridCalcAccMeta <- function(formula, train_df, numberOfFolds, argorithm, param_list){
return_list <- list()
for(i in seq_len(nrow(param_list[["ParamGrid"]]))){
res <- calcAccCreMetaFeature(formula, train_df, numberOfFolds, argorithm, param_list[["ParamGrid"]][i])
print(paste("Params:", paste(paste(colnames(param_list[["ParamGrid"]][i]), param_list[["ParamGrid"]][i], collapse = " | "),
"|", "Score:", res[[1]])))
#結果をリストに格納
return_list[["Params"]][i] <- list(param_list[["ParamGrid"]][i])
return_list[["Scores"]][i] <- res[[1]]
return_list[["MetaFeature"]][i] <- list(res[[2]]$Pred)
}
return(return_list)
}
#ベストなパラメータを取得
getBestParas <- function(res_list){
ret_list <- list()
best <- which.max(res_list[["Scores"]])
ret_list[["BestParams"]] <- res_list[["Params"]][best]
ret_list[["BestScores"]] <- res_list[["Scores"]][best]
ret_list[["BestMeta"]] <- res_list[["MetaFeature"]][best]
return(ret_list)
}
###ライブラリ読み込み--------------------------------------------------
#前処理用
library(data.table)
library(mltools)
library(dplyr)
#アルゴリズム用
library(class) #k近傍法
library(LiblineaR) #サポートベクターマシン
library(randomForest) #ランダムフォレスト
#データ用
library(kernlab) #spamデータ用
data(spam)
#関数読み込み
source("stacking_in_r_function.R")
###データを読み込んで学習データと検証データに分ける--------
spam <- data.table(spam)
spam[, FoldID := folds(type, nfolds = 10, stratified = TRUE, seed = 123)] #stratified = TRUEで正例が各Foldで均等になるように分割
spam_train <- spam %>% filter(FoldID <= 3)
spam_test <- spam %>% filter(FoldID > 3)
spam_train$FoldID <- NULL
spam_test$FoldID <- NULL
#学習データにmeta faeture------------------------------------------------------------
#クロスバリデーション設定
spam_train[, FoldID := folds(type, nfolds = 5, stratified = TRUE, seed = 123)]
numberOfFolds <- max(spam_train[, "FoldID"])
#k近傍法
knnCV <- list()
knnCV[["ParamGrid"]] <- CJ(k=seq(1, 30))
res_knn <- gridCalcAccMeta(type ~ ., spam_train, numberOfFolds, "knn", knnCV)
knn_train_pred <- getBestParas(res_knn)
#svm
svmCV <- list()
svmCV[["ParamGrid"]] <- CJ(type = 1:5, cost = c(.01, .1, 1, 10, 100, 1000, 2000))
res_svm <- gridCalcAccMeta(type ~ ., spam_train, numberOfFolds, "svm", svmCV)
svm_train_pred <- getBestParas(res_svm)
#radom forest
set.seed(123)
rfCV <- list()
rfCV[["ParamGrid"]] <- CJ(mtry = seq(15, 20))
res_rf <- gridCalcAccMeta(type ~ ., spam_train, numberOfFolds, "rf", rfCV)
rf_train_pred <- getBestParas(res_rf)
#meta featureの結合
spam_train$meta_knn <- knn_train_pred[["BestMeta"]][[1]]
spam_train$meta_svm <- svm_train_pred[["BestMeta"]][[1]]
spam_train$meta_rf <- rf_train_pred[["BestMeta"]][[1]]
spam_train$FoldID <- NULL
#検証モデルにmeta feature----------------------------------------------
# knn
knn_test_pred <- calcAccCreMetaFeatureForTestData(type ~ ., spam_test, spam_train, "knn", knn_train_pred[["BestParams"]][[1]])
# svm
svm_test_pred <- calcAccCreMetaFeatureForTestData(type ~ ., spam_test, spam_train, "svm", svm_train_pred[["BestParams"]][[1]])
# rf
set.seed(123)
rf_test_pred <- calcAccCreMetaFeatureForTestData(type ~ ., spam_test, spam_train, "rf", rf_train_pred[["BestParams"]][[1]])
#meta feature結合
spam_test$meta_knn <- knn_test_pred[[2]]
spam_test$meta_svm <- svm_test_pred[[2]]
spam_test$meta_rf <- rf_test_pred[[2]]
#検証データ予測---------------------------------------------
set.seed(123)
stacking_model <- randomForest(type ~ ., data = spam_train)
pred_stacking <- predict(stacking_model, spam_test)
#正解率
mean(spam_test$type == knn_test_pred[[2]])
mean(spam_test$type == svm_test_pred[[2]])
mean(spam_test$type == rf_test_pred[[2]])
mean(spam_test$type == pred_stacking)
#混同行列
table(spam_test$type, knn_test_pred[[2]])
table(spam_test$type, svm_test_pred[[2]])
table(spam_test$type, rf_test_pred[[2]])
table(spam_test$type, pred_stacking)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment