puyokw/randomForest.r

## randomForest.r
# おまけ：比較に用いたランダムフォレストのコード
odd.n<-2*(1:75)-1
iris.train<-iris[odd.n,] # 奇数を訓練データ
iris.test<-iris[-odd.n,] # 偶数を検証データ
# randomForest
library(randomForest)
set.seed(131)
train.x<-iris.train[,1:4]
train.y<-as.factor(iris.train[,5])
model.rf<-tuneRF(train.x,train.y,doBest=T)
pred<-predict(model.rf,iris.test[,-5])
table(iris.test[,5],pred)
# 変数重要度を求める(xgboost との比較のため和が100になるように調整)
print(model.rf$importance /sum(model.rf$importance) )
varImpPlot(model.rf)

## xgboost.r
dim(iris) # 行数:150, 列数:5
odd.n<-2*(1:75)-1
iris.train<-iris[odd.n,] # 奇数を訓練データ
iris.test<-iris[-odd.n,] # 偶数を検証データ

library(xgboost)
y <- iris.train[,5] # 目的変数
y <- as.integer(y)-1 #xgboost で既定されいるクラスは 0 base

train.x<-iris.train[,1:4]
x <- rbind(train.x,iris.test[,-5]) # xgboost を使うときのため
x <- as.matrix(x)

trind <- 1:length(y) # 先程定義したx の中の訓練データを指すのに使う
teind <- (nrow(train.x)+1):nrow(x) # 先程定義したx の中の検証用データを指すのに使う
# 今回は必要最低限のパラメータ設定
set.seed(131) # 固定シードで試す
param <- list("objective" = "multi:softprob", # 多クラスの分類で各クラスに所属する確率を求める
              "eval_metric" = "mlogloss", # 損失関数の設定
              "num_class" = 3 # class がいくつ存在するのか
              )
# 最適な木の数を探す
k<-round(1+log2(nrow(train.x)))
cv.nround <- 100 #search
bst.cv <- xgb.cv(param=param, data = x[trind,], label = y,  nfold = k, nrounds=cv.nround)

set.seed(131)
nround <- 27
# モデルの構築
bst <- xgboost(param=param, data = x[trind,], label = y, nrounds=nround)
pred <- predict(bst,x[teind,]) # モデルを使って予測値を算出
pred <- matrix(pred,3,length(pred)/3) # 今回は3クラスあるので
pred <- t(pred)
colnames(pred)<-c("setosa","versicolor","virginica")
head(pred,3) # 結果の確認(3行だけ)
# どこのグループに所属するのか
table(as.integer(iris.test[,5]),max.col(pred)) # 簡略版

# どこのグループに属するのかを調べる
param <- list("objective" = "multi:softmax", # multi:softmax に変更！
                "eval_metric" = "mlogloss",
                "num_class" = 3
                )
set.seed(131)
nround <- 27
bst <- xgboost(param=param, data = x[trind,], label = y, nrounds=nround)
pred <- predict(bst,x[teind,])
# データを見やすくする
for(i in 1:length(pred)){
  if(pred[i]==0) {pred[i]="setosa"}
  else if(pred[i]==1) {pred[i]="versicolor"}
  else {pred[i]="virginica"}
}
table(iris.test[,5],pred)
# 変数重要度を求める
imp<-xgb.importance(names(iris[,-5]),model=bst)
print(imp)
xgb.plot.importance(imp)
# 決定木を表示
xgb.plot.tree(feature_names=names(iris[,-5]),model=bst, n_first_tree=2)
	# おまけ：比較に用いたランダムフォレストのコード
	odd.n<-2*(1:75)-1
	iris.train<-iris[odd.n,] # 奇数を訓練データ
	iris.test<-iris[-odd.n,] # 偶数を検証データ
	# randomForest
	library(randomForest)
	set.seed(131)
	train.x<-iris.train[,1:4]
	train.y<-as.factor(iris.train[,5])
	model.rf<-tuneRF(train.x,train.y,doBest=T)
	pred<-predict(model.rf,iris.test[,-5])
	table(iris.test[,5],pred)
	# 変数重要度を求める(xgboost との比較のため和が100になるように調整)
	print(model.rf$importance /sum(model.rf$importance) )
	varImpPlot(model.rf)
	dim(iris) # 行数:150, 列数:5
	odd.n<-2*(1:75)-1
	iris.train<-iris[odd.n,] # 奇数を訓練データ
	iris.test<-iris[-odd.n,] # 偶数を検証データ

	library(xgboost)
	y <- iris.train[,5] # 目的変数
	y <- as.integer(y)-1 #xgboost で既定されいるクラスは 0 base

	train.x<-iris.train[,1:4]
	x <- rbind(train.x,iris.test[,-5]) # xgboost を使うときのため
	x <- as.matrix(x)

	trind <- 1:length(y) # 先程定義したx の中の訓練データを指すのに使う
	teind <- (nrow(train.x)+1):nrow(x) # 先程定義したx の中の検証用データを指すのに使う
	# 今回は必要最低限のパラメータ設定
	set.seed(131) # 固定シードで試す
	param <- list("objective" = "multi:softprob", # 多クラスの分類で各クラスに所属する確率を求める
	"eval_metric" = "mlogloss", # 損失関数の設定
	"num_class" = 3 # class がいくつ存在するのか
	)
	# 最適な木の数を探す
	k<-round(1+log2(nrow(train.x)))
	cv.nround <- 100 #search
	bst.cv <- xgb.cv(param=param, data = x[trind,], label = y, nfold = k, nrounds=cv.nround)

	set.seed(131)
	nround <- 27
	# モデルの構築
	bst <- xgboost(param=param, data = x[trind,], label = y, nrounds=nround)
	pred <- predict(bst,x[teind,]) # モデルを使って予測値を算出
	pred <- matrix(pred,3,length(pred)/3) # 今回は3クラスあるので
	pred <- t(pred)
	colnames(pred)<-c("setosa","versicolor","virginica")
	head(pred,3) # 結果の確認(3行だけ)
	# どこのグループに所属するのか
	table(as.integer(iris.test[,5]),max.col(pred)) # 簡略版

	# どこのグループに属するのかを調べる
	param <- list("objective" = "multi:softmax", # multi:softmax に変更！
	"eval_metric" = "mlogloss",
	"num_class" = 3
	)
	set.seed(131)
	nround <- 27
	bst <- xgboost(param=param, data = x[trind,], label = y, nrounds=nround)
	pred <- predict(bst,x[teind,])
	# データを見やすくする
	for(i in 1:length(pred)){
	if(pred[i]==0) {pred[i]="setosa"}
	else if(pred[i]==1) {pred[i]="versicolor"}
	else {pred[i]="virginica"}
	}
	table(iris.test[,5],pred)
	# 変数重要度を求める
	imp<-xgb.importance(names(iris[,-5]),model=bst)
	print(imp)
	xgb.plot.importance(imp)
	# 決定木を表示
	xgb.plot.tree(feature_names=names(iris[,-5]),model=bst, n_first_tree=2)