Skip to content

Instantly share code, notes, and snippets.

@JoeUnsung
Last active January 20, 2022 12:16
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save JoeUnsung/26e7269e55476f9064a646171cc3ca44 to your computer and use it in GitHub Desktop.
Save JoeUnsung/26e7269e55476f9064a646171cc3ca44 to your computer and use it in GitHub Desktop.
brunch_r_xgboost
## apply xgboost on otto data
## url : https://www.kaggle.com/c/otto-group-product-classification-challenge/data
## reference : https://www.analyticsvidhya.com/blog/2016/01/xgboost-algorithm-easy-steps/
install.packages("Matrix")
library(Matrix)
install.packages(c("caret", "car", "dplyr"))
library(xgboost)
library(readr)
library(stringr)
library(caret)
library(car)
library(dplyr)
setwd("C:\\Users\\ts93856\\Desktop\\datasource")
# load data
df_train <- read.csv("train.csv")
df_test <- read.csv("test.csv")
df_test <- lapply(df_test, as.numeric)
df_test <- as.data.frame(df_test)
x_test <- df_test[, -1]
## very simple way to convert categorical data into numeric data
## xgboost는 numeric만 처리할 수 있음. 명목변수 -> 연속형 변수로 변형이 필요
x <- lapply(df_train, as.numeric)
train <- as.data.frame(x)
train <- train[, -ncol(train)]
train <- train[, -1]
## made target data
y <- x$target
y <- y-1
unique(y)
?xgboost
View(data.matrix(train))
## modeling
xgb <- xgboost(data = data.matrix(train),
label = y,
eta = 0.3, ## eta 학습률, x의 움직임 (default = 0.5)
max_depth = 15, ## max_depth, decision tree가 몇번 들어가는지
nround=25, ## nround 최대로 iteration 돌아가는 횟수
subsample = 1,
colsample_bytree = 0.5,
seed = 1,
eval_metric = "merror", ## 랜덤포레스트의 평가 지표
objective = "multi:softprob",
num_class = 9,
nthread = 3
)
xgb
## scoring
y_pred <- predict(xgb, data.matrix(x_test), na.action = na.pass)
sum(y_pred)
## prediction
test_prediction <- matrix(y_pred, nrow = 9,
ncol=length(y_pred)/9) %>%
t() %>%
data.frame() %>%
mutate(label = 1,
max_prob = max.col(., "last"))
head(test_prediction, 3)
result <- test_prediction$max_prob
## submission file 작성
sub_csv <- matrix(0, nrow = nrow(x) , ncol = 9)
x <- data.frame(sample_sub , result)
for (i in 1:nrow(x)) {
if (x$result[i] == 1){
sub_csv[i,] <- c(1,0,0,0,0,0,0,0,0)
} else if (x$result[i] == 2){
sub_csv[i,] <- c(0,1,0,0,0,0,0,0,0)
} else if (x$result[i] == 3){
sub_csv[i,] <- c(0,0,1,0,0,0,0,0,0)
} else if (x$result[i] == 4){
sub_csv[i,] <- c(0,0,0,1,0,0,0,0,0)
} else if (x$result[i] == 5){
sub_csv[i,] <- c(0,0,0,0,1,0,0,0,0)
} else if (x$result[i] == 6){
sub_csv[i,] <- c(0,0,0,0,0,1,0,0,0)
} else if (x$result[i] == 7){
sub_csv[i,] <- c(0,0,0,0,0,0,1,0,0)
} else if (x$result[i] == 8){
sub_csv[i,] <- c(0,0,0,0,0,0,0,1,0)
} else {
sub_csv[i,] <- c(0,0,0,0,0,0,0,0,1)
}
}
id <- seq(1:nrow(x))
result_submission <- data.frame(id, sub_csv)
write.csv(result_submission,"sampleSubmission.csv")
colnames(result_submission) <- c("id", "Class_1","Class_2",
"Class_3",
"Class_4",
"Class_5",
"Class_6",
"Class_7",
"Class_8",
"Class_9")
nrow(result_submission)
colnames(result_submission)
## 가장 중요한 변수가 무엇인지 시각화하여 확인
# Lets start with finding what the actual tree looks like
model <- xgb.dump(xgb, with.stats = T)
model[1:10] #This statement prints top 10 nodes of the model
# Get the feature real names
names <- dimnames(data.matrix(train))[[2]]
# Compute feature importance matrix
importance_matrix <- xgb.importance(names, model = xgb)
# Nice graph
xgb.plot.importance(importance_matrix[1:10,], top_n = 10)
## 검정 과정
# pearson's validation
test <- chisq.test(train$feat_11, data)
print(test)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment