Created
March 16, 2017 13:32
-
-
Save ratsgo/e426e92e4a88dca8f8a3f59d41f891be to your computer and use it in GitHub Desktop.
Rotation Forest
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(rpart) | |
library(Matrix) | |
library(stringr) | |
#################################### | |
# function # | |
#################################### | |
regression.perf_eval <- function(real,hat) { | |
MSE <- mean((real - hat)^2) | |
RMSE <- sqrt(MSE) | |
MAE <- mean(abs(real - hat)) | |
MAPE <- mean(abs((real - hat)/real)) * 100 | |
result <- c(MSE,RMSE,MAE,MAPE) | |
return(result) | |
} | |
classification.perf_eval <- function(real,hat) { | |
tmp <- union(as.numeric(unique(real)), as.numeric(unique(hat))) | |
tmp <- factor(tmp[order(tmp,decreasing = F)]) | |
cm <- table(factor(as.integer(factor(real)),tmp),factor(hat,tmp)) | |
ACC <- sum(diag(cm)) / sum(cm) | |
} | |
rotationForest <- function(data,k=3,numTrees=30,bootstrapRate=0.75,type="class"){ | |
#################################### | |
# function 1 # | |
# split feature set into k subsets # | |
#################################### | |
getRandomSubset <- function(x, y, k){ | |
#check that k is proper | |
if(k > length(x)) { | |
k <- length(x) | |
} | |
numSubset <- round(length(x)/k) | |
subsetList <- list() | |
#check that number of subsets | |
if(numSubset > length(x)){ | |
numSubset <- length(x) | |
} | |
#give features names for indentifing the original sequence of features | |
colnames(x) <- paste0("X", 1:length(x)) | |
#shuffling features randomly | |
x <- subset(x, select = sample(1:length(x))) | |
#pick out k-1 subsets | |
i <- 1 | |
while(i < k){ | |
if (is.null(dim(x)) == TRUE) { | |
subsetList[[i]] <- as.matrix(data.frame(x, y)) | |
} else { | |
subsetList[[i]] <- cbind(x[ , 1:numSubset], y) | |
x <- x[ , -(1:numSubset)] | |
} | |
i <- i+1 | |
} | |
subsetList[[k]] <- cbind(x, y) | |
#pick out the last subset (kth subset) | |
# The last kth subset may include more or less records than the other subsets | |
# when the number of subets is not an integer | |
return(subsetList) | |
} | |
#################################### | |
# function 2 # | |
# bootstrapping # | |
#################################### | |
bootstrap <- function(subsetList, bootstrapRate){ | |
bootIdxList <- list() | |
bootSubsetList <- list() | |
for(i in 1:length(subsetList)){#for-loop begins | |
#bootstrap sampling | |
numRecord <- nrow(data.frame(subsetList[[i]])) | |
bootIdxList[[i]] <- sample(1:numRecord, round(bootstrapRate*numRecord), replace = TRUE) | |
#save a bootstrapped subset in the 'bootSubsetList' | |
bootSubsetList[[i]] <- data.frame(subsetList[[i]][bootIdxList[[i]], ]) | |
names(bootSubsetList)[i] <- paste0("bootstraped", i) | |
}#for-loop ends | |
return(bootSubsetList) | |
} | |
#################################### | |
# function 3 # | |
# PCA # | |
#################################### | |
PCA <- function(bootSubsetList){ | |
bootSubsetListRd <- lapply(bootSubsetList, function(x){subset(x, select = -y)}) | |
PCAcomp <- lapply(bootSubsetListRd, function(x){prcomp(x, center = TRUE)$rotation}) | |
PCAcompT <- lapply(PCAcomp, t) | |
PCArrangement <- as.vector(unlist(lapply(PCAcompT, colnames))) | |
#'bdiag' is a function included in the R package 'Matrix' | |
#It returns block-shaped diagonal matrix which is sparse | |
#rotationMatrix <- as.matrix(do.call(bdiag, lapply(PCAcompT, as.matrix))) | |
rotationMatrix <- as.matrix(bdiag(lapply(PCAcompT, as.matrix))) | |
colnames(rotationMatrix) <- PCArrangement | |
rotationMatrix <- subset(rotationMatrix, select = sort(PCArrangement)) | |
return(as.matrix(rotationMatrix)) | |
} | |
#################################### | |
# function 4 # | |
# initialize Variables # | |
#################################### | |
init.var <- function (data) { | |
x <- data.frame(data[,-dim(data)[2]],stringsAsFactors = T) | |
y <- data[,dim(data)[2]] | |
y.level <- levels(as.factor(y)) | |
y <- as.integer(as.factor(y)) | |
dummy.location <- which(sapply(x,function(x){class(x)}) == "factor") | |
if (length(dummy.location) > 0) { | |
dummy.matrix <- matrix(nrow=dim(x)[1]) | |
for (j in 1:length(dummy.location)) { | |
dummy.tmp <- as.matrix(model.matrix(~factor(x[,dummy.location[j]]))[,-1]) | |
if (dim(dummy.tmp)[2] > 1 & is.null(colnames(dummy.tmp)) == FALSE & nlevels(x[,dummy.location[j]]) == (ncol(dummy.tmp)+1)) { | |
colnames(dummy.tmp) <- paste0(colnames(x)[dummy.location[j]],"_",levels(x[,dummy.location[j]]))[2:nlevels(x[,dummy.location[j]])] | |
} | |
dummy.matrix <- cbind(dummy.matrix,dummy.tmp) | |
} | |
x <- data.frame(dummy.matrix[,-1],x[,-dummy.location]) | |
} | |
return(list(x,y,data.frame(id=1:length(y.level),Y=y.level))) | |
} | |
#################################### | |
# main function # | |
#################################### | |
input.tmp <- init.var(data) | |
x <- input.tmp[[1]] | |
y <- input.tmp[[2]] | |
Yclass <- input.tmp[[3]] | |
PCArfList <- list() | |
RmxList <- list() | |
for(i in 1:numTrees){ | |
#generate a rotation matrix | |
subsetList <- getRandomSubset(x = x, y = y, k = k) | |
bootSubsetList <- bootstrap(subsetList = subsetList, bootstrapRate = bootstrapRate) | |
RmxPCA <- PCA(bootSubsetList) | |
#build a tree model and then put it in the list; PCArfList | |
xRy <- cbind(data.frame(as.matrix(x) %*% RmxPCA), y) | |
PCArfList[[i]] <- rpart(y ~., method = type, data = xRy) | |
RmxList[[i]] <- RmxPCA | |
} | |
return(list(x=xRy,y=Yclass,model=PCArfList,PC=RmxList)) | |
} | |
rf.predict <- function (model,newdata,method="max.prop",type="class") { | |
#################################### | |
# function 1 # | |
# initialize Variables # | |
#################################### | |
init.var <- function (data) { | |
x <- data.frame(data[,-dim(data)[2]],stringsAsFactors = T) | |
y <- data[,dim(data)[2]] | |
y.level <- levels(as.factor(y)) | |
y <- as.integer(as.factor(y)) | |
dummy.location <- which(sapply(x,function(x){class(x)}) == "factor") | |
if (length(dummy.location) > 0) { | |
dummy.matrix <- matrix(nrow=dim(x)[1]) | |
for (j in 1:length(dummy.location)) { | |
dummy.tmp <- as.matrix(model.matrix(~factor(x[,dummy.location[j]]))[,-1]) | |
if (dim(dummy.tmp)[2] > 1 & is.null(colnames(dummy.tmp)) == FALSE & nlevels(x[,dummy.location[j]]) == (ncol(dummy.tmp)+1)) { | |
colnames(dummy.tmp) <- paste0(colnames(x)[dummy.location[j]],"_",levels(x[,dummy.location[j]]))[2:nlevels(x[,dummy.location[j]])] | |
} | |
dummy.matrix <- cbind(dummy.matrix,dummy.tmp) | |
} | |
x <- data.frame(dummy.matrix[,-1],x[,-dummy.location]) | |
} | |
return(list(x,y,data.frame(id=1:length(y.level),Y=y.level))) | |
} | |
#################################### | |
# function 2 # | |
# max probability evaluation # | |
#################################### | |
max.prob <- function(model,newdata) { | |
pca.data <- matrix(nrow=dim(newdata)[1],ncol=dim(newdata)[2]) | |
result <- matrix(0,nrow=dim(newdata)[1],ncol=length(unique(model$model[[1]]$y))) | |
for (i in 1:length(model[[3]])) { | |
pca.data <- data.frame(as.matrix(newdata) %*% as.matrix(model$PC[[i]])) | |
result <- result + predict(model$model[[i]], pca.data, type="prob") | |
} | |
yhat <- apply(result,1,function(x){which.max(x)}) | |
prob <- data.frame(result/length(model$model)) | |
return(list(class=yhat,probability=result)) | |
} | |
#################################### | |
# function 3 # | |
# majority vote evaluation # | |
#################################### | |
max.vote <- function(model,newdata) { | |
pca.data <- matrix(nrow=dim(newdata)[1],ncol=dim(newdata)[2]) | |
result <- matrix(0,nrow=dim(newdata)[1],ncol=length(model$model)) | |
for (i in 1:length(model$model)) { | |
pca.data <- data.frame(as.matrix(newdata) %*% as.matrix(model$PC[[i]])) | |
result[,i] <- predict(model$model[[i]], pca.data, type="class") | |
} | |
yhat <- apply(result,1,function(x){names(table(x))[1]}) | |
return(yhat) | |
} | |
#################################### | |
# function 4 # | |
# regression evaluation # | |
#################################### | |
regression <- function(model,newdata) { | |
pca.data <- matrix(nrow=dim(newdata)[1],ncol=dim(newdata)[2]) | |
result <- matrix(0,nrow=dim(newdata)[1],ncol=length(model$model)) | |
for (i in 1:length(model[[3]])) { | |
pca.data <- data.frame(as.matrix(newdata) %*% as.matrix(model$PC[[i]])) | |
result[,i] <- predict(model$model[[i]], pca.data, type="vector") | |
} | |
result <- apply(result,1,function(x){mean(x)}) | |
return(result) | |
} | |
#################################### | |
# main function # | |
#################################### | |
if (type == "class" & method == "max.vote") { | |
input.tmp <- init.var(newdata) | |
data <- input.tmp[[1]] | |
result <- max.vote(model,data) | |
return(result) | |
} | |
if (type == "class" & method == "max.prob") { | |
input.tmp <- init.var(newdata) | |
data <- input.tmp[[1]] | |
result <- max.prob(model,data) | |
return(result) | |
} | |
if (type == "regression") { | |
input.tmp <- init.var(newdata) | |
data <- input.tmp[[1]] | |
result <- regression(model,data) | |
return(result) | |
} | |
} | |
#################################### | |
# regression # | |
#################################### | |
data <- read.csv('filename.csv') | |
trn_idx <- sample(1:dim(data)[1], round(0.7*dim(data)[1])) | |
trn.data <- data[trn_idx,] | |
test.data <- data[-trn_idx,] | |
model <- rotationForest(trn.data,k=round(dim(data)[2]/3),type="anova") | |
prediction <- rf.predict(model,test.data,type="regression") | |
regression.perf_eval(test.data[,dim(test.data)[2]],prediction) | |
#################################### | |
# classification # | |
#################################### | |
data <- read.csv('filename.csv') | |
trn_idx <- sample(1:dim(data)[1], round(0.7*dim(data)[1])) | |
trn.data <- data[trn_idx,] | |
test.data <- data[-trn_idx,] | |
model <- rotationForest(trn.data,k=round(dim(data)[2]/3),type="class") | |
prediction <- rf.predict(model,test.data,type="class",method == "max.prob") | |
classification.perf_eval(test.data[,dim(test.data)[2]],prediction) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment