Skip to content

Instantly share code, notes, and snippets.

@smrmkt
Last active August 29, 2015 14:17
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save smrmkt/1dd4a51d6d40dcff003a to your computer and use it in GitHub Desktop.
Save smrmkt/1dd4a51d6d40dcff003a to your computer and use it in GitHub Desktop.
library(randomForest)
# load data
data = read.delim("data/sample.tsv", sep="\t")
# create data for k-fold cross validation
cv = function(d, k) {
n = sample(nrow(d), nrow(d))
d.randomized = data[n,] # randomize data
n.residual = k-nrow(d)%%k
d.dummy = as.data.frame(matrix(NA, nrow=n.residual, ncol=ncol(d)))
names(d.dummy) = names(d)
d.randomized = rbind(d.randomized, d.dummy) # append dummy for residuals
d.splitted = split(d.randomized, 1:k)
for (i in 1:k) {
d.splitted[[i]] = na.omit(d.splitted[[i]])
}
d.splitted
}
# training data
cv.training = function(d, k) {
d.train = as.data.frame(matrix(0, nrow=0, ncol=ncol(d[[1]])))
names(d.train) = names(d[[1]])
for (i in 1:length(d)) {
if (i != k) {
d.train = rbind(d.train, d[[i]])
}
}
d.train
}
# test data
cv.test = function(d, k) {
d[[k]]
}
# stacking with glm
stacking = function(d, m) {
d = cbind(d, predict(m, newdata=d, type="response"))
names(d)[length(d)] = "stacking"
d
}
# check
score = function(p, r) {
s = c(0, 0, 0, 0)
for (i in 1:length(p)) {
pi = 2-as.integer(p[[i]])
ri = 2-as.integer(r[i])
s[pi*2+ri+1] = s[pi*2+ri+1]+1
}
s
}
# stacking sample
k = 10 # cross validation split number
result = c()
for (i in 1:k) {
print(i)
data.splitted = cv(data, k)
# construct predict model
data.train = cv.training(data.splitted, 1)
model.glm = glm(y~., data=data.train, family=binomial)
data.train = stacking(data.train, model.glm)
model.rf = randomForest(y~., data=data.train)
# predict with test data
data.test = cv.test(data.splitted, 1)
data.test = stacking(data.test, model.glm)
model.rf.predict = predict(model.rf, newdata=data.test, type="class")
result = rbind(result, score(model.rf.predict, data.test$y))
}
# show results
m = matrix(apply(result, 2, sum), 2, 2)
dimnames(m) = list(c("pred$p", "pred$n"), c("res$p", "res$n"))
print(m)
print(m/nrow(data))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment