Skip to content

Instantly share code, notes, and snippets.

@hadley
Created August 9, 2018 15:29
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save hadley/2f8f43a7e1aff3202eff78d8d7502eba to your computer and use it in GitHub Desktop.
Save hadley/2f8f43a7e1aff3202eff78d8d7502eba to your computer and use it in GitHub Desktop.
The first R script I can find on my computer
library(class)
library(mass)
library(mva)
tst <- read.table("e:/uni/stats766/puktest.txt", header = TRUE)
tst.v <- tst[,1:7]
tst.g <- tst[,8]
trn <- read.table("e:/uni/stats766/puktrain.txt", header = TRUE)
trn.v <- trn[,1:7]
trn.g <- trn[,8]
all.v <- rbind(tst, trn)
pca <- princomp(all.v[,1:7])
eqscplot(pca$scores[21:103,], pch=as.character(trn.g), cex=0.7)
points(pca$scores[1:20,], pch=as.character(tst.g), cex=0.7, col="red")
plot(pca$scores, pch=as.character(trn.g), cex=0.7)
plot(trn.v, pch=as.character(trn.g), cex=0.7)
m <- trn.v[grep("M",as.character(trn.g)),]
f <- trn.v[grep("F",as.character(trn.g)),]
mdev <- data.frame(V1 = m$V1 - median(m$V1), V2 = m$V2 - median(m$V2), V3 = m$V3 - median(m$V3), V4 = m$V4 - median(m$V4), V5 = m$V5 - median(m$V5) ,V6 = m$V6 - median(m$V6) ,V7 = m$V7 - median(m$V7))
mdev <- abs(mdev)
fdev <- data.frame(V1 = f$V1 - median(f$V1), V2 = f$V2 - median(f$V2), V3 = f$V3 - median(f$V3), V4 = f$V4 - median(f$V4), V5 = f$V5 - median(f$V5) ,V6 = f$V6 - median(f$V6) ,V7 = f$V7 - median(f$V7))
fdev <- abs(fdev)
g <- factor(rep(c("M","F"),c(43,40)))
mfdev <- rbind(mdev, fdev)
plot(trn.v$V3, trn.v$V7, pch=as.character(trn.g), cex=0.7)
abline(v=29.75)
r <- rpart(trn.g ~ trn.v$V1 + trn.v$V2 + trn.v$V3 + trn.v$V4 + trn.v$V5 + trn.v$V6 + trn.v$V7)
rpred <- predict(r, trn.v, type="class")
calcError <- function(pred, act = trn.g) {
group.count <- tapply(act, act, length)
f <- group.count["F"]
m <- group.count["M"]
if (is.na(f)) {
f <- 0
}
if (is.na(m)) {
m <- 0
}
false <- act[pred != act]
list(error = false, m = m, f = f)
}
dispError <- function(e) {
f <- e$f
m <- e$m
t <- f + m
false <- e$error
false.count <- tapply(false, false, length)
f.false <- false.count["F"]
m.false <- false.count["M"]
t.false <- f.false + m.false
cat("Incorrect classification: \n")
cat("M: ", m.false, "/", m, " (", format(m.false / m * 100, digits = 3), "%)\n")
cat("F: ", f.false, "/", f, " (", format(f.false / f * 100, digits = 3), "%)\n")
cat("T: ", t.false, "/", t, " (", format(t.false / t * 100, digits = 3), "%)\n")
}
samplevector <- function(x, n) {
size <- dim(x)[1]
if (missing(n)) {
n <- size
}
d <- dim(x)[2]
v <- x[1:7, ]
g <- x[8]
bootstrap <- data.frame(x[floor(runif(1,0,1) * size) + 1, ])
#bootstrap <- matrix(0, n, d)
for(i in 1:(n-1)) {
i <- floor(runif(1,0,1) * size) + 1
bootstrap = rbind(bootstrap, x[i, ])
#bootstrap[i, ] = x[floor(runif(1,0,1) * size) + 1, ]
}
bootstrap
}
samplevectorrand <- function(x, n) {
size <- dim(x)[1]
if (missing(n)) {
n <- size
}
d <- dim(x)[2]
bootstrap <- data.frame(x[floor(runif(1,0,1) * size) + 1, ])
for(i in 1:(n-1)) {
bootstrap = rbind(bootstrap, x[floor(runif(1,0,1) * size) + 1, ])
}
bootstrap
}
crossvalidate <- function(train, group, func, ...) {
e <- func(train, group, CV=true, ...)
dispError(e)
}
chv.lda <- function(train, group) {
result1 <- lda(x = train[1:41, ], grouping = group[1:41 ])
result2 <- lda(x = train[42:83, ], grouping = group[42:83 ])
pred1 <- predict(result1, train[42:83, ])$class
pred2 <- predict(result2, train[1:41 , ])$class
pred <- rbind(pred2, pred1)
dispError(calcError(pred))
}
chv.qda <- function(train, group) {
result1 <- qda(x = train[1:41, ], grouping = group[1:41 ])
result2 <- qda(x = train[42:83, ], grouping = group[42:83 ])
pred1 <- predict(result1, train[42:83, ])$class
pred2 <- predict(result2, train[1:41 , ])$class
pred <- c(as.character(pred2), as.character(pred1))
dispError(calcError(pred))
}
chv.qda <- function(train, group, k) {
pred1 <- knn(train = train[1:41, ], test = train[42:83, ], cl = group[1:41],k)
pred2 <- knn(train = train[42:83, ], test = train[1:41 , ], cl = group[42:83],k)
pred <- c(as.character(pred2), as.character(pred1))
dispError(calcError(pred))
}
crossvalidate.lda <- function(train, group) {
crossvalidate(train, group, cv.lda.err)
}
crossvalidate.qda <- function(train, group) {
n <- dim(train)[1]
classes <- vector("numeric", n)
for (i in 1:n) {
result <- qda(x = train[-i, ], grouping = group[-i])
classes[i] <- as.character(predict(result, train[i, ])$class)
}
dispError(calcError(classes))
}
crossvalidate.knn <- function(train, group ,k ) {
n <- dim(train)[1]
classes <- vector("numeric", n)
for (i in 1:n) {
classes[i] <- as.character(knn(train = train[-i, ], test = train[i, ], cl = group[-i], k))
}
dispError(calcError(classes))
}
resample <- function(train, group, func, ...) {
e <- func(train, group, ...)
dispError(e)
}
resample.lda <- function(train, group) {
resample(train, group, cv.lda.err)
}
resample.qda <- function(train, group) {
resample(train, group, cv.qda.err)
}
resample.knn <- function(train, group, k) {
resample(train, group, cv.knn.err, k = k)
}
clv.lda.err <- function(sample, sampleg, orig = sample, origg = sampleg) {
result <- lda(x = sample, grouping = sampleg, CV = true)
pred <- predict(result, orig)$class
calcError(pred, origg)
}
cv.qda.err <- function(sample, sampleg, orig = sample, origg = sampleg) {
result <- qda(x = sample, grouping = sampleg)
pred <- predict(result, orig)$class
calcError(pred, origg)
}
cv.knn.err <- function(sample, sampleg, orig = sample, origg = sampleg, k) {
pred <- knn(train = sample, test = orig, cl = sampleg, k)
calcError(pred, origg)
}
#want to estimate bias of resubstitution method
bootstrap <- function(train, group, func, n, ...) {
combine <- cbind(train, group)
dif = c(n)
for (i in 1:n) {
sv <- samplevector(combine)
sample <- sv[, 1:7]
sampleg <- sv[, 8]
boot <- func(sample, sampleg, train, group, ...)
orig <- func(sample, sampleg, ...)
boot.e <- length(boot$error)
orig.e <- length(orig$error)
# orig is biased
# boot is unbiased
# orig - boot is estimate of bias
dif[i] = orig.e - boot.e
}
hist(dif)
invisible(dif)
}
bootstrap.knn <- function(train, group, n, k ) {
bootstrap(train, group, cv.knn.err, n, k = k)
}
bootstrap.qda <- function(train, group, n) {
bootstrap(train, group, cv.qda.err, n)
}
bootstrap.qda <- function(train, group, n) {
bootstrap(train, group, cv.qda.err, n)
}
> boot <- bootstrap.lda(trn.v, trn.g, n=1000)
> var(boot)
[1] 3.643763
> quantile(c(0.025,0.975))
0% 25% 50% 75% 100%
0.0250 0.2625 0.5000 0.7375 0.9750
> ?quantile
> quantile(boot,c(0.025,0.975))
2.5% 97.5%
-2 5
> mean(boot)
[1] 1.391
>
@dan-reznik
Copy link

your indenting wasn't bad even way back when

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment