klainfo/out-of-sample-bootstrap.R

## out-of-sample-bootstrap.R
library(DefectData)
library(pROC)
Data <- loadData("eclipse-2.0")
data <- Data$data
dep <- Data$dep
indep <- Data$indep

# Normality Correction
transformLog  <- function(y){ y <- log1p(y)}
data[,indep] <- data.frame(apply(data[,indep], 2, transformLog))
data[,dep] <- as.factor(ifelse(data[,dep] == "TRUE","T","F"))

performance <- NULL
for(i in seq(1,100)){
    # Generate a bootstrap sample with replacement
    indices <- sample(nrow(data),replace=TRUE)

    # Generate training dataset using a bootstrap sample
    training <- data[indices,]

    # Generate testing dataset (i.e., instances that
    # are not included in the bootstrap sample)
    testing <- data[-unique(indices),]

    # Generate model formula
    f <- as.formula(paste0(dep, " ~ ", paste0(indep,collapse = "+")))

    # Fit a prediction model using a logistic regression model
    m <- glm(f, data=training, family="binomial")

    # Extract probabilities using the testing dataset
    prob <- predict(m, testing, type="response")

    # Compute AUC performance
    performance <- c(performance, auc(testing[,dep],prob))
}

# Report the average AUC performance
mean(performance)
	library(DefectData)
	library(pROC)
	Data <- loadData("eclipse-2.0")
	data <- Data$data
	dep <- Data$dep
	indep <- Data$indep

	# Normality Correction
	transformLog <- function(y){ y <- log1p(y)}
	data[,indep] <- data.frame(apply(data[,indep], 2, transformLog))
	data[,dep] <- as.factor(ifelse(data[,dep] == "TRUE","T","F"))

	performance <- NULL
	for(i in seq(1,100)){
	# Generate a bootstrap sample with replacement
	indices <- sample(nrow(data),replace=TRUE)

	# Generate training dataset using a bootstrap sample
	training <- data[indices,]

	# Generate testing dataset (i.e., instances that
	# are not included in the bootstrap sample)
	testing <- data[-unique(indices),]

	# Generate model formula
	f <- as.formula(paste0(dep, " ~ ", paste0(indep,collapse = "+")))

	# Fit a prediction model using a logistic regression model
	m <- glm(f, data=training, family="binomial")

	# Extract probabilities using the testing dataset
	prob <- predict(m, testing, type="response")

	# Compute AUC performance
	performance <- c(performance, auc(testing[,dep],prob))
	}

	# Report the average AUC performance
	mean(performance)