tobigithub/caret.R

## caret.R
# Example of training a glm model on a spam data-set, using the caret library.
library(caret)
library(kernlab)

# Load spam dataset.
data(spam)

# Split the data into a training/test set by 60% training/40% test.
inTrain <- createDataPartition(y = spam$type, p=0.6, list=FALSE)
training <- spam[inTrain,]
testing <- spam[-inTrain,]

# Set random seed value to ensure consistent training results each time.
set.seed(12345)

# Train the model.
fit <- train(type ~ ., data=training, method='glm')
# Show statistical significance of coefficients (terms).
summary(fit)
# Show accuracy on training set.
fit

# Run model on test set.
results <- predict(fit, newdata = testing)

# Show accuracy on test set.
confusionMatrix(results, testing$type)


##########
# Quiz 2 #
##########

# 1. Which of the following commands will create training and test sets with about 50% of the observations assigned to each?
library(AppliedPredictiveModeling)
library(caret)
data(AlzheimerDisease)

adData = data.frame(diagnosis,predictors)
trainIndex = createDataPartition(diagnosis, p = 0.50,list=FALSE)
training = adData[trainIndex,]
testing = adData[-trainIndex,]

# Show result is 50%/50%.
nrow(training) / nrow(adData)
nrow(testing) / nrow(adData)

# 2.Make a histogram and confirm the SuperPlasticizer variable is skewed. Normally you might use the log transform to try to make the data more symmetric. Why would that be a poor choice for this variable?

library(AppliedPredictiveModeling)
data(concrete)
library(caret)
set.seed(1000)
inTrain = createDataPartition(mixtures$CompressiveStrength, p = 3/4)[[1]]
training = mixtures[ inTrain,]
testing = mixtures[-inTrain,]

# Display histogram, notice skew.
hist(training$Superplasticizer)

# Display histogram with log, looks better, but ..
hist(log(training$Superplasticizer))

# Notice -Inf values. No good.
head(log(training$Superplasticizer))

# 3. Find all the predictor variables in the training set that begin with IL. Perform principal components on these variables with the preProcess() function from the caret package. Calculate the number of principal components needed to capture 90% of the variance. How many are there?

library(caret)
library(AppliedPredictiveModeling)
set.seed(3433)
data(AlzheimerDisease)
adData = data.frame(diagnosis,predictors)
inTrain = createDataPartition(adData$diagnosis, p = 3/4)[[1]]
training = adData[ inTrain,]
testing = adData[-inTrain,]

# Get columns starting with IL.
data <- training[, grep('^IL', names(training))]

# Run PCA and print out number of components needed (9).
preProcess(data, method='pca', thresh=0.9)

# 4. Create a training data set consisting of only the predictors with variable names beginning with IL and the diagnosis. Build two predictive models, one using the predictors as they are and one using PCA with principal components explaining 80% of the variance in the predictors. Use method="glm" in the train function. What is the accuracy of each method in the test set? Which is more accurate?
library(caret)
library(AppliedPredictiveModeling)
set.seed(3433)
data(AlzheimerDisease)
adData = data.frame(diagnosis,predictors)
inTrain = createDataPartition(adData$diagnosis, p = 3/4)[[1]]
training = adData[ inTrain,]
testing = adData[-inTrain,]

set.seed(3433)
# Get columns starting with IL.
training2 <- adData[, grep('^IL|diagnosis', names(training))]

# Create training and testing set.
inTrain = createDataPartition(training2$diagnosis, p = 3/4)[[1]]
training = training2[inTrain, ]
testing = training2[-inTrain, ]

set.seed(3433)
# Traing model with a generalized linear model.
fit1 <- train(diagnosis ~ ., data=training, method='glm')

# Get result on testing set and display accuracy.
predictions <- predict(fit1, newdata = testing)
confusionMatrix(predictions, testing$diagnosis)

# Do it again, this time while pre-processing with PCA.
fit2 <- train(training$diagnosis ~ ., method = "glm", preProcess = "pca", data = training, trControl = trainControl(preProcOptions = list(thresh = 0.8)))

# Get result on testing set and display accuracy.
predictions2 <- predict(fit2, newdata = testing)
confusionMatrix(predictions2, testing$diagnosis)
	# Example of training a glm model on a spam data-set, using the caret library.
	library(caret)
	library(kernlab)

	# Load spam dataset.
	data(spam)

	# Split the data into a training/test set by 60% training/40% test.
	inTrain <- createDataPartition(y = spam$type, p=0.6, list=FALSE)
	training <- spam[inTrain,]
	testing <- spam[-inTrain,]

	# Set random seed value to ensure consistent training results each time.
	set.seed(12345)

	# Train the model.
	fit <- train(type ~ ., data=training, method='glm')
	# Show statistical significance of coefficients (terms).
	summary(fit)
	# Show accuracy on training set.
	fit

	# Run model on test set.
	results <- predict(fit, newdata = testing)

	# Show accuracy on test set.
	confusionMatrix(results, testing$type)


	##########
	# Quiz 2 #
	##########

	# 1. Which of the following commands will create training and test sets with about 50% of the observations assigned to each?
	library(AppliedPredictiveModeling)
	library(caret)
	data(AlzheimerDisease)

	adData = data.frame(diagnosis,predictors)
	trainIndex = createDataPartition(diagnosis, p = 0.50,list=FALSE)
	training = adData[trainIndex,]
	testing = adData[-trainIndex,]

	# Show result is 50%/50%.
	nrow(training) / nrow(adData)
	nrow(testing) / nrow(adData)

	# 2.Make a histogram and confirm the SuperPlasticizer variable is skewed. Normally you might use the log transform to try to make the data more symmetric. Why would that be a poor choice for this variable?

	library(AppliedPredictiveModeling)
	data(concrete)
	library(caret)
	set.seed(1000)
	inTrain = createDataPartition(mixtures$CompressiveStrength, p = 3/4)[[1]]
	training = mixtures[ inTrain,]
	testing = mixtures[-inTrain,]

	# Display histogram, notice skew.
	hist(training$Superplasticizer)

	# Display histogram with log, looks better, but ..
	hist(log(training$Superplasticizer))

	# Notice -Inf values. No good.
	head(log(training$Superplasticizer))

	# 3. Find all the predictor variables in the training set that begin with IL. Perform principal components on these variables with the preProcess() function from the caret package. Calculate the number of principal components needed to capture 90% of the variance. How many are there?

	library(caret)
	library(AppliedPredictiveModeling)
	set.seed(3433)
	data(AlzheimerDisease)
	adData = data.frame(diagnosis,predictors)
	inTrain = createDataPartition(adData$diagnosis, p = 3/4)[[1]]
	training = adData[ inTrain,]
	testing = adData[-inTrain,]

	# Get columns starting with IL.
	data <- training[, grep('^IL', names(training))]

	# Run PCA and print out number of components needed (9).
	preProcess(data, method='pca', thresh=0.9)

	# 4. Create a training data set consisting of only the predictors with variable names beginning with IL and the diagnosis. Build two predictive models, one using the predictors as they are and one using PCA with principal components explaining 80% of the variance in the predictors. Use method="glm" in the train function. What is the accuracy of each method in the test set? Which is more accurate?
	library(caret)
	library(AppliedPredictiveModeling)
	set.seed(3433)
	data(AlzheimerDisease)
	adData = data.frame(diagnosis,predictors)
	inTrain = createDataPartition(adData$diagnosis, p = 3/4)[[1]]
	training = adData[ inTrain,]
	testing = adData[-inTrain,]

	set.seed(3433)
	# Get columns starting with IL.
	training2 <- adData[, grep('^IL\|diagnosis', names(training))]

	# Create training and testing set.
	inTrain = createDataPartition(training2$diagnosis, p = 3/4)[[1]]
	training = training2[inTrain, ]
	testing = training2[-inTrain, ]

	set.seed(3433)
	# Traing model with a generalized linear model.
	fit1 <- train(diagnosis ~ ., data=training, method='glm')

	# Get result on testing set and display accuracy.
	predictions <- predict(fit1, newdata = testing)
	confusionMatrix(predictions, testing$diagnosis)

	# Do it again, this time while pre-processing with PCA.
	fit2 <- train(training$diagnosis ~ ., method = "glm", preProcess = "pca", data = training, trControl = trainControl(preProcOptions = list(thresh = 0.8)))

	# Get result on testing set and display accuracy.
	predictions2 <- predict(fit2, newdata = testing)
	confusionMatrix(predictions2, testing$diagnosis)