Last active
August 16, 2017 01:56
-
-
Save idrissrasheed/55557c1c633b474a572a9f7e7bf83ec4 to your computer and use it in GitHub Desktop.
Pima Diabetes Code
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--- | |
title: "Pima Indian Women Code" | |
author: "Idris Rasheed" | |
date: "August 12, 2017" | |
output: pdf_document | |
--- | |
```{r setup, include=FALSE} | |
knitr::opts_chunk$set(echo = TRUE) | |
``` | |
```{r, echo=FALSE} | |
library(corrplot) | |
library(e1071) | |
library(RSNNS) | |
library(caret) | |
library(AppliedPredictiveModeling) | |
library(mice) | |
library(randomForest) | |
library(VIM) | |
library(kernlab) | |
``` | |
Reading Data | |
```{r reading} | |
#Create a vector of the feature names | |
headers <- c("TotalPregnancies", "PlasmaGlucose", "DiastolicPressure", "FoldThickness", | |
"Insulin", "BMI", "PedigreeFunction", "Age", "Diagnosis") | |
#Import data | |
url <- paste0("http://archive.ics.uci.edu/ml/machine-learning-databases/", | |
"pima-indians-diabetes/pima-indians-diabetes.data") | |
diabetes <- read.csv(url(url),header = FALSE, col.names = headers) | |
``` | |
Exploratory Analysis | |
```{r str} | |
str(diabetes) | |
``` | |
```{r pair} | |
pairs(diabetes) | |
``` | |
Loop | |
```{r change 0} | |
#Creates loops to make all 0's NA | |
for (i in 2:6){ | |
for (n in 1:nrow(diabetes)){ | |
if (diabetes[n, i] == 0){ | |
diabetes[n, i] <- NA | |
} | |
} | |
} | |
``` | |
Factor | |
```{r factor} | |
diabetes$Diagnosis <- as.factor(ifelse(diabetes$Diagnosis == 0, "NotDiabetic", "Diabetic")) | |
``` | |
Density | |
```{r densities} | |
transparentTheme(trans = .9) | |
featurePlot(x = diabetes[, 1:8], | |
y = diabetes$Diagnosis, | |
plot = "density", | |
## Pass in options to xyplot() to | |
## make it prettier | |
scales = list(x = list(relation="free"), | |
y = list(relation="free")), | |
adjust = 1.5, | |
pch = "|", | |
layout = c(4, 1), | |
auto.key = list(columns = 2)) | |
``` | |
Missing Values | |
```{r} | |
table(is.na(diabetes)) | |
``` | |
```{r agg} | |
aggr(diabetes[,2:6], cex.lab=1, cex.axis = .4, numbers = T, gap = 0) | |
``` | |
```{r scatter missing} | |
scattmatrixMiss(diabetes) | |
``` | |
Predicted Mean Matching | |
```{r temp data} | |
com.diabetes <- mice(diabetes, m = 3, method = 'pmm', seed = 100) | |
``` | |
```{r} | |
densityplot(com.diabetes) | |
``` | |
```{r form} | |
diabetes <- complete(com.diabetes) | |
``` | |
```{r corr} | |
corrplot(cor(diabetes[,-9]),type = "lower", method = "number") | |
``` | |
```{r scale} | |
diabetes[, 1:8] <- scale(diabetes[, 1:8], center = TRUE, scale = TRUE) | |
``` | |
Cross-Validation | |
```{r folds} | |
Folds <- trainControl(method = "repeatedcv", | |
number = 10, | |
repeats = 10, | |
classProbs=TRUE, | |
summaryFunction=twoClassSummary) | |
``` | |
Training and Testing | |
```{r training} | |
sampleSize <- floor(.7 * nrow(diabetes)) | |
set.seed(100) | |
Ind <- sample(seq_len(nrow(diabetes)), size = sampleSize) | |
XTrain <- diabetes[Ind, 1:8] | |
XTest <- diabetes[-Ind, 1:8] | |
YTrain <- diabetes[Ind, 9] | |
YTest <- diabetes[-Ind, 9] | |
``` | |
Linear Classification | |
```{r linear svm} | |
#Creates Linear SVM | |
linear.tune <- expand.grid(C = c(.1, 1, 10)) | |
set.seed(100) | |
linear.svm <- train(XTrain, | |
YTrain, | |
method = "svmLinear", | |
metric = "ROC", | |
trControl = Folds, | |
tuneLength = 10, | |
tuneGrid = linear.tune) | |
linear.svm | |
``` | |
```{r} | |
#Linear SVM fit | |
linear.fit <- svm(Diagnosis ~ ., | |
data=diabetes, | |
kernel="linear", | |
cost=.1) | |
summary(linear.fit) | |
``` | |
The gamma parameter at 0.125 indicates that the variance is high, but the bias is low. | |
##Radial Classfication | |
```{r radial svm} | |
radial.svm.expand <- expand.grid(sigma = c(.2, .4, .6, .8), | |
C = c(.1, 1, 5, 10, 100)) | |
set.seed(100) | |
radial.svm <- train(XTrain, | |
YTrain, | |
method = "svmRadial", | |
metric = "ROC", | |
trControl = Folds, | |
tuneGrid = radial.svm.expand) | |
radial.svm | |
``` | |
```{r radial fit} | |
radial.fit <- svm(Diagnosis ~ ., | |
data=diabetes, | |
kernel="radial", | |
cost=1, | |
sigma=0.2) | |
summary(radial.fit) | |
``` | |
##Random Forest | |
```{r randomforest} | |
rf.expand <- expand.grid(mtry = 2:8) | |
set.seed(100) | |
randomforest <- train(XTrain, | |
YTrain, | |
method = "rf", | |
metric = "ROC", | |
trControl = Folds, | |
tuneGrid = rf.expand) | |
randomforest | |
``` | |
Plots | |
```{r rf varimpplot} | |
varImpPlot(randomforest$finalModel, type = 2, main = "Random Forest") | |
``` | |
```{r linear plot BMI} | |
plot(linear.fit, diabetes, PlasmaGlucose ~ BMI) | |
``` | |
```{r linear TP} | |
plot(linear.fit, diabetes, PlasmaGlucose ~ TotalPregnancies) | |
``` | |
```{r radial plot bmi} | |
plot(radial.fit, diabetes, PlasmaGlucose ~ BMI) | |
``` | |
```{r radial plot TP} | |
plot(radial.fit, diabetes, PlasmaGlucose ~ TotalPregnancies) | |
``` | |
##Accuracy | |
```{r rf acc} | |
#RF Train accuracy | |
rf.predictTR <- predict(randomforest$finalModel, XTrain) | |
rf.train.accuracy <- mean(rf.predictTR == YTrain) | |
rf.train.accuracy | |
``` | |
```{r rf acc test} | |
#RF Test accuracy | |
rf.predictTT <- predict(randomforest$finalModel, XTest) | |
rf.test.accuracy <- mean(rf.predictTT == YTest) | |
rf.test.accuracy | |
``` | |
```{r lin acc} | |
#Linear SVM Train accuracy | |
linear.svm.predictTR <- predict(linear.svm$finalModel, XTrain) | |
linear.svm.train.accuracy <- mean(linear.svm.predictTR == YTrain) | |
linear.svm.train.accuracy | |
``` | |
```{r lin acc test} | |
#Linear SVM Train accuracy | |
linear.svm.predictTT <- predict(linear.svm$finalModel, XTest) | |
linear.svm.test.accuracy <- mean(linear.svm.predictTT == YTest) | |
linear.svm.test.accuracy | |
``` | |
```{r radial acc} | |
#Radial SVM Train accuracy | |
radial.svm.predictTR <- predict(radial.svm$finalModel, XTrain) | |
radial.svm.train.accuracy <- mean(radial.svm.predictTR == YTrain) | |
radial.svm.train.accuracy | |
``` | |
```{r rad test} | |
#Radial SVM test accuracy | |
radial.svm.predictTT <- predict(radial.svm$finalModel, XTest) | |
radial.svm.test.accuracy <- mean(radial.svm.predictTT == YTest) | |
radial.svm.test.accuracy | |
``` |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment