Last active
October 4, 2020 16:10
-
-
Save BroVic/01c9b192e672655da496b4c244dde321 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(dslabs) | |
library(dplyr) | |
library(lubridate) | |
library(caret) | |
data("reported_heights") | |
dat <- | |
mutate(reported_heights, date_time = ymd_hms(time_stamp)) %>% | |
filter(date_time >= make_date(2016, 01, 25) & | |
date_time < make_date(2016, 02, 1)) %>% | |
mutate(type = ifelse( | |
day(date_time) == 25 & | |
hour(date_time) == 8 & | |
between(minute(date_time), 15, 30), | |
'inclass', | |
'online' | |
)) %>% | |
select(sex, type) | |
y <- factor(dat$sex, c("Female", "Male")) | |
x <- factor(dat$type) | |
## Solution code | |
dat %>% group_by(type) %>% summarise(prop_female = mean(sex == 'Female')) | |
## Explore the data a little | |
dat %>% | |
group_by(type) %>% | |
summarise(Av.Male = mean(sex == "Male"), Av.Female = mean(sex == "Female")) | |
## Guessing | |
y_hat <- sample(c("Male", "Female"), length(index), replace = TRUE) %>% | |
factor(levels = levels(y)) | |
mean(y_hat == y) | |
## Overall Accuracy | |
y_hat <- ifelse(train$type == 'online', 'Male', 'Female') %>% | |
factor(levels = levels(y)) | |
mean(y_hat == y) | |
## use 'table()' to creae a confusion matrix | |
table(predicted = y_hat, actual = y) | |
## Compute the sensitivity | |
sensitivity(y_hat, y) | |
## Prevalence of females | |
## My solution | |
sum("Female" %in% y) / length(y) | |
## Given solution | |
mean(y == "Female") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(caret) | |
library(tidyverse) | |
data("iris") | |
iris <- iris[-which(iris$Species == 'setosa'), ] | |
y <- iris$Species | |
## Create evenly partitioned data | |
set.seed(2) | |
test_index <- createDataPartition(y, times = 1, p = .5, list = FALSE) | |
test <- iris[test_index, ] | |
train <- iris[-test_index, ] | |
## Assess overall accuracy for the features and find | |
## the one feature that has the highest overall accuracy | |
## My solution: | |
features <- colnames(train)[-5] | |
structure(map(features, function(x) { | |
dat <- train %>% | |
group_by(Species) %>% | |
summarise(average = mean(Sepal.Length), sd = sd(Sepal.Length)) | |
ind_chosen <- which.max(dat$average) | |
av <- dat$average[ind_chosen] | |
sd <- dat$sd[ind_chosen] | |
y_hat <- ifelse(train[[x]] > (av - (2 * sd)), "virginica", "versicolor") | |
mean(y_hat == train$Species) | |
}), names = features) | |
## Given solution (with modifications to enable plotting) | |
foo <- function(x) { | |
rangedValues <- seq(range(x)[1], range(x)[2], by = 0.1) | |
acc <- sapply(rangedValues, function(i) { | |
y_hat <- ifelse(x > i, 'virginica', 'versicolor') | |
mean(y_hat == train$Species) | |
}) | |
print(qplot(rangedValues, acc, geom = c('line', 'point'))) | |
acc | |
} | |
predictions <- apply(train[ , -5], 2, foo) | |
sapply(predictions, max) | |
## Overall accuracy for the test data | |
values <- seq(range(train$Petal.Length)[1], range(train$Petal.Length)[2], by = 0.1) | |
smart_cutoff <- values[which.max(predictions$Petal.Length)] | |
smart_cutoff | |
y_hat <- ifelse(test$Petal.Length > smart_cutoff, "virginica", "versicolor") | |
mean(y_hat == test$Species) | |
## Check to see which feature will optimize our predictions, using the test data | |
predictions_w <- apply(test[ , -5], 2, foo) | |
sapply(predictions, max) | |
## Check overall accuracy using a combination of petal length and petal width | |
values_w <- seq(range(train$Petal.Width)[1], range(train$Petal.Width)[2], by = 0.1) | |
smart_cutoff_w <- values_w[which.max(predictions_w$Petal.Width)] | |
smart_cutoff_w | |
y_hat <- ifelse( | |
(test$Petal.Length > smart_cutoff | test$Petal.Width > smart_cutoff_w), | |
'virginica', | |
'versicolor' | |
) | |
mean(y_hat == test$Species) | |
## Correct result obtained but here is the given solution for the exercise | |
## Note that it starts afresh, but this is totally unnecessary | |
data('iris') | |
iris <- iris[-which(iris$Species == 'setosa'), ] | |
y <- iris$Species | |
plot(iris, pch = 21, bg = iris$Species) | |
set.seed(2) | |
test_index <- createDataPartition(y, times = 1, p = 0.5, list = FALSE) | |
test <- iris[test_index, ] | |
train <- iris[-test_index, ] | |
petalLengthRange <- | |
seq(range(train[, 3])[1], range(train[, 3])[2], by = 0.1) | |
petalWidthRange <- | |
seq(range(train[, 4])[1], range(train[, 4])[2], by = 0.1) | |
cutoffs <- expand.grid(petalLengthRange, petalWidthRange) | |
id <- sapply(seq(nrow(cutoffs)), function(i) { | |
y_hat <- | |
ifelse(train[, 3] > cutoffs[i, 1] | | |
train[, 4] > cutoffs[i, 2], 'virginica', 'versicolor') | |
mean(y_hat == train$Species) | |
}) %>% which.max | |
optimalCutoff <- cutoffs[id, ] %>% as.numeric | |
y_hat <- | |
ifelse(test[, 3] > optimalCutoff[1] & | |
test[, 4] > optimalCutoff[2], | |
'virginica', | |
'versicolor') | |
mean(y_hat == test$Species) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# comp-check4.R | |
# Conditional probabilities | |
set.seed(1) | |
disease <- | |
sample(c(0, 1), | |
size = 1e6, | |
replace = TRUE, | |
prob = c(0.98, 0.02)) | |
test <- rep(NA, 1e6) | |
test[disease == 0] <- | |
sample( | |
c(0, 1), | |
size = sum(disease == 0), | |
replace = TRUE, | |
prob = c(0.90, 0.10) | |
) | |
test[disease == 1] <- | |
sample( | |
c(0, 1), | |
size = sum(disease == 1), | |
replace = TRUE, | |
prob = c(0.15, 0.85) | |
) | |
## Probability of a positive test | |
mean(test) | |
## Probability of disease when test is negative | |
mean(test == 0 & disease == 1) # mean(disease[test == 0]) | |
## Probability of disease when the test is positive | |
diseased_testedPos <- mean(disease[test == 1]) | |
diseased_testedPos | |
## Relative risk of disease if test is positive | |
diseased_testedPos / mean(disease == 1) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# comp-check5.R | |
# Conditional probabilities | |
library(dslabs) | |
library(tidyverse) | |
data("heights") | |
## Round heights to nearest whole number and find | |
## proportion of males for given heights. Plot result. | |
heights %>% | |
mutate(height = round(height)) %>% | |
group_by(height) %>% | |
summarize(p = mean(sex == "Male")) %>% | |
qplot(height, p, data = .) | |
## Noting high variability of lower height values, | |
## create groups with same number of points using | |
## percentiles. Plot the result | |
ps <- seq(0, 1, 0.1) # each group of 10% | |
heights %>% | |
mutate(g = cut(height, quantile(height, ps), include.lowest = TRUE)) %>% | |
group_by(g) %>% | |
summarise(p = mean(sex == "Male"), height = mean(height)) %>% | |
qplot(height, p, data = .) | |
## Generate data from a bivariate normal distribution | |
Sigma <- 9 * matrix(c(1, 0.5, 0.5, 1), 2, 2) | |
dat <- MASS::mvrnorm(n = 10000, c(69, 69), Sigma = Sigma) %>% | |
data.frame() %>% | |
setNames(c('x', 'y')) | |
dat %>% | |
mutate(g = cut(x, quantile(x, ps), include.lowest = TRUE)) %>% | |
group_by(g) %>% | |
summarise(y = mean(y), x = mean(x)) %>% | |
qplot(x, y, data = .) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# overall-accuracy.R | |
## Lecture notes | |
library(dslabs) | |
library(dplyr) | |
data(heights) | |
## Define predictor 'x' and outcome 'y' | |
y <- heights$sex | |
x <- heights$height | |
## Create training and test sets | |
set.seed(2) | |
test_index <- | |
caret::createDataPartition(y, times = 1, p = 0.5, list = FALSE) | |
train_set <- heights[-test_index, ] | |
test_set <- heights[test_index, ] | |
## Guessing the outcome | |
y_hat <- sample(c("Male", "Female"), length(test_index), replace = TRUE) %>% | |
factor(levels = levels(test_set$sex)) | |
## Overall accuracy is defined as the overall proportion that is | |
## predicted correctly. | |
mean(y_hat == test_set$sex) | |
## Now, to go beyond mere guessing | |
heights %>% | |
group_by(sex) %>% | |
summarise(mean(height), sd(height)) | |
## Predict Male when height is within 2 standard deviations for the Males | |
y_hat <- ifelse(x > 62, "Male", "Female") %>% | |
factor(levels = levels(test_set$sex)) | |
mean(y_hat == test_set$sex) | |
## Now, we get the best value by examining accuracy for different cutoffs | |
cutoff <- seq(61, 70) | |
accuracy <- purrr::map_dbl(cutoff, function(x) { | |
y_hat <- ifelse(train_set$height > x, "Male", "Female") %>% | |
factor(levels = levels(test_set$sex)) | |
mean(y_hat == train_set$sex) | |
}) | |
qplot(cutoff, accuracy, geom = c("point", "line")) | |
max(accuracy) | |
(best_cutoff <- cutoff[which.max(accuracy)]) | |
## Test cutoff on test set | |
y_hat <- ifelse(test_set$height > best_cutoff, "Male", "Female") %>% | |
factor(levels = levels(test_set$sex)) | |
y_hat <- factor(y_hat) | |
mean(y_hat == test_set$sex) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment