Skip to content

Instantly share code, notes, and snippets.

@BroVic
Last active October 4, 2020 16:10
Show Gist options
  • Save BroVic/01c9b192e672655da496b4c244dde321 to your computer and use it in GitHub Desktop.
Save BroVic/01c9b192e672655da496b4c244dde321 to your computer and use it in GitHub Desktop.
library(dslabs)
library(dplyr)
library(lubridate)
library(caret)
data("reported_heights")
dat <-
mutate(reported_heights, date_time = ymd_hms(time_stamp)) %>%
filter(date_time >= make_date(2016, 01, 25) &
date_time < make_date(2016, 02, 1)) %>%
mutate(type = ifelse(
day(date_time) == 25 &
hour(date_time) == 8 &
between(minute(date_time), 15, 30),
'inclass',
'online'
)) %>%
select(sex, type)
y <- factor(dat$sex, c("Female", "Male"))
x <- factor(dat$type)
## Solution code
dat %>% group_by(type) %>% summarise(prop_female = mean(sex == 'Female'))
## Explore the data a little
dat %>%
group_by(type) %>%
summarise(Av.Male = mean(sex == "Male"), Av.Female = mean(sex == "Female"))
## Guessing
y_hat <- sample(c("Male", "Female"), length(index), replace = TRUE) %>%
factor(levels = levels(y))
mean(y_hat == y)
## Overall Accuracy
y_hat <- ifelse(train$type == 'online', 'Male', 'Female') %>%
factor(levels = levels(y))
mean(y_hat == y)
## use 'table()' to creae a confusion matrix
table(predicted = y_hat, actual = y)
## Compute the sensitivity
sensitivity(y_hat, y)
## Prevalence of females
## My solution
sum("Female" %in% y) / length(y)
## Given solution
mean(y == "Female")
library(caret)
library(tidyverse)
data("iris")
iris <- iris[-which(iris$Species == 'setosa'), ]
y <- iris$Species
## Create evenly partitioned data
set.seed(2)
test_index <- createDataPartition(y, times = 1, p = .5, list = FALSE)
test <- iris[test_index, ]
train <- iris[-test_index, ]
## Assess overall accuracy for the features and find
## the one feature that has the highest overall accuracy
## My solution:
features <- colnames(train)[-5]
structure(map(features, function(x) {
dat <- train %>%
group_by(Species) %>%
summarise(average = mean(Sepal.Length), sd = sd(Sepal.Length))
ind_chosen <- which.max(dat$average)
av <- dat$average[ind_chosen]
sd <- dat$sd[ind_chosen]
y_hat <- ifelse(train[[x]] > (av - (2 * sd)), "virginica", "versicolor")
mean(y_hat == train$Species)
}), names = features)
## Given solution (with modifications to enable plotting)
foo <- function(x) {
rangedValues <- seq(range(x)[1], range(x)[2], by = 0.1)
acc <- sapply(rangedValues, function(i) {
y_hat <- ifelse(x > i, 'virginica', 'versicolor')
mean(y_hat == train$Species)
})
print(qplot(rangedValues, acc, geom = c('line', 'point')))
acc
}
predictions <- apply(train[ , -5], 2, foo)
sapply(predictions, max)
## Overall accuracy for the test data
values <- seq(range(train$Petal.Length)[1], range(train$Petal.Length)[2], by = 0.1)
smart_cutoff <- values[which.max(predictions$Petal.Length)]
smart_cutoff
y_hat <- ifelse(test$Petal.Length > smart_cutoff, "virginica", "versicolor")
mean(y_hat == test$Species)
## Check to see which feature will optimize our predictions, using the test data
predictions_w <- apply(test[ , -5], 2, foo)
sapply(predictions, max)
## Check overall accuracy using a combination of petal length and petal width
values_w <- seq(range(train$Petal.Width)[1], range(train$Petal.Width)[2], by = 0.1)
smart_cutoff_w <- values_w[which.max(predictions_w$Petal.Width)]
smart_cutoff_w
y_hat <- ifelse(
(test$Petal.Length > smart_cutoff | test$Petal.Width > smart_cutoff_w),
'virginica',
'versicolor'
)
mean(y_hat == test$Species)
## Correct result obtained but here is the given solution for the exercise
## Note that it starts afresh, but this is totally unnecessary
data('iris')
iris <- iris[-which(iris$Species == 'setosa'), ]
y <- iris$Species
plot(iris, pch = 21, bg = iris$Species)
set.seed(2)
test_index <- createDataPartition(y, times = 1, p = 0.5, list = FALSE)
test <- iris[test_index, ]
train <- iris[-test_index, ]
petalLengthRange <-
seq(range(train[, 3])[1], range(train[, 3])[2], by = 0.1)
petalWidthRange <-
seq(range(train[, 4])[1], range(train[, 4])[2], by = 0.1)
cutoffs <- expand.grid(petalLengthRange, petalWidthRange)
id <- sapply(seq(nrow(cutoffs)), function(i) {
y_hat <-
ifelse(train[, 3] > cutoffs[i, 1] |
train[, 4] > cutoffs[i, 2], 'virginica', 'versicolor')
mean(y_hat == train$Species)
}) %>% which.max
optimalCutoff <- cutoffs[id, ] %>% as.numeric
y_hat <-
ifelse(test[, 3] > optimalCutoff[1] &
test[, 4] > optimalCutoff[2],
'virginica',
'versicolor')
mean(y_hat == test$Species)
# comp-check4.R
# Conditional probabilities
set.seed(1)
disease <-
sample(c(0, 1),
size = 1e6,
replace = TRUE,
prob = c(0.98, 0.02))
test <- rep(NA, 1e6)
test[disease == 0] <-
sample(
c(0, 1),
size = sum(disease == 0),
replace = TRUE,
prob = c(0.90, 0.10)
)
test[disease == 1] <-
sample(
c(0, 1),
size = sum(disease == 1),
replace = TRUE,
prob = c(0.15, 0.85)
)
## Probability of a positive test
mean(test)
## Probability of disease when test is negative
mean(test == 0 & disease == 1) # mean(disease[test == 0])
## Probability of disease when the test is positive
diseased_testedPos <- mean(disease[test == 1])
diseased_testedPos
## Relative risk of disease if test is positive
diseased_testedPos / mean(disease == 1)
# comp-check5.R
# Conditional probabilities
library(dslabs)
library(tidyverse)
data("heights")
## Round heights to nearest whole number and find
## proportion of males for given heights. Plot result.
heights %>%
mutate(height = round(height)) %>%
group_by(height) %>%
summarize(p = mean(sex == "Male")) %>%
qplot(height, p, data = .)
## Noting high variability of lower height values,
## create groups with same number of points using
## percentiles. Plot the result
ps <- seq(0, 1, 0.1) # each group of 10%
heights %>%
mutate(g = cut(height, quantile(height, ps), include.lowest = TRUE)) %>%
group_by(g) %>%
summarise(p = mean(sex == "Male"), height = mean(height)) %>%
qplot(height, p, data = .)
## Generate data from a bivariate normal distribution
Sigma <- 9 * matrix(c(1, 0.5, 0.5, 1), 2, 2)
dat <- MASS::mvrnorm(n = 10000, c(69, 69), Sigma = Sigma) %>%
data.frame() %>%
setNames(c('x', 'y'))
dat %>%
mutate(g = cut(x, quantile(x, ps), include.lowest = TRUE)) %>%
group_by(g) %>%
summarise(y = mean(y), x = mean(x)) %>%
qplot(x, y, data = .)
# overall-accuracy.R
## Lecture notes
library(dslabs)
library(dplyr)
data(heights)
## Define predictor 'x' and outcome 'y'
y <- heights$sex
x <- heights$height
## Create training and test sets
set.seed(2)
test_index <-
caret::createDataPartition(y, times = 1, p = 0.5, list = FALSE)
train_set <- heights[-test_index, ]
test_set <- heights[test_index, ]
## Guessing the outcome
y_hat <- sample(c("Male", "Female"), length(test_index), replace = TRUE) %>%
factor(levels = levels(test_set$sex))
## Overall accuracy is defined as the overall proportion that is
## predicted correctly.
mean(y_hat == test_set$sex)
## Now, to go beyond mere guessing
heights %>%
group_by(sex) %>%
summarise(mean(height), sd(height))
## Predict Male when height is within 2 standard deviations for the Males
y_hat <- ifelse(x > 62, "Male", "Female") %>%
factor(levels = levels(test_set$sex))
mean(y_hat == test_set$sex)
## Now, we get the best value by examining accuracy for different cutoffs
cutoff <- seq(61, 70)
accuracy <- purrr::map_dbl(cutoff, function(x) {
y_hat <- ifelse(train_set$height > x, "Male", "Female") %>%
factor(levels = levels(test_set$sex))
mean(y_hat == train_set$sex)
})
qplot(cutoff, accuracy, geom = c("point", "line"))
max(accuracy)
(best_cutoff <- cutoff[which.max(accuracy)])
## Test cutoff on test set
y_hat <- ifelse(test_set$height > best_cutoff, "Male", "Female") %>%
factor(levels = levels(test_set$sex))
y_hat <- factor(y_hat)
mean(y_hat == test_set$sex)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment