BroVic/comp-check2.R

## comp-check2.R
library(dslabs)
library(dplyr)
library(lubridate)
library(caret)

data("reported_heights")

dat <-
  mutate(reported_heights, date_time = ymd_hms(time_stamp)) %>%
  filter(date_time >= make_date(2016, 01, 25) &
           date_time < make_date(2016, 02, 1)) %>%
  mutate(type = ifelse(
    day(date_time) == 25 &
      hour(date_time) == 8 &
      between(minute(date_time), 15, 30),
    'inclass',
    'online'
  )) %>%
  select(sex, type)

y <- factor(dat$sex, c("Female", "Male"))
x <- factor(dat$type)

## Solution code
dat %>% group_by(type) %>% summarise(prop_female = mean(sex == 'Female'))


## Explore the data a little
dat %>%
  group_by(type) %>%
  summarise(Av.Male = mean(sex == "Male"), Av.Female = mean(sex == "Female"))

## Guessing
y_hat <- sample(c("Male", "Female"), length(index), replace = TRUE) %>%
  factor(levels = levels(y))
mean(y_hat == y)

## Overall Accuracy
y_hat <- ifelse(train$type == 'online', 'Male', 'Female') %>%
  factor(levels = levels(y))
mean(y_hat == y)

## use 'table()' to creae a confusion matrix
table(predicted = y_hat, actual = y)

## Compute the sensitivity
sensitivity(y_hat, y)

## Prevalence of females
## My solution
sum("Female" %in% y) / length(y)

## Given solution
mean(y == "Female")

## comp-check3.R
library(caret)
library(tidyverse)

data("iris")
iris <- iris[-which(iris$Species == 'setosa'), ]
y <- iris$Species

## Create evenly partitioned data
set.seed(2)
test_index <- createDataPartition(y, times = 1, p = .5, list = FALSE)
test <- iris[test_index, ]
train <- iris[-test_index, ]

## Assess overall accuracy for the features and find
## the one feature that has the highest overall accuracy
## My solution:
features <- colnames(train)[-5]
structure(map(features, function(x) {
  dat <- train %>%
    group_by(Species) %>%
    summarise(average = mean(Sepal.Length), sd = sd(Sepal.Length))
  ind_chosen <- which.max(dat$average)
  av <- dat$average[ind_chosen]
  sd <- dat$sd[ind_chosen]
  y_hat <- ifelse(train[[x]] > (av - (2 * sd)), "virginica", "versicolor")
  mean(y_hat == train$Species)
}), names = features)

## Given solution (with modifications to enable plotting)
foo <- function(x) {
  rangedValues <- seq(range(x)[1], range(x)[2], by = 0.1)
  acc <- sapply(rangedValues, function(i) {
    y_hat <- ifelse(x > i, 'virginica', 'versicolor')
    mean(y_hat == train$Species)
  })
  print(qplot(rangedValues, acc, geom = c('line', 'point')))
  acc
}
predictions <- apply(train[ , -5], 2, foo)
sapply(predictions, max)


## Overall accuracy for the test data
values <- seq(range(train$Petal.Length)[1], range(train$Petal.Length)[2], by = 0.1)
smart_cutoff <- values[which.max(predictions$Petal.Length)]
smart_cutoff

y_hat <- ifelse(test$Petal.Length > smart_cutoff, "virginica", "versicolor")
mean(y_hat == test$Species)

## Check to see which feature will optimize our predictions, using the test data
predictions_w <- apply(test[ , -5], 2, foo)
sapply(predictions, max)

## Check overall accuracy using a combination of petal length and petal width
values_w <- seq(range(train$Petal.Width)[1], range(train$Petal.Width)[2], by = 0.1)
smart_cutoff_w <- values_w[which.max(predictions_w$Petal.Width)]
smart_cutoff_w

y_hat <- ifelse(
  (test$Petal.Length > smart_cutoff | test$Petal.Width > smart_cutoff_w),
  'virginica',
  'versicolor'
)
mean(y_hat == test$Species)

## Correct result obtained but here is the given solution for the exercise
## Note that it starts afresh, but this is totally unnecessary
data('iris')
iris <- iris[-which(iris$Species == 'setosa'), ]
y <- iris$Species

plot(iris, pch = 21, bg = iris$Species)

set.seed(2)
test_index <- createDataPartition(y, times = 1, p = 0.5, list = FALSE)
test <- iris[test_index, ]
train <- iris[-test_index, ]

petalLengthRange <-
  seq(range(train[, 3])[1], range(train[, 3])[2], by = 0.1)
petalWidthRange <-
  seq(range(train[, 4])[1], range(train[, 4])[2], by = 0.1)
cutoffs <- expand.grid(petalLengthRange, petalWidthRange)

id <- sapply(seq(nrow(cutoffs)), function(i) {
  y_hat <-
    ifelse(train[, 3] > cutoffs[i, 1] |
             train[, 4] > cutoffs[i, 2], 'virginica', 'versicolor')
  mean(y_hat == train$Species)
}) %>% which.max

optimalCutoff <- cutoffs[id, ] %>% as.numeric
y_hat <-
  ifelse(test[, 3] > optimalCutoff[1] &
           test[, 4] > optimalCutoff[2],
         'virginica',
         'versicolor')
mean(y_hat == test$Species)


## comp-check4.R
# comp-check4.R

# Conditional probabilities

set.seed(1)
disease <-
  sample(c(0, 1),
         size = 1e6,
         replace = TRUE,
         prob = c(0.98, 0.02))
test <- rep(NA, 1e6)
test[disease == 0] <-
  sample(
    c(0, 1),
    size = sum(disease == 0),
    replace = TRUE,
    prob = c(0.90, 0.10)
  )
test[disease == 1] <-
  sample(
    c(0, 1),
    size = sum(disease == 1),
    replace = TRUE,
    prob = c(0.15, 0.85)
  )

## Probability of a positive test
mean(test)

## Probability of disease when test is negative
mean(test == 0 & disease == 1)    # mean(disease[test == 0])

## Probability of disease when the test is positive
diseased_testedPos <- mean(disease[test == 1])
diseased_testedPos

## Relative risk of disease if test is positive
diseased_testedPos / mean(disease == 1)

## comp-check5.R
# comp-check5.R

# Conditional probabilities

library(dslabs)
library(tidyverse)
data("heights")

## Round heights to nearest whole number and find
## proportion of males for given heights. Plot result.
heights %>%
  mutate(height = round(height)) %>%
  group_by(height) %>%
  summarize(p = mean(sex == "Male")) %>%
  qplot(height, p, data = .)

## Noting high variability of lower height values,
## create groups with same number of points using
## percentiles. Plot the result
ps <- seq(0, 1, 0.1)   # each group of 10%
heights %>%
  mutate(g = cut(height, quantile(height, ps), include.lowest = TRUE)) %>%
  group_by(g) %>%
  summarise(p = mean(sex == "Male"), height = mean(height)) %>%
  qplot(height, p, data = .)

## Generate data from a bivariate normal distribution
Sigma <- 9 * matrix(c(1, 0.5, 0.5, 1), 2, 2)
dat <- MASS::mvrnorm(n = 10000, c(69, 69), Sigma = Sigma) %>%
  data.frame() %>%
  setNames(c('x', 'y'))

dat %>%
  mutate(g = cut(x, quantile(x, ps), include.lowest = TRUE)) %>%
  group_by(g) %>%
  summarise(y = mean(y), x = mean(x)) %>%
  qplot(x, y, data = .)

## overall-accuracy.R
# overall-accuracy.R

## Lecture notes

library(dslabs)
library(dplyr)

data(heights)

## Define predictor 'x' and outcome 'y'
y <- heights$sex
x <- heights$height

## Create training and test sets
set.seed(2)
test_index <-
  caret::createDataPartition(y, times = 1, p = 0.5, list = FALSE)
train_set <- heights[-test_index, ]
test_set <- heights[test_index, ]

## Guessing the outcome
y_hat <- sample(c("Male", "Female"), length(test_index), replace = TRUE) %>%
  factor(levels = levels(test_set$sex))

## Overall accuracy is defined as the overall proportion that is
## predicted correctly.
mean(y_hat == test_set$sex)

## Now, to go beyond mere guessing
heights %>%
  group_by(sex) %>%
  summarise(mean(height), sd(height))

## Predict Male when height is within 2 standard deviations for the Males
y_hat <- ifelse(x > 62, "Male", "Female") %>%
  factor(levels = levels(test_set$sex))
mean(y_hat == test_set$sex)

## Now, we get the best value by examining accuracy for different cutoffs
cutoff <- seq(61, 70)
accuracy <- purrr::map_dbl(cutoff, function(x) {
  y_hat <- ifelse(train_set$height > x, "Male", "Female") %>%
    factor(levels = levels(test_set$sex))
  mean(y_hat == train_set$sex)
})

qplot(cutoff, accuracy, geom = c("point", "line"))
max(accuracy)
(best_cutoff <- cutoff[which.max(accuracy)])

## Test cutoff on test set
y_hat <- ifelse(test_set$height > best_cutoff, "Male", "Female") %>%
  factor(levels = levels(test_set$sex))
y_hat <- factor(y_hat)
mean(y_hat == test_set$sex)
	library(dslabs)
	library(dplyr)
	library(lubridate)
	library(caret)

	data("reported_heights")

	dat <-
	mutate(reported_heights, date_time = ymd_hms(time_stamp)) %>%
	filter(date_time >= make_date(2016, 01, 25) &
	date_time < make_date(2016, 02, 1)) %>%
	mutate(type = ifelse(
	day(date_time) == 25 &
	hour(date_time) == 8 &
	between(minute(date_time), 15, 30),
	'inclass',
	'online'
	)) %>%
	select(sex, type)

	y <- factor(dat$sex, c("Female", "Male"))
	x <- factor(dat$type)

	## Solution code
	dat %>% group_by(type) %>% summarise(prop_female = mean(sex == 'Female'))


	## Explore the data a little
	dat %>%
	group_by(type) %>%
	summarise(Av.Male = mean(sex == "Male"), Av.Female = mean(sex == "Female"))

	## Guessing
	y_hat <- sample(c("Male", "Female"), length(index), replace = TRUE) %>%
	factor(levels = levels(y))
	mean(y_hat == y)

	## Overall Accuracy
	y_hat <- ifelse(train$type == 'online', 'Male', 'Female') %>%
	factor(levels = levels(y))
	mean(y_hat == y)

	## use 'table()' to creae a confusion matrix
	table(predicted = y_hat, actual = y)

	## Compute the sensitivity
	sensitivity(y_hat, y)

	## Prevalence of females
	## My solution
	sum("Female" %in% y) / length(y)

	## Given solution
	mean(y == "Female")
	library(caret)
	library(tidyverse)

	data("iris")
	iris <- iris[-which(iris$Species == 'setosa'), ]
	y <- iris$Species

	## Create evenly partitioned data
	set.seed(2)
	test_index <- createDataPartition(y, times = 1, p = .5, list = FALSE)
	test <- iris[test_index, ]
	train <- iris[-test_index, ]

	## Assess overall accuracy for the features and find
	## the one feature that has the highest overall accuracy
	## My solution:
	features <- colnames(train)[-5]
	structure(map(features, function(x) {
	dat <- train %>%
	group_by(Species) %>%
	summarise(average = mean(Sepal.Length), sd = sd(Sepal.Length))
	ind_chosen <- which.max(dat$average)
	av <- dat$average[ind_chosen]
	sd <- dat$sd[ind_chosen]
	y_hat <- ifelse(train[[x]] > (av - (2 * sd)), "virginica", "versicolor")
	mean(y_hat == train$Species)
	}), names = features)

	## Given solution (with modifications to enable plotting)
	foo <- function(x) {
	rangedValues <- seq(range(x)[1], range(x)[2], by = 0.1)
	acc <- sapply(rangedValues, function(i) {
	y_hat <- ifelse(x > i, 'virginica', 'versicolor')
	mean(y_hat == train$Species)
	})
	print(qplot(rangedValues, acc, geom = c('line', 'point')))
	acc
	}
	predictions <- apply(train[ , -5], 2, foo)
	sapply(predictions, max)


	## Overall accuracy for the test data
	values <- seq(range(train$Petal.Length)[1], range(train$Petal.Length)[2], by = 0.1)
	smart_cutoff <- values[which.max(predictions$Petal.Length)]
	smart_cutoff

	y_hat <- ifelse(test$Petal.Length > smart_cutoff, "virginica", "versicolor")
	mean(y_hat == test$Species)

	## Check to see which feature will optimize our predictions, using the test data
	predictions_w <- apply(test[ , -5], 2, foo)
	sapply(predictions, max)

	## Check overall accuracy using a combination of petal length and petal width
	values_w <- seq(range(train$Petal.Width)[1], range(train$Petal.Width)[2], by = 0.1)
	smart_cutoff_w <- values_w[which.max(predictions_w$Petal.Width)]
	smart_cutoff_w

	y_hat <- ifelse(
	(test$Petal.Length > smart_cutoff \| test$Petal.Width > smart_cutoff_w),
	'virginica',
	'versicolor'
	)
	mean(y_hat == test$Species)

	## Correct result obtained but here is the given solution for the exercise
	## Note that it starts afresh, but this is totally unnecessary
	data('iris')
	iris <- iris[-which(iris$Species == 'setosa'), ]
	y <- iris$Species

	plot(iris, pch = 21, bg = iris$Species)

	set.seed(2)
	test_index <- createDataPartition(y, times = 1, p = 0.5, list = FALSE)
	test <- iris[test_index, ]
	train <- iris[-test_index, ]

	petalLengthRange <-
	seq(range(train[, 3])[1], range(train[, 3])[2], by = 0.1)
	petalWidthRange <-
	seq(range(train[, 4])[1], range(train[, 4])[2], by = 0.1)
	cutoffs <- expand.grid(petalLengthRange, petalWidthRange)

	id <- sapply(seq(nrow(cutoffs)), function(i) {
	y_hat <-
	ifelse(train[, 3] > cutoffs[i, 1] \|
	train[, 4] > cutoffs[i, 2], 'virginica', 'versicolor')
	mean(y_hat == train$Species)
	}) %>% which.max

	optimalCutoff <- cutoffs[id, ] %>% as.numeric
	y_hat <-
	ifelse(test[, 3] > optimalCutoff[1] &
	test[, 4] > optimalCutoff[2],
	'virginica',
	'versicolor')
	mean(y_hat == test$Species)
	# comp-check4.R

	# Conditional probabilities

	set.seed(1)
	disease <-
	sample(c(0, 1),
	size = 1e6,
	replace = TRUE,
	prob = c(0.98, 0.02))
	test <- rep(NA, 1e6)
	test[disease == 0] <-
	sample(
	c(0, 1),
	size = sum(disease == 0),
	replace = TRUE,
	prob = c(0.90, 0.10)
	)
	test[disease == 1] <-
	sample(
	c(0, 1),
	size = sum(disease == 1),
	replace = TRUE,
	prob = c(0.15, 0.85)
	)

	## Probability of a positive test
	mean(test)

	## Probability of disease when test is negative
	mean(test == 0 & disease == 1) # mean(disease[test == 0])

	## Probability of disease when the test is positive
	diseased_testedPos <- mean(disease[test == 1])
	diseased_testedPos

	## Relative risk of disease if test is positive
	diseased_testedPos / mean(disease == 1)
	# comp-check5.R

	# Conditional probabilities

	library(dslabs)
	library(tidyverse)
	data("heights")

	## Round heights to nearest whole number and find
	## proportion of males for given heights. Plot result.
	heights %>%
	mutate(height = round(height)) %>%
	group_by(height) %>%
	summarize(p = mean(sex == "Male")) %>%
	qplot(height, p, data = .)

	## Noting high variability of lower height values,
	## create groups with same number of points using
	## percentiles. Plot the result
	ps <- seq(0, 1, 0.1) # each group of 10%
	heights %>%
	mutate(g = cut(height, quantile(height, ps), include.lowest = TRUE)) %>%
	group_by(g) %>%
	summarise(p = mean(sex == "Male"), height = mean(height)) %>%
	qplot(height, p, data = .)

	## Generate data from a bivariate normal distribution
	Sigma <- 9 * matrix(c(1, 0.5, 0.5, 1), 2, 2)
	dat <- MASS::mvrnorm(n = 10000, c(69, 69), Sigma = Sigma) %>%
	data.frame() %>%
	setNames(c('x', 'y'))

	dat %>%
	mutate(g = cut(x, quantile(x, ps), include.lowest = TRUE)) %>%
	group_by(g) %>%
	summarise(y = mean(y), x = mean(x)) %>%
	qplot(x, y, data = .)
	# overall-accuracy.R

	## Lecture notes

	library(dslabs)
	library(dplyr)

	data(heights)

	## Define predictor 'x' and outcome 'y'
	y <- heights$sex
	x <- heights$height

	## Create training and test sets
	set.seed(2)
	test_index <-
	caret::createDataPartition(y, times = 1, p = 0.5, list = FALSE)
	train_set <- heights[-test_index, ]
	test_set <- heights[test_index, ]

	## Guessing the outcome
	y_hat <- sample(c("Male", "Female"), length(test_index), replace = TRUE) %>%
	factor(levels = levels(test_set$sex))

	## Overall accuracy is defined as the overall proportion that is
	## predicted correctly.
	mean(y_hat == test_set$sex)

	## Now, to go beyond mere guessing
	heights %>%
	group_by(sex) %>%
	summarise(mean(height), sd(height))

	## Predict Male when height is within 2 standard deviations for the Males
	y_hat <- ifelse(x > 62, "Male", "Female") %>%
	factor(levels = levels(test_set$sex))
	mean(y_hat == test_set$sex)

	## Now, we get the best value by examining accuracy for different cutoffs
	cutoff <- seq(61, 70)
	accuracy <- purrr::map_dbl(cutoff, function(x) {
	y_hat <- ifelse(train_set$height > x, "Male", "Female") %>%
	factor(levels = levels(test_set$sex))
	mean(y_hat == train_set$sex)
	})

	qplot(cutoff, accuracy, geom = c("point", "line"))
	max(accuracy)
	(best_cutoff <- cutoff[which.max(accuracy)])

	## Test cutoff on test set
	y_hat <- ifelse(test_set$height > best_cutoff, "Male", "Female") %>%
	factor(levels = levels(test_set$sex))
	y_hat <- factor(y_hat)
	mean(y_hat == test_set$sex)