BroVic/comp-check1.R

## comp-check1.R
# comp-check1.R

library(tidyverse)
library(MASS)
library(caret)

set.seed(1)
Sigma <- 9 * matrix(c(1, 0.5, 0.5, 1), 2, 2)
dat <- mvrnorm(n = 100, c(69, 69), Sigma = Sigma) %>%
  data.frame() %>%
  setNames(c('x', 'y'))
set.seed(1)
test_index <- createDataPartition(dat$y, list = FALSE)
train_set <- dat[-test_index,]
test_set <- dat[test_index,]

# Guessing using the average
avg <- mean(train_set$y)
avg
## Squared loss...
mean((avg - test_set$y) ^ 2)

d <- function(size, means, Sigma, names) {
  set.seed(1)
  mvrnorm(n = size, mu = means, Sigma = Sigma) %>%
    data.frame() %>%
    setNames(names)
}

f <- function(data) {
  rmse <- replicate(100, {
    test_index <- createDataPartition(data$y, list = FALSE)
    train_set <- data[-test_index,]
    test_set <- data[test_index,]

    ## Train a linear model
    fit <- lm(y ~ x, data = train_set)
    y_hat <- predict(fit, test_set)

    ## Squared loss...
    sqrt(mean((y_hat - test_set$y) ^ 2))
  })
  structure(c(mean(rmse), sd(rmse)), names = c('mean', 'sd'))
}

data <- d(100, c(69, 69), Sigma, c('x', 'y'))
set.seed(1)
results <- f(data)
results

# Repeat using larger datasets
## Size of datasets
n <- c(100, 500, 1000, 5000, 10000)
names(n) <- as.character(n)
results <- map(n, function(x) {
  set.seed(1)
  data <- d(size = x, means = c(69, 69), Sigma = Sigma, names = c('x', 'y'))
})
set.seed(1)
results <- map(results, f) %>%
  rbind()
results

# Q4: Make the correlation of x and y larger
set.seed(1)
n <- 100
Sigma <- 9 * matrix(c(1.0, 0.95, 0.95, 1.0), 2, 2)
dat <- mvrnorm(n = 100, c(69, 69), Sigma = Sigma) %>%
  data.frame() %>%
  setNames(c('x', 'y'))

## We can just use our function 'f'
f(n, Sigma = Sigma)

# Q6: Create data set
set.seed(1)
n <- 1000
Sigma <- matrix(c(1.0, 0.75, 0.75, 0.75, 1.0, 0.25, 0.75, 0.25, 1.0), 3, 3)
dat <- mvrnorm(n = 100, c(0, 0, 0), Sigma = Sigma) %>%
  data.frame() %>%
  setNames(c('y', 'x_1', 'x_2'))
set.seed(1)
test_index <- createDataPartition(dat$y, list = FALSE)
train_set <- dat %>% slice(-test_index)
test_set <- dat %>% slice(test_index)

avg <- mean(train_set$y)
avg

## Test for best model i.e. lowest RMSE
fit_x_1 <- lm(y ~ x_1, data = train_set)
fit_x_2 <- lm(y ~ x_2, data = train_set)
fit_x12 <- lm(y ~ x_1 + x_2, data = train_set)

y_hat_x1 <- predict(fit_x_1, test_set)
y_hat_x2 <- predict(fit_x_2, test_set)
y_hat_x12 <- predict(fit_x12, test_set)

lossFx <- function(yhat) mean(sqrt((yhat - test_set$y)^2))
predList <- list(x1 = y_hat_x1, x2 = y_hat_x2, x_1_2 = y_hat_x12)
listOfRMSEs <- lapply(predList, lossFx)
vecOfRMSEs <- structure(unlist(listOfRMSEs), names = names(listOfRMSEs))
bestModel <- names(which.min(vecOfRMSEs))
cat('The best model is', bestModel, '\n')

## comp-check2.R
# comp-check2.R

library(tidyverse)
library(caret)

set.seed(2)

make_data <-
  function(n = 1000,
           p = 0.5,
           mu_0 = 0,
           mu_1 = 2,
           sigma_0 = 1,
           sigma_1 = 1) {
    y <-  rbinom(n, 1, p)
    f_0 <- rnorm(n, mu_0, sigma_0)
    f_1 <- rnorm(n, mu_1, sigma_1)
    x <- ifelse(y == 1, f_1, f_0)

    test_index <- createDataPartition(y, list = FALSE)
    list(
      train = data.frame(x = x, y = as.factor(y)) %>% slice(-test_index),
      test = data.frame(x = x, y = as.factor(y)) %>% slice(test_index)
    )
  }

dat <- make_data()

dat$train %>%
  ggplot(aes(x, color = y)) +
  geom_density()

# Plot 25 different datasets with difference ranging from 0 to 3
# and plot accuracy vs mu_1
delta <- seq(0, 3, len = 25)

## First we compute accuracy using our initial dataset to
## develop a prototype computation. Since we have a binary
## outcome, we will use a logistic regression model (at
## this stage of the course that's all we know to use)
## Now, we start by training the model
fit <- glm(y ~ x, data = dat$train, family = 'binomial')
p_hat <- predict(fit, dat$test)
y_hat <- factor(ifelse(p_hat > 0.5, '1', '0'))
cfm <- confusionMatrix(y_hat, reference = dat$test$y)
cfm$overall['Accuracy']

## Extending this to the problem
acc <- map_dbl(delta, function(x) {
  dat <- make_data(mu_1 = x)
  fit <- glm(y ~ x, data = dat$train, family = 'binomial')
  p_hat <- predict(fit, dat$test, type = 'response')
  y_hat <- factor(ifelse(p_hat > 0.5, '1', '0'))
  cfm <- confusionMatrix(y_hat, reference = dat$test$y)
  cfm$overall['Accuracy']
})
qplot(delta, acc)

## comp-check3.R
# comp-check3.R

# Smoothing

library(tidyverse)
library(lubridate)
library(pdftools)

fn <-
  system.file('extdata',
              'RD-Mortality-Report_2015-18-180531.pdf',
              package = 'dslabs')

## Create the data frame
dat <- map_df(str_split(pdf_text(fn), '\n'), function(s) {
  s <- str_trim(s)
  header_index <- str_which(s, '2015')[1]
  tmp <- str_split(s[header_index], "\\s+", simplify = TRUE)
  month <- tmp[1]
  header <- tmp[-1]
  tail_index <- str_which(s, 'Total')
  n <- str_count(s, "\\d+")
  out <-
    c(1:header_index,
      which(n == 1),
      which(n >= 28),
      tail_index:length(s))
  s[-out] %>%
    str_remove_all('[^\\d\\s]') %>%
    str_trim() %>%
    str_split_fixed('\\s+', n = 6) %>%
    .[, 1:5] %>%
    as_data_frame() %>%
    setNames(c('day', header)) %>%
    mutate(month = month,
           day = as.numeric(day)) %>%
    gather(year, deaths, -c(day, month)) %>%
    mutate(deaths = as.numeric(deaths))
}) %>%
  mutate(
    month = recode(
      month,
      'JAN' = 1,
      'FEB' = 2,
      'MAR' = 3,
      'APR' = 4,
      'MAY' = 5,
      'JUN' = 6,
      'JUL' = 7,
      'AGO' = 8,
      'SEP' = 9,
      'OCT' = 10,
      'NOV' = 11,
      'DEC' = 12
    )
  ) %>%
  mutate(date = make_date(year, month, day)) %>%
  filter(date <= '2018-05-01')

all_days <- diff(range(dat$date))
span <- 60/as.numeric(all_days)
# fit <-
#   loess(
#     deaths ~ date,
#     degree = 1L,
#     data = dat,
#     span = span
#   )

# ggplot(dat, aes(date, deaths)) +
#   geom_point() +
#   geom_smooth(
#     color = 'red',
#     span = span,
#     method = 'loess',
#     method.args = list(degree = 1)
#   )

fit <- dat %>%
  mutate(x = as.numeric(date)) %>%
  loess(deaths ~ x, data = ., span = span, degree =  1)

dat %>%
  mutate(smooth = predict(fit, as.numeric(date))) %>%
  ggplot() +
  geom_point(aes(date, deaths)) +
  geom_line(aes(date, smooth), lwd = 2, col = 2)


## Produce a plot with a curve fitted for each year
dat %>%
  mutate(smooth = predict(fit, as.numeric(date)), day = yday(date)) %>%
  ggplot(aes(day, smooth, col = year)) +
  geom_line(lwd = 2)

## Predict 2s and 7s in the mnist_27 dataset
library(broom)
library(dslabs)
mnist_27$train %>%
  glm(y ~ x_2, family = 'binomial', data = .) %>%
  tidy()

qplot(x_2, y, data = mnist_27$train)

## Fit a smoothing model using loess
dat2 <- mnist_27$train %>%
  mutate(y2 = recode(y, '2' = 1, '7' = 0))

fit2 <- loess(y2 ~ x_2, data = dat2, family = 'symmetric', degree = 1)

## Fit a loess line to the above data
dat2 %>%
  mutate(smooth = predict(fit2, newdata = .)) %>%
  ggplot() +
  geom_point(aes(x_2, y2)) +
  geom_line(aes(x_2, smooth))

## Given solution
mnist_27$train %>%
  mutate(y = ifelse(y=="7", 1, 0)) %>%
  ggplot(aes(x_2, y)) +
  geom_smooth(method = "loess")
	# comp-check1.R

	library(tidyverse)
	library(MASS)
	library(caret)

	set.seed(1)
	Sigma <- 9 * matrix(c(1, 0.5, 0.5, 1), 2, 2)
	dat <- mvrnorm(n = 100, c(69, 69), Sigma = Sigma) %>%
	data.frame() %>%
	setNames(c('x', 'y'))
	set.seed(1)
	test_index <- createDataPartition(dat$y, list = FALSE)
	train_set <- dat[-test_index,]
	test_set <- dat[test_index,]

	# Guessing using the average
	avg <- mean(train_set$y)
	avg
	## Squared loss...
	mean((avg - test_set$y) ^ 2)

	d <- function(size, means, Sigma, names) {
	set.seed(1)
	mvrnorm(n = size, mu = means, Sigma = Sigma) %>%
	data.frame() %>%
	setNames(names)
	}

	f <- function(data) {
	rmse <- replicate(100, {
	test_index <- createDataPartition(data$y, list = FALSE)
	train_set <- data[-test_index,]
	test_set <- data[test_index,]

	## Train a linear model
	fit <- lm(y ~ x, data = train_set)
	y_hat <- predict(fit, test_set)

	## Squared loss...
	sqrt(mean((y_hat - test_set$y) ^ 2))
	})
	structure(c(mean(rmse), sd(rmse)), names = c('mean', 'sd'))
	}

	data <- d(100, c(69, 69), Sigma, c('x', 'y'))
	set.seed(1)
	results <- f(data)
	results

	# Repeat using larger datasets
	## Size of datasets
	n <- c(100, 500, 1000, 5000, 10000)
	names(n) <- as.character(n)
	results <- map(n, function(x) {
	set.seed(1)
	data <- d(size = x, means = c(69, 69), Sigma = Sigma, names = c('x', 'y'))
	})
	set.seed(1)
	results <- map(results, f) %>%
	rbind()
	results

	# Q4: Make the correlation of x and y larger
	set.seed(1)
	n <- 100
	Sigma <- 9 * matrix(c(1.0, 0.95, 0.95, 1.0), 2, 2)
	dat <- mvrnorm(n = 100, c(69, 69), Sigma = Sigma) %>%
	data.frame() %>%
	setNames(c('x', 'y'))

	## We can just use our function 'f'
	f(n, Sigma = Sigma)

	# Q6: Create data set
	set.seed(1)
	n <- 1000
	Sigma <- matrix(c(1.0, 0.75, 0.75, 0.75, 1.0, 0.25, 0.75, 0.25, 1.0), 3, 3)
	dat <- mvrnorm(n = 100, c(0, 0, 0), Sigma = Sigma) %>%
	data.frame() %>%
	setNames(c('y', 'x_1', 'x_2'))
	set.seed(1)
	test_index <- createDataPartition(dat$y, list = FALSE)
	train_set <- dat %>% slice(-test_index)
	test_set <- dat %>% slice(test_index)

	avg <- mean(train_set$y)
	avg

	## Test for best model i.e. lowest RMSE
	fit_x_1 <- lm(y ~ x_1, data = train_set)
	fit_x_2 <- lm(y ~ x_2, data = train_set)
	fit_x12 <- lm(y ~ x_1 + x_2, data = train_set)

	y_hat_x1 <- predict(fit_x_1, test_set)
	y_hat_x2 <- predict(fit_x_2, test_set)
	y_hat_x12 <- predict(fit_x12, test_set)

	lossFx <- function(yhat) mean(sqrt((yhat - test_set$y)^2))
	predList <- list(x1 = y_hat_x1, x2 = y_hat_x2, x_1_2 = y_hat_x12)
	listOfRMSEs <- lapply(predList, lossFx)
	vecOfRMSEs <- structure(unlist(listOfRMSEs), names = names(listOfRMSEs))
	bestModel <- names(which.min(vecOfRMSEs))
	cat('The best model is', bestModel, '\n')
	# comp-check2.R

	library(tidyverse)
	library(caret)

	set.seed(2)

	make_data <-
	function(n = 1000,
	p = 0.5,
	mu_0 = 0,
	mu_1 = 2,
	sigma_0 = 1,
	sigma_1 = 1) {
	y <- rbinom(n, 1, p)
	f_0 <- rnorm(n, mu_0, sigma_0)
	f_1 <- rnorm(n, mu_1, sigma_1)
	x <- ifelse(y == 1, f_1, f_0)

	test_index <- createDataPartition(y, list = FALSE)
	list(
	train = data.frame(x = x, y = as.factor(y)) %>% slice(-test_index),
	test = data.frame(x = x, y = as.factor(y)) %>% slice(test_index)
	)
	}

	dat <- make_data()

	dat$train %>%
	ggplot(aes(x, color = y)) +
	geom_density()

	# Plot 25 different datasets with difference ranging from 0 to 3
	# and plot accuracy vs mu_1
	delta <- seq(0, 3, len = 25)

	## First we compute accuracy using our initial dataset to
	## develop a prototype computation. Since we have a binary
	## outcome, we will use a logistic regression model (at
	## this stage of the course that's all we know to use)
	## Now, we start by training the model
	fit <- glm(y ~ x, data = dat$train, family = 'binomial')
	p_hat <- predict(fit, dat$test)
	y_hat <- factor(ifelse(p_hat > 0.5, '1', '0'))
	cfm <- confusionMatrix(y_hat, reference = dat$test$y)
	cfm$overall['Accuracy']

	## Extending this to the problem
	acc <- map_dbl(delta, function(x) {
	dat <- make_data(mu_1 = x)
	fit <- glm(y ~ x, data = dat$train, family = 'binomial')
	p_hat <- predict(fit, dat$test, type = 'response')
	y_hat <- factor(ifelse(p_hat > 0.5, '1', '0'))
	cfm <- confusionMatrix(y_hat, reference = dat$test$y)
	cfm$overall['Accuracy']
	})
	qplot(delta, acc)
	# comp-check3.R

	# Smoothing

	library(tidyverse)
	library(lubridate)
	library(pdftools)

	fn <-
	system.file('extdata',
	'RD-Mortality-Report_2015-18-180531.pdf',
	package = 'dslabs')

	## Create the data frame
	dat <- map_df(str_split(pdf_text(fn), '\n'), function(s) {
	s <- str_trim(s)
	header_index <- str_which(s, '2015')[1]
	tmp <- str_split(s[header_index], "\\s+", simplify = TRUE)
	month <- tmp[1]
	header <- tmp[-1]
	tail_index <- str_which(s, 'Total')
	n <- str_count(s, "\\d+")
	out <-
	c(1:header_index,
	which(n == 1),
	which(n >= 28),
	tail_index:length(s))
	s[-out] %>%
	str_remove_all('[^\\d\\s]') %>%
	str_trim() %>%
	str_split_fixed('\\s+', n = 6) %>%
	.[, 1:5] %>%
	as_data_frame() %>%
	setNames(c('day', header)) %>%
	mutate(month = month,
	day = as.numeric(day)) %>%
	gather(year, deaths, -c(day, month)) %>%
	mutate(deaths = as.numeric(deaths))
	}) %>%
	mutate(
	month = recode(
	month,
	'JAN' = 1,
	'FEB' = 2,
	'MAR' = 3,
	'APR' = 4,
	'MAY' = 5,
	'JUN' = 6,
	'JUL' = 7,
	'AGO' = 8,
	'SEP' = 9,
	'OCT' = 10,
	'NOV' = 11,
	'DEC' = 12
	)
	) %>%
	mutate(date = make_date(year, month, day)) %>%
	filter(date <= '2018-05-01')

	all_days <- diff(range(dat$date))
	span <- 60/as.numeric(all_days)
	# fit <-
	# loess(
	# deaths ~ date,
	# degree = 1L,
	# data = dat,
	# span = span
	# )

	# ggplot(dat, aes(date, deaths)) +
	# geom_point() +
	# geom_smooth(
	# color = 'red',
	# span = span,
	# method = 'loess',
	# method.args = list(degree = 1)
	# )

	fit <- dat %>%
	mutate(x = as.numeric(date)) %>%
	loess(deaths ~ x, data = ., span = span, degree = 1)

	dat %>%
	mutate(smooth = predict(fit, as.numeric(date))) %>%
	ggplot() +
	geom_point(aes(date, deaths)) +
	geom_line(aes(date, smooth), lwd = 2, col = 2)


	## Produce a plot with a curve fitted for each year
	dat %>%
	mutate(smooth = predict(fit, as.numeric(date)), day = yday(date)) %>%
	ggplot(aes(day, smooth, col = year)) +
	geom_line(lwd = 2)

	## Predict 2s and 7s in the mnist_27 dataset
	library(broom)
	library(dslabs)
	mnist_27$train %>%
	glm(y ~ x_2, family = 'binomial', data = .) %>%
	tidy()

	qplot(x_2, y, data = mnist_27$train)

	## Fit a smoothing model using loess
	dat2 <- mnist_27$train %>%
	mutate(y2 = recode(y, '2' = 1, '7' = 0))

	fit2 <- loess(y2 ~ x_2, data = dat2, family = 'symmetric', degree = 1)

	## Fit a loess line to the above data
	dat2 %>%
	mutate(smooth = predict(fit2, newdata = .)) %>%
	ggplot() +
	geom_point(aes(x_2, y2)) +
	geom_line(aes(x_2, smooth))

	## Given solution
	mnist_27$train %>%
	mutate(y = ifelse(y=="7", 1, 0)) %>%
	ggplot(aes(x_2, y)) +
	geom_smooth(method = "loess")