Skip to content

Instantly share code, notes, and snippets.

@BroVic
Last active May 19, 2020 18:30
Show Gist options
  • Save BroVic/60839a3e11bafda996eb3d5eeddec2fa to your computer and use it in GitHub Desktop.
Save BroVic/60839a3e11bafda996eb3d5eeddec2fa to your computer and use it in GitHub Desktop.
Harvard PH125.8x: Section 2
# comp-check1.R
library(tidyverse)
library(MASS)
library(caret)
set.seed(1)
Sigma <- 9 * matrix(c(1, 0.5, 0.5, 1), 2, 2)
dat <- mvrnorm(n = 100, c(69, 69), Sigma = Sigma) %>%
data.frame() %>%
setNames(c('x', 'y'))
set.seed(1)
test_index <- createDataPartition(dat$y, list = FALSE)
train_set <- dat[-test_index,]
test_set <- dat[test_index,]
# Guessing using the average
avg <- mean(train_set$y)
avg
## Squared loss...
mean((avg - test_set$y) ^ 2)
d <- function(size, means, Sigma, names) {
set.seed(1)
mvrnorm(n = size, mu = means, Sigma = Sigma) %>%
data.frame() %>%
setNames(names)
}
f <- function(data) {
rmse <- replicate(100, {
test_index <- createDataPartition(data$y, list = FALSE)
train_set <- data[-test_index,]
test_set <- data[test_index,]
## Train a linear model
fit <- lm(y ~ x, data = train_set)
y_hat <- predict(fit, test_set)
## Squared loss...
sqrt(mean((y_hat - test_set$y) ^ 2))
})
structure(c(mean(rmse), sd(rmse)), names = c('mean', 'sd'))
}
data <- d(100, c(69, 69), Sigma, c('x', 'y'))
set.seed(1)
results <- f(data)
results
# Repeat using larger datasets
## Size of datasets
n <- c(100, 500, 1000, 5000, 10000)
names(n) <- as.character(n)
results <- map(n, function(x) {
set.seed(1)
data <- d(size = x, means = c(69, 69), Sigma = Sigma, names = c('x', 'y'))
})
set.seed(1)
results <- map(results, f) %>%
rbind()
results
# Q4: Make the correlation of x and y larger
set.seed(1)
n <- 100
Sigma <- 9 * matrix(c(1.0, 0.95, 0.95, 1.0), 2, 2)
dat <- mvrnorm(n = 100, c(69, 69), Sigma = Sigma) %>%
data.frame() %>%
setNames(c('x', 'y'))
## We can just use our function 'f'
f(n, Sigma = Sigma)
# Q6: Create data set
set.seed(1)
n <- 1000
Sigma <- matrix(c(1.0, 0.75, 0.75, 0.75, 1.0, 0.25, 0.75, 0.25, 1.0), 3, 3)
dat <- mvrnorm(n = 100, c(0, 0, 0), Sigma = Sigma) %>%
data.frame() %>%
setNames(c('y', 'x_1', 'x_2'))
set.seed(1)
test_index <- createDataPartition(dat$y, list = FALSE)
train_set <- dat %>% slice(-test_index)
test_set <- dat %>% slice(test_index)
avg <- mean(train_set$y)
avg
## Test for best model i.e. lowest RMSE
fit_x_1 <- lm(y ~ x_1, data = train_set)
fit_x_2 <- lm(y ~ x_2, data = train_set)
fit_x12 <- lm(y ~ x_1 + x_2, data = train_set)
y_hat_x1 <- predict(fit_x_1, test_set)
y_hat_x2 <- predict(fit_x_2, test_set)
y_hat_x12 <- predict(fit_x12, test_set)
lossFx <- function(yhat) mean(sqrt((yhat - test_set$y)^2))
predList <- list(x1 = y_hat_x1, x2 = y_hat_x2, x_1_2 = y_hat_x12)
listOfRMSEs <- lapply(predList, lossFx)
vecOfRMSEs <- structure(unlist(listOfRMSEs), names = names(listOfRMSEs))
bestModel <- names(which.min(vecOfRMSEs))
cat('The best model is', bestModel, '\n')
# comp-check2.R
library(tidyverse)
library(caret)
set.seed(2)
make_data <-
function(n = 1000,
p = 0.5,
mu_0 = 0,
mu_1 = 2,
sigma_0 = 1,
sigma_1 = 1) {
y <- rbinom(n, 1, p)
f_0 <- rnorm(n, mu_0, sigma_0)
f_1 <- rnorm(n, mu_1, sigma_1)
x <- ifelse(y == 1, f_1, f_0)
test_index <- createDataPartition(y, list = FALSE)
list(
train = data.frame(x = x, y = as.factor(y)) %>% slice(-test_index),
test = data.frame(x = x, y = as.factor(y)) %>% slice(test_index)
)
}
dat <- make_data()
dat$train %>%
ggplot(aes(x, color = y)) +
geom_density()
# Plot 25 different datasets with difference ranging from 0 to 3
# and plot accuracy vs mu_1
delta <- seq(0, 3, len = 25)
## First we compute accuracy using our initial dataset to
## develop a prototype computation. Since we have a binary
## outcome, we will use a logistic regression model (at
## this stage of the course that's all we know to use)
## Now, we start by training the model
fit <- glm(y ~ x, data = dat$train, family = 'binomial')
p_hat <- predict(fit, dat$test)
y_hat <- factor(ifelse(p_hat > 0.5, '1', '0'))
cfm <- confusionMatrix(y_hat, reference = dat$test$y)
cfm$overall['Accuracy']
## Extending this to the problem
acc <- map_dbl(delta, function(x) {
dat <- make_data(mu_1 = x)
fit <- glm(y ~ x, data = dat$train, family = 'binomial')
p_hat <- predict(fit, dat$test, type = 'response')
y_hat <- factor(ifelse(p_hat > 0.5, '1', '0'))
cfm <- confusionMatrix(y_hat, reference = dat$test$y)
cfm$overall['Accuracy']
})
qplot(delta, acc)
# comp-check3.R
# Smoothing
library(tidyverse)
library(lubridate)
library(pdftools)
fn <-
system.file('extdata',
'RD-Mortality-Report_2015-18-180531.pdf',
package = 'dslabs')
## Create the data frame
dat <- map_df(str_split(pdf_text(fn), '\n'), function(s) {
s <- str_trim(s)
header_index <- str_which(s, '2015')[1]
tmp <- str_split(s[header_index], "\\s+", simplify = TRUE)
month <- tmp[1]
header <- tmp[-1]
tail_index <- str_which(s, 'Total')
n <- str_count(s, "\\d+")
out <-
c(1:header_index,
which(n == 1),
which(n >= 28),
tail_index:length(s))
s[-out] %>%
str_remove_all('[^\\d\\s]') %>%
str_trim() %>%
str_split_fixed('\\s+', n = 6) %>%
.[, 1:5] %>%
as_data_frame() %>%
setNames(c('day', header)) %>%
mutate(month = month,
day = as.numeric(day)) %>%
gather(year, deaths, -c(day, month)) %>%
mutate(deaths = as.numeric(deaths))
}) %>%
mutate(
month = recode(
month,
'JAN' = 1,
'FEB' = 2,
'MAR' = 3,
'APR' = 4,
'MAY' = 5,
'JUN' = 6,
'JUL' = 7,
'AGO' = 8,
'SEP' = 9,
'OCT' = 10,
'NOV' = 11,
'DEC' = 12
)
) %>%
mutate(date = make_date(year, month, day)) %>%
filter(date <= '2018-05-01')
all_days <- diff(range(dat$date))
span <- 60/as.numeric(all_days)
# fit <-
# loess(
# deaths ~ date,
# degree = 1L,
# data = dat,
# span = span
# )
# ggplot(dat, aes(date, deaths)) +
# geom_point() +
# geom_smooth(
# color = 'red',
# span = span,
# method = 'loess',
# method.args = list(degree = 1)
# )
fit <- dat %>%
mutate(x = as.numeric(date)) %>%
loess(deaths ~ x, data = ., span = span, degree = 1)
dat %>%
mutate(smooth = predict(fit, as.numeric(date))) %>%
ggplot() +
geom_point(aes(date, deaths)) +
geom_line(aes(date, smooth), lwd = 2, col = 2)
## Produce a plot with a curve fitted for each year
dat %>%
mutate(smooth = predict(fit, as.numeric(date)), day = yday(date)) %>%
ggplot(aes(day, smooth, col = year)) +
geom_line(lwd = 2)
## Predict 2s and 7s in the mnist_27 dataset
library(broom)
library(dslabs)
mnist_27$train %>%
glm(y ~ x_2, family = 'binomial', data = .) %>%
tidy()
qplot(x_2, y, data = mnist_27$train)
## Fit a smoothing model using loess
dat2 <- mnist_27$train %>%
mutate(y2 = recode(y, '2' = 1, '7' = 0))
fit2 <- loess(y2 ~ x_2, data = dat2, family = 'symmetric', degree = 1)
## Fit a loess line to the above data
dat2 %>%
mutate(smooth = predict(fit2, newdata = .)) %>%
ggplot() +
geom_point(aes(x_2, y2)) +
geom_line(aes(x_2, smooth))
## Given solution
mnist_27$train %>%
mutate(y = ifelse(y=="7", 1, 0)) %>%
ggplot(aes(x_2, y)) +
geom_smooth(method = "loess")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment