Last active
May 19, 2020 18:30
-
-
Save BroVic/60839a3e11bafda996eb3d5eeddec2fa to your computer and use it in GitHub Desktop.
Harvard PH125.8x: Section 2
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# comp-check1.R | |
library(tidyverse) | |
library(MASS) | |
library(caret) | |
set.seed(1) | |
Sigma <- 9 * matrix(c(1, 0.5, 0.5, 1), 2, 2) | |
dat <- mvrnorm(n = 100, c(69, 69), Sigma = Sigma) %>% | |
data.frame() %>% | |
setNames(c('x', 'y')) | |
set.seed(1) | |
test_index <- createDataPartition(dat$y, list = FALSE) | |
train_set <- dat[-test_index,] | |
test_set <- dat[test_index,] | |
# Guessing using the average | |
avg <- mean(train_set$y) | |
avg | |
## Squared loss... | |
mean((avg - test_set$y) ^ 2) | |
d <- function(size, means, Sigma, names) { | |
set.seed(1) | |
mvrnorm(n = size, mu = means, Sigma = Sigma) %>% | |
data.frame() %>% | |
setNames(names) | |
} | |
f <- function(data) { | |
rmse <- replicate(100, { | |
test_index <- createDataPartition(data$y, list = FALSE) | |
train_set <- data[-test_index,] | |
test_set <- data[test_index,] | |
## Train a linear model | |
fit <- lm(y ~ x, data = train_set) | |
y_hat <- predict(fit, test_set) | |
## Squared loss... | |
sqrt(mean((y_hat - test_set$y) ^ 2)) | |
}) | |
structure(c(mean(rmse), sd(rmse)), names = c('mean', 'sd')) | |
} | |
data <- d(100, c(69, 69), Sigma, c('x', 'y')) | |
set.seed(1) | |
results <- f(data) | |
results | |
# Repeat using larger datasets | |
## Size of datasets | |
n <- c(100, 500, 1000, 5000, 10000) | |
names(n) <- as.character(n) | |
results <- map(n, function(x) { | |
set.seed(1) | |
data <- d(size = x, means = c(69, 69), Sigma = Sigma, names = c('x', 'y')) | |
}) | |
set.seed(1) | |
results <- map(results, f) %>% | |
rbind() | |
results | |
# Q4: Make the correlation of x and y larger | |
set.seed(1) | |
n <- 100 | |
Sigma <- 9 * matrix(c(1.0, 0.95, 0.95, 1.0), 2, 2) | |
dat <- mvrnorm(n = 100, c(69, 69), Sigma = Sigma) %>% | |
data.frame() %>% | |
setNames(c('x', 'y')) | |
## We can just use our function 'f' | |
f(n, Sigma = Sigma) | |
# Q6: Create data set | |
set.seed(1) | |
n <- 1000 | |
Sigma <- matrix(c(1.0, 0.75, 0.75, 0.75, 1.0, 0.25, 0.75, 0.25, 1.0), 3, 3) | |
dat <- mvrnorm(n = 100, c(0, 0, 0), Sigma = Sigma) %>% | |
data.frame() %>% | |
setNames(c('y', 'x_1', 'x_2')) | |
set.seed(1) | |
test_index <- createDataPartition(dat$y, list = FALSE) | |
train_set <- dat %>% slice(-test_index) | |
test_set <- dat %>% slice(test_index) | |
avg <- mean(train_set$y) | |
avg | |
## Test for best model i.e. lowest RMSE | |
fit_x_1 <- lm(y ~ x_1, data = train_set) | |
fit_x_2 <- lm(y ~ x_2, data = train_set) | |
fit_x12 <- lm(y ~ x_1 + x_2, data = train_set) | |
y_hat_x1 <- predict(fit_x_1, test_set) | |
y_hat_x2 <- predict(fit_x_2, test_set) | |
y_hat_x12 <- predict(fit_x12, test_set) | |
lossFx <- function(yhat) mean(sqrt((yhat - test_set$y)^2)) | |
predList <- list(x1 = y_hat_x1, x2 = y_hat_x2, x_1_2 = y_hat_x12) | |
listOfRMSEs <- lapply(predList, lossFx) | |
vecOfRMSEs <- structure(unlist(listOfRMSEs), names = names(listOfRMSEs)) | |
bestModel <- names(which.min(vecOfRMSEs)) | |
cat('The best model is', bestModel, '\n') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# comp-check2.R | |
library(tidyverse) | |
library(caret) | |
set.seed(2) | |
make_data <- | |
function(n = 1000, | |
p = 0.5, | |
mu_0 = 0, | |
mu_1 = 2, | |
sigma_0 = 1, | |
sigma_1 = 1) { | |
y <- rbinom(n, 1, p) | |
f_0 <- rnorm(n, mu_0, sigma_0) | |
f_1 <- rnorm(n, mu_1, sigma_1) | |
x <- ifelse(y == 1, f_1, f_0) | |
test_index <- createDataPartition(y, list = FALSE) | |
list( | |
train = data.frame(x = x, y = as.factor(y)) %>% slice(-test_index), | |
test = data.frame(x = x, y = as.factor(y)) %>% slice(test_index) | |
) | |
} | |
dat <- make_data() | |
dat$train %>% | |
ggplot(aes(x, color = y)) + | |
geom_density() | |
# Plot 25 different datasets with difference ranging from 0 to 3 | |
# and plot accuracy vs mu_1 | |
delta <- seq(0, 3, len = 25) | |
## First we compute accuracy using our initial dataset to | |
## develop a prototype computation. Since we have a binary | |
## outcome, we will use a logistic regression model (at | |
## this stage of the course that's all we know to use) | |
## Now, we start by training the model | |
fit <- glm(y ~ x, data = dat$train, family = 'binomial') | |
p_hat <- predict(fit, dat$test) | |
y_hat <- factor(ifelse(p_hat > 0.5, '1', '0')) | |
cfm <- confusionMatrix(y_hat, reference = dat$test$y) | |
cfm$overall['Accuracy'] | |
## Extending this to the problem | |
acc <- map_dbl(delta, function(x) { | |
dat <- make_data(mu_1 = x) | |
fit <- glm(y ~ x, data = dat$train, family = 'binomial') | |
p_hat <- predict(fit, dat$test, type = 'response') | |
y_hat <- factor(ifelse(p_hat > 0.5, '1', '0')) | |
cfm <- confusionMatrix(y_hat, reference = dat$test$y) | |
cfm$overall['Accuracy'] | |
}) | |
qplot(delta, acc) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# comp-check3.R | |
# Smoothing | |
library(tidyverse) | |
library(lubridate) | |
library(pdftools) | |
fn <- | |
system.file('extdata', | |
'RD-Mortality-Report_2015-18-180531.pdf', | |
package = 'dslabs') | |
## Create the data frame | |
dat <- map_df(str_split(pdf_text(fn), '\n'), function(s) { | |
s <- str_trim(s) | |
header_index <- str_which(s, '2015')[1] | |
tmp <- str_split(s[header_index], "\\s+", simplify = TRUE) | |
month <- tmp[1] | |
header <- tmp[-1] | |
tail_index <- str_which(s, 'Total') | |
n <- str_count(s, "\\d+") | |
out <- | |
c(1:header_index, | |
which(n == 1), | |
which(n >= 28), | |
tail_index:length(s)) | |
s[-out] %>% | |
str_remove_all('[^\\d\\s]') %>% | |
str_trim() %>% | |
str_split_fixed('\\s+', n = 6) %>% | |
.[, 1:5] %>% | |
as_data_frame() %>% | |
setNames(c('day', header)) %>% | |
mutate(month = month, | |
day = as.numeric(day)) %>% | |
gather(year, deaths, -c(day, month)) %>% | |
mutate(deaths = as.numeric(deaths)) | |
}) %>% | |
mutate( | |
month = recode( | |
month, | |
'JAN' = 1, | |
'FEB' = 2, | |
'MAR' = 3, | |
'APR' = 4, | |
'MAY' = 5, | |
'JUN' = 6, | |
'JUL' = 7, | |
'AGO' = 8, | |
'SEP' = 9, | |
'OCT' = 10, | |
'NOV' = 11, | |
'DEC' = 12 | |
) | |
) %>% | |
mutate(date = make_date(year, month, day)) %>% | |
filter(date <= '2018-05-01') | |
all_days <- diff(range(dat$date)) | |
span <- 60/as.numeric(all_days) | |
# fit <- | |
# loess( | |
# deaths ~ date, | |
# degree = 1L, | |
# data = dat, | |
# span = span | |
# ) | |
# ggplot(dat, aes(date, deaths)) + | |
# geom_point() + | |
# geom_smooth( | |
# color = 'red', | |
# span = span, | |
# method = 'loess', | |
# method.args = list(degree = 1) | |
# ) | |
fit <- dat %>% | |
mutate(x = as.numeric(date)) %>% | |
loess(deaths ~ x, data = ., span = span, degree = 1) | |
dat %>% | |
mutate(smooth = predict(fit, as.numeric(date))) %>% | |
ggplot() + | |
geom_point(aes(date, deaths)) + | |
geom_line(aes(date, smooth), lwd = 2, col = 2) | |
## Produce a plot with a curve fitted for each year | |
dat %>% | |
mutate(smooth = predict(fit, as.numeric(date)), day = yday(date)) %>% | |
ggplot(aes(day, smooth, col = year)) + | |
geom_line(lwd = 2) | |
## Predict 2s and 7s in the mnist_27 dataset | |
library(broom) | |
library(dslabs) | |
mnist_27$train %>% | |
glm(y ~ x_2, family = 'binomial', data = .) %>% | |
tidy() | |
qplot(x_2, y, data = mnist_27$train) | |
## Fit a smoothing model using loess | |
dat2 <- mnist_27$train %>% | |
mutate(y2 = recode(y, '2' = 1, '7' = 0)) | |
fit2 <- loess(y2 ~ x_2, data = dat2, family = 'symmetric', degree = 1) | |
## Fit a loess line to the above data | |
dat2 %>% | |
mutate(smooth = predict(fit2, newdata = .)) %>% | |
ggplot() + | |
geom_point(aes(x_2, y2)) + | |
geom_line(aes(x_2, smooth)) | |
## Given solution | |
mnist_27$train %>% | |
mutate(y = ifelse(y=="7", 1, 0)) %>% | |
ggplot(aes(x_2, y)) + | |
geom_smooth(method = "loess") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment