Instantly share code, notes, and snippets.

Embed
What would you like to do?
Intro to Splines
# This is the source code of Albert Y. Kim (rudeboybert)'s "Introduction to
# Splines" video.
# Lines 4-86 were run before the video and hidden from the viewer in order to
# simplify the content of the video.
library(tidyverse)
library(broom)
# Set options, random number generator seed value, & ggplot geom() + theme()
# arguments
options(max.print=25)
set.seed(76)
update_geom_defaults("point", list(size = 0.5))
update_geom_defaults("path", list(size = 1, colour="red"))
update_geom_defaults("line", list(size = 1, colour="blue"))
theme() +
theme_update(axis.title.y=element_blank())
# 1. Create multiple_df ggplot2 object which shows different spline fits with
# differing degrees of freedom for same "y = f(x) + epsilon" function as in the
# video
df_values <- c(2, 10, 25, 50)
f <- function(x){
f_x <- 0.2*x^11*(10*(1-x))^6+10*(10*x)^3*(1-x)^10
return(f_x)
}
values <- data_frame(
x = seq(from=0, to=1, length=500),
f_x = f(x),
epsilon = rnorm(500, 0, sd = 2),
y = f_x + epsilon
)
overall <- NULL
for(df in df_values){
overall <- smooth.spline(values$x, values$y, df=df) %>%
augment() %>%
mutate(df=df) %>%
bind_rows(overall)
}
multiple_df <- overall %>%
ggplot(aes(x=x)) +
geom_point(aes(y=y)) +
geom_line(aes(y=.fitted)) +
facet_wrap(~df, nrow=2) +
labs(title="Splines fit w/ different degrees of freedom")
multiple_df
# 2. Create new "y = f(x) + epsilon" values data frame using new (piece-wise
# defined) function
f2 <- function(x){
ifelse(0 < x & x < 5*pi, -6*sin(x),
ifelse(5*pi < x & x < 25, 0,
ifelse(25 < x & x < 32, 0.02*(exp(x-25)-1),
ifelse(32 < x & x < 54, -x+32+0.02*(exp(7)-1), 10)
)
)
)
}
values <- data_frame(
x = runif(2000, min=0, max=60),
f_x = f2(x),
epsilon = rnorm(2000, 0, sd = 12),
y = f_x + epsilon
)
# Exercise, fitted, and truth plot
exercise <- values %>%
ggplot(aes(x=x)) +
geom_point(aes(y=y)) +
labs(title="Points based on new y=f(x)+epsilon")
exercise
fitted <- smooth.spline(values$x, values$y, df=25) %>%
augment() %>%
mutate(df=10) %>%
ggplot(aes(x=x)) +
geom_point(aes(y=y)) +
geom_line(aes(y=.fitted)) +
labs(title = "Spline curve (in blue) fit with degrees of freedom = 25")
fitted
truth <- fitted +
stat_function(fun = f2) +
labs(title = "True function f(x) in red")
truth
# Video starts here --------------------------------------------------------------
library(tidyverse)
library(broom)
# Here is a wacky function f(x)
f <- function(x){
f_x <- 0.2*x^11*(10*(1-x))^6+10*(10*x)^3*(1-x)^10
return(f_x)
}
# For 500 equally spaced values of x between 0 & 1, let's compute and plot f(x) in red.
# Recall that f(x) is the systematic component, or "the signal"
values <- data_frame(
x = seq(from=0, to=1, length=500),
f_x = f(x)
)
values %>%
ggplot(aes(x=x)) +
stat_function(fun = f)
# We now add the unsystematic error component epsilon to f(x) i.e. the noise, to
# obtain our y's, and hence our observed points in black (x, y)
values <- values %>%
mutate(
epsilon = rnorm(500, 0, sd = 2),
y = f_x + epsilon
)
values %>%
ggplot(aes(x=x)) +
stat_function(fun = f) +
geom_point(aes(y=y))
# But remember in real life, we won't know the red curve! If we did, then why
# are we doing any of this? All we observe are the black points. Let's "pretend"
# like we don't know what the red curve is!
values %>%
ggplot(aes(x=x)) +
geom_point(aes(y=y))
# We now fit a 'smoothing spline'. Think of it as a piece of string with a
# specified amount of flexibility, where the flexibility is controlled by the
# "degrees of freedom" df. This blue curve is a "guess/estimate" of the red
# curve f(x), which recall, we are pretending we don't know. Also observe how we
# use the broom::augment() function to convert the output of smooth.spline to
# tidy data frame format.
smooth.spline(values$x, values$y, df=5) %>%
broom::augment() %>%
ggplot(aes(x=x)) +
geom_point(aes(y=y)) +
geom_line(aes(y=.fitted))
# Play around with the df argument in smooth.spline() above.
# Now let's compare smoothing splines using four different values of the degrees
# of freedom in a plot I precomputed. Which do you think is best?
multiple_df
# I would say that df=10 roughly is best. df=2 is not quite flexible enough,
# where as df=50 seems to no longer fitting to signal (the true function f) and
# is now fitting to noise. In other words, it is overfitting to this particular
# data set.
multiple_df +
stat_function(fun = f)
# Exercise. Here are a set of points from a different f(x) and epsilon. With
# your finger trace what you think the true f(x) function looks like. In other
# words, separate the signal from the noise!"
exercise
# Let's fit a spline with 25 degrees of freedom. How close is this to the truth?
fitted
# Ready? Here is the truth! How close were you? Note the noise is normal with mean 0
# and sd = 12!
truth
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment