rudeboybert/splines.R

## splines.R
# This is the source code of Albert Y. Kim (rudeboybert)'s "Introduction to
# Splines" video.

# Lines 4-86 were run before the video and hidden from the viewer in order to
# simplify the content of the video.
library(tidyverse)
library(broom)

# Set options, random number generator seed value, & ggplot geom() + theme()
# arguments
options(max.print=25)
set.seed(76)

update_geom_defaults("point", list(size = 0.5))
update_geom_defaults("path", list(size = 1, colour="red"))
update_geom_defaults("line", list(size = 1, colour="blue"))
theme() +
  theme_update(axis.title.y=element_blank())

# 1. Create multiple_df ggplot2 object which shows different spline fits with
# differing degrees of freedom for same "y = f(x) + epsilon" function as in the
# video
df_values <- c(2, 10, 25, 50)
f <- function(x){
  f_x <- 0.2*x^11*(10*(1-x))^6+10*(10*x)^3*(1-x)^10
  return(f_x)
}
values <- data_frame(
  x = seq(from=0, to=1, length=500),
  f_x = f(x),
  epsilon = rnorm(500, 0, sd = 2),
  y = f_x + epsilon
)
overall <- NULL
for(df in df_values){
  overall <- smooth.spline(values$x, values$y, df=df) %>%
    augment() %>%
    mutate(df=df) %>%
    bind_rows(overall)
}
multiple_df <- overall %>%
  ggplot(aes(x=x)) +
  geom_point(aes(y=y)) +
  geom_line(aes(y=.fitted)) +
  facet_wrap(~df, nrow=2) +
  labs(title="Splines fit w/ different degrees of freedom")
multiple_df

# 2. Create new "y = f(x) + epsilon" values data frame using new (piece-wise
# defined) function
f2 <- function(x){
  ifelse(0 < x & x < 5*pi, -6*sin(x),
         ifelse(5*pi < x & x < 25,  0,
                ifelse(25 < x & x < 32, 0.02*(exp(x-25)-1),
                       ifelse(32 < x & x < 54, -x+32+0.02*(exp(7)-1), 10)
                )
         )
  )
}
values <- data_frame(
  x = runif(2000, min=0, max=60),
  f_x = f2(x),
  epsilon = rnorm(2000, 0, sd = 12),
  y = f_x + epsilon
)

# Exercise, fitted, and truth plot
exercise <- values %>%
  ggplot(aes(x=x)) +
  geom_point(aes(y=y)) +
  labs(title="Points based on new y=f(x)+epsilon")
exercise

fitted <- smooth.spline(values$x, values$y, df=25) %>%
  augment() %>%
  mutate(df=10) %>%
  ggplot(aes(x=x)) +
  geom_point(aes(y=y)) +
  geom_line(aes(y=.fitted)) +
  labs(title = "Spline curve (in blue) fit with degrees of freedom = 25")
fitted

truth <- fitted +
  stat_function(fun = f2) +
  labs(title = "True function f(x) in red")
truth


# Video starts here --------------------------------------------------------------
library(tidyverse)
library(broom)

# Here is a wacky function f(x)
f <- function(x){
  f_x <- 0.2*x^11*(10*(1-x))^6+10*(10*x)^3*(1-x)^10
  return(f_x)
}

# For 500 equally spaced values of x between 0 & 1, let's compute and plot f(x) in red.
# Recall that f(x) is the systematic component, or "the signal"
values <- data_frame(
  x = seq(from=0, to=1, length=500),
  f_x = f(x)
)
values %>%
  ggplot(aes(x=x)) +
  stat_function(fun = f)

# We now add the unsystematic error component epsilon to f(x) i.e. the noise, to
# obtain our y's, and hence our observed points in black (x, y)
values <- values %>%
  mutate(
    epsilon = rnorm(500, 0, sd = 2),
    y = f_x + epsilon
  )
values %>%
  ggplot(aes(x=x)) +
  stat_function(fun = f) +
  geom_point(aes(y=y))

# But remember in real life, we won't know the red curve! If we did, then why
# are we doing any of this? All we observe are the black points. Let's "pretend"
# like we don't know what the red curve is!
values %>%
  ggplot(aes(x=x)) +
  geom_point(aes(y=y))

# We now fit a 'smoothing spline'. Think of it as a piece of string with a
# specified amount of flexibility, where the flexibility is controlled by the
# "degrees of freedom" df. This blue curve is a "guess/estimate" of the red
# curve f(x), which recall, we are pretending we don't know. Also observe how we
# use the broom::augment() function to convert the output of smooth.spline to
# tidy data frame format.
smooth.spline(values$x, values$y, df=5) %>%
  broom::augment() %>%
  ggplot(aes(x=x)) +
  geom_point(aes(y=y)) +
  geom_line(aes(y=.fitted))

# Play around with the df argument in smooth.spline() above.

# Now let's compare smoothing splines using four different values of the degrees
# of freedom in a plot I precomputed. Which do you think is best?
multiple_df

# I would say that df=10 roughly is best. df=2 is not quite flexible enough,
# where as df=50 seems to no longer fitting to signal (the true function f) and
# is now fitting to noise. In other words, it is overfitting to this particular
# data set.
multiple_df +
  stat_function(fun = f)

# Exercise. Here are a set of points from a different f(x) and epsilon. With
# your finger trace what you think the true f(x) function looks like. In other
# words, separate the signal from the noise!"
exercise

# Let's fit a spline with 25 degrees of freedom. How close is this to the truth?
fitted

# Ready? Here is the truth! How close were you? Note the noise is normal with mean 0
# and sd = 12!
truth
	# This is the source code of Albert Y. Kim (rudeboybert)'s "Introduction to
	# Splines" video.

	# Lines 4-86 were run before the video and hidden from the viewer in order to
	# simplify the content of the video.
	library(tidyverse)
	library(broom)

	# Set options, random number generator seed value, & ggplot geom() + theme()
	# arguments
	options(max.print=25)
	set.seed(76)

	update_geom_defaults("point", list(size = 0.5))
	update_geom_defaults("path", list(size = 1, colour="red"))
	update_geom_defaults("line", list(size = 1, colour="blue"))
	theme() +
	theme_update(axis.title.y=element_blank())

	# 1. Create multiple_df ggplot2 object which shows different spline fits with
	# differing degrees of freedom for same "y = f(x) + epsilon" function as in the
	# video
	df_values <- c(2, 10, 25, 50)
	f <- function(x){
	f_x <- 0.2x^11(10(1-x))^6+10(10x)^3(1-x)^10
	return(f_x)
	}
	values <- data_frame(
	x = seq(from=0, to=1, length=500),
	f_x = f(x),
	epsilon = rnorm(500, 0, sd = 2),
	y = f_x + epsilon
	)
	overall <- NULL
	for(df in df_values){
	overall <- smooth.spline(values$x, values$y, df=df) %>%
	augment() %>%
	mutate(df=df) %>%
	bind_rows(overall)
	}
	multiple_df <- overall %>%
	ggplot(aes(x=x)) +
	geom_point(aes(y=y)) +
	geom_line(aes(y=.fitted)) +
	facet_wrap(~df, nrow=2) +
	labs(title="Splines fit w/ different degrees of freedom")
	multiple_df

	# 2. Create new "y = f(x) + epsilon" values data frame using new (piece-wise
	# defined) function
	f2 <- function(x){
	ifelse(0 < x & x < 5pi, -6sin(x),
	ifelse(5*pi < x & x < 25, 0,
	ifelse(25 < x & x < 32, 0.02*(exp(x-25)-1),
	ifelse(32 < x & x < 54, -x+32+0.02*(exp(7)-1), 10)
	)
	)
	)
	}
	values <- data_frame(
	x = runif(2000, min=0, max=60),
	f_x = f2(x),
	epsilon = rnorm(2000, 0, sd = 12),
	y = f_x + epsilon
	)

	# Exercise, fitted, and truth plot
	exercise <- values %>%
	ggplot(aes(x=x)) +
	geom_point(aes(y=y)) +
	labs(title="Points based on new y=f(x)+epsilon")
	exercise

	fitted <- smooth.spline(values$x, values$y, df=25) %>%
	augment() %>%
	mutate(df=10) %>%
	ggplot(aes(x=x)) +
	geom_point(aes(y=y)) +
	geom_line(aes(y=.fitted)) +
	labs(title = "Spline curve (in blue) fit with degrees of freedom = 25")
	fitted

	truth <- fitted +
	stat_function(fun = f2) +
	labs(title = "True function f(x) in red")
	truth



	# Video starts here --------------------------------------------------------------
	library(tidyverse)
	library(broom)

	# Here is a wacky function f(x)
	f <- function(x){
	f_x <- 0.2x^11(10(1-x))^6+10(10x)^3(1-x)^10
	return(f_x)
	}

	# For 500 equally spaced values of x between 0 & 1, let's compute and plot f(x) in red.
	# Recall that f(x) is the systematic component, or "the signal"
	values <- data_frame(
	x = seq(from=0, to=1, length=500),
	f_x = f(x)
	)
	values %>%
	ggplot(aes(x=x)) +
	stat_function(fun = f)

	# We now add the unsystematic error component epsilon to f(x) i.e. the noise, to
	# obtain our y's, and hence our observed points in black (x, y)
	values <- values %>%
	mutate(
	epsilon = rnorm(500, 0, sd = 2),
	y = f_x + epsilon
	)
	values %>%
	ggplot(aes(x=x)) +
	stat_function(fun = f) +
	geom_point(aes(y=y))

	# But remember in real life, we won't know the red curve! If we did, then why
	# are we doing any of this? All we observe are the black points. Let's "pretend"
	# like we don't know what the red curve is!
	values %>%
	ggplot(aes(x=x)) +
	geom_point(aes(y=y))

	# We now fit a 'smoothing spline'. Think of it as a piece of string with a
	# specified amount of flexibility, where the flexibility is controlled by the
	# "degrees of freedom" df. This blue curve is a "guess/estimate" of the red
	# curve f(x), which recall, we are pretending we don't know. Also observe how we
	# use the broom::augment() function to convert the output of smooth.spline to
	# tidy data frame format.
	smooth.spline(values$x, values$y, df=5) %>%
	broom::augment() %>%
	ggplot(aes(x=x)) +
	geom_point(aes(y=y)) +
	geom_line(aes(y=.fitted))

	# Play around with the df argument in smooth.spline() above.

	# Now let's compare smoothing splines using four different values of the degrees
	# of freedom in a plot I precomputed. Which do you think is best?
	multiple_df

	# I would say that df=10 roughly is best. df=2 is not quite flexible enough,
	# where as df=50 seems to no longer fitting to signal (the true function f) and
	# is now fitting to noise. In other words, it is overfitting to this particular
	# data set.
	multiple_df +
	stat_function(fun = f)

	# Exercise. Here are a set of points from a different f(x) and epsilon. With
	# your finger trace what you think the true f(x) function looks like. In other
	# words, separate the signal from the noise!"
	exercise

	# Let's fit a spline with 25 degrees of freedom. How close is this to the truth?
	fitted

	# Ready? Here is the truth! How close were you? Note the noise is normal with mean 0
	# and sd = 12!
	truth