markushuff/d_sub_a.R

## d_sub_a.R
# Calculation of d_a ("d sub a"). A step-by-step tutorial based on the book:
# Macmillan, N. A., & Creelman, C. D. (2004). Detection theory: A user’s guide.
# Psychology Press.
#
# pp. 57ff

library(ggplot2)
library(plyr)

# Create data set

frequency <- c(rep("low",5),rep("high",5))
old <- c(61,15,15,5,4,37,25,18,11,9)
new <- c(2,8,35,23,30,4,18,28,21,29)
answer <- c(rep(1:5,2))  # 1 = "sure old", 2 = "maybe old", 3 = "uncertain", 4 = "maybe new", 5 = "sure new"

roc_test <- data.frame(frequency,old,new,answer)


# Create proportional data
roc_test$p_hit <- roc_test$old / 100
roc_test$p_fa <- roc_test$new / 100

# Create cummulative proportions
# IMPORTANT: The data has to be sorted such that the "sure old" answer comes first. In this example (1,2,3,4,5).
# If the answer variable is different, e.g. 6 for "sure old", than the data frame has to be adequately sorted (decreasing).
# Otherwise, the ROC curves look weired.
roc_test$p_cum_hit <- NA
roc_test$p_cum_fa <- NA

roc_test[roc_test$frequency == "low",]$p_cum_hit <- cumsum(roc_test[roc_test$frequency == "low",]$p_hit)
roc_test[roc_test$frequency == "low",]$p_cum_fa <- cumsum(roc_test[roc_test$frequency == "low",]$p_fa)

roc_test[roc_test$frequency == "high",]$p_cum_hit <- cumsum(roc_test[roc_test$frequency == "high",]$p_hit)
roc_test[roc_test$frequency == "high",]$p_cum_fa <- cumsum(roc_test[roc_test$frequency == "high",]$p_fa)

# Plot data
roc_test

# Exclude the last step of the cumulative function
# Note: if this step is not executed, the results will look different as compared to the book example.
roc_test <- roc_test[roc_test$answer != 5,]

# Plotting the data.
ggplot(roc_test,aes(p_cum_fa,p_cum_hit,color=frequency))+
  geom_point()+
  geom_line()+
  geom_abline()+
  coord_cartesian(xlim = c(0,1), ylim = c(0,1))+
  theme_bw(base_size = 16)

# Plotting the z-transfored data
ggplot(roc_test,aes(qnorm(p_cum_fa),qnorm(p_cum_hit),color=frequency))+
  geom_point()+
  geom_smooth(method="lm", se = FALSE)+
  geom_abline()+
  coord_cartesian(xlim = c(-2,2), ylim = c(-2,2))+
  theme_bw(base_size = 16)

# Get the intercept (d_2_prime, vertical intercept) and slope with maximum likelihood estimations based on the z-transformed data
# x: hits
# y: fa

lm.loss_x_y <- function(par) {

  a.par <- par[1]      # The current slope
  b.par <- par[2]      # The current intercept
  err.sigma <- par[3]  # The current error standard deviation

  # If the error standard deviation is invalid (i.e.; negative), then we need to return a very high deviance
  # This will tell the optimization procedure to stay away from invalid (either statistically or psychologically)
  # parameter values.

  if(err.sigma < 0) {deviance <- 10000000}

  # If the error standard deviation is valid (i.e.; > 0), then calculate the deviance...

  if(err.sigma > 0) {

    # Calculate the likelihood of each data point.
    # Here is where you specify the model and how you calculate likelihoods.

    likelihoods <- dnorm(x_data, mean = y_data * a.par + b.par, sd = err.sigma)

    # Now let's convert the vector of likelihoods to a summary deviance score (-2 times sub log-lik)

    # Calculate log-likelihoods of each data point
    log.likelihoods <- log(likelihoods)

    # Calculate the deviance (-2 times sum of log-likelihoods)
    deviance <- -2 * sum(log.likelihoods)

  }

  return(deviance)

}

dat_roc_stat <- data.frame(frequency = 1,
                           slope = 1,
                           d_2_prime = 1
)

dat_roc_stat <- dat_roc_stat[-1,]

for(freq in unique(roc_test$frequency))
{
  x_data <- roc_test[roc_test$frequency==freq,]$p_cum_hit
  y_data <- roc_test[roc_test$frequency==freq,]$p_cum_fa

  x_data <- qnorm(x_data)
  y_data <- qnorm(y_data)

  parameter.fits_d2_prime <- optim(par = c(0, 0, 10),
                                   fn = lm.loss_x_y,
                                   hessian = T)
  tmp <- data.frame(frequency = freq,
                    slope = parameter.fits_d2_prime$par[1],
                    d_2_prime = parameter.fits_d2_prime$par[2]
  )

  dat_roc_stat <- rbind(dat_roc_stat, tmp)
}

# Calculate d_a (Formula 3.4)
dat_roc_stat$d_a <- sqrt(2/(1 + dat_roc_stat$slope^2)) * dat_roc_stat$d_2_prime

# The resutls are similar to those in the book chapter
dat_roc_stat
	# Calculation of d_a ("d sub a"). A step-by-step tutorial based on the book:
	# Macmillan, N. A., & Creelman, C. D. (2004). Detection theory: A user’s guide.
	# Psychology Press.
	#
	# pp. 57ff

	library(ggplot2)
	library(plyr)

	# Create data set

	frequency <- c(rep("low",5),rep("high",5))
	old <- c(61,15,15,5,4,37,25,18,11,9)
	new <- c(2,8,35,23,30,4,18,28,21,29)
	answer <- c(rep(1:5,2)) # 1 = "sure old", 2 = "maybe old", 3 = "uncertain", 4 = "maybe new", 5 = "sure new"

	roc_test <- data.frame(frequency,old,new,answer)


	# Create proportional data
	roc_test$p_hit <- roc_test$old / 100
	roc_test$p_fa <- roc_test$new / 100

	# Create cummulative proportions
	# IMPORTANT: The data has to be sorted such that the "sure old" answer comes first. In this example (1,2,3,4,5).
	# If the answer variable is different, e.g. 6 for "sure old", than the data frame has to be adequately sorted (decreasing).
	# Otherwise, the ROC curves look weired.
	roc_test$p_cum_hit <- NA
	roc_test$p_cum_fa <- NA

	roc_test[roc_test$frequency == "low",]$p_cum_hit <- cumsum(roc_test[roc_test$frequency == "low",]$p_hit)
	roc_test[roc_test$frequency == "low",]$p_cum_fa <- cumsum(roc_test[roc_test$frequency == "low",]$p_fa)

	roc_test[roc_test$frequency == "high",]$p_cum_hit <- cumsum(roc_test[roc_test$frequency == "high",]$p_hit)
	roc_test[roc_test$frequency == "high",]$p_cum_fa <- cumsum(roc_test[roc_test$frequency == "high",]$p_fa)

	# Plot data
	roc_test

	# Exclude the last step of the cumulative function
	# Note: if this step is not executed, the results will look different as compared to the book example.
	roc_test <- roc_test[roc_test$answer != 5,]

	# Plotting the data.
	ggplot(roc_test,aes(p_cum_fa,p_cum_hit,color=frequency))+
	geom_point()+
	geom_line()+
	geom_abline()+
	coord_cartesian(xlim = c(0,1), ylim = c(0,1))+
	theme_bw(base_size = 16)

	# Plotting the z-transfored data
	ggplot(roc_test,aes(qnorm(p_cum_fa),qnorm(p_cum_hit),color=frequency))+
	geom_point()+
	geom_smooth(method="lm", se = FALSE)+
	geom_abline()+
	coord_cartesian(xlim = c(-2,2), ylim = c(-2,2))+
	theme_bw(base_size = 16)

	# Get the intercept (d_2_prime, vertical intercept) and slope with maximum likelihood estimations based on the z-transformed data
	# x: hits
	# y: fa

	lm.loss_x_y <- function(par) {

	a.par <- par[1] # The current slope
	b.par <- par[2] # The current intercept
	err.sigma <- par[3] # The current error standard deviation

	# If the error standard deviation is invalid (i.e.; negative), then we need to return a very high deviance
	# This will tell the optimization procedure to stay away from invalid (either statistically or psychologically)
	# parameter values.

	if(err.sigma < 0) {deviance <- 10000000}

	# If the error standard deviation is valid (i.e.; > 0), then calculate the deviance...

	if(err.sigma > 0) {

	# Calculate the likelihood of each data point.
	# Here is where you specify the model and how you calculate likelihoods.

	likelihoods <- dnorm(x_data, mean = y_data * a.par + b.par, sd = err.sigma)

	# Now let's convert the vector of likelihoods to a summary deviance score (-2 times sub log-lik)

	# Calculate log-likelihoods of each data point
	log.likelihoods <- log(likelihoods)

	# Calculate the deviance (-2 times sum of log-likelihoods)
	deviance <- -2 * sum(log.likelihoods)

	}

	return(deviance)

	}

	dat_roc_stat <- data.frame(frequency = 1,
	slope = 1,
	d_2_prime = 1
	)

	dat_roc_stat <- dat_roc_stat[-1,]

	for(freq in unique(roc_test$frequency))
	{
	x_data <- roc_test[roc_test$frequency==freq,]$p_cum_hit
	y_data <- roc_test[roc_test$frequency==freq,]$p_cum_fa

	x_data <- qnorm(x_data)
	y_data <- qnorm(y_data)

	parameter.fits_d2_prime <- optim(par = c(0, 0, 10),
	fn = lm.loss_x_y,
	hessian = T)
	tmp <- data.frame(frequency = freq,
	slope = parameter.fits_d2_prime$par[1],
	d_2_prime = parameter.fits_d2_prime$par[2]
	)

	dat_roc_stat <- rbind(dat_roc_stat, tmp)
	}

	# Calculate d_a (Formula 3.4)
	dat_roc_stat$d_a <- sqrt(2/(1 + dat_roc_stat$slope^2)) * dat_roc_stat$d_2_prime

	# The resutls are similar to those in the book chapter
	dat_roc_stat