dreidpath/hba1c-rbg.R

## hba1c-rbg.R
##########################################################################################################
#  Analysis of Blood glucose / HbA1C data to show that y ~ x is not the same as x ~ y.                   #
#  Author: Daniel D Reidpath, Pascale Allotey, & Mark R Diamond                                          #
#  Date: 18 October 2015                                                                                 #
##########################################################################################################
# This analysis supports a research note (doi: 10.4225/03/56239BD13C5E0)
#
#
# Load the data.  These are derived from Daramola (2012) and held on the CMU StatLib---Datasets Archive
data <- read.csv('http://lib.stat.cmu.edu/datasets/hba1c_bloodGlucose.dat', header=T, sep='\t', skip=30)
# Rename the random blood glucose variable 'mmol'  -- it keeps the unit of measure straight in my head
names(data)[2] <- 'rgb_mmol'
# Create an HbA1c variable as mmol/mol
data$mmolmol <- (data$hba1c-2.15)*10.929  # HbA1c expressed in IFCC units

# Set the random seed, so that anyone replicating the analysis will get the same results
set.seed(seed=12146)
# Randomly sample 20 of the 349 patients ... it will be easier to see what's happening in the graph with 20
data_20 <- data[sample(1:nrow(data), 20, replace=F), ]


##############
## Figure 1 ##
##############
# Plot the sample of 20 data points
plot(data_20$rgb_mmol, data_20$mmolmol,
     xlab='Blood Glucose (mmol/L)', xlim=c(0,30),
     ylab='', ylim=c(0,200))
# Place the y-axis label nicely
mtext(side=2, line=2.3, expression(paste('HbA'['1c'],' (mmol/mol)')))

# Calculate the OLS regression model for hba1c ~ blood glucose
xy.model <- lm(data_20$mmolmol ~ data_20$rgb_mmol)
# Place the line of best fit
abline(xy.model, lty=1)

# Calculate the OLS regression model for blood glucose ~ hba1c
yx.model <- lm(data_20$rgb_mmol ~ data_20$mmolmol)
# Place the line of best fit
lines(c((-10*yx.model$coef[2]+yx.model$coef[1]), (220*yx.model$coef[2]+yx.model$coef[1])),c(-10, 220), lty=2)
# Demonstrate residuals y_e
points(data_20$rgb_mmol[6], data_20$mmolmol[6], pch=20, col='Black')
lines(c(data_20$rgb_mmol[6], (data_20$mmolmol[6]*yx.model$coef[2] + yx.model$coef[1])),
      c(data_20$mmolmol[6], data_20$mmolmol[6]),
      lty=3, col='Black', lwd=1)
text(21, 137, expression("y"[e]))

# Demonstrate residuals x_e
points(data_20$rgb_mmol[2], data_20$mmolmol[2], pch=20, col='Black')
lines(c(data_20$rgb_mmol[2], data_20$rgb_mmol[2]),
      c(data_20$mmolmol[2], data_20$rgb_mmol[2]*xy.model$coef[2] + xy.model$coef[1]),
      lty=3, col='Black', lwd=1)
text(28, 69.4, expression("x"[e]))

# Note that the R_sqr for the two models is identical
summary(xy.model)
summary(yx.model)


##############
## Figure 2 ##
##############
# Plot all 349 data points in light grey
plot(data$rgb_mmol, data$mmolmol,
     xlab='Blood Glucose (mmol/L)', xlim=c(0,50),
     ylab='',
     ylim=c(0,250),
     col='Grey', pch=20)
# Place the y-axis label nicely
mtext(side=2, line=2.3, expression(paste('HbA'['1c'],' (mmol/mol)')))

# Calculate the OLS regression model for hba1c ~ blood glucose
xy.model <- lm(data$mmolmol ~ data$rgb_mmol)
# Place the line of best fit
abline(xy.model, lty=1)

# Calculate the OLS regression model for blood glucose ~ hba1c
yx.model <- lm(data$rgb_mmol ~ data$mmolmol)
# Place the line of best fit
lines(c((-10*yx.model$coef[2]+yx.model$coef[1]), (300*yx.model$coef[2]+yx.model$coef[1])),c(-10, 300), lty=2)

# Mark the approximate point of intersection of the two lines
points(13.2, 80, pch=15, col='Black', cex=1.5)

# Label the line predicting HbA1c from blood glucose
text(40, 145, expression("Predict HbA"["1c"]), cex=0.7)
# Label the line predicting blood glucose from HbA1c
text(21, 220, "Predict Blood Glucose", cex=0.7)

# Note that the R_sqr for the two models is identical
summary(xy.model)
summary(yx.model)


##############
## Figure 3 ##
##############
# Plot all 349 data points in light grey  and add the errors for a point
plot(data$rgb_mmol, data$mmolmol,
     xlab='Blood Glucose (mmol/L)', xlim=c(0,50),
     ylab='',
     ylim=c(0,250),
     col='Grey', pch=20)
# Place the y-axis label nicely
mtext(side=2, line=2.3, expression(paste('HbA'['1c'],' (mmol/mol)')))

# Calculate the OLS regression model for hba1c ~ blood glucose
xy.model <- lm(data$mmolmol ~ data$rgb_mmol)
# Place the line of best fit
abline(xy.model, lty=1)

# Calculate the OLS regression model for blood glucose ~ hba1c
yx.model <- lm(data$rgb_mmol ~ data$mmolmol)
# Place the line of best fit
lines(c((-10*yx.model$coef[2]+yx.model$coef[1]), (300*yx.model$coef[2]+yx.model$coef[1])),c(-10, 300), lty=2)

# Mark the approximate point of intersection of the two lines
points(13.2, 80, pch=15, col='Black', cex=1.5)

# Label the line predicting HbA1c from blood glucose
text(40, 145, expression("Predict HbA"["1c"]), cex=0.7)
# Label the line predicting blood glucose from HbA1c
text(21, 220, "Predict Blood Glucose", cex=0.7)


# Demonstrate residuals y_e
points(data$rgb_mmol[348], data$mmolmol[348], pch=20, col='Black')
lines(c(data$rgb_mmol[348], (data$mmolmol[348]*yx.model$coef[2] + yx.model$coef[1])),
      c(data$mmolmol[348], data$mmolmol[348]),
      lty=3, col='Black', lwd=1)
text(32.5, 205, expression("y"[e]))

# Demonstrate residuals x_e
points(data$rgb_mmol[348], data$mmolmol[348], pch=20, col='Black')
lines(c(data$rgb_mmol[348], data$rgb_mmol[348]),
      c(data$mmolmol[348], data$rgb_mmol[348]*xy.model$coef[2] + xy.model$coef[1]),
      lty=3, col='Black', lwd=1)
text(36, 175, expression("x"[e]))


# Note that the R_sqr for the two models is identical
summary(xy.model)
summary(yx.model)
	##########################################################################################################
	# Analysis of Blood glucose / HbA1C data to show that y ~ x is not the same as x ~ y. #
	# Author: Daniel D Reidpath, Pascale Allotey, & Mark R Diamond #
	# Date: 18 October 2015 #
	##########################################################################################################
	# This analysis supports a research note (doi: 10.4225/03/56239BD13C5E0)
	#
	#
	# Load the data. These are derived from Daramola (2012) and held on the CMU StatLib---Datasets Archive
	data <- read.csv('http://lib.stat.cmu.edu/datasets/hba1c_bloodGlucose.dat', header=T, sep='\t', skip=30)
	# Rename the random blood glucose variable 'mmol' -- it keeps the unit of measure straight in my head
	names(data)[2] <- 'rgb_mmol'
	# Create an HbA1c variable as mmol/mol
	data$mmolmol <- (data$hba1c-2.15)*10.929 # HbA1c expressed in IFCC units

	# Set the random seed, so that anyone replicating the analysis will get the same results
	set.seed(seed=12146)
	# Randomly sample 20 of the 349 patients ... it will be easier to see what's happening in the graph with 20
	data_20 <- data[sample(1:nrow(data), 20, replace=F), ]


	##############
	## Figure 1 ##
	##############
	# Plot the sample of 20 data points
	plot(data_20$rgb_mmol, data_20$mmolmol,
	xlab='Blood Glucose (mmol/L)', xlim=c(0,30),
	ylab='', ylim=c(0,200))
	# Place the y-axis label nicely
	mtext(side=2, line=2.3, expression(paste('HbA'['1c'],' (mmol/mol)')))

	# Calculate the OLS regression model for hba1c ~ blood glucose
	xy.model <- lm(data_20$mmolmol ~ data_20$rgb_mmol)
	# Place the line of best fit
	abline(xy.model, lty=1)

	# Calculate the OLS regression model for blood glucose ~ hba1c
	yx.model <- lm(data_20$rgb_mmol ~ data_20$mmolmol)
	# Place the line of best fit
	lines(c((-10yx.model$coef[2]+yx.model$coef[1]), (220yx.model$coef[2]+yx.model$coef[1])),c(-10, 220), lty=2)
	# Demonstrate residuals y_e
	points(data_20$rgb_mmol[6], data_20$mmolmol[6], pch=20, col='Black')
	lines(c(data_20$rgb_mmol[6], (data_20$mmolmol[6]*yx.model$coef[2] + yx.model$coef[1])),
	c(data_20$mmolmol[6], data_20$mmolmol[6]),
	lty=3, col='Black', lwd=1)
	text(21, 137, expression("y"[e]))

	# Demonstrate residuals x_e
	points(data_20$rgb_mmol[2], data_20$mmolmol[2], pch=20, col='Black')
	lines(c(data_20$rgb_mmol[2], data_20$rgb_mmol[2]),
	c(data_20$mmolmol[2], data_20$rgb_mmol[2]*xy.model$coef[2] + xy.model$coef[1]),
	lty=3, col='Black', lwd=1)
	text(28, 69.4, expression("x"[e]))

	# Note that the R_sqr for the two models is identical
	summary(xy.model)
	summary(yx.model)


	##############
	## Figure 2 ##
	##############
	# Plot all 349 data points in light grey
	plot(data$rgb_mmol, data$mmolmol,
	xlab='Blood Glucose (mmol/L)', xlim=c(0,50),
	ylab='',
	ylim=c(0,250),
	col='Grey', pch=20)
	# Place the y-axis label nicely
	mtext(side=2, line=2.3, expression(paste('HbA'['1c'],' (mmol/mol)')))

	# Calculate the OLS regression model for hba1c ~ blood glucose
	xy.model <- lm(data$mmolmol ~ data$rgb_mmol)
	# Place the line of best fit
	abline(xy.model, lty=1)

	# Calculate the OLS regression model for blood glucose ~ hba1c
	yx.model <- lm(data$rgb_mmol ~ data$mmolmol)
	# Place the line of best fit
	lines(c((-10yx.model$coef[2]+yx.model$coef[1]), (300yx.model$coef[2]+yx.model$coef[1])),c(-10, 300), lty=2)

	# Mark the approximate point of intersection of the two lines
	points(13.2, 80, pch=15, col='Black', cex=1.5)

	# Label the line predicting HbA1c from blood glucose
	text(40, 145, expression("Predict HbA"["1c"]), cex=0.7)
	# Label the line predicting blood glucose from HbA1c
	text(21, 220, "Predict Blood Glucose", cex=0.7)

	# Note that the R_sqr for the two models is identical
	summary(xy.model)
	summary(yx.model)


	##############
	## Figure 3 ##
	##############
	# Plot all 349 data points in light grey and add the errors for a point
	plot(data$rgb_mmol, data$mmolmol,
	xlab='Blood Glucose (mmol/L)', xlim=c(0,50),
	ylab='',
	ylim=c(0,250),
	col='Grey', pch=20)
	# Place the y-axis label nicely
	mtext(side=2, line=2.3, expression(paste('HbA'['1c'],' (mmol/mol)')))

	# Calculate the OLS regression model for hba1c ~ blood glucose
	xy.model <- lm(data$mmolmol ~ data$rgb_mmol)
	# Place the line of best fit
	abline(xy.model, lty=1)

	# Calculate the OLS regression model for blood glucose ~ hba1c
	yx.model <- lm(data$rgb_mmol ~ data$mmolmol)
	# Place the line of best fit
	lines(c((-10yx.model$coef[2]+yx.model$coef[1]), (300yx.model$coef[2]+yx.model$coef[1])),c(-10, 300), lty=2)

	# Mark the approximate point of intersection of the two lines
	points(13.2, 80, pch=15, col='Black', cex=1.5)

	# Label the line predicting HbA1c from blood glucose
	text(40, 145, expression("Predict HbA"["1c"]), cex=0.7)
	# Label the line predicting blood glucose from HbA1c
	text(21, 220, "Predict Blood Glucose", cex=0.7)



	# Demonstrate residuals y_e
	points(data$rgb_mmol[348], data$mmolmol[348], pch=20, col='Black')
	lines(c(data$rgb_mmol[348], (data$mmolmol[348]*yx.model$coef[2] + yx.model$coef[1])),
	c(data$mmolmol[348], data$mmolmol[348]),
	lty=3, col='Black', lwd=1)
	text(32.5, 205, expression("y"[e]))

	# Demonstrate residuals x_e
	points(data$rgb_mmol[348], data$mmolmol[348], pch=20, col='Black')
	lines(c(data$rgb_mmol[348], data$rgb_mmol[348]),
	c(data$mmolmol[348], data$rgb_mmol[348]*xy.model$coef[2] + xy.model$coef[1]),
	lty=3, col='Black', lwd=1)
	text(36, 175, expression("x"[e]))




	# Note that the R_sqr for the two models is identical
	summary(xy.model)
	summary(yx.model)