Skip to content

Instantly share code, notes, and snippets.

@dreidpath
Created October 18, 2015 13:20
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dreidpath/76bc76a1cecb1d454801 to your computer and use it in GitHub Desktop.
Save dreidpath/76bc76a1cecb1d454801 to your computer and use it in GitHub Desktop.
##########################################################################################################
# Analysis of Blood glucose / HbA1C data to show that y ~ x is not the same as x ~ y. #
# Author: Daniel D Reidpath, Pascale Allotey, & Mark R Diamond #
# Date: 18 October 2015 #
##########################################################################################################
# This analysis supports a research note (doi: 10.4225/03/56239BD13C5E0)
#
#
# Load the data. These are derived from Daramola (2012) and held on the CMU StatLib---Datasets Archive
data <- read.csv('http://lib.stat.cmu.edu/datasets/hba1c_bloodGlucose.dat', header=T, sep='\t', skip=30)
# Rename the random blood glucose variable 'mmol' -- it keeps the unit of measure straight in my head
names(data)[2] <- 'rgb_mmol'
# Create an HbA1c variable as mmol/mol
data$mmolmol <- (data$hba1c-2.15)*10.929 # HbA1c expressed in IFCC units
# Set the random seed, so that anyone replicating the analysis will get the same results
set.seed(seed=12146)
# Randomly sample 20 of the 349 patients ... it will be easier to see what's happening in the graph with 20
data_20 <- data[sample(1:nrow(data), 20, replace=F), ]
##############
## Figure 1 ##
##############
# Plot the sample of 20 data points
plot(data_20$rgb_mmol, data_20$mmolmol,
xlab='Blood Glucose (mmol/L)', xlim=c(0,30),
ylab='', ylim=c(0,200))
# Place the y-axis label nicely
mtext(side=2, line=2.3, expression(paste('HbA'['1c'],' (mmol/mol)')))
# Calculate the OLS regression model for hba1c ~ blood glucose
xy.model <- lm(data_20$mmolmol ~ data_20$rgb_mmol)
# Place the line of best fit
abline(xy.model, lty=1)
# Calculate the OLS regression model for blood glucose ~ hba1c
yx.model <- lm(data_20$rgb_mmol ~ data_20$mmolmol)
# Place the line of best fit
lines(c((-10*yx.model$coef[2]+yx.model$coef[1]), (220*yx.model$coef[2]+yx.model$coef[1])),c(-10, 220), lty=2)
# Demonstrate residuals y_e
points(data_20$rgb_mmol[6], data_20$mmolmol[6], pch=20, col='Black')
lines(c(data_20$rgb_mmol[6], (data_20$mmolmol[6]*yx.model$coef[2] + yx.model$coef[1])),
c(data_20$mmolmol[6], data_20$mmolmol[6]),
lty=3, col='Black', lwd=1)
text(21, 137, expression("y"[e]))
# Demonstrate residuals x_e
points(data_20$rgb_mmol[2], data_20$mmolmol[2], pch=20, col='Black')
lines(c(data_20$rgb_mmol[2], data_20$rgb_mmol[2]),
c(data_20$mmolmol[2], data_20$rgb_mmol[2]*xy.model$coef[2] + xy.model$coef[1]),
lty=3, col='Black', lwd=1)
text(28, 69.4, expression("x"[e]))
# Note that the R_sqr for the two models is identical
summary(xy.model)
summary(yx.model)
##############
## Figure 2 ##
##############
# Plot all 349 data points in light grey
plot(data$rgb_mmol, data$mmolmol,
xlab='Blood Glucose (mmol/L)', xlim=c(0,50),
ylab='',
ylim=c(0,250),
col='Grey', pch=20)
# Place the y-axis label nicely
mtext(side=2, line=2.3, expression(paste('HbA'['1c'],' (mmol/mol)')))
# Calculate the OLS regression model for hba1c ~ blood glucose
xy.model <- lm(data$mmolmol ~ data$rgb_mmol)
# Place the line of best fit
abline(xy.model, lty=1)
# Calculate the OLS regression model for blood glucose ~ hba1c
yx.model <- lm(data$rgb_mmol ~ data$mmolmol)
# Place the line of best fit
lines(c((-10*yx.model$coef[2]+yx.model$coef[1]), (300*yx.model$coef[2]+yx.model$coef[1])),c(-10, 300), lty=2)
# Mark the approximate point of intersection of the two lines
points(13.2, 80, pch=15, col='Black', cex=1.5)
# Label the line predicting HbA1c from blood glucose
text(40, 145, expression("Predict HbA"["1c"]), cex=0.7)
# Label the line predicting blood glucose from HbA1c
text(21, 220, "Predict Blood Glucose", cex=0.7)
# Note that the R_sqr for the two models is identical
summary(xy.model)
summary(yx.model)
##############
## Figure 3 ##
##############
# Plot all 349 data points in light grey and add the errors for a point
plot(data$rgb_mmol, data$mmolmol,
xlab='Blood Glucose (mmol/L)', xlim=c(0,50),
ylab='',
ylim=c(0,250),
col='Grey', pch=20)
# Place the y-axis label nicely
mtext(side=2, line=2.3, expression(paste('HbA'['1c'],' (mmol/mol)')))
# Calculate the OLS regression model for hba1c ~ blood glucose
xy.model <- lm(data$mmolmol ~ data$rgb_mmol)
# Place the line of best fit
abline(xy.model, lty=1)
# Calculate the OLS regression model for blood glucose ~ hba1c
yx.model <- lm(data$rgb_mmol ~ data$mmolmol)
# Place the line of best fit
lines(c((-10*yx.model$coef[2]+yx.model$coef[1]), (300*yx.model$coef[2]+yx.model$coef[1])),c(-10, 300), lty=2)
# Mark the approximate point of intersection of the two lines
points(13.2, 80, pch=15, col='Black', cex=1.5)
# Label the line predicting HbA1c from blood glucose
text(40, 145, expression("Predict HbA"["1c"]), cex=0.7)
# Label the line predicting blood glucose from HbA1c
text(21, 220, "Predict Blood Glucose", cex=0.7)
# Demonstrate residuals y_e
points(data$rgb_mmol[348], data$mmolmol[348], pch=20, col='Black')
lines(c(data$rgb_mmol[348], (data$mmolmol[348]*yx.model$coef[2] + yx.model$coef[1])),
c(data$mmolmol[348], data$mmolmol[348]),
lty=3, col='Black', lwd=1)
text(32.5, 205, expression("y"[e]))
# Demonstrate residuals x_e
points(data$rgb_mmol[348], data$mmolmol[348], pch=20, col='Black')
lines(c(data$rgb_mmol[348], data$rgb_mmol[348]),
c(data$mmolmol[348], data$rgb_mmol[348]*xy.model$coef[2] + xy.model$coef[1]),
lty=3, col='Black', lwd=1)
text(36, 175, expression("x"[e]))
# Note that the R_sqr for the two models is identical
summary(xy.model)
summary(yx.model)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment