Skip to content

Instantly share code, notes, and snippets.

@mustafaascha
Last active October 4, 2017 06:17
Show Gist options
  • Save mustafaascha/4fdf4a9971f957f6bd0dfe10b3563a12 to your computer and use it in GitHub Desktop.
Save mustafaascha/4fdf4a9971f957f6bd0dfe10b3563a12 to your computer and use it in GitHub Desktop.
#Notes from Clinical Information Systems, October 3 2017
# Contents ===================================================================
# 1. HPV frequencies, model
# 2. Vitamin D and depression
# Setup =====================================================================
script_depends_on <- c("tidyverse", "nhanesA")
for(pkg in script_depends_on) {
if (!pkg %in% installed.packages()) install.packages(pkg,dependencies = TRUE)
}
library(tidyverse)
library(nhanesA)
#Definitely check out the author's other work here:
# https://github.com/cjendres1/nhanes/blob/master/vignettes/Introducing_nhanesA.R
#Here, we'll get the demographics and laboratory data
# First check out which tables are available, among lab tests
nhanesTables("LAB", 2005)
#See each preceding line for relevant documentation
#https://wwwn.cdc.gov/nchs/nhanes/2005-2006/VID_D.htm
vitamin_d <- nhanes("VID_D")
#https://wwwn.cdc.gov/nchs/nhanes/2005-2006/DEMO_D.htm
demos <- nhanes("DEMO_D")
#https://wwwn.cdc.gov/nchs/nhanes/2005-2006/HPVSWR_D.htm
hpv <- nhanes("HPVSWR_D")
#https://wwwn.cdc.gov/Nchs/Nhanes/2013-2014/DPQ_H.htm
dpqh <- nhanes("DPQ_D")
# HPV and Demographics =======================================================
# Let's see what the frequencies of HPV seropositivity are.
# Remember, positive is represented by "1".
hpv_frequencies <-
vapply(hpv[,-1],
function(x) sum(x == 1, na.rm = TRUE) / length(x),
double(1))
#that's interesting. let's visualize it
hpv_frequencies
hpv_frequencies <-
data.frame(sero_frequencies = hpv_frequencies,
serotype = names(hpv_frequencies),
stringsAsFactors = FALSE)
#this will order things nicely in our plot
hpv_frequencies$serotype <- with(hpv_frequencies,
factor(serotype, levels = serotype[order(sero_frequencies, decreasing = FALSE)]))
ggplot(hpv_frequencies, aes(x = serotype, xend = serotype, y = 0, yend = sero_frequencies)) +
geom_segment() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(x = "HPV Serotype", y = "Proportion of subjects")
# That's interesting. Next, let's explore how demographics relate to HPV16 status.
# In class, we made a plot of HPV serostatus against age, colored by education
hpv_demos <- left_join(hpv, demos)
hpv_demos$Education <-
with(hpv_demos,
ifelse(DMDEDUC2 < 4, "No College",
ifelse(is.na(DMDEDUC2), NA, "College")))
hpv_demos$Education[hpv_demos$DMDEDUC2 > 5] <- NA
hpv_demos$HPV16 <-
ifelse(hpv_demos$LBDR16 == 1, "Positive",
ifelse(is.na(hpv_demos$LBDR16), NA,
"Negative"))
ggplot(hpv_demos %>% filter(!is.na(Education)),
aes(x = HPV16, y = RIDAGEYR, fill = Education)) +
geom_boxplot() +
labs(x = "HPV16 Serostatus", y = "Age (Years)")
#we're making the assumption that non-positive non-missing observations are negatives
hpv_demos$HPV16 <- ifelse(hpv_demos$LBDR16 == 1, 1,
ifelse(is.na(hpv_demos$LBDR16), NA, 0))
income_labels <-
c("$0 to $ 4,999", "$5,000 to $ 9,999",
"$10,000 to $14,999", "$15,000 to $19,999",
"$20,000 to $24,999", "$25,000 to $34,999",
"$35,000 to $44,999", "$45,000 to $54,999",
"$55,000 to $64,999", "$65,000 to $74,999",
"$75,000 and Over", "Over $20,000",
"Under $20,000")
hpv_demos$Household_Income <-
factor(hpv_demos$INDHHINC, levels = 1:13,
labels = income_labels)
hpv_demos$Ethnicity <-
factor(hpv_demos$RIDRETH1, levels = 1:5,
labels = c("Mexican American",
"Other Hispanic",
"Non-Hispanic White",
"Non-Hispanic Black",
"Other (including multiracial)"))
hpv_demos_model <-
glm(HPV16 ~ RIDAGEYR + Household_Income + Ethnicity, data = hpv_demos)
hpv_model_df <-
cbind(broom::tidy(hpv_demos_model), confint(hpv_demos_model))
hpv_model_df <- hpv_model_df[,c("term", "estimate", "2.5 %", "97.5 %")]
hpv_model_df[,2:4] <- lapply(hpv_model_df[,2:4], exp)
names(hpv_model_df) <- c("term", "Odds_Ratio", "Conf.Low", "Conf.High")
hpv_model_df
# Vitamin D and Depression ===========================================
# The depression questionnaire is comprised of ten questions that are
# rated on a 0 to 3 scale. For the purposes of this exercise, we'll
# consider that a suitable range to fit a linear model
dpqh[,-1] <- lapply(dpqh[,-1], function(x) ifelse(x > 3, NA, x))
for_regression <-
data.frame(SEQN = dpqh$SEQN,
total_depression_score = rowSums(dpqh[,-1], na.rm = TRUE))
#it's possible inner joins would have been better here
# read up on two-table verbs here:
# https://cran.r-project.org/web/packages/dplyr/vignettes/two-table.html
for_regression <- inner_join(for_regression, demos)
for_regression <- inner_join(for_regression, vitamin_d)
income_labels <-
c("$0 to $ 4,999", "$5,000 to $ 9,999",
"$10,000 to $14,999", "$15,000 to $19,999",
"$20,000 to $24,999", "$25,000 to $34,999",
"$35,000 to $44,999", "$45,000 to $54,999",
"$55,000 to $64,999", "$65,000 to $74,999",
"$75,000 and Over", "Over $20,000",
"Under $20,000")
for_regression$Household_Income <-
factor(for_regression$INDHHINC, levels = 1:13,
labels = income_labels)
for_regression$Ethnicity <-
factor(for_regression$RIDRETH1, levels = 1:5,
labels = c("Mexican American",
"Other Hispanic",
"Non-Hispanic White",
"Non-Hispanic Black",
"Other (including multiracial)"))
for_regression$Education[for_regression$DMDEDUC2 > 5] <- NA
for_regression$Education <-
with(for_regression,
ifelse(DMDEDUC2 < 4, "No College",
ifelse(is.na(DMDEDUC2), NA, "College")))
for_regression$Vitamin_D <- for_regression$LBDVIDMS
for_regression$Age <- for_regression$RIDAGEYR
depression_model <-
lm(total_depression_score ~
Household_Income + Ethnicity +
Education + Age + Vitamin_D,
data = for_regression)
summary(depression_model)
#Indeed, looking at the "Vitamin D" coefficient, it looks like each extra microgram of serum Vitamin D concentration is statistically significantly associated with a decrease in depression score of about 0.01. Since Vitamin D concentrations can range orders of magnitude, let's calculate how much lower someone's score would be if they had the highest Vitamin D concentration compared to the lowest concentration that we observe:
range_vitamin_d <-
range(for_regression$Vitamin_D, na.rm = TRUE)
vitamin_d_deltas <- depression_model$coefficients["Vitamin_D"] * range_vitamin_d
vitamin_d_deltas[2] - vitamin_d_deltas[1]
#Looks like there is a change of about 1.66 points on the depression score. Wow!
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment