Last active
October 4, 2017 06:17
-
-
Save mustafaascha/4fdf4a9971f957f6bd0dfe10b3563a12 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Notes from Clinical Information Systems, October 3 2017 | |
# Contents =================================================================== | |
# 1. HPV frequencies, model | |
# 2. Vitamin D and depression | |
# Setup ===================================================================== | |
script_depends_on <- c("tidyverse", "nhanesA") | |
for(pkg in script_depends_on) { | |
if (!pkg %in% installed.packages()) install.packages(pkg,dependencies = TRUE) | |
} | |
library(tidyverse) | |
library(nhanesA) | |
#Definitely check out the author's other work here: | |
# https://github.com/cjendres1/nhanes/blob/master/vignettes/Introducing_nhanesA.R | |
#Here, we'll get the demographics and laboratory data | |
# First check out which tables are available, among lab tests | |
nhanesTables("LAB", 2005) | |
#See each preceding line for relevant documentation | |
#https://wwwn.cdc.gov/nchs/nhanes/2005-2006/VID_D.htm | |
vitamin_d <- nhanes("VID_D") | |
#https://wwwn.cdc.gov/nchs/nhanes/2005-2006/DEMO_D.htm | |
demos <- nhanes("DEMO_D") | |
#https://wwwn.cdc.gov/nchs/nhanes/2005-2006/HPVSWR_D.htm | |
hpv <- nhanes("HPVSWR_D") | |
#https://wwwn.cdc.gov/Nchs/Nhanes/2013-2014/DPQ_H.htm | |
dpqh <- nhanes("DPQ_D") | |
# HPV and Demographics ======================================================= | |
# Let's see what the frequencies of HPV seropositivity are. | |
# Remember, positive is represented by "1". | |
hpv_frequencies <- | |
vapply(hpv[,-1], | |
function(x) sum(x == 1, na.rm = TRUE) / length(x), | |
double(1)) | |
#that's interesting. let's visualize it | |
hpv_frequencies | |
hpv_frequencies <- | |
data.frame(sero_frequencies = hpv_frequencies, | |
serotype = names(hpv_frequencies), | |
stringsAsFactors = FALSE) | |
#this will order things nicely in our plot | |
hpv_frequencies$serotype <- with(hpv_frequencies, | |
factor(serotype, levels = serotype[order(sero_frequencies, decreasing = FALSE)])) | |
ggplot(hpv_frequencies, aes(x = serotype, xend = serotype, y = 0, yend = sero_frequencies)) + | |
geom_segment() + | |
theme(axis.text.x = element_text(angle = 45, hjust = 1)) + | |
labs(x = "HPV Serotype", y = "Proportion of subjects") | |
# That's interesting. Next, let's explore how demographics relate to HPV16 status. | |
# In class, we made a plot of HPV serostatus against age, colored by education | |
hpv_demos <- left_join(hpv, demos) | |
hpv_demos$Education <- | |
with(hpv_demos, | |
ifelse(DMDEDUC2 < 4, "No College", | |
ifelse(is.na(DMDEDUC2), NA, "College"))) | |
hpv_demos$Education[hpv_demos$DMDEDUC2 > 5] <- NA | |
hpv_demos$HPV16 <- | |
ifelse(hpv_demos$LBDR16 == 1, "Positive", | |
ifelse(is.na(hpv_demos$LBDR16), NA, | |
"Negative")) | |
ggplot(hpv_demos %>% filter(!is.na(Education)), | |
aes(x = HPV16, y = RIDAGEYR, fill = Education)) + | |
geom_boxplot() + | |
labs(x = "HPV16 Serostatus", y = "Age (Years)") | |
#we're making the assumption that non-positive non-missing observations are negatives | |
hpv_demos$HPV16 <- ifelse(hpv_demos$LBDR16 == 1, 1, | |
ifelse(is.na(hpv_demos$LBDR16), NA, 0)) | |
income_labels <- | |
c("$0 to $ 4,999", "$5,000 to $ 9,999", | |
"$10,000 to $14,999", "$15,000 to $19,999", | |
"$20,000 to $24,999", "$25,000 to $34,999", | |
"$35,000 to $44,999", "$45,000 to $54,999", | |
"$55,000 to $64,999", "$65,000 to $74,999", | |
"$75,000 and Over", "Over $20,000", | |
"Under $20,000") | |
hpv_demos$Household_Income <- | |
factor(hpv_demos$INDHHINC, levels = 1:13, | |
labels = income_labels) | |
hpv_demos$Ethnicity <- | |
factor(hpv_demos$RIDRETH1, levels = 1:5, | |
labels = c("Mexican American", | |
"Other Hispanic", | |
"Non-Hispanic White", | |
"Non-Hispanic Black", | |
"Other (including multiracial)")) | |
hpv_demos_model <- | |
glm(HPV16 ~ RIDAGEYR + Household_Income + Ethnicity, data = hpv_demos) | |
hpv_model_df <- | |
cbind(broom::tidy(hpv_demos_model), confint(hpv_demos_model)) | |
hpv_model_df <- hpv_model_df[,c("term", "estimate", "2.5 %", "97.5 %")] | |
hpv_model_df[,2:4] <- lapply(hpv_model_df[,2:4], exp) | |
names(hpv_model_df) <- c("term", "Odds_Ratio", "Conf.Low", "Conf.High") | |
hpv_model_df | |
# Vitamin D and Depression =========================================== | |
# The depression questionnaire is comprised of ten questions that are | |
# rated on a 0 to 3 scale. For the purposes of this exercise, we'll | |
# consider that a suitable range to fit a linear model | |
dpqh[,-1] <- lapply(dpqh[,-1], function(x) ifelse(x > 3, NA, x)) | |
for_regression <- | |
data.frame(SEQN = dpqh$SEQN, | |
total_depression_score = rowSums(dpqh[,-1], na.rm = TRUE)) | |
#it's possible inner joins would have been better here | |
# read up on two-table verbs here: | |
# https://cran.r-project.org/web/packages/dplyr/vignettes/two-table.html | |
for_regression <- inner_join(for_regression, demos) | |
for_regression <- inner_join(for_regression, vitamin_d) | |
income_labels <- | |
c("$0 to $ 4,999", "$5,000 to $ 9,999", | |
"$10,000 to $14,999", "$15,000 to $19,999", | |
"$20,000 to $24,999", "$25,000 to $34,999", | |
"$35,000 to $44,999", "$45,000 to $54,999", | |
"$55,000 to $64,999", "$65,000 to $74,999", | |
"$75,000 and Over", "Over $20,000", | |
"Under $20,000") | |
for_regression$Household_Income <- | |
factor(for_regression$INDHHINC, levels = 1:13, | |
labels = income_labels) | |
for_regression$Ethnicity <- | |
factor(for_regression$RIDRETH1, levels = 1:5, | |
labels = c("Mexican American", | |
"Other Hispanic", | |
"Non-Hispanic White", | |
"Non-Hispanic Black", | |
"Other (including multiracial)")) | |
for_regression$Education[for_regression$DMDEDUC2 > 5] <- NA | |
for_regression$Education <- | |
with(for_regression, | |
ifelse(DMDEDUC2 < 4, "No College", | |
ifelse(is.na(DMDEDUC2), NA, "College"))) | |
for_regression$Vitamin_D <- for_regression$LBDVIDMS | |
for_regression$Age <- for_regression$RIDAGEYR | |
depression_model <- | |
lm(total_depression_score ~ | |
Household_Income + Ethnicity + | |
Education + Age + Vitamin_D, | |
data = for_regression) | |
summary(depression_model) | |
#Indeed, looking at the "Vitamin D" coefficient, it looks like each extra microgram of serum Vitamin D concentration is statistically significantly associated with a decrease in depression score of about 0.01. Since Vitamin D concentrations can range orders of magnitude, let's calculate how much lower someone's score would be if they had the highest Vitamin D concentration compared to the lowest concentration that we observe: | |
range_vitamin_d <- | |
range(for_regression$Vitamin_D, na.rm = TRUE) | |
vitamin_d_deltas <- depression_model$coefficients["Vitamin_D"] * range_vitamin_d | |
vitamin_d_deltas[2] - vitamin_d_deltas[1] | |
#Looks like there is a change of about 1.66 points on the depression score. Wow! | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment