mustafaascha/Clinical-Info-Systems-2017-10-03

## Clinical-Info-Systems-2017-10-03
#Notes from Clinical Information Systems, October 3 2017

# Contents ===================================================================

# 1. HPV frequencies, model

# 2. Vitamin D and depression

# Setup =====================================================================

script_depends_on <- c("tidyverse", "nhanesA")
for(pkg in script_depends_on) {
  if (!pkg %in% installed.packages())  install.packages(pkg,dependencies = TRUE)
}

library(tidyverse)
library(nhanesA)

#Definitely check out the author's other work here:
#  https://github.com/cjendres1/nhanes/blob/master/vignettes/Introducing_nhanesA.R

#Here, we'll get the demographics and laboratory data

# First check out which tables are available, among lab tests
nhanesTables("LAB", 2005)

#See each preceding line for relevant documentation
#https://wwwn.cdc.gov/nchs/nhanes/2005-2006/VID_D.htm
vitamin_d <- nhanes("VID_D")
#https://wwwn.cdc.gov/nchs/nhanes/2005-2006/DEMO_D.htm
demos <- nhanes("DEMO_D")
#https://wwwn.cdc.gov/nchs/nhanes/2005-2006/HPVSWR_D.htm
hpv <- nhanes("HPVSWR_D")
#https://wwwn.cdc.gov/Nchs/Nhanes/2013-2014/DPQ_H.htm
dpqh <- nhanes("DPQ_D")

# HPV and Demographics =======================================================
# Let's see what the frequencies of HPV seropositivity are.
      # Remember, positive is represented by "1".

hpv_frequencies <-
  vapply(hpv[,-1],
         function(x) sum(x == 1, na.rm = TRUE) / length(x),
         double(1))

#that's interesting. let's visualize it
hpv_frequencies

hpv_frequencies <-
  data.frame(sero_frequencies = hpv_frequencies,
             serotype = names(hpv_frequencies),
             stringsAsFactors = FALSE)

#this will order things nicely in our plot
hpv_frequencies$serotype <- with(hpv_frequencies,
       factor(serotype, levels = serotype[order(sero_frequencies, decreasing = FALSE)]))

ggplot(hpv_frequencies, aes(x = serotype, xend = serotype, y = 0, yend = sero_frequencies)) +
  geom_segment() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(x = "HPV Serotype", y = "Proportion of subjects")


# That's interesting. Next, let's explore how demographics relate to HPV16 status.
# In class, we made a plot of HPV serostatus against age, colored by education

hpv_demos <- left_join(hpv, demos)

hpv_demos$Education <-
  with(hpv_demos,
       ifelse(DMDEDUC2 < 4, "No College",
              ifelse(is.na(DMDEDUC2), NA, "College")))

hpv_demos$Education[hpv_demos$DMDEDUC2 > 5] <- NA

hpv_demos$HPV16 <-
  ifelse(hpv_demos$LBDR16 == 1, "Positive",
         ifelse(is.na(hpv_demos$LBDR16), NA,
                "Negative"))

ggplot(hpv_demos %>% filter(!is.na(Education)),
       aes(x = HPV16, y = RIDAGEYR, fill = Education)) +
  geom_boxplot() +
  labs(x = "HPV16 Serostatus", y = "Age (Years)")


#we're making the assumption that non-positive non-missing observations are negatives
hpv_demos$HPV16 <- ifelse(hpv_demos$LBDR16 == 1, 1,
                          ifelse(is.na(hpv_demos$LBDR16), NA, 0))

income_labels <-
  c("$0 to $ 4,999",    "$5,000 to $ 9,999",
    "$10,000 to $14,999",     "$15,000 to $19,999",
    "$20,000 to $24,999",     "$25,000 to $34,999",
    "$35,000 to $44,999",     "$45,000 to $54,999",
    "$55,000 to $64,999",     "$65,000 to $74,999",
    "$75,000 and Over",       "Over $20,000",
    "Under $20,000")
hpv_demos$Household_Income <-
  factor(hpv_demos$INDHHINC, levels = 1:13,
         labels = income_labels)

hpv_demos$Ethnicity <-
  factor(hpv_demos$RIDRETH1, levels = 1:5,
         labels = c("Mexican American",
                    "Other Hispanic",
                    "Non-Hispanic White",
                    "Non-Hispanic Black",
                    "Other (including multiracial)"))

hpv_demos_model <-
  glm(HPV16 ~ RIDAGEYR + Household_Income + Ethnicity, data = hpv_demos)

hpv_model_df <-
  cbind(broom::tidy(hpv_demos_model), confint(hpv_demos_model))
hpv_model_df <- hpv_model_df[,c("term", "estimate", "2.5 %", "97.5 %")]
hpv_model_df[,2:4] <- lapply(hpv_model_df[,2:4], exp)
names(hpv_model_df) <- c("term", "Odds_Ratio", "Conf.Low", "Conf.High")
hpv_model_df


# Vitamin D and Depression ===========================================

# The depression questionnaire is comprised of ten questions that are
#     rated on a 0 to 3 scale. For the purposes of this exercise, we'll
#     consider that a suitable range to fit a linear model

dpqh[,-1] <- lapply(dpqh[,-1], function(x) ifelse(x > 3, NA, x))

for_regression <-
  data.frame(SEQN = dpqh$SEQN,
             total_depression_score = rowSums(dpqh[,-1], na.rm = TRUE))

#it's possible inner joins would have been better here
#  read up on two-table verbs here:
#  https://cran.r-project.org/web/packages/dplyr/vignettes/two-table.html
for_regression <- inner_join(for_regression, demos)
for_regression <- inner_join(for_regression, vitamin_d)

income_labels <-
  c("$0 to $ 4,999",    "$5,000 to $ 9,999",
    "$10,000 to $14,999",     "$15,000 to $19,999",
    "$20,000 to $24,999",     "$25,000 to $34,999",
    "$35,000 to $44,999",     "$45,000 to $54,999",
    "$55,000 to $64,999",     "$65,000 to $74,999",
    "$75,000 and Over",       "Over $20,000",
    "Under $20,000")
for_regression$Household_Income <-
  factor(for_regression$INDHHINC, levels = 1:13,
         labels = income_labels)

for_regression$Ethnicity <-
  factor(for_regression$RIDRETH1, levels = 1:5,
         labels = c("Mexican American",
                    "Other Hispanic",
                    "Non-Hispanic White",
                    "Non-Hispanic Black",
                    "Other (including multiracial)"))

for_regression$Education[for_regression$DMDEDUC2 > 5] <- NA

for_regression$Education <-
  with(for_regression,
       ifelse(DMDEDUC2 < 4, "No College",
              ifelse(is.na(DMDEDUC2), NA, "College")))


for_regression$Vitamin_D <- for_regression$LBDVIDMS
for_regression$Age <- for_regression$RIDAGEYR

depression_model <-
  lm(total_depression_score ~
       Household_Income + Ethnicity +
       Education + Age + Vitamin_D,
     data = for_regression)

summary(depression_model)

#Indeed, looking at the "Vitamin D" coefficient, it looks like each extra microgram of serum Vitamin D concentration is statistically significantly associated with a decrease in depression score of about 0.01. Since Vitamin D concentrations can range orders of magnitude, let's calculate how much lower someone's score would be if they had the highest Vitamin D concentration compared to the lowest concentration that we observe:

range_vitamin_d <-
  range(for_regression$Vitamin_D, na.rm = TRUE)

vitamin_d_deltas <- depression_model$coefficients["Vitamin_D"] * range_vitamin_d

vitamin_d_deltas[2] - vitamin_d_deltas[1]

#Looks like there is a change of about 1.66 points on the depression score. Wow!
	#Notes from Clinical Information Systems, October 3 2017

	# Contents ===================================================================

	# 1. HPV frequencies, model

	# 2. Vitamin D and depression

	# Setup =====================================================================

	script_depends_on <- c("tidyverse", "nhanesA")
	for(pkg in script_depends_on) {
	if (!pkg %in% installed.packages()) install.packages(pkg,dependencies = TRUE)
	}

	library(tidyverse)
	library(nhanesA)

	#Definitely check out the author's other work here:
	# https://github.com/cjendres1/nhanes/blob/master/vignettes/Introducing_nhanesA.R

	#Here, we'll get the demographics and laboratory data

	# First check out which tables are available, among lab tests
	nhanesTables("LAB", 2005)

	#See each preceding line for relevant documentation
	#https://wwwn.cdc.gov/nchs/nhanes/2005-2006/VID_D.htm
	vitamin_d <- nhanes("VID_D")
	#https://wwwn.cdc.gov/nchs/nhanes/2005-2006/DEMO_D.htm
	demos <- nhanes("DEMO_D")
	#https://wwwn.cdc.gov/nchs/nhanes/2005-2006/HPVSWR_D.htm
	hpv <- nhanes("HPVSWR_D")
	#https://wwwn.cdc.gov/Nchs/Nhanes/2013-2014/DPQ_H.htm
	dpqh <- nhanes("DPQ_D")

	# HPV and Demographics =======================================================
	# Let's see what the frequencies of HPV seropositivity are.
	# Remember, positive is represented by "1".

	hpv_frequencies <-
	vapply(hpv[,-1],
	function(x) sum(x == 1, na.rm = TRUE) / length(x),
	double(1))

	#that's interesting. let's visualize it
	hpv_frequencies

	hpv_frequencies <-
	data.frame(sero_frequencies = hpv_frequencies,
	serotype = names(hpv_frequencies),
	stringsAsFactors = FALSE)

	#this will order things nicely in our plot
	hpv_frequencies$serotype <- with(hpv_frequencies,
	factor(serotype, levels = serotype[order(sero_frequencies, decreasing = FALSE)]))

	ggplot(hpv_frequencies, aes(x = serotype, xend = serotype, y = 0, yend = sero_frequencies)) +
	geom_segment() +
	theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
	labs(x = "HPV Serotype", y = "Proportion of subjects")


	# That's interesting. Next, let's explore how demographics relate to HPV16 status.
	# In class, we made a plot of HPV serostatus against age, colored by education

	hpv_demos <- left_join(hpv, demos)

	hpv_demos$Education <-
	with(hpv_demos,
	ifelse(DMDEDUC2 < 4, "No College",
	ifelse(is.na(DMDEDUC2), NA, "College")))

	hpv_demos$Education[hpv_demos$DMDEDUC2 > 5] <- NA

	hpv_demos$HPV16 <-
	ifelse(hpv_demos$LBDR16 == 1, "Positive",
	ifelse(is.na(hpv_demos$LBDR16), NA,
	"Negative"))

	ggplot(hpv_demos %>% filter(!is.na(Education)),
	aes(x = HPV16, y = RIDAGEYR, fill = Education)) +
	geom_boxplot() +
	labs(x = "HPV16 Serostatus", y = "Age (Years)")


	#we're making the assumption that non-positive non-missing observations are negatives
	hpv_demos$HPV16 <- ifelse(hpv_demos$LBDR16 == 1, 1,
	ifelse(is.na(hpv_demos$LBDR16), NA, 0))

	income_labels <-
	c("$0 to $ 4,999", "$5,000 to $ 9,999",
	"$10,000 to $14,999", "$15,000 to $19,999",
	"$20,000 to $24,999", "$25,000 to $34,999",
	"$35,000 to $44,999", "$45,000 to $54,999",
	"$55,000 to $64,999", "$65,000 to $74,999",
	"$75,000 and Over", "Over $20,000",
	"Under $20,000")
	hpv_demos$Household_Income <-
	factor(hpv_demos$INDHHINC, levels = 1:13,
	labels = income_labels)

	hpv_demos$Ethnicity <-
	factor(hpv_demos$RIDRETH1, levels = 1:5,
	labels = c("Mexican American",
	"Other Hispanic",
	"Non-Hispanic White",
	"Non-Hispanic Black",
	"Other (including multiracial)"))

	hpv_demos_model <-
	glm(HPV16 ~ RIDAGEYR + Household_Income + Ethnicity, data = hpv_demos)

	hpv_model_df <-
	cbind(broom::tidy(hpv_demos_model), confint(hpv_demos_model))
	hpv_model_df <- hpv_model_df[,c("term", "estimate", "2.5 %", "97.5 %")]
	hpv_model_df[,2:4] <- lapply(hpv_model_df[,2:4], exp)
	names(hpv_model_df) <- c("term", "Odds_Ratio", "Conf.Low", "Conf.High")
	hpv_model_df


	# Vitamin D and Depression ===========================================

	# The depression questionnaire is comprised of ten questions that are
	# rated on a 0 to 3 scale. For the purposes of this exercise, we'll
	# consider that a suitable range to fit a linear model

	dpqh[,-1] <- lapply(dpqh[,-1], function(x) ifelse(x > 3, NA, x))

	for_regression <-
	data.frame(SEQN = dpqh$SEQN,
	total_depression_score = rowSums(dpqh[,-1], na.rm = TRUE))

	#it's possible inner joins would have been better here
	# read up on two-table verbs here:
	# https://cran.r-project.org/web/packages/dplyr/vignettes/two-table.html
	for_regression <- inner_join(for_regression, demos)
	for_regression <- inner_join(for_regression, vitamin_d)

	income_labels <-
	c("$0 to $ 4,999", "$5,000 to $ 9,999",
	"$10,000 to $14,999", "$15,000 to $19,999",
	"$20,000 to $24,999", "$25,000 to $34,999",
	"$35,000 to $44,999", "$45,000 to $54,999",
	"$55,000 to $64,999", "$65,000 to $74,999",
	"$75,000 and Over", "Over $20,000",
	"Under $20,000")
	for_regression$Household_Income <-
	factor(for_regression$INDHHINC, levels = 1:13,
	labels = income_labels)

	for_regression$Ethnicity <-
	factor(for_regression$RIDRETH1, levels = 1:5,
	labels = c("Mexican American",
	"Other Hispanic",
	"Non-Hispanic White",
	"Non-Hispanic Black",
	"Other (including multiracial)"))

	for_regression$Education[for_regression$DMDEDUC2 > 5] <- NA

	for_regression$Education <-
	with(for_regression,
	ifelse(DMDEDUC2 < 4, "No College",
	ifelse(is.na(DMDEDUC2), NA, "College")))


	for_regression$Vitamin_D <- for_regression$LBDVIDMS
	for_regression$Age <- for_regression$RIDAGEYR

	depression_model <-
	lm(total_depression_score ~
	Household_Income + Ethnicity +
	Education + Age + Vitamin_D,
	data = for_regression)

	summary(depression_model)

	#Indeed, looking at the "Vitamin D" coefficient, it looks like each extra microgram of serum Vitamin D concentration is statistically significantly associated with a decrease in depression score of about 0.01. Since Vitamin D concentrations can range orders of magnitude, let's calculate how much lower someone's score would be if they had the highest Vitamin D concentration compared to the lowest concentration that we observe:

	range_vitamin_d <-
	range(for_regression$Vitamin_D, na.rm = TRUE)

	vitamin_d_deltas <- depression_model$coefficients["Vitamin_D"] * range_vitamin_d

	vitamin_d_deltas[2] - vitamin_d_deltas[1]

	#Looks like there is a change of about 1.66 points on the depression score. Wow!