mustafaascha/Clinical-Info-Systems-2018-10-23

## Clinical-Info-Systems-2018-10-23
#' Notes from Clinical Information Systems, October 2 2018
#'
# Contents ========================================
#'
#' From 2018
#' 1. Human immunodeficiency status versus white
#'    blood cell counts, visualization and modeling
#' 2. Body mass index versus dietary habits (salt intake
#'    and salmon consumption)
#' 3. Income and fast-food consumption
#'
#' From 2017:
#' 3. HPV frequencies, modeling
#' 4. Vitamin D and depression

# Preamble =========================================
#'
#' In class, someone asked about why we use quotation marks.
#'
#' The answer would be that quotation marks indicate the 'string'
#' (some arbitrary sequence), whereas unquoted things refer to
#' objects that exist in the environments listed in R's search path.
#' The search path is where R looks for things, see yours here:
search()

#' If I assigned the word "ggplot2" to the object name dplyr,
#' I could load the library "ggplot2" but the code that I
#' write would appear as though it refers to dplyr.
#'
#' Note: dplyr and ggplot2 are two very useful packages, you
#' should download/learn about them
#'
#' Here's what that means:
dplyr <- "ggplot2"

#' We see what we might expect, if we print the value:
dplyr

#' This function works as appears, it loads dplyr:
library(dplyr)

#' This function doesn't exactly work as appears, it loads ggplot2:
library(dplyr, character.only = TRUE)

#' This fails because there's no ggplot2 object, but it still looks like we want dplyr:
try(eval(as.name(dplyr)))

#' Functions can access these names and/or their values:
show_name_value <- function(z) {
  message(deparse(substitute(z)))
  message(z)
}

show_name_value(dplyr)

#' Point:
#' Understanding the differences between values and names,
#' along with how R looks for objects, will meaningfully help you
#' reason about R code.
#'
#' See here for more: https://adv-r.hadley.nz/

# Setup =============================================

# This is where we install and load relevant R packages
script_depends_on <- c("nhanesA", "dplyr", "ggplot2", "purrr")

for (pkg in script_depends_on) {
  if (!(pkg %in% installed.packages()))
    install.packages(pkg, dependencies = TRUE)
}

library("nhanesA")
library("dplyr")
library("ggplot2")
library("purrr")

#' Definitely check out the nhanesA package author's other work here:
#' https://github.com/cjendres1/nhanes/blob/master/vignettes/Introducing_nhanesA.R

#' First we should consider checking out which tables are available. Uncomment and run the following lines to see what is out there.
#' nhanesTables("LAB", 2005)
#' nhanesTables("DEMO", 2016)
#' nhanesTables("DIET", 2016)
#' nhanesTables("EXAM", 2016)

#' Here, we might check which variables are contained in our tables. Again, uncomment the lines to see what they do.
#nhanesTableVars("DEMO", "DEMO_I")
#nhanesTableVars("LAB", "HIV_I")
#nhanesTableVars("LAB", "CBC_I")

# HIV and WBC ===========================================

#' First, we retrieve the data. Note that the links point to data documentation
#' https://wwwn.cdc.gov/Nchs/Nhanes/2015-2016/HIV_I.htm
hiv <- nhanes("HIV_I")
#' https://wwwn.cdc.gov/Nchs/Nhanes/2015-2016/CBC_I.htm
cbc <- nhanes("CBC_I")

#' Batch recoding is super convenient!!
#' This is necessary because the data arrive in a format that's convenient for computers, but not humans. As a result, we'll have to convert the codes (generally an integer) to more meaningful representations.
#' First, we retrieve the codes and their meanings. We do this by identifying the name of each variable (also called a column) in the hiv data, and asking 'nhanesTranslate' to create an object that shows the code and its meaningful representation. 'lapply' looks at each variable name and does this translation for us, using a function we created on-the-fly.
hiv_translations <-
  lapply(names(hiv)[-1], function(z) {
    translation <- nhanesTranslate("HIV_I", z)[[1]]
    names(translation) <- c(z, paste(z, "v", sep = "_"))
    translation[[z]] <- as.numeric(translation[[z]])
    translation
    })

#' Then, we join each dataframe to the main 'hiv' dataframe,
#' and select only those variables that have been translated to their
#' values. Don't worry about the 'reduce' function, just focus on the
#' fact that we're pulling the values (whose name we indicated end with a '_v')
#' and the patient ID number
hiv <-
  reduce(hiv_translations, left_join, .init = hiv) %>%
  select(SEQN, ends_with("_v"))

hiv_cbc <-
  left_join(hiv, cbc) %>%
  rename(WBC_Count = LBXWBCSI,
         HIV_Status = LBXHIVC_v)

#' Now we're ready to do some analysis!
#' For this example, we'll stick with exploratory data visualization.
#' for a quick plot, we use qplot
qplot(hiv_cbc$LBXHGB) +
  labs(x = "Hemoglobin concentration (g/dL)",
       y = "Count")

#' for a more involved plot, we use ggplot
hiv_cbc %>%
  # First, we have to fix one of the variables so it is pretty on the plot
  modify_at("HIV_Status",
            function(z) {
              z <- strsplit(z, " ")
              map_chr(z, function(z) paste(z, collapse = "\n"))
            }) %>%
  ggplot(aes(y = WBC_Count, x = HIV_Status, fill = HIV_Status)) +
  geom_boxplot() +
  scale_fill_grey() +
  theme(axis.text.x = element_text(color = "black"))

# Body mass index and dietary habits =============================

#' First, we retrieve the data. Note that the links point to data documentation
#' https://wwwn.cdc.gov/Nchs/Nhanes/2015-2016/DR1TOT_I.htm
diets <- nhanes("DR1TOT_I")
#' https://wwwn.cdc.gov/Nchs/Nhanes/2015-2016/BMX_I.htm
bmx   <- nhanes("BMX_I")

#' Batch recoding -- super convenient!!
#' Next, we define the columns whose values we will change
diet_names <-
  c(
    "DR1DAY", "DR1LANG", "DR1MRESP", "DR1HELP",
    "DBQ095Z", "DBD100", "DRQSPREP", "DRD370JQ",
    "DRD370M", "DRD370MQ", "DRD370N", "DRD370NQ",
    "DRD370Q", "DRD370QQ", "DRD370R"
  )

#' Then, we retrieve the codes and their corresponding values
diets_translations <-
  lapply(diet_names, function(z) {
    translation <-
      nhanesTranslate(nh_table = "DR1TOT_I", colnames = z)[[1]]

    names(translation) <- c(z, paste(z, "v", sep = "_"))
    translation[[z]] <- as.numeric(translation[[z]])

    translation
  })

#' Then we join the dataframes containing translations with the main 'diets' dataframe
diets <-
  reduce(diets_translations, left_join, .init = diets) %>%
  select(SEQN, ends_with("_v"))

#' next, we put the datasets together
bm_diets <-
  left_join(bmx, diets) %>%
  rename(Eats_Salmon = DRD370M_v,
         Adds_Salt   = DBD100_v,
         BMI = BMXBMI) %>%
  modify_at("Adds_Salt",
            function(z) {
              factor(z,
                     levels = c("Rarely",
                                 "Occasionally",
                                 "Very often",
                                 "Missing", "Don't know"))
              })

#' We take only a sample because using the whole dataset is prohibitive for laptops
bm_diets %>%
  sample_n(5e4) %>%
  ggplot(aes(x = Adds_Salt, y = BMI)) +
  geom_jitter(alpha = 0.5) +
  labs(
    x = "How often {do you/does SP} add ordinary \nsalt to {your/his/her/SP's} food at the table?",
    y = "Body Mass Index"
  )

bm_diets  %>%
  sample_n(5e4) %>%
  ggplot(aes(x = Eats_Salmon, y = BMI)) +
  geom_jitter(alpha = 0.5) +
  labs(
    x = "Ate salmon within the past 30 days",
    y = "Body Mass Index"
  )

#' Income and Fast-food intake ================================================
#' A recent study found that income may be related to fast food intake. Let's see if we can recapitulate that conclusion using the NHANES data.

#' First, we retrieve diet information--this requires two tables, the difference between which is described in the NHANES documentation (always refer to the documentation when you have a question about the data!). Once you have done analysis, you should really put together documentation describing what you did.
diet_quest <- nhanes("DR1TOT_I")
diet_indiv <- nhanes("DR1IFF_I")
#' Here, we select only the relevant columns from our diet questionnaire
intake_q <- select(diet_indiv, SEQN, DR1FS)

#' Next, we retrieve income information and select relevant columns from that table
income <- nhanes("INQ_I")
just_income <- select(income, SEQN, IND235)
#' Here, we put food intake and income data together
income_intake <- left_join(intake_q, income, by = "SEQN")

#' Next, we have to obtain the translations between computer and real-world representations of these data
#' Since the income data aren't expressed in dollars, we have to find a table to translate the coded values into their meaningful representations
lookup_income <- nhanesTranslate("INQ_I", "IND235")[[1]]
names(lookup_income) <- c("IND235", "ID235_V")
lookup_food <- nhanesTranslate("DR1IFF_I", "DR1FS")[[1]]
names(lookup_food) <- c("DR1FS", "DR1FS_V")
#' These modifications are necessary to join our translations
lookup_income <- modify(lookup_income, as.character)
lookup_food <- modify(lookup_food, as.character)
income_intake <- modify(income_intake, as.character)

#' Next, we join the translations to their values
income_intake <- left_join(income_intake, lookup_income)
income_intake <- left_join(income_intake, lookup_food)
#' And then we select only the relevant variables (which, in addition to the patient id/SEQN, are represented here by a '_V' at the end of their name)
income_intake_v <- select(income_intake, SEQN, ends_with("_V"))
#' Because we want to draw a picture, and we have categories that follow a clear order in their magnitude, we have to tell the computer what those income levels' order actually is
income_intake_v$ID235_V <-
  factor(income_intake_v$ID235_V,
         levels = c(
           '$0 - $399',
           '$400 - $799',
           '$800 - $1,249',
           '$1,250 - $1,649',
           '$1,650 - $2,099',
           '$2,100 - $2,899',
           '$2,900 - $3,749',
           '$3,750 - $4,599',
           '$4,600 - $5,399',
           '$5,400 - $6,249',
           '$6,250 - $8,399',
           '$8,400 and over',
           'Dont know',
           'NA',
           'Refused'
         ))

#' Finally, we draw a picture to see if there's any visible difference between fast food consumption frequency and income.
income_intake_v %>%
  distinct(SEQN, .keep_all = TRUE) %>%
ggplot(
       aes(x = ID235_V, y = DR1FS_V)) +
  geom_jitter(alpha = I(1/10)) +
   theme(axis.text.x = element_text(angle = 45, hjust = 1))

#' HPV and Demographics =======================================================
#' Let's see what the frequencies of HPV seropositivity are.
#' Remember, positive is represented by "1".

#' First, we retrieve the data. Note that the links point to data documentation
#' https://wwwn.cdc.gov/nchs/nhanes/2005-2006/DEMO_D.htm
demos <- nhanes("DEMO_D")
#' https://wwwn.cdc.gov/nchs/nhanes/2005-2006/HPVSWR_D.htm
hpv <- nhanes("HPVSWR_D")

hpv_frequencies <-
  vapply(hpv[,-1],
         function(x) sum(x == 1, na.rm = TRUE) / length(x),
         double(1))

#' that's interesting. let's visualize it
hpv_frequencies <-
  data.frame(sero_frequencies = hpv_frequencies,
             serotype = names(hpv_frequencies),
             stringsAsFactors = FALSE)

#' this will order things nicely in our plot
hpv_frequencies$serotype <-
  with(hpv_frequencies,
      factor(serotype, levels = serotype[order(sero_frequencies, decreasing = FALSE)]))

ggplot(hpv_frequencies, aes(x = serotype, xend = serotype, y = 0, yend = sero_frequencies)) +
  geom_segment() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(x = "HPV Serotype", y = "Proportion of subjects")


# That's interesting. Next, let's explore how demographics relate to HPV16 status.
# In class, we made a plot of HPV serostatus against age, colored by education

hpv_demos <- left_join(hpv, demos)

hpv_demos$Education <-
  with(hpv_demos,
       ifelse(DMDEDUC2 < 4, "No College",
              ifelse(is.na(DMDEDUC2), NA, "College")))

hpv_demos$Education[hpv_demos$DMDEDUC2 > 5] <- NA

hpv_demos$HPV16 <-
  ifelse(hpv_demos$LBDR16 == 1, "Positive",
         ifelse(is.na(hpv_demos$LBDR16), NA,
                "Negative"))

ggplot(hpv_demos %>% filter(!is.na(Education)),
       aes(x = HPV16, y = RIDAGEYR, fill = Education)) +
  geom_boxplot() +
  labs(x = "HPV16 Serostatus", y = "Age (Years)")


#we're making the assumption that non-positive non-missing observations are negatives
hpv_demos$HPV16 <- ifelse(hpv_demos$LBDR16 == 1, 1,
                          ifelse(is.na(hpv_demos$LBDR16), NA, 0))

income_labels <-
  c("$0 to $ 4,999",    "$5,000 to $ 9,999",
    "$10,000 to $14,999",     "$15,000 to $19,999",
    "$20,000 to $24,999",     "$25,000 to $34,999",
    "$35,000 to $44,999",     "$45,000 to $54,999",
    "$55,000 to $64,999",     "$65,000 to $74,999",
    "$75,000 and Over",       "Over $20,000",
    "Under $20,000")

hpv_demos$Household_Income <-
  factor(hpv_demos$INDHHINC, levels = 1:13,
         labels = income_labels)

hpv_demos$Ethnicity <-
  factor(hpv_demos$RIDRETH1, levels = 1:5,
         labels = c("Mexican American",
                    "Other Hispanic",
                    "Non-Hispanic White",
                    "Non-Hispanic Black",
                    "Other (including multiracial)"))

hpv_demos_model <-
  glm(HPV16 ~ RIDAGEYR + Household_Income + Ethnicity, data = hpv_demos)

hpv_model_df <-
  cbind(broom::tidy(hpv_demos_model), confint(hpv_demos_model))
hpv_model_df <- hpv_model_df[,c("term", "estimate", "2.5 %", "97.5 %")]
hpv_model_df[,2:4] <- lapply(hpv_model_df[,2:4], exp)
names(hpv_model_df) <- c("term", "Odds_Ratio", "Conf.Low", "Conf.High")
hpv_model_df


# Vitamin D and Depression ===========================================

#' First, we retrieve the data. Note that the links point to data documentation
#' https://wwwn.cdc.gov/nchs/nhanes/2005-2006/VID_D.htm
vitamin_d <- nhanes("VID_D")
#' https://wwwn.cdc.gov/Nchs/Nhanes/2013-2014/DPQ_H.htm
dpqh <- nhanes("DPQ_D")

#' The depression questionnaire is comprised of ten questions that are
#'     rated on a 0 to 3 scale. For the purposes of this exercise, we'll
#'     consider that a suitable range to fit a linear model

dpqh[,-1] <- lapply(dpqh[,-1], function(x) ifelse(x > 3, NA, x))

for_regression <-
  data.frame(SEQN = dpqh$SEQN,
             total_depression_score = rowSums(dpqh[,-1], na.rm = TRUE))

#it's possible inner joins would have been better here
#  read up on two-table verbs here:
#  https://cran.r-project.org/web/packages/dplyr/vignettes/two-table.html
for_regression <- inner_join(for_regression, demos)
for_regression <- inner_join(for_regression, vitamin_d)

income_labels <-
  c("$0 to $ 4,999",    "$5,000 to $ 9,999",
    "$10,000 to $14,999",     "$15,000 to $19,999",
    "$20,000 to $24,999",     "$25,000 to $34,999",
    "$35,000 to $44,999",     "$45,000 to $54,999",
    "$55,000 to $64,999",     "$65,000 to $74,999",
    "$75,000 and Over",       "Over $20,000",
    "Under $20,000")
for_regression$Household_Income <-
  factor(for_regression$INDHHINC, levels = 1:13,
         labels = income_labels)

for_regression$Ethnicity <-
  factor(for_regression$RIDRETH1, levels = 1:5,
         labels = c("Mexican American",
                    "Other Hispanic",
                    "Non-Hispanic White",
                    "Non-Hispanic Black",
                    "Other (including multiracial)"))

for_regression$Education[for_regression$DMDEDUC2 > 5] <- NA

for_regression$Education <-
  with(for_regression,
       ifelse(DMDEDUC2 < 4, "No College",
              ifelse(is.na(DMDEDUC2), NA, "College")))


for_regression$Vitamin_D <- for_regression$LBDVIDMS
for_regression$Age <- for_regression$RIDAGEYR

depression_model <-
  lm(total_depression_score ~
       Household_Income + Ethnicity +
       Education + Age + Vitamin_D,
     data = for_regression)

summary(depression_model)

#' Indeed, looking at the "Vitamin D" coefficient, it looks like each
#' extra microgram of serum Vitamin D concentration is statistically
#' significantly associated with a decrease in depression score of about 0.01.
#' Since Vitamin D concentrations can range orders of magnitude, let's calculate
#' how much lower someone's score would be if they had the highest Vitamin D
#' concentration compared to the lowest concentration that we observe:

range_vitamin_d <-
  range(for_regression$Vitamin_D, na.rm = TRUE)

vitamin_d_deltas <- depression_model$coefficients["Vitamin_D"] * range_vitamin_d

vitamin_d_deltas[2] - vitamin_d_deltas[1]

#Looks like there is a change of about 1.66 points on the depression score. Wow!
	#' Notes from Clinical Information Systems, October 2 2018
	#'
	# Contents ========================================
	#'
	#' From 2018
	#' 1. Human immunodeficiency status versus white
	#' blood cell counts, visualization and modeling
	#' 2. Body mass index versus dietary habits (salt intake
	#' and salmon consumption)
	#' 3. Income and fast-food consumption
	#'
	#' From 2017:
	#' 3. HPV frequencies, modeling
	#' 4. Vitamin D and depression

	# Preamble =========================================
	#'
	#' In class, someone asked about why we use quotation marks.
	#'
	#' The answer would be that quotation marks indicate the 'string'
	#' (some arbitrary sequence), whereas unquoted things refer to
	#' objects that exist in the environments listed in R's search path.
	#' The search path is where R looks for things, see yours here:
	search()

	#' If I assigned the word "ggplot2" to the object name dplyr,
	#' I could load the library "ggplot2" but the code that I
	#' write would appear as though it refers to dplyr.
	#'
	#' Note: dplyr and ggplot2 are two very useful packages, you
	#' should download/learn about them
	#'
	#' Here's what that means:
	dplyr <- "ggplot2"

	#' We see what we might expect, if we print the value:
	dplyr

	#' This function works as appears, it loads dplyr:
	library(dplyr)

	#' This function doesn't exactly work as appears, it loads ggplot2:
	library(dplyr, character.only = TRUE)

	#' This fails because there's no ggplot2 object, but it still looks like we want dplyr:
	try(eval(as.name(dplyr)))

	#' Functions can access these names and/or their values:
	show_name_value <- function(z) {
	message(deparse(substitute(z)))
	message(z)
	}

	show_name_value(dplyr)

	#' Point:
	#' Understanding the differences between values and names,
	#' along with how R looks for objects, will meaningfully help you
	#' reason about R code.
	#'
	#' See here for more: https://adv-r.hadley.nz/

	# Setup =============================================

	# This is where we install and load relevant R packages
	script_depends_on <- c("nhanesA", "dplyr", "ggplot2", "purrr")

	for (pkg in script_depends_on) {
	if (!(pkg %in% installed.packages()))
	install.packages(pkg, dependencies = TRUE)
	}

	library("nhanesA")
	library("dplyr")
	library("ggplot2")
	library("purrr")

	#' Definitely check out the nhanesA package author's other work here:
	#' https://github.com/cjendres1/nhanes/blob/master/vignettes/Introducing_nhanesA.R

	#' First we should consider checking out which tables are available. Uncomment and run the following lines to see what is out there.
	#' nhanesTables("LAB", 2005)
	#' nhanesTables("DEMO", 2016)
	#' nhanesTables("DIET", 2016)
	#' nhanesTables("EXAM", 2016)

	#' Here, we might check which variables are contained in our tables. Again, uncomment the lines to see what they do.
	#nhanesTableVars("DEMO", "DEMO_I")
	#nhanesTableVars("LAB", "HIV_I")
	#nhanesTableVars("LAB", "CBC_I")

	# HIV and WBC ===========================================

	#' First, we retrieve the data. Note that the links point to data documentation
	#' https://wwwn.cdc.gov/Nchs/Nhanes/2015-2016/HIV_I.htm
	hiv <- nhanes("HIV_I")
	#' https://wwwn.cdc.gov/Nchs/Nhanes/2015-2016/CBC_I.htm
	cbc <- nhanes("CBC_I")

	#' Batch recoding is super convenient!!
	#' This is necessary because the data arrive in a format that's convenient for computers, but not humans. As a result, we'll have to convert the codes (generally an integer) to more meaningful representations.
	#' First, we retrieve the codes and their meanings. We do this by identifying the name of each variable (also called a column) in the hiv data, and asking 'nhanesTranslate' to create an object that shows the code and its meaningful representation. 'lapply' looks at each variable name and does this translation for us, using a function we created on-the-fly.
	hiv_translations <-
	lapply(names(hiv)[-1], function(z) {
	translation <- nhanesTranslate("HIV_I", z)[[1]]
	names(translation) <- c(z, paste(z, "v", sep = "_"))
	translation[[z]] <- as.numeric(translation[[z]])
	translation
	})

	#' Then, we join each dataframe to the main 'hiv' dataframe,
	#' and select only those variables that have been translated to their
	#' values. Don't worry about the 'reduce' function, just focus on the
	#' fact that we're pulling the values (whose name we indicated end with a '_v')
	#' and the patient ID number
	hiv <-
	reduce(hiv_translations, left_join, .init = hiv) %>%
	select(SEQN, ends_with("_v"))

	hiv_cbc <-
	left_join(hiv, cbc) %>%
	rename(WBC_Count = LBXWBCSI,
	HIV_Status = LBXHIVC_v)

	#' Now we're ready to do some analysis!
	#' For this example, we'll stick with exploratory data visualization.
	#' for a quick plot, we use qplot
	qplot(hiv_cbc$LBXHGB) +
	labs(x = "Hemoglobin concentration (g/dL)",
	y = "Count")

	#' for a more involved plot, we use ggplot
	hiv_cbc %>%
	# First, we have to fix one of the variables so it is pretty on the plot
	modify_at("HIV_Status",
	function(z) {
	z <- strsplit(z, " ")
	map_chr(z, function(z) paste(z, collapse = "\n"))
	}) %>%
	ggplot(aes(y = WBC_Count, x = HIV_Status, fill = HIV_Status)) +
	geom_boxplot() +
	scale_fill_grey() +
	theme(axis.text.x = element_text(color = "black"))

	# Body mass index and dietary habits =============================

	#' First, we retrieve the data. Note that the links point to data documentation
	#' https://wwwn.cdc.gov/Nchs/Nhanes/2015-2016/DR1TOT_I.htm
	diets <- nhanes("DR1TOT_I")
	#' https://wwwn.cdc.gov/Nchs/Nhanes/2015-2016/BMX_I.htm
	bmx <- nhanes("BMX_I")

	#' Batch recoding -- super convenient!!
	#' Next, we define the columns whose values we will change
	diet_names <-
	c(
	"DR1DAY", "DR1LANG", "DR1MRESP", "DR1HELP",
	"DBQ095Z", "DBD100", "DRQSPREP", "DRD370JQ",
	"DRD370M", "DRD370MQ", "DRD370N", "DRD370NQ",
	"DRD370Q", "DRD370QQ", "DRD370R"
	)

	#' Then, we retrieve the codes and their corresponding values
	diets_translations <-
	lapply(diet_names, function(z) {
	translation <-
	nhanesTranslate(nh_table = "DR1TOT_I", colnames = z)[[1]]

	names(translation) <- c(z, paste(z, "v", sep = "_"))
	translation[[z]] <- as.numeric(translation[[z]])

	translation
	})

	#' Then we join the dataframes containing translations with the main 'diets' dataframe
	diets <-
	reduce(diets_translations, left_join, .init = diets) %>%
	select(SEQN, ends_with("_v"))

	#' next, we put the datasets together
	bm_diets <-
	left_join(bmx, diets) %>%
	rename(Eats_Salmon = DRD370M_v,
	Adds_Salt = DBD100_v,
	BMI = BMXBMI) %>%
	modify_at("Adds_Salt",
	function(z) {
	factor(z,
	levels = c("Rarely",
	"Occasionally",
	"Very often",
	"Missing", "Don't know"))
	})

	#' We take only a sample because using the whole dataset is prohibitive for laptops
	bm_diets %>%
	sample_n(5e4) %>%
	ggplot(aes(x = Adds_Salt, y = BMI)) +
	geom_jitter(alpha = 0.5) +
	labs(
	x = "How often {do you/does SP} add ordinary \nsalt to {your/his/her/SP's} food at the table?",
	y = "Body Mass Index"
	)

	bm_diets %>%
	sample_n(5e4) %>%
	ggplot(aes(x = Eats_Salmon, y = BMI)) +
	geom_jitter(alpha = 0.5) +
	labs(
	x = "Ate salmon within the past 30 days",
	y = "Body Mass Index"
	)

	#' Income and Fast-food intake ================================================
	#' A recent study found that income may be related to fast food intake. Let's see if we can recapitulate that conclusion using the NHANES data.

	#' First, we retrieve diet information--this requires two tables, the difference between which is described in the NHANES documentation (always refer to the documentation when you have a question about the data!). Once you have done analysis, you should really put together documentation describing what you did.
	diet_quest <- nhanes("DR1TOT_I")
	diet_indiv <- nhanes("DR1IFF_I")
	#' Here, we select only the relevant columns from our diet questionnaire
	intake_q <- select(diet_indiv, SEQN, DR1FS)

	#' Next, we retrieve income information and select relevant columns from that table
	income <- nhanes("INQ_I")
	just_income <- select(income, SEQN, IND235)
	#' Here, we put food intake and income data together
	income_intake <- left_join(intake_q, income, by = "SEQN")

	#' Next, we have to obtain the translations between computer and real-world representations of these data
	#' Since the income data aren't expressed in dollars, we have to find a table to translate the coded values into their meaningful representations
	lookup_income <- nhanesTranslate("INQ_I", "IND235")[[1]]
	names(lookup_income) <- c("IND235", "ID235_V")
	lookup_food <- nhanesTranslate("DR1IFF_I", "DR1FS")[[1]]
	names(lookup_food) <- c("DR1FS", "DR1FS_V")
	#' These modifications are necessary to join our translations
	lookup_income <- modify(lookup_income, as.character)
	lookup_food <- modify(lookup_food, as.character)
	income_intake <- modify(income_intake, as.character)

	#' Next, we join the translations to their values
	income_intake <- left_join(income_intake, lookup_income)
	income_intake <- left_join(income_intake, lookup_food)
	#' And then we select only the relevant variables (which, in addition to the patient id/SEQN, are represented here by a '_V' at the end of their name)
	income_intake_v <- select(income_intake, SEQN, ends_with("_V"))
	#' Because we want to draw a picture, and we have categories that follow a clear order in their magnitude, we have to tell the computer what those income levels' order actually is
	income_intake_v$ID235_V <-
	factor(income_intake_v$ID235_V,
	levels = c(
	'$0 - $399',
	'$400 - $799',
	'$800 - $1,249',
	'$1,250 - $1,649',
	'$1,650 - $2,099',
	'$2,100 - $2,899',
	'$2,900 - $3,749',
	'$3,750 - $4,599',
	'$4,600 - $5,399',
	'$5,400 - $6,249',
	'$6,250 - $8,399',
	'$8,400 and over',
	'Dont know',
	'NA',
	'Refused'
	))

	#' Finally, we draw a picture to see if there's any visible difference between fast food consumption frequency and income.
	income_intake_v %>%
	distinct(SEQN, .keep_all = TRUE) %>%
	ggplot(
	aes(x = ID235_V, y = DR1FS_V)) +
	geom_jitter(alpha = I(1/10)) +
	theme(axis.text.x = element_text(angle = 45, hjust = 1))

	#' HPV and Demographics =======================================================
	#' Let's see what the frequencies of HPV seropositivity are.
	#' Remember, positive is represented by "1".

	#' First, we retrieve the data. Note that the links point to data documentation
	#' https://wwwn.cdc.gov/nchs/nhanes/2005-2006/DEMO_D.htm
	demos <- nhanes("DEMO_D")
	#' https://wwwn.cdc.gov/nchs/nhanes/2005-2006/HPVSWR_D.htm
	hpv <- nhanes("HPVSWR_D")

	hpv_frequencies <-
	vapply(hpv[,-1],
	function(x) sum(x == 1, na.rm = TRUE) / length(x),
	double(1))

	#' that's interesting. let's visualize it
	hpv_frequencies <-
	data.frame(sero_frequencies = hpv_frequencies,
	serotype = names(hpv_frequencies),
	stringsAsFactors = FALSE)

	#' this will order things nicely in our plot
	hpv_frequencies$serotype <-
	with(hpv_frequencies,
	factor(serotype, levels = serotype[order(sero_frequencies, decreasing = FALSE)]))

	ggplot(hpv_frequencies, aes(x = serotype, xend = serotype, y = 0, yend = sero_frequencies)) +
	geom_segment() +
	theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
	labs(x = "HPV Serotype", y = "Proportion of subjects")


	# That's interesting. Next, let's explore how demographics relate to HPV16 status.
	# In class, we made a plot of HPV serostatus against age, colored by education

	hpv_demos <- left_join(hpv, demos)

	hpv_demos$Education <-
	with(hpv_demos,
	ifelse(DMDEDUC2 < 4, "No College",
	ifelse(is.na(DMDEDUC2), NA, "College")))

	hpv_demos$Education[hpv_demos$DMDEDUC2 > 5] <- NA

	hpv_demos$HPV16 <-
	ifelse(hpv_demos$LBDR16 == 1, "Positive",
	ifelse(is.na(hpv_demos$LBDR16), NA,
	"Negative"))

	ggplot(hpv_demos %>% filter(!is.na(Education)),
	aes(x = HPV16, y = RIDAGEYR, fill = Education)) +
	geom_boxplot() +
	labs(x = "HPV16 Serostatus", y = "Age (Years)")


	#we're making the assumption that non-positive non-missing observations are negatives
	hpv_demos$HPV16 <- ifelse(hpv_demos$LBDR16 == 1, 1,
	ifelse(is.na(hpv_demos$LBDR16), NA, 0))

	income_labels <-
	c("$0 to $ 4,999", "$5,000 to $ 9,999",
	"$10,000 to $14,999", "$15,000 to $19,999",
	"$20,000 to $24,999", "$25,000 to $34,999",
	"$35,000 to $44,999", "$45,000 to $54,999",
	"$55,000 to $64,999", "$65,000 to $74,999",
	"$75,000 and Over", "Over $20,000",
	"Under $20,000")

	hpv_demos$Household_Income <-
	factor(hpv_demos$INDHHINC, levels = 1:13,
	labels = income_labels)

	hpv_demos$Ethnicity <-
	factor(hpv_demos$RIDRETH1, levels = 1:5,
	labels = c("Mexican American",
	"Other Hispanic",
	"Non-Hispanic White",
	"Non-Hispanic Black",
	"Other (including multiracial)"))

	hpv_demos_model <-
	glm(HPV16 ~ RIDAGEYR + Household_Income + Ethnicity, data = hpv_demos)

	hpv_model_df <-
	cbind(broom::tidy(hpv_demos_model), confint(hpv_demos_model))
	hpv_model_df <- hpv_model_df[,c("term", "estimate", "2.5 %", "97.5 %")]
	hpv_model_df[,2:4] <- lapply(hpv_model_df[,2:4], exp)
	names(hpv_model_df) <- c("term", "Odds_Ratio", "Conf.Low", "Conf.High")
	hpv_model_df


	# Vitamin D and Depression ===========================================

	#' First, we retrieve the data. Note that the links point to data documentation
	#' https://wwwn.cdc.gov/nchs/nhanes/2005-2006/VID_D.htm
	vitamin_d <- nhanes("VID_D")
	#' https://wwwn.cdc.gov/Nchs/Nhanes/2013-2014/DPQ_H.htm
	dpqh <- nhanes("DPQ_D")

	#' The depression questionnaire is comprised of ten questions that are
	#' rated on a 0 to 3 scale. For the purposes of this exercise, we'll
	#' consider that a suitable range to fit a linear model

	dpqh[,-1] <- lapply(dpqh[,-1], function(x) ifelse(x > 3, NA, x))

	for_regression <-
	data.frame(SEQN = dpqh$SEQN,
	total_depression_score = rowSums(dpqh[,-1], na.rm = TRUE))

	#it's possible inner joins would have been better here
	# read up on two-table verbs here:
	# https://cran.r-project.org/web/packages/dplyr/vignettes/two-table.html
	for_regression <- inner_join(for_regression, demos)
	for_regression <- inner_join(for_regression, vitamin_d)

	income_labels <-
	c("$0 to $ 4,999", "$5,000 to $ 9,999",
	"$10,000 to $14,999", "$15,000 to $19,999",
	"$20,000 to $24,999", "$25,000 to $34,999",
	"$35,000 to $44,999", "$45,000 to $54,999",
	"$55,000 to $64,999", "$65,000 to $74,999",
	"$75,000 and Over", "Over $20,000",
	"Under $20,000")
	for_regression$Household_Income <-
	factor(for_regression$INDHHINC, levels = 1:13,
	labels = income_labels)

	for_regression$Ethnicity <-
	factor(for_regression$RIDRETH1, levels = 1:5,
	labels = c("Mexican American",
	"Other Hispanic",
	"Non-Hispanic White",
	"Non-Hispanic Black",
	"Other (including multiracial)"))

	for_regression$Education[for_regression$DMDEDUC2 > 5] <- NA

	for_regression$Education <-
	with(for_regression,
	ifelse(DMDEDUC2 < 4, "No College",
	ifelse(is.na(DMDEDUC2), NA, "College")))


	for_regression$Vitamin_D <- for_regression$LBDVIDMS
	for_regression$Age <- for_regression$RIDAGEYR

	depression_model <-
	lm(total_depression_score ~
	Household_Income + Ethnicity +
	Education + Age + Vitamin_D,
	data = for_regression)

	summary(depression_model)

	#' Indeed, looking at the "Vitamin D" coefficient, it looks like each
	#' extra microgram of serum Vitamin D concentration is statistically
	#' significantly associated with a decrease in depression score of about 0.01.
	#' Since Vitamin D concentrations can range orders of magnitude, let's calculate
	#' how much lower someone's score would be if they had the highest Vitamin D
	#' concentration compared to the lowest concentration that we observe:

	range_vitamin_d <-
	range(for_regression$Vitamin_D, na.rm = TRUE)

	vitamin_d_deltas <- depression_model$coefficients["Vitamin_D"] * range_vitamin_d

	vitamin_d_deltas[2] - vitamin_d_deltas[1]

	#Looks like there is a change of about 1.66 points on the depression score. Wow!