Created
November 5, 2018 15:59
-
-
Save mustafaascha/2ee5c1de6a2b1e844483b95188630a99 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#' Notes from Clinical Information Systems, October 2 2018 | |
#' | |
# Contents ======================================== | |
#' | |
#' From 2018 | |
#' 1. Human immunodeficiency status versus white | |
#' blood cell counts, visualization and modeling | |
#' 2. Body mass index versus dietary habits (salt intake | |
#' and salmon consumption) | |
#' 3. Income and fast-food consumption | |
#' | |
#' From 2017: | |
#' 3. HPV frequencies, modeling | |
#' 4. Vitamin D and depression | |
# Preamble ========================================= | |
#' | |
#' In class, someone asked about why we use quotation marks. | |
#' | |
#' The answer would be that quotation marks indicate the 'string' | |
#' (some arbitrary sequence), whereas unquoted things refer to | |
#' objects that exist in the environments listed in R's search path. | |
#' The search path is where R looks for things, see yours here: | |
search() | |
#' If I assigned the word "ggplot2" to the object name dplyr, | |
#' I could load the library "ggplot2" but the code that I | |
#' write would appear as though it refers to dplyr. | |
#' | |
#' Note: dplyr and ggplot2 are two very useful packages, you | |
#' should download/learn about them | |
#' | |
#' Here's what that means: | |
dplyr <- "ggplot2" | |
#' We see what we might expect, if we print the value: | |
dplyr | |
#' This function works as appears, it loads dplyr: | |
library(dplyr) | |
#' This function doesn't exactly work as appears, it loads ggplot2: | |
library(dplyr, character.only = TRUE) | |
#' This fails because there's no ggplot2 object, but it still looks like we want dplyr: | |
try(eval(as.name(dplyr))) | |
#' Functions can access these names and/or their values: | |
show_name_value <- function(z) { | |
message(deparse(substitute(z))) | |
message(z) | |
} | |
show_name_value(dplyr) | |
#' Point: | |
#' Understanding the differences between values and names, | |
#' along with how R looks for objects, will meaningfully help you | |
#' reason about R code. | |
#' | |
#' See here for more: https://adv-r.hadley.nz/ | |
# Setup ============================================= | |
# This is where we install and load relevant R packages | |
script_depends_on <- c("nhanesA", "dplyr", "ggplot2", "purrr") | |
for (pkg in script_depends_on) { | |
if (!(pkg %in% installed.packages())) | |
install.packages(pkg, dependencies = TRUE) | |
} | |
library("nhanesA") | |
library("dplyr") | |
library("ggplot2") | |
library("purrr") | |
#' Definitely check out the nhanesA package author's other work here: | |
#' https://github.com/cjendres1/nhanes/blob/master/vignettes/Introducing_nhanesA.R | |
#' First we should consider checking out which tables are available. Uncomment and run the following lines to see what is out there. | |
#' nhanesTables("LAB", 2005) | |
#' nhanesTables("DEMO", 2016) | |
#' nhanesTables("DIET", 2016) | |
#' nhanesTables("EXAM", 2016) | |
#' Here, we might check which variables are contained in our tables. Again, uncomment the lines to see what they do. | |
#nhanesTableVars("DEMO", "DEMO_I") | |
#nhanesTableVars("LAB", "HIV_I") | |
#nhanesTableVars("LAB", "CBC_I") | |
# HIV and WBC =========================================== | |
#' First, we retrieve the data. Note that the links point to data documentation | |
#' https://wwwn.cdc.gov/Nchs/Nhanes/2015-2016/HIV_I.htm | |
hiv <- nhanes("HIV_I") | |
#' https://wwwn.cdc.gov/Nchs/Nhanes/2015-2016/CBC_I.htm | |
cbc <- nhanes("CBC_I") | |
#' Batch recoding is super convenient!! | |
#' This is necessary because the data arrive in a format that's convenient for computers, but not humans. As a result, we'll have to convert the codes (generally an integer) to more meaningful representations. | |
#' First, we retrieve the codes and their meanings. We do this by identifying the name of each variable (also called a column) in the hiv data, and asking 'nhanesTranslate' to create an object that shows the code and its meaningful representation. 'lapply' looks at each variable name and does this translation for us, using a function we created on-the-fly. | |
hiv_translations <- | |
lapply(names(hiv)[-1], function(z) { | |
translation <- nhanesTranslate("HIV_I", z)[[1]] | |
names(translation) <- c(z, paste(z, "v", sep = "_")) | |
translation[[z]] <- as.numeric(translation[[z]]) | |
translation | |
}) | |
#' Then, we join each dataframe to the main 'hiv' dataframe, | |
#' and select only those variables that have been translated to their | |
#' values. Don't worry about the 'reduce' function, just focus on the | |
#' fact that we're pulling the values (whose name we indicated end with a '_v') | |
#' and the patient ID number | |
hiv <- | |
reduce(hiv_translations, left_join, .init = hiv) %>% | |
select(SEQN, ends_with("_v")) | |
hiv_cbc <- | |
left_join(hiv, cbc) %>% | |
rename(WBC_Count = LBXWBCSI, | |
HIV_Status = LBXHIVC_v) | |
#' Now we're ready to do some analysis! | |
#' For this example, we'll stick with exploratory data visualization. | |
#' for a quick plot, we use qplot | |
qplot(hiv_cbc$LBXHGB) + | |
labs(x = "Hemoglobin concentration (g/dL)", | |
y = "Count") | |
#' for a more involved plot, we use ggplot | |
hiv_cbc %>% | |
# First, we have to fix one of the variables so it is pretty on the plot | |
modify_at("HIV_Status", | |
function(z) { | |
z <- strsplit(z, " ") | |
map_chr(z, function(z) paste(z, collapse = "\n")) | |
}) %>% | |
ggplot(aes(y = WBC_Count, x = HIV_Status, fill = HIV_Status)) + | |
geom_boxplot() + | |
scale_fill_grey() + | |
theme(axis.text.x = element_text(color = "black")) | |
# Body mass index and dietary habits ============================= | |
#' First, we retrieve the data. Note that the links point to data documentation | |
#' https://wwwn.cdc.gov/Nchs/Nhanes/2015-2016/DR1TOT_I.htm | |
diets <- nhanes("DR1TOT_I") | |
#' https://wwwn.cdc.gov/Nchs/Nhanes/2015-2016/BMX_I.htm | |
bmx <- nhanes("BMX_I") | |
#' Batch recoding -- super convenient!! | |
#' Next, we define the columns whose values we will change | |
diet_names <- | |
c( | |
"DR1DAY", "DR1LANG", "DR1MRESP", "DR1HELP", | |
"DBQ095Z", "DBD100", "DRQSPREP", "DRD370JQ", | |
"DRD370M", "DRD370MQ", "DRD370N", "DRD370NQ", | |
"DRD370Q", "DRD370QQ", "DRD370R" | |
) | |
#' Then, we retrieve the codes and their corresponding values | |
diets_translations <- | |
lapply(diet_names, function(z) { | |
translation <- | |
nhanesTranslate(nh_table = "DR1TOT_I", colnames = z)[[1]] | |
names(translation) <- c(z, paste(z, "v", sep = "_")) | |
translation[[z]] <- as.numeric(translation[[z]]) | |
translation | |
}) | |
#' Then we join the dataframes containing translations with the main 'diets' dataframe | |
diets <- | |
reduce(diets_translations, left_join, .init = diets) %>% | |
select(SEQN, ends_with("_v")) | |
#' next, we put the datasets together | |
bm_diets <- | |
left_join(bmx, diets) %>% | |
rename(Eats_Salmon = DRD370M_v, | |
Adds_Salt = DBD100_v, | |
BMI = BMXBMI) %>% | |
modify_at("Adds_Salt", | |
function(z) { | |
factor(z, | |
levels = c("Rarely", | |
"Occasionally", | |
"Very often", | |
"Missing", "Don't know")) | |
}) | |
#' We take only a sample because using the whole dataset is prohibitive for laptops | |
bm_diets %>% | |
sample_n(5e4) %>% | |
ggplot(aes(x = Adds_Salt, y = BMI)) + | |
geom_jitter(alpha = 0.5) + | |
labs( | |
x = "How often {do you/does SP} add ordinary \nsalt to {your/his/her/SP's} food at the table?", | |
y = "Body Mass Index" | |
) | |
bm_diets %>% | |
sample_n(5e4) %>% | |
ggplot(aes(x = Eats_Salmon, y = BMI)) + | |
geom_jitter(alpha = 0.5) + | |
labs( | |
x = "Ate salmon within the past 30 days", | |
y = "Body Mass Index" | |
) | |
#' Income and Fast-food intake ================================================ | |
#' A recent study found that income may be related to fast food intake. Let's see if we can recapitulate that conclusion using the NHANES data. | |
#' First, we retrieve diet information--this requires two tables, the difference between which is described in the NHANES documentation (always refer to the documentation when you have a question about the data!). Once you have done analysis, you should really put together documentation describing what you did. | |
diet_quest <- nhanes("DR1TOT_I") | |
diet_indiv <- nhanes("DR1IFF_I") | |
#' Here, we select only the relevant columns from our diet questionnaire | |
intake_q <- select(diet_indiv, SEQN, DR1FS) | |
#' Next, we retrieve income information and select relevant columns from that table | |
income <- nhanes("INQ_I") | |
just_income <- select(income, SEQN, IND235) | |
#' Here, we put food intake and income data together | |
income_intake <- left_join(intake_q, income, by = "SEQN") | |
#' Next, we have to obtain the translations between computer and real-world representations of these data | |
#' Since the income data aren't expressed in dollars, we have to find a table to translate the coded values into their meaningful representations | |
lookup_income <- nhanesTranslate("INQ_I", "IND235")[[1]] | |
names(lookup_income) <- c("IND235", "ID235_V") | |
lookup_food <- nhanesTranslate("DR1IFF_I", "DR1FS")[[1]] | |
names(lookup_food) <- c("DR1FS", "DR1FS_V") | |
#' These modifications are necessary to join our translations | |
lookup_income <- modify(lookup_income, as.character) | |
lookup_food <- modify(lookup_food, as.character) | |
income_intake <- modify(income_intake, as.character) | |
#' Next, we join the translations to their values | |
income_intake <- left_join(income_intake, lookup_income) | |
income_intake <- left_join(income_intake, lookup_food) | |
#' And then we select only the relevant variables (which, in addition to the patient id/SEQN, are represented here by a '_V' at the end of their name) | |
income_intake_v <- select(income_intake, SEQN, ends_with("_V")) | |
#' Because we want to draw a picture, and we have categories that follow a clear order in their magnitude, we have to tell the computer what those income levels' order actually is | |
income_intake_v$ID235_V <- | |
factor(income_intake_v$ID235_V, | |
levels = c( | |
'$0 - $399', | |
'$400 - $799', | |
'$800 - $1,249', | |
'$1,250 - $1,649', | |
'$1,650 - $2,099', | |
'$2,100 - $2,899', | |
'$2,900 - $3,749', | |
'$3,750 - $4,599', | |
'$4,600 - $5,399', | |
'$5,400 - $6,249', | |
'$6,250 - $8,399', | |
'$8,400 and over', | |
'Dont know', | |
'NA', | |
'Refused' | |
)) | |
#' Finally, we draw a picture to see if there's any visible difference between fast food consumption frequency and income. | |
income_intake_v %>% | |
distinct(SEQN, .keep_all = TRUE) %>% | |
ggplot( | |
aes(x = ID235_V, y = DR1FS_V)) + | |
geom_jitter(alpha = I(1/10)) + | |
theme(axis.text.x = element_text(angle = 45, hjust = 1)) | |
#' HPV and Demographics ======================================================= | |
#' Let's see what the frequencies of HPV seropositivity are. | |
#' Remember, positive is represented by "1". | |
#' First, we retrieve the data. Note that the links point to data documentation | |
#' https://wwwn.cdc.gov/nchs/nhanes/2005-2006/DEMO_D.htm | |
demos <- nhanes("DEMO_D") | |
#' https://wwwn.cdc.gov/nchs/nhanes/2005-2006/HPVSWR_D.htm | |
hpv <- nhanes("HPVSWR_D") | |
hpv_frequencies <- | |
vapply(hpv[,-1], | |
function(x) sum(x == 1, na.rm = TRUE) / length(x), | |
double(1)) | |
#' that's interesting. let's visualize it | |
hpv_frequencies <- | |
data.frame(sero_frequencies = hpv_frequencies, | |
serotype = names(hpv_frequencies), | |
stringsAsFactors = FALSE) | |
#' this will order things nicely in our plot | |
hpv_frequencies$serotype <- | |
with(hpv_frequencies, | |
factor(serotype, levels = serotype[order(sero_frequencies, decreasing = FALSE)])) | |
ggplot(hpv_frequencies, aes(x = serotype, xend = serotype, y = 0, yend = sero_frequencies)) + | |
geom_segment() + | |
theme(axis.text.x = element_text(angle = 45, hjust = 1)) + | |
labs(x = "HPV Serotype", y = "Proportion of subjects") | |
# That's interesting. Next, let's explore how demographics relate to HPV16 status. | |
# In class, we made a plot of HPV serostatus against age, colored by education | |
hpv_demos <- left_join(hpv, demos) | |
hpv_demos$Education <- | |
with(hpv_demos, | |
ifelse(DMDEDUC2 < 4, "No College", | |
ifelse(is.na(DMDEDUC2), NA, "College"))) | |
hpv_demos$Education[hpv_demos$DMDEDUC2 > 5] <- NA | |
hpv_demos$HPV16 <- | |
ifelse(hpv_demos$LBDR16 == 1, "Positive", | |
ifelse(is.na(hpv_demos$LBDR16), NA, | |
"Negative")) | |
ggplot(hpv_demos %>% filter(!is.na(Education)), | |
aes(x = HPV16, y = RIDAGEYR, fill = Education)) + | |
geom_boxplot() + | |
labs(x = "HPV16 Serostatus", y = "Age (Years)") | |
#we're making the assumption that non-positive non-missing observations are negatives | |
hpv_demos$HPV16 <- ifelse(hpv_demos$LBDR16 == 1, 1, | |
ifelse(is.na(hpv_demos$LBDR16), NA, 0)) | |
income_labels <- | |
c("$0 to $ 4,999", "$5,000 to $ 9,999", | |
"$10,000 to $14,999", "$15,000 to $19,999", | |
"$20,000 to $24,999", "$25,000 to $34,999", | |
"$35,000 to $44,999", "$45,000 to $54,999", | |
"$55,000 to $64,999", "$65,000 to $74,999", | |
"$75,000 and Over", "Over $20,000", | |
"Under $20,000") | |
hpv_demos$Household_Income <- | |
factor(hpv_demos$INDHHINC, levels = 1:13, | |
labels = income_labels) | |
hpv_demos$Ethnicity <- | |
factor(hpv_demos$RIDRETH1, levels = 1:5, | |
labels = c("Mexican American", | |
"Other Hispanic", | |
"Non-Hispanic White", | |
"Non-Hispanic Black", | |
"Other (including multiracial)")) | |
hpv_demos_model <- | |
glm(HPV16 ~ RIDAGEYR + Household_Income + Ethnicity, data = hpv_demos) | |
hpv_model_df <- | |
cbind(broom::tidy(hpv_demos_model), confint(hpv_demos_model)) | |
hpv_model_df <- hpv_model_df[,c("term", "estimate", "2.5 %", "97.5 %")] | |
hpv_model_df[,2:4] <- lapply(hpv_model_df[,2:4], exp) | |
names(hpv_model_df) <- c("term", "Odds_Ratio", "Conf.Low", "Conf.High") | |
hpv_model_df | |
# Vitamin D and Depression =========================================== | |
#' First, we retrieve the data. Note that the links point to data documentation | |
#' https://wwwn.cdc.gov/nchs/nhanes/2005-2006/VID_D.htm | |
vitamin_d <- nhanes("VID_D") | |
#' https://wwwn.cdc.gov/Nchs/Nhanes/2013-2014/DPQ_H.htm | |
dpqh <- nhanes("DPQ_D") | |
#' The depression questionnaire is comprised of ten questions that are | |
#' rated on a 0 to 3 scale. For the purposes of this exercise, we'll | |
#' consider that a suitable range to fit a linear model | |
dpqh[,-1] <- lapply(dpqh[,-1], function(x) ifelse(x > 3, NA, x)) | |
for_regression <- | |
data.frame(SEQN = dpqh$SEQN, | |
total_depression_score = rowSums(dpqh[,-1], na.rm = TRUE)) | |
#it's possible inner joins would have been better here | |
# read up on two-table verbs here: | |
# https://cran.r-project.org/web/packages/dplyr/vignettes/two-table.html | |
for_regression <- inner_join(for_regression, demos) | |
for_regression <- inner_join(for_regression, vitamin_d) | |
income_labels <- | |
c("$0 to $ 4,999", "$5,000 to $ 9,999", | |
"$10,000 to $14,999", "$15,000 to $19,999", | |
"$20,000 to $24,999", "$25,000 to $34,999", | |
"$35,000 to $44,999", "$45,000 to $54,999", | |
"$55,000 to $64,999", "$65,000 to $74,999", | |
"$75,000 and Over", "Over $20,000", | |
"Under $20,000") | |
for_regression$Household_Income <- | |
factor(for_regression$INDHHINC, levels = 1:13, | |
labels = income_labels) | |
for_regression$Ethnicity <- | |
factor(for_regression$RIDRETH1, levels = 1:5, | |
labels = c("Mexican American", | |
"Other Hispanic", | |
"Non-Hispanic White", | |
"Non-Hispanic Black", | |
"Other (including multiracial)")) | |
for_regression$Education[for_regression$DMDEDUC2 > 5] <- NA | |
for_regression$Education <- | |
with(for_regression, | |
ifelse(DMDEDUC2 < 4, "No College", | |
ifelse(is.na(DMDEDUC2), NA, "College"))) | |
for_regression$Vitamin_D <- for_regression$LBDVIDMS | |
for_regression$Age <- for_regression$RIDAGEYR | |
depression_model <- | |
lm(total_depression_score ~ | |
Household_Income + Ethnicity + | |
Education + Age + Vitamin_D, | |
data = for_regression) | |
summary(depression_model) | |
#' Indeed, looking at the "Vitamin D" coefficient, it looks like each | |
#' extra microgram of serum Vitamin D concentration is statistically | |
#' significantly associated with a decrease in depression score of about 0.01. | |
#' Since Vitamin D concentrations can range orders of magnitude, let's calculate | |
#' how much lower someone's score would be if they had the highest Vitamin D | |
#' concentration compared to the lowest concentration that we observe: | |
range_vitamin_d <- | |
range(for_regression$Vitamin_D, na.rm = TRUE) | |
vitamin_d_deltas <- depression_model$coefficients["Vitamin_D"] * range_vitamin_d | |
vitamin_d_deltas[2] - vitamin_d_deltas[1] | |
#Looks like there is a change of about 1.66 points on the depression score. Wow! | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment