laurakwiley/jt_processmulttests_makehistogram.R

## jt_processmulttests_makehistogram.R
library(magrittr)
library(tidyr)
library(dplyr)
library(ggplot2)

data <- data.frame(PT_ID = c(23, 23, 23, 23, 100, 100, 100, 100),
                   DATETIME = c("1/1/2013 9:38", "8/9/2014 15:42", "1/1/2013 9:38", "8/9/2014 15:42", "12/21/2011 12:10", "8/9/2013 11:18", "12/21/2011 12:10", "8/9/2013 11:18"),
                   MoCA_total = c("MHSMOCA:MOCATOTAL = 25", "MHSMOCA:MOCATOTAL = 22", NA, NA, "MHSMOCA:MOCATOTAL = 21", "MHSMOCA:MOCATOTAL = 25", NA, NA),
                   Total_Correct = c(NA, NA, "MHSWLM:CORRECT = 15", "MHSWLM:CORRECT = 20", NA, NA, "MHSWLM:CORRECT = 18", "MHSWLM:CORRECT = 24")) %>% tbl_df()

# Clean up datatypes
data %<>%
  mutate(DATETIME = mdy_hm(DATETIME),
         MoCA_total = as.character(MoCA_total),
         Total_Correct = as.character(Total_Correct))

# Process MoCA and MHSWLM to extract the number for the test and remove the test name since it's already in the column ID
data %<>%
  separate(col = MoCA_total, into = c("text1", "MoCA_total"), sep = "=", convert = TRUE) %>%
  separate(col = Total_Correct, into = c("text2", "Total_Correct"), sep = "=", convert = TRUE) %>%
  select(-text1, -text2)

# Now make this a tidy data frame (e.g. one line for each observation)
# Gather makes a key column ("test") that contains the column names you select and a value column ("score") that contains the values from that column
# Here is missing data is uninformative so we use na.rm=TRUE to get rid of the empty data.
data %<>%
  gather(key = "test", value = "score", MoCA_total:Total_Correct, na.rm = TRUE)

# Now assign visit numbers for each person and test
# As written this will give you 1:total number of visits w/ that lab test as the visit id. (e.g. if patient 23 had 4 MoCA_total tests, the visit id would go from 1:4)
# If you really want to create pairs of tests you can replace the "1:length(PT_ID)" with "rep(c(1,2), times = length(PT_ID)/2)" This would result in visit id being 1,2,1,2 -- fair warning, this will cause problems down the analysis pipeline I'm building.
data %<>%
  group_by(PT_ID, test) %>%
  arrange(DATETIME) %>%
  mutate(visit_id = 1:length(PT_ID))

# Now let's also make a value for the difference between visits.  There are a few ways to do this, but I'm going to do the easiest conceptually, but least elegant way.
data %>%
  select(-DATETIME) %>% # For this pipeline the different DATETIMES will cause problems. Note that I'm creating the entire pipeline in one go and *not* returning it into the original data object - this leaves the underlying data untouched.
  unite(col = test_visit, sep = ".", test, visit_id) %>% ## This joins the test and visit column so each test is unique for a patient
  spread(key = test_visit, value = score) %>% # This turns the long data wide
  mutate(MoCA_total.diff = MoCA_total.2 - MoCA_total.1,
         Total_Correct.diff = Total_Correct.2 - Total_Correct.1) %>% # We calculate our differences
  gather(key = test_visit, value = score, MoCA_total.1:Total_Correct.diff) %>% # And bring our wide data long again
  separate(col = test_visit, into = c("test", "visit"), sep = "\\.") %>% # Separate out test and visit labels again
  ggplot() + # plot our histograms, score on the x axis then split by test and visit/difference If you want these separate you can use filtering before plotting.
    geom_histogram(aes(x = score))+
    facet_grid(visit~test)
	library(magrittr)
	library(tidyr)
	library(dplyr)
	library(ggplot2)

	data <- data.frame(PT_ID = c(23, 23, 23, 23, 100, 100, 100, 100),
	DATETIME = c("1/1/2013 9:38", "8/9/2014 15:42", "1/1/2013 9:38", "8/9/2014 15:42", "12/21/2011 12:10", "8/9/2013 11:18", "12/21/2011 12:10", "8/9/2013 11:18"),
	MoCA_total = c("MHSMOCA:MOCATOTAL = 25", "MHSMOCA:MOCATOTAL = 22", NA, NA, "MHSMOCA:MOCATOTAL = 21", "MHSMOCA:MOCATOTAL = 25", NA, NA),
	Total_Correct = c(NA, NA, "MHSWLM:CORRECT = 15", "MHSWLM:CORRECT = 20", NA, NA, "MHSWLM:CORRECT = 18", "MHSWLM:CORRECT = 24")) %>% tbl_df()

	# Clean up datatypes
	data %<>%
	mutate(DATETIME = mdy_hm(DATETIME),
	MoCA_total = as.character(MoCA_total),
	Total_Correct = as.character(Total_Correct))

	# Process MoCA and MHSWLM to extract the number for the test and remove the test name since it's already in the column ID
	data %<>%
	separate(col = MoCA_total, into = c("text1", "MoCA_total"), sep = "=", convert = TRUE) %>%
	separate(col = Total_Correct, into = c("text2", "Total_Correct"), sep = "=", convert = TRUE) %>%
	select(-text1, -text2)

	# Now make this a tidy data frame (e.g. one line for each observation)
	# Gather makes a key column ("test") that contains the column names you select and a value column ("score") that contains the values from that column
	# Here is missing data is uninformative so we use na.rm=TRUE to get rid of the empty data.
	data %<>%
	gather(key = "test", value = "score", MoCA_total:Total_Correct, na.rm = TRUE)

	# Now assign visit numbers for each person and test
	# As written this will give you 1:total number of visits w/ that lab test as the visit id. (e.g. if patient 23 had 4 MoCA_total tests, the visit id would go from 1:4)
	# If you really want to create pairs of tests you can replace the "1:length(PT_ID)" with "rep(c(1,2), times = length(PT_ID)/2)" This would result in visit id being 1,2,1,2 -- fair warning, this will cause problems down the analysis pipeline I'm building.
	data %<>%
	group_by(PT_ID, test) %>%
	arrange(DATETIME) %>%
	mutate(visit_id = 1:length(PT_ID))

	# Now let's also make a value for the difference between visits. There are a few ways to do this, but I'm going to do the easiest conceptually, but least elegant way.
	data %>%
	select(-DATETIME) %>% # For this pipeline the different DATETIMES will cause problems. Note that I'm creating the entire pipeline in one go and not returning it into the original data object - this leaves the underlying data untouched.
	unite(col = test_visit, sep = ".", test, visit_id) %>% ## This joins the test and visit column so each test is unique for a patient
	spread(key = test_visit, value = score) %>% # This turns the long data wide
	mutate(MoCA_total.diff = MoCA_total.2 - MoCA_total.1,
	Total_Correct.diff = Total_Correct.2 - Total_Correct.1) %>% # We calculate our differences
	gather(key = test_visit, value = score, MoCA_total.1:Total_Correct.diff) %>% # And bring our wide data long again
	separate(col = test_visit, into = c("test", "visit"), sep = "\\.") %>% # Separate out test and visit labels again
	ggplot() + # plot our histograms, score on the x axis then split by test and visit/difference If you want these separate you can use filtering before plotting.
	geom_histogram(aes(x = score))+
	facet_grid(visit~test)