Skip to content

Instantly share code, notes, and snippets.

@laurakwiley
Created December 5, 2015 21:33
Show Gist options
  • Save laurakwiley/4bfc2a657d57280b0924 to your computer and use it in GitHub Desktop.
Save laurakwiley/4bfc2a657d57280b0924 to your computer and use it in GitHub Desktop.
library(magrittr)
library(tidyr)
library(dplyr)
library(ggplot2)
data <- data.frame(PT_ID = c(23, 23, 23, 23, 100, 100, 100, 100),
DATETIME = c("1/1/2013 9:38", "8/9/2014 15:42", "1/1/2013 9:38", "8/9/2014 15:42", "12/21/2011 12:10", "8/9/2013 11:18", "12/21/2011 12:10", "8/9/2013 11:18"),
MoCA_total = c("MHSMOCA:MOCATOTAL = 25", "MHSMOCA:MOCATOTAL = 22", NA, NA, "MHSMOCA:MOCATOTAL = 21", "MHSMOCA:MOCATOTAL = 25", NA, NA),
Total_Correct = c(NA, NA, "MHSWLM:CORRECT = 15", "MHSWLM:CORRECT = 20", NA, NA, "MHSWLM:CORRECT = 18", "MHSWLM:CORRECT = 24")) %>% tbl_df()
# Clean up datatypes
data %<>%
mutate(DATETIME = mdy_hm(DATETIME),
MoCA_total = as.character(MoCA_total),
Total_Correct = as.character(Total_Correct))
# Process MoCA and MHSWLM to extract the number for the test and remove the test name since it's already in the column ID
data %<>%
separate(col = MoCA_total, into = c("text1", "MoCA_total"), sep = "=", convert = TRUE) %>%
separate(col = Total_Correct, into = c("text2", "Total_Correct"), sep = "=", convert = TRUE) %>%
select(-text1, -text2)
# Now make this a tidy data frame (e.g. one line for each observation)
# Gather makes a key column ("test") that contains the column names you select and a value column ("score") that contains the values from that column
# Here is missing data is uninformative so we use na.rm=TRUE to get rid of the empty data.
data %<>%
gather(key = "test", value = "score", MoCA_total:Total_Correct, na.rm = TRUE)
# Now assign visit numbers for each person and test
# As written this will give you 1:total number of visits w/ that lab test as the visit id. (e.g. if patient 23 had 4 MoCA_total tests, the visit id would go from 1:4)
# If you really want to create pairs of tests you can replace the "1:length(PT_ID)" with "rep(c(1,2), times = length(PT_ID)/2)" This would result in visit id being 1,2,1,2 -- fair warning, this will cause problems down the analysis pipeline I'm building.
data %<>%
group_by(PT_ID, test) %>%
arrange(DATETIME) %>%
mutate(visit_id = 1:length(PT_ID))
# Now let's also make a value for the difference between visits. There are a few ways to do this, but I'm going to do the easiest conceptually, but least elegant way.
data %>%
select(-DATETIME) %>% # For this pipeline the different DATETIMES will cause problems. Note that I'm creating the entire pipeline in one go and *not* returning it into the original data object - this leaves the underlying data untouched.
unite(col = test_visit, sep = ".", test, visit_id) %>% ## This joins the test and visit column so each test is unique for a patient
spread(key = test_visit, value = score) %>% # This turns the long data wide
mutate(MoCA_total.diff = MoCA_total.2 - MoCA_total.1,
Total_Correct.diff = Total_Correct.2 - Total_Correct.1) %>% # We calculate our differences
gather(key = test_visit, value = score, MoCA_total.1:Total_Correct.diff) %>% # And bring our wide data long again
separate(col = test_visit, into = c("test", "visit"), sep = "\\.") %>% # Separate out test and visit labels again
ggplot() + # plot our histograms, score on the x axis then split by test and visit/difference If you want these separate you can use filtering before plotting.
geom_histogram(aes(x = score))+
facet_grid(visit~test)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment