Created
January 22, 2016 12:09
-
-
Save dantalus/3db6756ebce1e9c0e260 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Useful libraries #### | |
library(readxl) # excel | |
library(plyr) # Tidy data | |
library(dplyr) | |
library(tidyr) | |
library(ggplot2) # Plot data | |
library(RColorBrewer) | |
library(ggrepel) | |
library(gmodels) # Describe data | |
library(xtable) | |
library("utils") # Read scripts from github | |
library(devtools) | |
# Keep/migrate useful functions here | |
source_url("https://raw.githubusercontent.com/dantalus/Rcode/master/propMiss.R") | |
source_url("https://gist.githubusercontent.com/dantalus/25237be808003673521b/raw/d251a4aab55f1cf0044484f2226213920463b6c5/plotData.R") | |
# Read in the data #### | |
# Recieved from Joe Eustace by email Nov 10, 2015 | |
base <- read_excel("data/Triplicate Analysis 130215 b.xlsx") | |
data <- base | |
# Get rid of blank columns #### | |
# Identified by inspection | |
data <- data[, 3:(length(data) - 1)] | |
# Save variable labels #### | |
varlabs <- colnames(data) | |
# Tidy variable names #### | |
colnames(data) <- tolower(colnames(data)) | |
colnames(data) <- gsub(" ", ".", colnames(data)) | |
# Clean variable names more | |
colnames(data) <- c("id", | |
"measurement", | |
"date.m", | |
"time.m", | |
"setting", | |
"medications", | |
"comments", | |
"operator.1", | |
"ambient.temp", | |
"core.temp", | |
"instatemp", | |
"blank.1", | |
"blank.2", | |
"nasopharyngeal", | |
"unit.number", | |
"notes", | |
"downloaded", | |
"operator.2", | |
"patient", | |
"tbody", | |
"time.date", | |
"core.temp.logged", | |
"data.header", | |
"measurement.id", | |
"unit", | |
"raw.1", | |
"raw.2", | |
"raw.3", | |
"raw.4", | |
"surface.temp.patient", | |
"internal.unit.ambient.temp", | |
"external.unit.ambient.temp", | |
"vcc1", | |
"vcc2", | |
"body.temperature.recorded.in.unit", | |
"blank.3", | |
"sensor.id", | |
"blank.4", | |
"blank.5", | |
"algorithm", | |
"fw", | |
"header") | |
# Fix measurement IDs #### | |
data$measurement <- as.integer(gsub("#", "", data$measurement)) | |
# View(data[is.na(data$id), ]) # 1 missing id to fix | |
data$id[is.na(data$id)] <- 22 | |
# Remove rows with all missing data #### | |
# View(data[is.na(data$date.m), ]) # 6 obs with no data | |
data <- data[!is.na(data$date.m), ] | |
# Tidy time #### | |
# There are several errros to correct, and changes to POSIX class. These need | |
# to be right because measures are in triplicate, and triplets are identified | |
# by shared times. | |
data$time.date <- gsub("2014", "2015", data$time.date) # Correction to date | |
# By inspection, there is an apparent error in these day values | |
data$time.date[data$id == 7] <- gsub("/06", "/08", | |
data$time.date[data$id == 7]) | |
data$time.date[data$id == 8] <- gsub("/06", "/08", | |
data$time.date[data$id == 8]) | |
data$time.date[data$id == 10] <- gsub("/08", "/09", | |
data$time.date[data$id == 10]) | |
data$time.date[data$id == 10] <- gsub("/06", "/08", | |
data$time.date[data$id == 10]) | |
data$time.date[data$id == 11] <- gsub("/08", "/09", | |
data$time.date[data$id == 11]) | |
data$time.date[data$id == 11] <- gsub("/06", "/08", | |
data$time.date[data$id == 11]) | |
data$time.date[data$id == 12] <- gsub("/06", "/08", data$time.date[data$id == 12]) | |
data$time.date <- as.POSIXct(data$time.date, | |
format = " %Y/%m/%d %H:%M:%S") | |
# By inspection, there are way off, and the new.time variable (below) fits in | |
# with other times. | |
data$time.date[data$id == 2 & data$measurement == 28] <- NA | |
data$time.date[data$id == 2 & data$measurement == 29] <- NA | |
data$time.date[data$id == 2 & data$measurement == 30] <- NA # TODO | |
# By inspecion, two times were swapped in error - swap them to correct obs. | |
# The same error was not in the $time.date variable. | |
x <- data$time.m[data$id == 21 & data$measurement == 66] | |
y <- data$time.m[data$id == 21 & data$measurement == 67] | |
data$time.m[data$id == 21 & data$measurement == 66] <- y | |
data$time.m[data$id == 21 & data$measurement == 67] <- x | |
rm(x, y) | |
data$time.m <- sprintf("%.2f", data$time.m) %>% | |
gsub("\\.", "\\:", .) # Replace "." with ":", without droping any "00" | |
data$time.m[nchar(data$time.m) == 4] <- | |
paste0("0", data$time.m[nchar(data$time.m) == 4]) # Add leading 0 as needed | |
data$time.m <- paste0(data$time.m, ":00") | |
data$time.m[data$time.m == "NA:00"] <- NA # Correction | |
# By inspection, there is an apparent error in these hour values | |
data$time.m[data$id == 21 & data$measurement == 49] <- | |
gsub("04:", "16:", data$time.m[data$id == 21 & data$measurement == 49]) | |
data$time.m[data$id == 21 & data$measurement == 50] <- | |
gsub("04:", "16:", data$time.m[data$id == 21 & data$measurement == 50]) | |
data$time.m[data$id == 21 & data$measurement == 51] <- | |
gsub("04:", "16:", data$time.m[data$id == 21 & data$measurement == 51]) | |
data$date.m <- gsub("2105", "2015", data$date.m) # Correction | |
data$date.m <- gsub("-10-", "-01-", data$date.m) # Correction TODO | |
# By inspection, there is an apparent error in these month values | |
data$date.m[data$id == 44] <- gsub("-01-", "-02-", data$date.m[data$id == 44]) | |
data$date.m[data$id == 45] <- gsub("-01-", "-02-", data$date.m[data$id == 45]) | |
data$date.m[data$id == 46] <- gsub("-01-", "-02-", data$date.m[data$id == 46]) | |
# Put it all together and change to POSIX class. | |
data$new.time <- paste(data$date.m, data$time.m) %>% | |
as.POSIXct(format = "%Y-%m-%d %H:%M:%S", tz = "GMT") | |
# plot(data$new.time, data$time.date) # A few outliers for time TODO | |
# Measurements are in triplicate, identified by shared times. Create a new | |
# numeric variable reflecting each set of measurements, sequentially, within | |
# each patient. | |
x <- vector() | |
for(i in 1:length(data$id)){ | |
x <- c(x, as.numeric(factor(data$new.time[data$id == i]))) | |
} | |
data$set.number <- x | |
rm(x) | |
# Arrange by person, time, measure #### | |
data <- arrange(data, id, set.number, measurement) | |
# Split temp variable #### | |
data$body.temp.c <- as.numeric(substr(data$tbody, 1, 5)) | |
data$body.temp.f <- as.numeric(substr(data$tbody, 12, 17)) | |
# plot(((data$body.temp.f - 32) * 5/9), data$body.temp.c) # Correct | |
# Correct meaurement nubmers #### | |
# There are a number of errors in the measurment numbers (repeats, omissions, | |
# out of order). There are also some missing observations where we expect a | |
# triplet of measures, but only see a pair in dataset. | |
# What is the total number of measurements for each person, and are they | |
# numbered sequentially, in sets of 3? #### | |
# ggplot(data, aes(x = id, y = measurement)) + geom_point(size = 1) | |
# Lots of variation in number of measurements, and gaps in ID 2 and 21. | |
# What is the total number of measurements for each person, and are they | |
# numbered sequentially? | |
# group_by(data, id) %>% | |
# summarize(length.measure = length(measurement), | |
# max.measure = max(measurement)) %>% | |
# View() # 2, 8, 11, 12, 13, 21 don't match | |
# View(filter(data, id == 2)) # 106 missing, but they are in sets of 3, so just | |
# re-number | |
data$measurement[data$id == 2] <- c(1:114) | |
# View(filter(data, id == 8)) # Re-number, lots of repeats, but in order and in | |
# sets of 3. | |
# filter(data, id == 8) %>% ggplot(aes(x = new.time, y = measurement)) + | |
# geom_line() | |
data$measurement[data$id == 8] <- c(1:72) | |
# View(filter(data, id == 11)) # Re-number, lots of repeats, but in order and in | |
# sets of 3. | |
data$measurement[data$id == 11] <- c(1:21) | |
# View(filter(data, id == 12)) # Missing number 21, of set 19, 20, 21 TODO | |
# View(filter(data, id == 13)) # Re-number, lots of repeats, but in order and in | |
# sets of 3. | |
data$measurement[data$id == 13] <- c(1:24) | |
# View(filter(data, id == 21)) # Switch times for 66 and 67 (above); skips 37, | |
# 38, 39; otherwise in sets of 3 | |
data$measurement[data$id == 21] <- c(1:78) | |
# Is everything in groups of 3? | |
# group_by(data, id) %>% | |
# summarize(length.measure = length(measurement) / 3, | |
# max.measure = max(measurement)) %>% | |
# View() | |
# 47, 12 are not multiples of 3 | |
# View(filter(data, id == 47)) # Missing measure to go with 7, 8; but otherwise | |
# numbered sequentially TODO | |
# View(filter(data, id == 12)) # Missing number 21, of set 19, 20, 21 TODO | |
data <- group_by(data, id) %>% | |
summarize(length.measure = length(measurement), | |
max.measure = max(measurement)) %>% | |
full_join(data, by = c("id")) | |
# Do measurement ids and time match linearly? | |
# ggplot(data, aes(x = new.time, y = measurement, group = id, | |
# color = factor(id))) + | |
# geom_point() + | |
# geom_line() + | |
# ylim(0, 30) + | |
# scale_color_discrete(guide = F) + | |
# facet_wrap(~id) # ids 44, 21 were off. Went back and corrected above | |
# ggplot(filter(data, id == 44), aes(x = new.time, y = measurement)) + | |
# geom_point() | |
# filter(data, id == 44) %>% select(measurement, new.time, date.m) %>% View() | |
# ggplot(filter(data, id == 21), aes(x = new.time, y = measurement)) + | |
# geom_point() | |
# filter(data, id == 21) %>% select(measurement, new.time, date.m) %>% View() | |
# Is there a difference between $ID and $Patient? #### | |
# plot(data$id, data$patient) # No, they are redundant | |
# Tidy character values #### | |
# View(data[, sapply(data, class) == 'character']) | |
# lapply(data[, sapply(data, class) == 'character'], table) | |
# Medications | |
data$medications[data$medications == "OxygenMask" & | |
!is.na(data$medications)] <- "Oxygen Mask" | |
# Does NA for medications refelct NO? TODO | |
# Comments | |
data$comments[data$comments == "Start Bypass" & !is.na(data$comments)] <- | |
"Before bypass" | |
data$comments[data$comments == "Before Bypass" & !is.na(data$comments)] <- | |
"Before bypass" | |
data$comments[data$comments == "bypass" & !is.na(data$comments)] <- | |
"Bypass" | |
data$comments[data$comments == "Cooling Blanket" & !is.na(data$comments)] <- | |
"Cooling" | |
data$comments[data$comments == "Cooling Cloth" & !is.na(data$comments)] <- | |
"Cooling" | |
data$comments[data$comments == "Cooling pad forehead" & !is.na(data$comments)] <- | |
"Cooling" | |
data$comments[data$comments == "Heatiing blanket" & !is.na(data$comments)] <- | |
"Heating blanket" | |
data$comments[data$comments == "Heating Blanket" & !is.na(data$comments)] <- | |
"Heating blanket" | |
data$comments[data$comments == "Pre-Op" & !is.na(data$comments)] <- | |
"Pre-op" | |
# Can we combine other comments? TODO | |
# Operator.1 | |
data$operator.1[data$operator.1 == "Adds Nasopharyngeal Reading+N576" & | |
!is.na(data$operator.1)] <- "Birgitta Gleeson" | |
# Ambient.temp | |
data$ambient.temp <- gsub("\\,", "\\.", data$ambient.temp) %>% | |
as.numeric() %>% | |
round(2) | |
# Setting | |
# table(data$setting) # TODO fix with Joe | |
# levels(factor(data$setting)) | |
data$setting[data$setting == "Theater" & !is.na(data$setting)] <- "Theatre" | |
data$setting[data$setting == "Pre-OP" & !is.na(data$setting)] <- "Pre-op" | |
# Measurement id | |
# ggplot(data, aes(x = factor(measurement), y = measurement.id)) + | |
# geom_point() + | |
# facet_wrap(~id) | |
# Appears pointless TODO | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment