Skip to content

Instantly share code, notes, and snippets.

@dantalus
Created January 22, 2016 12:09
Show Gist options
  • Save dantalus/3db6756ebce1e9c0e260 to your computer and use it in GitHub Desktop.
Save dantalus/3db6756ebce1e9c0e260 to your computer and use it in GitHub Desktop.
# Useful libraries ####
library(readxl) # excel
library(plyr) # Tidy data
library(dplyr)
library(tidyr)
library(ggplot2) # Plot data
library(RColorBrewer)
library(ggrepel)
library(gmodels) # Describe data
library(xtable)
library("utils") # Read scripts from github
library(devtools)
# Keep/migrate useful functions here
source_url("https://raw.githubusercontent.com/dantalus/Rcode/master/propMiss.R")
source_url("https://gist.githubusercontent.com/dantalus/25237be808003673521b/raw/d251a4aab55f1cf0044484f2226213920463b6c5/plotData.R")
# Read in the data ####
# Recieved from Joe Eustace by email Nov 10, 2015
base <- read_excel("data/Triplicate Analysis 130215 b.xlsx")
data <- base
# Get rid of blank columns ####
# Identified by inspection
data <- data[, 3:(length(data) - 1)]
# Save variable labels ####
varlabs <- colnames(data)
# Tidy variable names ####
colnames(data) <- tolower(colnames(data))
colnames(data) <- gsub(" ", ".", colnames(data))
# Clean variable names more
colnames(data) <- c("id",
"measurement",
"date.m",
"time.m",
"setting",
"medications",
"comments",
"operator.1",
"ambient.temp",
"core.temp",
"instatemp",
"blank.1",
"blank.2",
"nasopharyngeal",
"unit.number",
"notes",
"downloaded",
"operator.2",
"patient",
"tbody",
"time.date",
"core.temp.logged",
"data.header",
"measurement.id",
"unit",
"raw.1",
"raw.2",
"raw.3",
"raw.4",
"surface.temp.patient",
"internal.unit.ambient.temp",
"external.unit.ambient.temp",
"vcc1",
"vcc2",
"body.temperature.recorded.in.unit",
"blank.3",
"sensor.id",
"blank.4",
"blank.5",
"algorithm",
"fw",
"header")
# Fix measurement IDs ####
data$measurement <- as.integer(gsub("#", "", data$measurement))
# View(data[is.na(data$id), ]) # 1 missing id to fix
data$id[is.na(data$id)] <- 22
# Remove rows with all missing data ####
# View(data[is.na(data$date.m), ]) # 6 obs with no data
data <- data[!is.na(data$date.m), ]
# Tidy time ####
# There are several errros to correct, and changes to POSIX class. These need
# to be right because measures are in triplicate, and triplets are identified
# by shared times.
data$time.date <- gsub("2014", "2015", data$time.date) # Correction to date
# By inspection, there is an apparent error in these day values
data$time.date[data$id == 7] <- gsub("/06", "/08",
data$time.date[data$id == 7])
data$time.date[data$id == 8] <- gsub("/06", "/08",
data$time.date[data$id == 8])
data$time.date[data$id == 10] <- gsub("/08", "/09",
data$time.date[data$id == 10])
data$time.date[data$id == 10] <- gsub("/06", "/08",
data$time.date[data$id == 10])
data$time.date[data$id == 11] <- gsub("/08", "/09",
data$time.date[data$id == 11])
data$time.date[data$id == 11] <- gsub("/06", "/08",
data$time.date[data$id == 11])
data$time.date[data$id == 12] <- gsub("/06", "/08", data$time.date[data$id == 12])
data$time.date <- as.POSIXct(data$time.date,
format = " %Y/%m/%d %H:%M:%S")
# By inspection, there are way off, and the new.time variable (below) fits in
# with other times.
data$time.date[data$id == 2 & data$measurement == 28] <- NA
data$time.date[data$id == 2 & data$measurement == 29] <- NA
data$time.date[data$id == 2 & data$measurement == 30] <- NA # TODO
# By inspecion, two times were swapped in error - swap them to correct obs.
# The same error was not in the $time.date variable.
x <- data$time.m[data$id == 21 & data$measurement == 66]
y <- data$time.m[data$id == 21 & data$measurement == 67]
data$time.m[data$id == 21 & data$measurement == 66] <- y
data$time.m[data$id == 21 & data$measurement == 67] <- x
rm(x, y)
data$time.m <- sprintf("%.2f", data$time.m) %>%
gsub("\\.", "\\:", .) # Replace "." with ":", without droping any "00"
data$time.m[nchar(data$time.m) == 4] <-
paste0("0", data$time.m[nchar(data$time.m) == 4]) # Add leading 0 as needed
data$time.m <- paste0(data$time.m, ":00")
data$time.m[data$time.m == "NA:00"] <- NA # Correction
# By inspection, there is an apparent error in these hour values
data$time.m[data$id == 21 & data$measurement == 49] <-
gsub("04:", "16:", data$time.m[data$id == 21 & data$measurement == 49])
data$time.m[data$id == 21 & data$measurement == 50] <-
gsub("04:", "16:", data$time.m[data$id == 21 & data$measurement == 50])
data$time.m[data$id == 21 & data$measurement == 51] <-
gsub("04:", "16:", data$time.m[data$id == 21 & data$measurement == 51])
data$date.m <- gsub("2105", "2015", data$date.m) # Correction
data$date.m <- gsub("-10-", "-01-", data$date.m) # Correction TODO
# By inspection, there is an apparent error in these month values
data$date.m[data$id == 44] <- gsub("-01-", "-02-", data$date.m[data$id == 44])
data$date.m[data$id == 45] <- gsub("-01-", "-02-", data$date.m[data$id == 45])
data$date.m[data$id == 46] <- gsub("-01-", "-02-", data$date.m[data$id == 46])
# Put it all together and change to POSIX class.
data$new.time <- paste(data$date.m, data$time.m) %>%
as.POSIXct(format = "%Y-%m-%d %H:%M:%S", tz = "GMT")
# plot(data$new.time, data$time.date) # A few outliers for time TODO
# Measurements are in triplicate, identified by shared times. Create a new
# numeric variable reflecting each set of measurements, sequentially, within
# each patient.
x <- vector()
for(i in 1:length(data$id)){
x <- c(x, as.numeric(factor(data$new.time[data$id == i])))
}
data$set.number <- x
rm(x)
# Arrange by person, time, measure ####
data <- arrange(data, id, set.number, measurement)
# Split temp variable ####
data$body.temp.c <- as.numeric(substr(data$tbody, 1, 5))
data$body.temp.f <- as.numeric(substr(data$tbody, 12, 17))
# plot(((data$body.temp.f - 32) * 5/9), data$body.temp.c) # Correct
# Correct meaurement nubmers ####
# There are a number of errors in the measurment numbers (repeats, omissions,
# out of order). There are also some missing observations where we expect a
# triplet of measures, but only see a pair in dataset.
# What is the total number of measurements for each person, and are they
# numbered sequentially, in sets of 3? ####
# ggplot(data, aes(x = id, y = measurement)) + geom_point(size = 1)
# Lots of variation in number of measurements, and gaps in ID 2 and 21.
# What is the total number of measurements for each person, and are they
# numbered sequentially?
# group_by(data, id) %>%
# summarize(length.measure = length(measurement),
# max.measure = max(measurement)) %>%
# View() # 2, 8, 11, 12, 13, 21 don't match
# View(filter(data, id == 2)) # 106 missing, but they are in sets of 3, so just
# re-number
data$measurement[data$id == 2] <- c(1:114)
# View(filter(data, id == 8)) # Re-number, lots of repeats, but in order and in
# sets of 3.
# filter(data, id == 8) %>% ggplot(aes(x = new.time, y = measurement)) +
# geom_line()
data$measurement[data$id == 8] <- c(1:72)
# View(filter(data, id == 11)) # Re-number, lots of repeats, but in order and in
# sets of 3.
data$measurement[data$id == 11] <- c(1:21)
# View(filter(data, id == 12)) # Missing number 21, of set 19, 20, 21 TODO
# View(filter(data, id == 13)) # Re-number, lots of repeats, but in order and in
# sets of 3.
data$measurement[data$id == 13] <- c(1:24)
# View(filter(data, id == 21)) # Switch times for 66 and 67 (above); skips 37,
# 38, 39; otherwise in sets of 3
data$measurement[data$id == 21] <- c(1:78)
# Is everything in groups of 3?
# group_by(data, id) %>%
# summarize(length.measure = length(measurement) / 3,
# max.measure = max(measurement)) %>%
# View()
# 47, 12 are not multiples of 3
# View(filter(data, id == 47)) # Missing measure to go with 7, 8; but otherwise
# numbered sequentially TODO
# View(filter(data, id == 12)) # Missing number 21, of set 19, 20, 21 TODO
data <- group_by(data, id) %>%
summarize(length.measure = length(measurement),
max.measure = max(measurement)) %>%
full_join(data, by = c("id"))
# Do measurement ids and time match linearly?
# ggplot(data, aes(x = new.time, y = measurement, group = id,
# color = factor(id))) +
# geom_point() +
# geom_line() +
# ylim(0, 30) +
# scale_color_discrete(guide = F) +
# facet_wrap(~id) # ids 44, 21 were off. Went back and corrected above
# ggplot(filter(data, id == 44), aes(x = new.time, y = measurement)) +
# geom_point()
# filter(data, id == 44) %>% select(measurement, new.time, date.m) %>% View()
# ggplot(filter(data, id == 21), aes(x = new.time, y = measurement)) +
# geom_point()
# filter(data, id == 21) %>% select(measurement, new.time, date.m) %>% View()
# Is there a difference between $ID and $Patient? ####
# plot(data$id, data$patient) # No, they are redundant
# Tidy character values ####
# View(data[, sapply(data, class) == 'character'])
# lapply(data[, sapply(data, class) == 'character'], table)
# Medications
data$medications[data$medications == "OxygenMask" &
!is.na(data$medications)] <- "Oxygen Mask"
# Does NA for medications refelct NO? TODO
# Comments
data$comments[data$comments == "Start Bypass" & !is.na(data$comments)] <-
"Before bypass"
data$comments[data$comments == "Before Bypass" & !is.na(data$comments)] <-
"Before bypass"
data$comments[data$comments == "bypass" & !is.na(data$comments)] <-
"Bypass"
data$comments[data$comments == "Cooling Blanket" & !is.na(data$comments)] <-
"Cooling"
data$comments[data$comments == "Cooling Cloth" & !is.na(data$comments)] <-
"Cooling"
data$comments[data$comments == "Cooling pad forehead" & !is.na(data$comments)] <-
"Cooling"
data$comments[data$comments == "Heatiing blanket" & !is.na(data$comments)] <-
"Heating blanket"
data$comments[data$comments == "Heating Blanket" & !is.na(data$comments)] <-
"Heating blanket"
data$comments[data$comments == "Pre-Op" & !is.na(data$comments)] <-
"Pre-op"
# Can we combine other comments? TODO
# Operator.1
data$operator.1[data$operator.1 == "Adds Nasopharyngeal Reading+N576" &
!is.na(data$operator.1)] <- "Birgitta Gleeson"
# Ambient.temp
data$ambient.temp <- gsub("\\,", "\\.", data$ambient.temp) %>%
as.numeric() %>%
round(2)
# Setting
# table(data$setting) # TODO fix with Joe
# levels(factor(data$setting))
data$setting[data$setting == "Theater" & !is.na(data$setting)] <- "Theatre"
data$setting[data$setting == "Pre-OP" & !is.na(data$setting)] <- "Pre-op"
# Measurement id
# ggplot(data, aes(x = factor(measurement), y = measurement.id)) +
# geom_point() +
# facet_wrap(~id)
# Appears pointless TODO
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment