dantalus/thermomters_cleaning.R

## thermomters_cleaning.R
# Useful libraries ####

  library(readxl)         # excel

  library(plyr)           # Tidy data
  library(dplyr)
  library(tidyr)

  library(ggplot2)        # Plot data
  library(RColorBrewer)
  library(ggrepel)

  library(gmodels)        # Describe data
  library(xtable)

  library("utils")        # Read scripts from github
  library(devtools)

# Keep/migrate useful functions here
  source_url("https://raw.githubusercontent.com/dantalus/Rcode/master/propMiss.R")
  source_url("https://gist.githubusercontent.com/dantalus/25237be808003673521b/raw/d251a4aab55f1cf0044484f2226213920463b6c5/plotData.R")


# Read in the data ####
# Recieved from Joe Eustace by email Nov 10, 2015

  base <- read_excel("data/Triplicate Analysis 130215 b.xlsx")
  data <- base


# Get rid of blank columns ####
# Identified by inspection

  data <- data[, 3:(length(data) - 1)]


# Save variable labels ####

  varlabs <- colnames(data)


# Tidy variable names ####

  colnames(data) <- tolower(colnames(data))
  colnames(data) <- gsub(" ", ".", colnames(data))

# Clean variable names more

  colnames(data) <- c("id",
                      "measurement",
                      "date.m",
                      "time.m",
                      "setting",
                      "medications",
                      "comments",
                      "operator.1",
                      "ambient.temp",
                      "core.temp",
                      "instatemp",
                      "blank.1",
                      "blank.2",
                      "nasopharyngeal",
                      "unit.number",
                      "notes",
                      "downloaded",
                      "operator.2",
                      "patient",
                      "tbody",
                      "time.date",
                      "core.temp.logged",
                      "data.header",
                      "measurement.id",
                      "unit",
                      "raw.1",
                      "raw.2",
                      "raw.3",
                      "raw.4",
                      "surface.temp.patient",
                      "internal.unit.ambient.temp",
                      "external.unit.ambient.temp",
                      "vcc1",
                      "vcc2",
                      "body.temperature.recorded.in.unit",
                      "blank.3",
                      "sensor.id",
                      "blank.4",
                      "blank.5",
                      "algorithm",
                      "fw",
                      "header")


# Fix measurement IDs ####

  data$measurement <- as.integer(gsub("#", "", data$measurement))

# View(data[is.na(data$id), ]) # 1 missing id to fix

  data$id[is.na(data$id)] <- 22


# Remove rows with all missing data ####

# View(data[is.na(data$date.m), ]) # 6 obs with no data

  data <- data[!is.na(data$date.m), ]


# Tidy time ####

# There are several errros to correct, and changes to POSIX class. These need
# to be right because measures are in triplicate, and triplets are identified
# by shared times.

  data$time.date <- gsub("2014", "2015", data$time.date) # Correction to date

# By inspection, there is an apparent error in these day values
  data$time.date[data$id == 7] <- gsub("/06", "/08",
                                        data$time.date[data$id == 7])
  data$time.date[data$id == 8] <- gsub("/06", "/08",
                                       data$time.date[data$id == 8])
  data$time.date[data$id == 10] <- gsub("/08", "/09",
                                        data$time.date[data$id == 10])
  data$time.date[data$id == 10] <- gsub("/06", "/08",
                                        data$time.date[data$id == 10])
  data$time.date[data$id == 11] <- gsub("/08", "/09",
                                        data$time.date[data$id == 11])
  data$time.date[data$id == 11] <- gsub("/06", "/08",
                                        data$time.date[data$id == 11])
  data$time.date[data$id == 12] <- gsub("/06", "/08", data$time.date[data$id == 12])

  data$time.date <- as.POSIXct(data$time.date,
                               format = "   %Y/%m/%d   %H:%M:%S")

# By inspection, there are way off, and the new.time variable (below) fits in
# with other times.
  data$time.date[data$id == 2 & data$measurement == 28] <- NA
  data$time.date[data$id == 2 & data$measurement == 29] <- NA
  data$time.date[data$id == 2 & data$measurement == 30] <- NA # TODO

# By inspecion, two times were swapped in error - swap them to correct obs.
# The same error was not in the $time.date variable.
  x <- data$time.m[data$id == 21 & data$measurement == 66]
  y <- data$time.m[data$id == 21 & data$measurement == 67]

  data$time.m[data$id == 21 & data$measurement == 66] <- y
  data$time.m[data$id == 21 & data$measurement == 67] <- x
  rm(x, y)

  data$time.m <- sprintf("%.2f", data$time.m) %>%
    gsub("\\.", "\\:", .) # Replace "." with ":", without droping any "00"

  data$time.m[nchar(data$time.m) == 4] <-
    paste0("0", data$time.m[nchar(data$time.m) == 4]) # Add leading 0 as needed

  data$time.m <- paste0(data$time.m, ":00")

  data$time.m[data$time.m == "NA:00"] <- NA # Correction


# By inspection, there is an apparent error in these hour values
  data$time.m[data$id == 21 & data$measurement == 49] <-
    gsub("04:", "16:",  data$time.m[data$id == 21 & data$measurement == 49])
  data$time.m[data$id == 21 & data$measurement == 50] <-
    gsub("04:", "16:",  data$time.m[data$id == 21 & data$measurement == 50])
  data$time.m[data$id == 21 & data$measurement == 51] <-
    gsub("04:", "16:",  data$time.m[data$id == 21 & data$measurement == 51])

  data$date.m <- gsub("2105", "2015",  data$date.m) # Correction
  data$date.m <- gsub("-10-", "-01-",  data$date.m) # Correction TODO

# By inspection, there is an apparent error in these month values
  data$date.m[data$id == 44] <- gsub("-01-", "-02-", data$date.m[data$id == 44])
  data$date.m[data$id == 45] <- gsub("-01-", "-02-", data$date.m[data$id == 45])
  data$date.m[data$id == 46] <- gsub("-01-", "-02-", data$date.m[data$id == 46])


# Put it all together and change to POSIX class.
  data$new.time <- paste(data$date.m, data$time.m) %>%
    as.POSIXct(format = "%Y-%m-%d %H:%M:%S", tz = "GMT")

# plot(data$new.time, data$time.date) # A few outliers for time TODO


# Measurements are in triplicate, identified by shared times. Create a new
# numeric variable reflecting each set of measurements, sequentially, within
# each patient.

  x <- vector()
  for(i in 1:length(data$id)){
    x <- c(x, as.numeric(factor(data$new.time[data$id == i])))
  }

  data$set.number <- x
  rm(x)


# Arrange by person, time, measure ####

  data <- arrange(data, id, set.number, measurement)


# Split temp variable ####

  data$body.temp.c <- as.numeric(substr(data$tbody, 1, 5))
  data$body.temp.f <- as.numeric(substr(data$tbody, 12, 17))

# plot(((data$body.temp.f - 32) * 5/9), data$body.temp.c) # Correct


# Correct meaurement nubmers ####
# There are a number of errors in the measurment numbers (repeats, omissions,
# out of order). There are also some missing observations where we expect a
# triplet of measures, but only see a pair in dataset.

# What is the total number of measurements for each person, and are they
# numbered sequentially, in sets of 3? ####

# ggplot(data, aes(x = id, y = measurement)) + geom_point(size = 1)
# Lots of variation in number of measurements, and gaps in ID 2 and 21.

# What is the total number of measurements for each person, and are they
# numbered sequentially?

# group_by(data, id) %>%
# summarize(length.measure = length(measurement),
#           max.measure = max(measurement)) %>%
# View() # 2, 8, 11, 12, 13, 21 don't match

# View(filter(data, id == 2)) # 106 missing, but they are in sets of 3, so just
# re-number
  data$measurement[data$id == 2] <- c(1:114)

# View(filter(data, id == 8)) # Re-number, lots of repeats, but in order and in
                              # sets of 3.
# filter(data, id == 8) %>% ggplot(aes(x = new.time, y = measurement)) +
#                             geom_line()

  data$measurement[data$id == 8] <- c(1:72)

# View(filter(data, id == 11)) # Re-number, lots of repeats, but in order and in
                               # sets of 3.

  data$measurement[data$id == 11] <- c(1:21)

# View(filter(data, id == 12)) # Missing number 21, of set 19, 20, 21 TODO

# View(filter(data, id == 13)) # Re-number, lots of repeats, but in order and in
                               # sets of 3.

  data$measurement[data$id == 13] <- c(1:24)

# View(filter(data, id == 21)) # Switch times for 66 and 67 (above); skips 37,
                               # 38, 39; otherwise in sets of 3

  data$measurement[data$id == 21] <- c(1:78)

# Is everything in groups of 3?

# group_by(data, id) %>%
# summarize(length.measure = length(measurement) / 3,
#           max.measure = max(measurement)) %>%
# View()

# 47, 12 are not multiples of 3

# View(filter(data, id == 47)) # Missing measure to go with 7, 8; but otherwise
                               # numbered sequentially TODO

# View(filter(data, id == 12)) # Missing number 21, of set 19, 20, 21 TODO


  data <- group_by(data, id) %>%
          summarize(length.measure = length(measurement),
                    max.measure = max(measurement)) %>%
          full_join(data, by = c("id"))


# Do measurement ids and time match linearly?

# ggplot(data, aes(x = new.time, y = measurement, group = id,
#                  color = factor(id))) +
#   geom_point() +
#   geom_line() +
#   ylim(0, 30) +
#   scale_color_discrete(guide = F) +
#   facet_wrap(~id)  # ids 44, 21 were off. Went back and corrected above

# ggplot(filter(data, id == 44), aes(x = new.time, y = measurement)) +
#   geom_point()

# filter(data, id == 44) %>% select(measurement, new.time, date.m) %>% View()

# ggplot(filter(data, id == 21), aes(x = new.time, y = measurement)) +
#   geom_point()

# filter(data, id == 21) %>% select(measurement, new.time, date.m) %>% View()


# Is there a difference between $ID and $Patient? ####

# plot(data$id, data$patient) # No, they are redundant


# Tidy character values ####

# View(data[, sapply(data, class) == 'character'])

# lapply(data[, sapply(data, class) == 'character'], table)

# Medications

  data$medications[data$medications == "OxygenMask" &
                   !is.na(data$medications)] <- "Oxygen Mask"

# Does NA for medications refelct NO? TODO

# Comments

  data$comments[data$comments == "Start Bypass" & !is.na(data$comments)] <-
                                 "Before bypass"
  data$comments[data$comments == "Before Bypass" & !is.na(data$comments)] <-
                                 "Before bypass"
  data$comments[data$comments == "bypass" & !is.na(data$comments)] <-
                                 "Bypass"
  data$comments[data$comments == "Cooling Blanket" & !is.na(data$comments)] <-
                                 "Cooling"
  data$comments[data$comments == "Cooling Cloth" & !is.na(data$comments)] <-
                                 "Cooling"
  data$comments[data$comments == "Cooling pad forehead" & !is.na(data$comments)] <-
                                 "Cooling"
  data$comments[data$comments == "Heatiing blanket" & !is.na(data$comments)] <-
                                 "Heating blanket"
  data$comments[data$comments == "Heating Blanket" & !is.na(data$comments)] <-
                                 "Heating blanket"
  data$comments[data$comments == "Pre-Op" & !is.na(data$comments)] <-
                                 "Pre-op"

# Can we combine other comments? TODO

# Operator.1

  data$operator.1[data$operator.1 == "Adds Nasopharyngeal Reading+N576" &
                  !is.na(data$operator.1)] <- "Birgitta Gleeson"

# Ambient.temp

  data$ambient.temp <- gsub("\\,", "\\.", data$ambient.temp) %>%
                       as.numeric() %>%
                       round(2)


# Setting

# table(data$setting) # TODO fix with Joe
# levels(factor(data$setting))

  data$setting[data$setting == "Theater" & !is.na(data$setting)] <- "Theatre"
  data$setting[data$setting == "Pre-OP" & !is.na(data$setting)] <- "Pre-op"


# Measurement id

# ggplot(data, aes(x = factor(measurement), y = measurement.id)) +
#   geom_point() +
#   facet_wrap(~id)

# Appears pointless TODO
	# Useful libraries ####

	library(readxl) # excel

	library(plyr) # Tidy data
	library(dplyr)
	library(tidyr)

	library(ggplot2) # Plot data
	library(RColorBrewer)
	library(ggrepel)

	library(gmodels) # Describe data
	library(xtable)

	library("utils") # Read scripts from github
	library(devtools)

	# Keep/migrate useful functions here
	source_url("https://raw.githubusercontent.com/dantalus/Rcode/master/propMiss.R")
	source_url("https://gist.githubusercontent.com/dantalus/25237be808003673521b/raw/d251a4aab55f1cf0044484f2226213920463b6c5/plotData.R")


	# Read in the data ####
	# Recieved from Joe Eustace by email Nov 10, 2015

	base <- read_excel("data/Triplicate Analysis 130215 b.xlsx")
	data <- base



	# Get rid of blank columns ####
	# Identified by inspection

	data <- data[, 3:(length(data) - 1)]



	# Save variable labels ####

	varlabs <- colnames(data)



	# Tidy variable names ####

	colnames(data) <- tolower(colnames(data))
	colnames(data) <- gsub(" ", ".", colnames(data))

	# Clean variable names more

	colnames(data) <- c("id",
	"measurement",
	"date.m",
	"time.m",
	"setting",
	"medications",
	"comments",
	"operator.1",
	"ambient.temp",
	"core.temp",
	"instatemp",
	"blank.1",
	"blank.2",
	"nasopharyngeal",
	"unit.number",
	"notes",
	"downloaded",
	"operator.2",
	"patient",
	"tbody",
	"time.date",
	"core.temp.logged",
	"data.header",
	"measurement.id",
	"unit",
	"raw.1",
	"raw.2",
	"raw.3",
	"raw.4",
	"surface.temp.patient",
	"internal.unit.ambient.temp",
	"external.unit.ambient.temp",
	"vcc1",
	"vcc2",
	"body.temperature.recorded.in.unit",
	"blank.3",
	"sensor.id",
	"blank.4",
	"blank.5",
	"algorithm",
	"fw",
	"header")



	# Fix measurement IDs ####

	data$measurement <- as.integer(gsub("#", "", data$measurement))

	# View(data[is.na(data$id), ]) # 1 missing id to fix

	data$id[is.na(data$id)] <- 22



	# Remove rows with all missing data ####

	# View(data[is.na(data$date.m), ]) # 6 obs with no data

	data <- data[!is.na(data$date.m), ]




	# Tidy time ####

	# There are several errros to correct, and changes to POSIX class. These need
	# to be right because measures are in triplicate, and triplets are identified
	# by shared times.

	data$time.date <- gsub("2014", "2015", data$time.date) # Correction to date

	# By inspection, there is an apparent error in these day values
	data$time.date[data$id == 7] <- gsub("/06", "/08",
	data$time.date[data$id == 7])
	data$time.date[data$id == 8] <- gsub("/06", "/08",
	data$time.date[data$id == 8])
	data$time.date[data$id == 10] <- gsub("/08", "/09",
	data$time.date[data$id == 10])
	data$time.date[data$id == 10] <- gsub("/06", "/08",
	data$time.date[data$id == 10])
	data$time.date[data$id == 11] <- gsub("/08", "/09",
	data$time.date[data$id == 11])
	data$time.date[data$id == 11] <- gsub("/06", "/08",
	data$time.date[data$id == 11])
	data$time.date[data$id == 12] <- gsub("/06", "/08", data$time.date[data$id == 12])

	data$time.date <- as.POSIXct(data$time.date,
	format = " %Y/%m/%d %H:%M:%S")

	# By inspection, there are way off, and the new.time variable (below) fits in
	# with other times.
	data$time.date[data$id == 2 & data$measurement == 28] <- NA
	data$time.date[data$id == 2 & data$measurement == 29] <- NA
	data$time.date[data$id == 2 & data$measurement == 30] <- NA # TODO

	# By inspecion, two times were swapped in error - swap them to correct obs.
	# The same error was not in the $time.date variable.
	x <- data$time.m[data$id == 21 & data$measurement == 66]
	y <- data$time.m[data$id == 21 & data$measurement == 67]

	data$time.m[data$id == 21 & data$measurement == 66] <- y
	data$time.m[data$id == 21 & data$measurement == 67] <- x
	rm(x, y)

	data$time.m <- sprintf("%.2f", data$time.m) %>%
	gsub("\\.", "\\:", .) # Replace "." with ":", without droping any "00"

	data$time.m[nchar(data$time.m) == 4] <-
	paste0("0", data$time.m[nchar(data$time.m) == 4]) # Add leading 0 as needed

	data$time.m <- paste0(data$time.m, ":00")

	data$time.m[data$time.m == "NA:00"] <- NA # Correction


	# By inspection, there is an apparent error in these hour values
	data$time.m[data$id == 21 & data$measurement == 49] <-
	gsub("04:", "16:", data$time.m[data$id == 21 & data$measurement == 49])
	data$time.m[data$id == 21 & data$measurement == 50] <-
	gsub("04:", "16:", data$time.m[data$id == 21 & data$measurement == 50])
	data$time.m[data$id == 21 & data$measurement == 51] <-
	gsub("04:", "16:", data$time.m[data$id == 21 & data$measurement == 51])

	data$date.m <- gsub("2105", "2015", data$date.m) # Correction
	data$date.m <- gsub("-10-", "-01-", data$date.m) # Correction TODO

	# By inspection, there is an apparent error in these month values
	data$date.m[data$id == 44] <- gsub("-01-", "-02-", data$date.m[data$id == 44])
	data$date.m[data$id == 45] <- gsub("-01-", "-02-", data$date.m[data$id == 45])
	data$date.m[data$id == 46] <- gsub("-01-", "-02-", data$date.m[data$id == 46])



	# Put it all together and change to POSIX class.
	data$new.time <- paste(data$date.m, data$time.m) %>%
	as.POSIXct(format = "%Y-%m-%d %H:%M:%S", tz = "GMT")

	# plot(data$new.time, data$time.date) # A few outliers for time TODO



	# Measurements are in triplicate, identified by shared times. Create a new
	# numeric variable reflecting each set of measurements, sequentially, within
	# each patient.

	x <- vector()
	for(i in 1:length(data$id)){
	x <- c(x, as.numeric(factor(data$new.time[data$id == i])))
	}

	data$set.number <- x
	rm(x)



	# Arrange by person, time, measure ####

	data <- arrange(data, id, set.number, measurement)



	# Split temp variable ####

	data$body.temp.c <- as.numeric(substr(data$tbody, 1, 5))
	data$body.temp.f <- as.numeric(substr(data$tbody, 12, 17))

	# plot(((data$body.temp.f - 32) * 5/9), data$body.temp.c) # Correct


	# Correct meaurement nubmers ####
	# There are a number of errors in the measurment numbers (repeats, omissions,
	# out of order). There are also some missing observations where we expect a
	# triplet of measures, but only see a pair in dataset.

	# What is the total number of measurements for each person, and are they
	# numbered sequentially, in sets of 3? ####

	# ggplot(data, aes(x = id, y = measurement)) + geom_point(size = 1)
	# Lots of variation in number of measurements, and gaps in ID 2 and 21.

	# What is the total number of measurements for each person, and are they
	# numbered sequentially?

	# group_by(data, id) %>%
	# summarize(length.measure = length(measurement),
	# max.measure = max(measurement)) %>%
	# View() # 2, 8, 11, 12, 13, 21 don't match

	# View(filter(data, id == 2)) # 106 missing, but they are in sets of 3, so just
	# re-number
	data$measurement[data$id == 2] <- c(1:114)

	# View(filter(data, id == 8)) # Re-number, lots of repeats, but in order and in
	# sets of 3.
	# filter(data, id == 8) %>% ggplot(aes(x = new.time, y = measurement)) +
	# geom_line()

	data$measurement[data$id == 8] <- c(1:72)

	# View(filter(data, id == 11)) # Re-number, lots of repeats, but in order and in
	# sets of 3.

	data$measurement[data$id == 11] <- c(1:21)

	# View(filter(data, id == 12)) # Missing number 21, of set 19, 20, 21 TODO

	# View(filter(data, id == 13)) # Re-number, lots of repeats, but in order and in
	# sets of 3.

	data$measurement[data$id == 13] <- c(1:24)

	# View(filter(data, id == 21)) # Switch times for 66 and 67 (above); skips 37,
	# 38, 39; otherwise in sets of 3

	data$measurement[data$id == 21] <- c(1:78)

	# Is everything in groups of 3?

	# group_by(data, id) %>%
	# summarize(length.measure = length(measurement) / 3,
	# max.measure = max(measurement)) %>%
	# View()

	# 47, 12 are not multiples of 3

	# View(filter(data, id == 47)) # Missing measure to go with 7, 8; but otherwise
	# numbered sequentially TODO

	# View(filter(data, id == 12)) # Missing number 21, of set 19, 20, 21 TODO


	data <- group_by(data, id) %>%
	summarize(length.measure = length(measurement),
	max.measure = max(measurement)) %>%
	full_join(data, by = c("id"))


	# Do measurement ids and time match linearly?

	# ggplot(data, aes(x = new.time, y = measurement, group = id,
	# color = factor(id))) +
	# geom_point() +
	# geom_line() +
	# ylim(0, 30) +
	# scale_color_discrete(guide = F) +
	# facet_wrap(~id) # ids 44, 21 were off. Went back and corrected above

	# ggplot(filter(data, id == 44), aes(x = new.time, y = measurement)) +
	# geom_point()

	# filter(data, id == 44) %>% select(measurement, new.time, date.m) %>% View()

	# ggplot(filter(data, id == 21), aes(x = new.time, y = measurement)) +
	# geom_point()

	# filter(data, id == 21) %>% select(measurement, new.time, date.m) %>% View()



	# Is there a difference between $ID and $Patient? ####

	# plot(data$id, data$patient) # No, they are redundant




	# Tidy character values ####

	# View(data[, sapply(data, class) == 'character'])

	# lapply(data[, sapply(data, class) == 'character'], table)

	# Medications

	data$medications[data$medications == "OxygenMask" &
	!is.na(data$medications)] <- "Oxygen Mask"

	# Does NA for medications refelct NO? TODO

	# Comments

	data$comments[data$comments == "Start Bypass" & !is.na(data$comments)] <-
	"Before bypass"
	data$comments[data$comments == "Before Bypass" & !is.na(data$comments)] <-
	"Before bypass"
	data$comments[data$comments == "bypass" & !is.na(data$comments)] <-
	"Bypass"
	data$comments[data$comments == "Cooling Blanket" & !is.na(data$comments)] <-
	"Cooling"
	data$comments[data$comments == "Cooling Cloth" & !is.na(data$comments)] <-
	"Cooling"
	data$comments[data$comments == "Cooling pad forehead" & !is.na(data$comments)] <-
	"Cooling"
	data$comments[data$comments == "Heatiing blanket" & !is.na(data$comments)] <-
	"Heating blanket"
	data$comments[data$comments == "Heating Blanket" & !is.na(data$comments)] <-
	"Heating blanket"
	data$comments[data$comments == "Pre-Op" & !is.na(data$comments)] <-
	"Pre-op"

	# Can we combine other comments? TODO

	# Operator.1

	data$operator.1[data$operator.1 == "Adds Nasopharyngeal Reading+N576" &
	!is.na(data$operator.1)] <- "Birgitta Gleeson"

	# Ambient.temp

	data$ambient.temp <- gsub("\\,", "\\.", data$ambient.temp) %>%
	as.numeric() %>%
	round(2)


	# Setting

	# table(data$setting) # TODO fix with Joe
	# levels(factor(data$setting))

	data$setting[data$setting == "Theater" & !is.na(data$setting)] <- "Theatre"
	data$setting[data$setting == "Pre-OP" & !is.na(data$setting)] <- "Pre-op"


	# Measurement id

	# ggplot(data, aes(x = factor(measurement), y = measurement.id)) +
	# geom_point() +
	# facet_wrap(~id)

	# Appears pointless TODO