ctesta01/comment_coding.R

## comment_coding.R
library(readxl)

#' Turn a Directory into a list of Coded Comment Data Frames (unprocessed)
directory_get_coded_comment_sheets <- function(directory) {
  # ask for directory if not provided
  if (missing(directory)) directory <- choose.dir()

  # we only want to look at Excel or CSV files in the given directory
  files_list <- list.files(path=directory, full.names=TRUE)
  files_list <- files_list[lapply(files_list, function(x) grepl("*.xlsx$|*.xls$|*.csv$", x)) == TRUE]
  files_list <- files_list[lapply(files_list, function(x) grepl("^~", basename(x))) == FALSE]

  # create coded comment tables
  coded_appendix_tables <- lapply(files_list, function(x) get_coded_comment_sheet(x))
  return(coded_appendix_tables)
}

#' Turn a Single Coded File into a Data Frame
get_coded_comment_sheet <- function(codedfile){

  # Ask for the Coded File if there isn't one provided
  if (missing(codedfile)) codedfile <- file.choose()

  # Pick out the sheet called "Coded"
  # Error if there isn't one
  sheetindex <- which(tolower(excel_sheets(codedfile))=="coded")
  if (length(sheetindex) == 0) {
    cat(paste0(codedfile, " did not have a Coded tab\n"))
    return(NA)
  }

  # Load the Coded Comments as a Data Frame
  coded_orig <- read_excel(codedfile, sheet=sheetindex)

  # Strip out the Blank Rows
  blank_rows <- which(is.na(coded_orig[1]) | nchar(coded_orig[1])<5)
  if (length(blank_rows) == 0) {coded_use <- coded_orig}
  else if (length(blank_rows)>0) {coded_use <- coded_orig[-blank_rows,]}

  # Make sure the Coded Comments have a Varname column
  index_qname <- which(tolower(names(coded_use))=="varname")
  if (length(index_qname) == 0) {
    cat(paste0(codedfile, " did not have a varname column\n"))
    return(NA)
  }

  # Return the Coded Comments Data Frame (unprocessed)
  return(coded_use)
}

#' Turn the original coded comments sheet into a pair: (Question, Data Frame)
format_coded_comments <- function(coded_comment_sheet) {
  # determine which column to start with
  index_qname <- which(tolower(names(coded_comment_sheet))=="varname")

  # get the varname from the sheet
  varname = as.character(coded_comment_sheet[1,index_qname])

  # get coded comments, and the number of comments for each
  codeList <- names(coded_comment_sheet)[(index_qname+2):ncol(coded_comment_sheet)]
  numComments <- lapply (codeList, function(x) length(which(coded_comment_sheet[x]==1)))

  # construct the table
  coded_table <- as.data.frame(cbind(codeList,numComments,deparse.level=0))
  names(coded_table) <- c("Response", "N")

  # remove zeroes
  coded_table <- coded_table[coded_table['N'] != 0, ]

  # sort by reverse numerically twice
  # sorting the first time gives reverse numerically reverse alphabetically
  # sorting the second time reverses the reverse alphabetic to forward alphabetic,
  # while keeping the descending numerical sort
  coded_table <- coded_table[rev(order(unlist(coded_table[,'N']))),]
  coded_table <- coded_table[rev(order(unlist(coded_table[,'N']))),]


  # add "Total" and the total N to the list of coded comments and Ns
  n_comments <- nrow(unique(coded_comment_sheet[, 1]))
  coded_table <- rbind(coded_table,c("Total", n_comments))


  # we return a pair, the varname and the coded table.
  return(list(varname, coded_table))

}

#' Turn a List of Unprocessed Coded Comment Sheets into a List of Coded Comments Tables
format_coded_comment_sheets <- function(coded_comment_sheets) {
  coded_comments <- list()
  cc_length <- length(coded_comment_sheets)
  for (i in 1:cc_length) {
    coded_comments[[i]] <- format_coded_comments(coded_comment_sheets[[i]])
  }
  return(coded_comments)
}

#' Merge a Splitting Column into an Unprocessed Coded Comment Sheet
#'
#' Run create_merged_response_column() before this to create the splitting column in the responses.
#' Then pass the column name of the column created by create_merged_response_column() as split_column
#' @param split_column is the name of the column to merge in for splitting
merge_split_column_into_comment_sheet <- function(coded_comment_sheet, responses, split_column) {
  # Which column is the split_column
  split_index <- which(colnames(responses) == split_column)
  if (split_index==0) {
    # Error if the split_column isn't present
    stop("No column in responses with name ", split_column)
  }
  # Get the response IDs and the split_column into a 2-column data frame
  relevant_columns <- responses[, c(1, split_index)]
  colnames(relevant_columns)[[1]] <- colnames(responses)[[1]]

  coded_comment_sheet <- merge(x = coded_comment_sheet, y = relevant_columns, by = 1)
  split_index <- which(colnames(coded_comment_sheet) == split_column)
  re_ordering <- c(1, split_index, 2:(split_index-1))
  coded_comment_sheet <- coded_comment_sheet[,re_ordering]
  return(coded_comment_sheet)
}

# Format and Split a list of Unprocessed Coded Comment Sheets
format_and_split_comment_sheets <- function(coded_comment_sheets, responses, split_column) {

  # split_coded_comment_sheets will be a list of coded comment sheets for each respondent group
  levels <- levels(factor(responses[, split_column]))
  split_coded_comment_sheets <- sapply(levels, function(x) NULL)

  # merge split_column in and split each sheet
  for(i in 1:length(coded_comment_sheets)) {
    coded_comment_sheets[[i]] <- merge_split_column_into_comment_sheet(coded_comment_sheets[[i]], responses, split_column)
    coded_comment_sheets[[i]] <- split(coded_comment_sheets[[i]], coded_comment_sheets[[i]][, split_column], drop=TRUE)

    # sort each sheet into the appropriate level and insert into split_coded_comment_sheets
    for (j in 1:length(levels)) {
      matching_split_sheet <- which(sapply(coded_comment_sheets[[i]], function(x) isTRUE(levels[[j]] %in% x[, split_column]) ))
      if (length(matching_split_sheet) != 0) {
        split_coded_comment_sheets[[j]][[length(split_coded_comment_sheets[[j]]) + 1]] <- as.data.frame(coded_comment_sheets[[i]][[matching_split_sheet]])
      }
    }
  }

  # Format each coded comment sheet
  for (i in 1:length(split_coded_comment_sheets)) {
    if (!is.null(split_coded_comment_sheets[[i]])) split_coded_comment_sheets[[i]] <- format_coded_comment_sheets(split_coded_comment_sheets[[i]])
  }

  return(split_coded_comment_sheets)
}

#' Insert Coded Comments into Blocks
insert_coded_comments <- function(blocks, original_first_rows, coded_comments) {

  r_col_dictionary <- create_response_column_dictionary(blocks, original_first_rows[1,])
  questions <- questions_from_blocks(blocks)

  for (i in 1:length(coded_comments)) {
    if (!is.null(coded_comments[[i]])) {
      varname <- coded_comments[[i]][[1]]
      matched_based_on_r_col <- which(r_col_dictionary[,2] == varname)
      if (length(matched_based_on_r_col) == 1) {
        varname <- r_col_dictionary[matched_based_on_r_col, 1]
      }
      question_index <- find_question_index(questions, varname)

      if (length(question_index) == 0) {
        cat(paste0("The appendices indicated for ", varname, " could not be matched to a question\n"))
        next;
      }
      cc_index <- length(questions[[question_index]][['CodedComments']]) + 1
      questions[[question_index]][['CodedComments']][[cc_index]] <- coded_comments[[i]]
    }
  }

  blocks <- questions_into_blocks(questions, blocks)
  return(blocks)
}

#' Split Survey and Insert Split Coded Comments
#'
#' The responses should already include the split column
insert_split_survey_comments <- function(split_blocks, split_coded_comment_sheets, split_column, original_first_rows) {
  # grab the original first rows if not included
  if (missing(original_first_rows)) original_first_rows <- get(x="original_first_rows", envir=globalenv())

  # match split blocks and split coded comments
  for (i in 1:length(split_coded_comment_sheets)) {
    if (!is.null(split_coded_comment_sheets[[i]])) {
      matching_block <- which(sapply(split_blocks, function(x) x[['split_group']] == names(split_coded_comment_sheets)[[i]]))
      split_blocks[[matching_block]] <- insert_coded_comments( split_blocks[[matching_block]], original_first_rows, split_coded_comment_sheets[[i]])
    }
  }
  return(split_blocks)
}
	library(readxl)

	#' Turn a Directory into a list of Coded Comment Data Frames (unprocessed)
	directory_get_coded_comment_sheets <- function(directory) {
	# ask for directory if not provided
	if (missing(directory)) directory <- choose.dir()

	# we only want to look at Excel or CSV files in the given directory
	files_list <- list.files(path=directory, full.names=TRUE)
	files_list <- files_list[lapply(files_list, function(x) grepl(".xlsx$\|.xls$\|*.csv$", x)) == TRUE]
	files_list <- files_list[lapply(files_list, function(x) grepl("^~", basename(x))) == FALSE]

	# create coded comment tables
	coded_appendix_tables <- lapply(files_list, function(x) get_coded_comment_sheet(x))
	return(coded_appendix_tables)
	}

	#' Turn a Single Coded File into a Data Frame
	get_coded_comment_sheet <- function(codedfile){

	# Ask for the Coded File if there isn't one provided
	if (missing(codedfile)) codedfile <- file.choose()

	# Pick out the sheet called "Coded"
	# Error if there isn't one
	sheetindex <- which(tolower(excel_sheets(codedfile))=="coded")
	if (length(sheetindex) == 0) {
	cat(paste0(codedfile, " did not have a Coded tab\n"))
	return(NA)
	}

	# Load the Coded Comments as a Data Frame
	coded_orig <- read_excel(codedfile, sheet=sheetindex)

	# Strip out the Blank Rows
	blank_rows <- which(is.na(coded_orig[1]) \| nchar(coded_orig[1])<5)
	if (length(blank_rows) == 0) {coded_use <- coded_orig}
	else if (length(blank_rows)>0) {coded_use <- coded_orig[-blank_rows,]}

	# Make sure the Coded Comments have a Varname column
	index_qname <- which(tolower(names(coded_use))=="varname")
	if (length(index_qname) == 0) {
	cat(paste0(codedfile, " did not have a varname column\n"))
	return(NA)
	}

	# Return the Coded Comments Data Frame (unprocessed)
	return(coded_use)
	}

	#' Turn the original coded comments sheet into a pair: (Question, Data Frame)
	format_coded_comments <- function(coded_comment_sheet) {
	# determine which column to start with
	index_qname <- which(tolower(names(coded_comment_sheet))=="varname")

	# get the varname from the sheet
	varname = as.character(coded_comment_sheet[1,index_qname])

	# get coded comments, and the number of comments for each
	codeList <- names(coded_comment_sheet)[(index_qname+2):ncol(coded_comment_sheet)]
	numComments <- lapply (codeList, function(x) length(which(coded_comment_sheet[x]==1)))

	# construct the table
	coded_table <- as.data.frame(cbind(codeList,numComments,deparse.level=0))
	names(coded_table) <- c("Response", "N")

	# remove zeroes
	coded_table <- coded_table[coded_table['N'] != 0, ]

	# sort by reverse numerically twice
	# sorting the first time gives reverse numerically reverse alphabetically
	# sorting the second time reverses the reverse alphabetic to forward alphabetic,
	# while keeping the descending numerical sort
	coded_table <- coded_table[rev(order(unlist(coded_table[,'N']))),]
	coded_table <- coded_table[rev(order(unlist(coded_table[,'N']))),]


	# add "Total" and the total N to the list of coded comments and Ns
	n_comments <- nrow(unique(coded_comment_sheet[, 1]))
	coded_table <- rbind(coded_table,c("Total", n_comments))


	# we return a pair, the varname and the coded table.
	return(list(varname, coded_table))

	}

	#' Turn a List of Unprocessed Coded Comment Sheets into a List of Coded Comments Tables
	format_coded_comment_sheets <- function(coded_comment_sheets) {
	coded_comments <- list()
	cc_length <- length(coded_comment_sheets)
	for (i in 1:cc_length) {
	coded_comments[[i]] <- format_coded_comments(coded_comment_sheets[[i]])
	}
	return(coded_comments)
	}

	#' Merge a Splitting Column into an Unprocessed Coded Comment Sheet
	#'
	#' Run create_merged_response_column() before this to create the splitting column in the responses.
	#' Then pass the column name of the column created by create_merged_response_column() as split_column
	#' @param split_column is the name of the column to merge in for splitting
	merge_split_column_into_comment_sheet <- function(coded_comment_sheet, responses, split_column) {
	# Which column is the split_column
	split_index <- which(colnames(responses) == split_column)
	if (split_index==0) {
	# Error if the split_column isn't present
	stop("No column in responses with name ", split_column)
	}
	# Get the response IDs and the split_column into a 2-column data frame
	relevant_columns <- responses[, c(1, split_index)]
	colnames(relevant_columns)[[1]] <- colnames(responses)[[1]]

	coded_comment_sheet <- merge(x = coded_comment_sheet, y = relevant_columns, by = 1)
	split_index <- which(colnames(coded_comment_sheet) == split_column)
	re_ordering <- c(1, split_index, 2:(split_index-1))
	coded_comment_sheet <- coded_comment_sheet[,re_ordering]
	return(coded_comment_sheet)
	}

	# Format and Split a list of Unprocessed Coded Comment Sheets
	format_and_split_comment_sheets <- function(coded_comment_sheets, responses, split_column) {

	# split_coded_comment_sheets will be a list of coded comment sheets for each respondent group
	levels <- levels(factor(responses[, split_column]))
	split_coded_comment_sheets <- sapply(levels, function(x) NULL)

	# merge split_column in and split each sheet
	for(i in 1:length(coded_comment_sheets)) {
	coded_comment_sheets[[i]] <- merge_split_column_into_comment_sheet(coded_comment_sheets[[i]], responses, split_column)
	coded_comment_sheets[[i]] <- split(coded_comment_sheets[[i]], coded_comment_sheets[[i]][, split_column], drop=TRUE)

	# sort each sheet into the appropriate level and insert into split_coded_comment_sheets
	for (j in 1:length(levels)) {
	matching_split_sheet <- which(sapply(coded_comment_sheets[[i]], function(x) isTRUE(levels[[j]] %in% x[, split_column]) ))
	if (length(matching_split_sheet) != 0) {
	split_coded_comment_sheets[[j]][[length(split_coded_comment_sheets[[j]]) + 1]] <- as.data.frame(coded_comment_sheets[[i]][[matching_split_sheet]])
	}
	}
	}

	# Format each coded comment sheet
	for (i in 1:length(split_coded_comment_sheets)) {
	if (!is.null(split_coded_comment_sheets[[i]])) split_coded_comment_sheets[[i]] <- format_coded_comment_sheets(split_coded_comment_sheets[[i]])
	}

	return(split_coded_comment_sheets)
	}

	#' Insert Coded Comments into Blocks
	insert_coded_comments <- function(blocks, original_first_rows, coded_comments) {

	r_col_dictionary <- create_response_column_dictionary(blocks, original_first_rows[1,])
	questions <- questions_from_blocks(blocks)

	for (i in 1:length(coded_comments)) {
	if (!is.null(coded_comments[[i]])) {
	varname <- coded_comments[[i]][[1]]
	matched_based_on_r_col <- which(r_col_dictionary[,2] == varname)
	if (length(matched_based_on_r_col) == 1) {
	varname <- r_col_dictionary[matched_based_on_r_col, 1]
	}
	question_index <- find_question_index(questions, varname)

	if (length(question_index) == 0) {
	cat(paste0("The appendices indicated for ", varname, " could not be matched to a question\n"))
	next;
	}
	cc_index <- length(questions[[question_index]][['CodedComments']]) + 1
	questions[[question_index]][['CodedComments']][[cc_index]] <- coded_comments[[i]]
	}
	}

	blocks <- questions_into_blocks(questions, blocks)
	return(blocks)
	}

	#' Split Survey and Insert Split Coded Comments
	#'
	#' The responses should already include the split column
	insert_split_survey_comments <- function(split_blocks, split_coded_comment_sheets, split_column, original_first_rows) {
	# grab the original first rows if not included
	if (missing(original_first_rows)) original_first_rows <- get(x="original_first_rows", envir=globalenv())

	# match split blocks and split coded comments
	for (i in 1:length(split_coded_comment_sheets)) {
	if (!is.null(split_coded_comment_sheets[[i]])) {
	matching_block <- which(sapply(split_blocks, function(x) x[['split_group']] == names(split_coded_comment_sheets)[[i]]))
	split_blocks[[matching_block]] <- insert_coded_comments( split_blocks[[matching_block]], original_first_rows, split_coded_comment_sheets[[i]])
	}
	}
	return(split_blocks)
	}