mathzero/drop_na_search.R

## drop_na_search.R
# Main function to find the 'optimal' combination of rows and columns to drop in order to
# maximise the number of remaining data points in a data set

# Search space rapidly becomes enormous as data size grows, so this random search solution will
# only ever be a rough approximation of the optimal data set.

optimiseDataset <- function(data, max_iterations = 10000) {

  best_solution <- data
  dims=dim(data[complete.cases(data),])
  orig_dimension <- dims[1]*dims[2]
  best_dimension <- orig_dimension

  # loop
  progress <- txtProgressBar(min = 0, max = max_iterations, style = 3)
  for (i in 1:max_iterations) {
    setTxtProgressBar(progress, i)
    set.seed(i)
    proportions <- missing_data_proportions(data)
    dropped_data <- random_drop(data, proportions$row_missings, proportions$col_missings)


    dims=dim(as.matrix(dropped_data[complete.cases(dropped_data),]))
    complete_case_dimension <- dims[1]*dims[2]


    if (complete_case_dimension > best_dimension) {
      best_solution <- dropped_data
      best_dimension <- complete_case_dimension
    }
  }

  print(paste0("Original data has ",orig_dimension," complete cases"))
  print(paste0("Optimal solution has ",best_dimension," complete cases"))

  # get complete cases version of data set
  complete_cases=best_solution[complete.cases(best_solution),]

  # get plot
  h1=plotKeptDropped(data = data,complete_cases = complete_cases)

  # return results
  return(list(best_dimension=best_dimension,
              best_solution=best_solution,
              complete_cases=complete_cases,
              heatmap=h1))
}


# function to plot the kept/dropped columns using ComplexHeatmap
# Takes iunputs from the optimiseDataset function above
plotKeptDropped <- function(data,complete_cases){

  # create boolean matrix of NAs
  dat_na=is.na(data)

  # reorder data by NA counts in rows and columns
  dat <- data[order(rowSums(dat_na)),order(colSums(dat_na))]

  # get boolean inclusion vectors for rows and columns
  rowvect=rownames(data)%in%rownames(complete_cases)
  colvect=colnames(data)%in%colnames(complete_cases)

  # create new create boolean matrix of NAs in reordered data set
  dat_na_new=is.na(dat)

  # function to colour plot
  col_fun = circlize::colorRamp2(c(0,1), c( "grey40","white"))

  # create plot
  h1 <- ComplexHeatmap::Heatmap(matrix = dat_na_new,
                                col = col_fun,show_heatmap_legend = F,
                                row_split = (ifelse(rowvect,"Row included","Row removed")),
                                column_split = (ifelse(colvect,"Column included","Column removed")),
                                cluster_rows = F,
                                cluster_columns = F,
                                row_names_side = "left",
                                column_names_side = "top")


  return(h1)

}


# Function to compute proportion of missing data for rows and columns
missing_data_proportions <- function(data) {
  row_missings <- apply(data, 1, function(x) mean(is.na(x)))
  col_missings <- apply(data, 2, function(x) mean(is.na(x)))

  # Replace NA in proportions with a small value
  row_missings[is.na(row_missings)] <- 0.01
  col_missings[is.na(col_missings)] <- 0.01

  list(row_missings = row_missings, col_missings = col_missings)
}

# Function to randomly drop rows and columns based on missing data proportions
random_drop <- function(data, row_missings, col_missings) {
  row_dropcount <- sample(1:(nrow(data) - 1), size = 1, replace = F)
  col_dropcount <- sample(1:(ncol(data) - 1), size = 1, replace = F)
  rows_to_drop <- sample(1:nrow(data), size = row_dropcount, prob = row_missings, replace = TRUE)
  cols_to_drop <- sample(1:ncol(data), size = col_dropcount, prob = col_missings, replace = TRUE)
  data[-rows_to_drop, -cols_to_drop, drop = FALSE]
}
	# Main function to find the 'optimal' combination of rows and columns to drop in order to
	# maximise the number of remaining data points in a data set

	# Search space rapidly becomes enormous as data size grows, so this random search solution will
	# only ever be a rough approximation of the optimal data set.

	optimiseDataset <- function(data, max_iterations = 10000) {

	best_solution <- data
	dims=dim(data[complete.cases(data),])
	orig_dimension <- dims[1]*dims[2]
	best_dimension <- orig_dimension

	# loop
	progress <- txtProgressBar(min = 0, max = max_iterations, style = 3)
	for (i in 1:max_iterations) {
	setTxtProgressBar(progress, i)
	set.seed(i)
	proportions <- missing_data_proportions(data)
	dropped_data <- random_drop(data, proportions$row_missings, proportions$col_missings)


	dims=dim(as.matrix(dropped_data[complete.cases(dropped_data),]))
	complete_case_dimension <- dims[1]*dims[2]


	if (complete_case_dimension > best_dimension) {
	best_solution <- dropped_data
	best_dimension <- complete_case_dimension
	}
	}

	print(paste0("Original data has ",orig_dimension," complete cases"))
	print(paste0("Optimal solution has ",best_dimension," complete cases"))

	# get complete cases version of data set
	complete_cases=best_solution[complete.cases(best_solution),]

	# get plot
	h1=plotKeptDropped(data = data,complete_cases = complete_cases)

	# return results
	return(list(best_dimension=best_dimension,
	best_solution=best_solution,
	complete_cases=complete_cases,
	heatmap=h1))
	}




	# function to plot the kept/dropped columns using ComplexHeatmap
	# Takes iunputs from the optimiseDataset function above
	plotKeptDropped <- function(data,complete_cases){

	# create boolean matrix of NAs
	dat_na=is.na(data)

	# reorder data by NA counts in rows and columns
	dat <- data[order(rowSums(dat_na)),order(colSums(dat_na))]

	# get boolean inclusion vectors for rows and columns
	rowvect=rownames(data)%in%rownames(complete_cases)
	colvect=colnames(data)%in%colnames(complete_cases)

	# create new create boolean matrix of NAs in reordered data set
	dat_na_new=is.na(dat)

	# function to colour plot
	col_fun = circlize::colorRamp2(c(0,1), c( "grey40","white"))

	# create plot
	h1 <- ComplexHeatmap::Heatmap(matrix = dat_na_new,
	col = col_fun,show_heatmap_legend = F,
	row_split = (ifelse(rowvect,"Row included","Row removed")),
	column_split = (ifelse(colvect,"Column included","Column removed")),
	cluster_rows = F,
	cluster_columns = F,
	row_names_side = "left",
	column_names_side = "top")


	return(h1)

	}



	# Function to compute proportion of missing data for rows and columns
	missing_data_proportions <- function(data) {
	row_missings <- apply(data, 1, function(x) mean(is.na(x)))
	col_missings <- apply(data, 2, function(x) mean(is.na(x)))

	# Replace NA in proportions with a small value
	row_missings[is.na(row_missings)] <- 0.01
	col_missings[is.na(col_missings)] <- 0.01

	list(row_missings = row_missings, col_missings = col_missings)
	}

	# Function to randomly drop rows and columns based on missing data proportions
	random_drop <- function(data, row_missings, col_missings) {
	row_dropcount <- sample(1:(nrow(data) - 1), size = 1, replace = F)
	col_dropcount <- sample(1:(ncol(data) - 1), size = 1, replace = F)
	rows_to_drop <- sample(1:nrow(data), size = row_dropcount, prob = row_missings, replace = TRUE)
	cols_to_drop <- sample(1:ncol(data), size = col_dropcount, prob = col_missings, replace = TRUE)
	data[-rows_to_drop, -cols_to_drop, drop = FALSE]
	}