Skip to content

Instantly share code, notes, and snippets.

@mathzero
Last active January 7, 2024 17:54
Show Gist options
  • Save mathzero/25b29dff128622820401ca232587cfae to your computer and use it in GitHub Desktop.
Save mathzero/25b29dff128622820401ca232587cfae to your computer and use it in GitHub Desktop.
Search algorithm to drop NAs from data set with minimum data loss
# Main function to find the 'optimal' combination of rows and columns to drop in order to
# maximise the number of remaining data points in a data set
# Search space rapidly becomes enormous as data size grows, so this random search solution will
# only ever be a rough approximation of the optimal data set.
optimiseDataset <- function(data, max_iterations = 10000) {
best_solution <- data
dims=dim(data[complete.cases(data),])
orig_dimension <- dims[1]*dims[2]
best_dimension <- orig_dimension
# loop
progress <- txtProgressBar(min = 0, max = max_iterations, style = 3)
for (i in 1:max_iterations) {
setTxtProgressBar(progress, i)
set.seed(i)
proportions <- missing_data_proportions(data)
dropped_data <- random_drop(data, proportions$row_missings, proportions$col_missings)
dims=dim(as.matrix(dropped_data[complete.cases(dropped_data),]))
complete_case_dimension <- dims[1]*dims[2]
if (complete_case_dimension > best_dimension) {
best_solution <- dropped_data
best_dimension <- complete_case_dimension
}
}
print(paste0("Original data has ",orig_dimension," complete cases"))
print(paste0("Optimal solution has ",best_dimension," complete cases"))
# get complete cases version of data set
complete_cases=best_solution[complete.cases(best_solution),]
# get plot
h1=plotKeptDropped(data = data,complete_cases = complete_cases)
# return results
return(list(best_dimension=best_dimension,
best_solution=best_solution,
complete_cases=complete_cases,
heatmap=h1))
}
# function to plot the kept/dropped columns using ComplexHeatmap
# Takes iunputs from the optimiseDataset function above
plotKeptDropped <- function(data,complete_cases){
# create boolean matrix of NAs
dat_na=is.na(data)
# reorder data by NA counts in rows and columns
dat <- data[order(rowSums(dat_na)),order(colSums(dat_na))]
# get boolean inclusion vectors for rows and columns
rowvect=rownames(data)%in%rownames(complete_cases)
colvect=colnames(data)%in%colnames(complete_cases)
# create new create boolean matrix of NAs in reordered data set
dat_na_new=is.na(dat)
# function to colour plot
col_fun = circlize::colorRamp2(c(0,1), c( "grey40","white"))
# create plot
h1 <- ComplexHeatmap::Heatmap(matrix = dat_na_new,
col = col_fun,show_heatmap_legend = F,
row_split = (ifelse(rowvect,"Row included","Row removed")),
column_split = (ifelse(colvect,"Column included","Column removed")),
cluster_rows = F,
cluster_columns = F,
row_names_side = "left",
column_names_side = "top")
return(h1)
}
# Function to compute proportion of missing data for rows and columns
missing_data_proportions <- function(data) {
row_missings <- apply(data, 1, function(x) mean(is.na(x)))
col_missings <- apply(data, 2, function(x) mean(is.na(x)))
# Replace NA in proportions with a small value
row_missings[is.na(row_missings)] <- 0.01
col_missings[is.na(col_missings)] <- 0.01
list(row_missings = row_missings, col_missings = col_missings)
}
# Function to randomly drop rows and columns based on missing data proportions
random_drop <- function(data, row_missings, col_missings) {
row_dropcount <- sample(1:(nrow(data) - 1), size = 1, replace = F)
col_dropcount <- sample(1:(ncol(data) - 1), size = 1, replace = F)
rows_to_drop <- sample(1:nrow(data), size = row_dropcount, prob = row_missings, replace = TRUE)
cols_to_drop <- sample(1:ncol(data), size = col_dropcount, prob = col_missings, replace = TRUE)
data[-rows_to_drop, -cols_to_drop, drop = FALSE]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment