Skip to content

Instantly share code, notes, and snippets.

@shukwong
Last active June 10, 2025 19:31
Show Gist options
  • Save shukwong/ab9bc5812219db7e33988000cfc044ae to your computer and use it in GitHub Desktop.
Save shukwong/ab9bc5812219db7e33988000cfc044ae to your computer and use it in GitHub Desktop.
plot distribution of TopMed Imputation R2 in various MAF in one chromosome
# TopMed Imputation .info File Processor - Fixed Version
# Fast processing using data.table and vectorized operations
library(data.table)
library(ggplot2)
library(dplyr)
library(stringr)
# Function to process different file formats
process_file <- function(file_path, file_type = "auto") {
cat("Reading file:", file_path, "\n")
# Auto-detect file type if not specified
if (file_type == "auto") {
# Check file extension and structure
if (grepl("\\.(info|info\\.gz|vcf|vcf\\.gz)$", file_path)) {
file_type <- "topmed"
cat("Auto-detected: TopMed imputation format\n")
} else {
file_type <- "tabdelim"
cat("Auto-detected: Tab-delimited format\n")
}
}
if (file_type == "topmed") {
return(process_topmed_info(file_path))
} else {
return(process_tabdelim_file(file_path))
}
}
# Fast function to process TopMed .info file
process_topmed_info <- function(file_path) {
cat("Processing TopMed format file...\n")
# First, count lines starting with ## to skip them
cat("Scanning for header and comment lines...\n")
# Open connection to read line by line
if (grepl("\\.gz$", file_path)) {
con <- gzfile(file_path, "rt")
} else {
con <- file(file_path, "rt")
}
skip_lines <- 0
line_num <- 0
header_found <- FALSE
while (TRUE) {
line <- readLines(con, n = 1)
if (length(line) == 0) break
line_num <- line_num + 1
# Count lines that start with ## (these need to be skipped)
if (grepl("^##", line)) {
skip_lines <- skip_lines + 1
}
# Check if this is the header line (starts with #CHROM)
else if (grepl("^#CHROM", line)) {
header_found <- TRUE
cat("Found", skip_lines, "comment lines (##) to skip\n")
cat("Found header (#CHROM) at line", line_num, "\n")
break
}
# If we hit a line that doesn't start with # at all, we've gone too far
else if (!grepl("^#", line)) {
break
}
# Safety check - don't read too many lines looking for header
if (line_num > 1000) {
close(con)
stop("Header line starting with #CHROM not found in first 1000 lines")
}
}
close(con)
if (!header_found) {
stop("Header line starting with #CHROM not found")
}
cat("Loading data with fread, skipping", skip_lines, "comment lines...\n")
# Now use fread with the correct skip number (skip only the ## lines)
dt <- fread(
file_path,
skip = skip_lines, # Skip only the ## comment lines
header = TRUE, # The #CHROM line will be treated as header
sep = "\t",
stringsAsFactors = FALSE,
showProgress = TRUE
)
cat("Loaded", nrow(dt), "variants\n")
cat("Extracting MAF and R2 values from INFO column...\n")
# Extract MAF and R2 using vectorized string operations
# Much faster than loop-based extraction
# Extract MAF values using regex
maf_extracted <- str_extract(dt$INFO, "MAF=([0-9.e-]+)")
maf_values <- as.numeric(gsub("MAF=", "", maf_extracted))
# Extract R2 values using regex
r2_extracted <- str_extract(dt$INFO, "R2=([0-9.e-]+)")
r2_values <- as.numeric(gsub("R2=", "", r2_extracted))
# Create result data.table with only non-missing values
result_dt <- data.table(
MAF = maf_values,
R2 = r2_values
)
# Remove rows with missing MAF or R2
result_dt <- result_dt[!is.na(MAF) & !is.na(R2)]
cat("Extracted", nrow(result_dt), "variants with valid MAF and R2 values\n")
return(result_dt)
}
# Function to process tab-delimited files with direct MAF and R2 columns
process_tabdelim_file <- function(file_path) {
cat("Processing tab-delimited file...\n")
# Read the file directly with fread
dt <- fread(
file_path,
header = TRUE,
sep = "\t",
stringsAsFactors = FALSE,
showProgress = TRUE
)
cat("Loaded", nrow(dt), "rows\n")
cat("Columns found:", paste(names(dt), collapse = ", "), "\n")
# Check for required columns
maf_col <- NULL
r2_col <- NULL
# Look for MAF column (case-insensitive)
maf_candidates <- c("all_maf", "maf", "MAF", "ALL_MAF", "allele_freq", "AF")
for (col in maf_candidates) {
if (col %in% names(dt)) {
maf_col <- col
break
}
}
# Look for R2 column (case-insensitive)
r2_candidates <- c("info", "r2", "R2", "INFO", "rsq", "RSQ", "imputation_quality")
for (col in r2_candidates) {
if (col %in% names(dt)) {
r2_col <- col
break
}
}
if (is.null(maf_col)) {
stop("MAF column not found. Looking for: ", paste(maf_candidates, collapse = ", "))
}
if (is.null(r2_col)) {
stop("R2 column not found. Looking for: ", paste(r2_candidates, collapse = ", "))
}
cat("Using MAF column:", maf_col, "\n")
cat("Using R2 column:", r2_col, "\n")
# Extract the values
result_dt <- data.table(
MAF = as.numeric(dt[[maf_col]]),
R2 = as.numeric(dt[[r2_col]])
)
# Remove rows with missing MAF or R2
result_dt <- result_dt[!is.na(MAF) & !is.na(R2)]
cat("Extracted", nrow(result_dt), "variants with valid MAF and R2 values\n")
return(result_dt)
}
# Function to process multiple files
process_multiple_files <- function(file_list, ancestry_labels = NULL) {
cat("Processing", length(file_list), "files...\n")
all_data <- list()
for (i in seq_along(file_list)) {
file_path <- file_list[i]
cat("\n--- Processing file", i, "of", length(file_list), ":", basename(file_path), "---\n")
if (!file.exists(file_path)) {
cat("Warning: File not found:", file_path, "- skipping\n")
next
}
# Process the file
file_data <- process_file(file_path)
if (nrow(file_data) == 0) {
cat("Warning: No valid data found in", basename(file_path), "- skipping\n")
next
}
# Add file identifier and ancestry label
file_data$File <- basename(file_path)
# Use ancestry label if provided, otherwise use cleaned file name
if (!is.null(ancestry_labels) && i <= length(ancestry_labels)) {
file_data$Ancestry <- ancestry_labels[i]
cat("Assigned ancestry label:", ancestry_labels[i], "to file:", basename(file_path), "\n")
} else {
# Create a cleaner file name (remove extensions)
clean_name <- gsub("\\.(info|info\\.gz|txt|tsv)$", "", basename(file_path))
file_data$Ancestry <- clean_name
cat("No ancestry label provided, using cleaned filename:", clean_name, "\n")
}
# Store the data
all_data[[i]] <- file_data
}
# Combine all data
if (length(all_data) == 0) {
stop("No valid data found in any files")
}
combined_data <- rbindlist(all_data, fill = TRUE)
cat("\nCombined data summary:\n")
cat("Total variants:", nrow(combined_data), "\n")
cat("Files processed:", length(unique(combined_data$File)), "\n")
# Check if Ancestry column exists and show its values
if ("Ancestry" %in% names(combined_data)) {
cat("Ancestries found:", paste(unique(combined_data$Ancestry), collapse = ", "), "\n")
} else {
cat("Warning: No Ancestry column found in combined data!\n")
}
# Print per-ancestry summary
if ("Ancestry" %in% names(combined_data)) {
ancestry_summary <- combined_data[, .(Count = .N), by = Ancestry]
print(ancestry_summary)
}
return(combined_data)
}
# Function to create violin plots for multiple files
plot_violin_by_maf_multifile <- function(data, exclude_perfect_r2 = FALSE) {
cat("Creating MAF categories and violin plots...\n")
# Convert to data.table if not already
if (!is.data.table(data)) {
data <- as.data.table(data)
}
# Debug: Check what columns we have
cat("Available columns in data:", paste(names(data), collapse = ", "), "\n")
if ("Ancestry" %in% names(data)) {
cat("Ancestry values found:", paste(unique(data$Ancestry), collapse = ", "), "\n")
}
# Filter out R2 == 1 if requested
if (exclude_perfect_r2) {
original_count <- nrow(data)
data <- data[R2 != 1.0]
filtered_count <- nrow(data)
cat("Filtered out", (original_count - filtered_count), "variants with R2 = 1.0\n")
cat("Remaining variants:", filtered_count, "\n")
}
# Create MAF categories using data.table syntax (very fast)
data[, MAF_Category := fcase(
MAF > 0.05, "MAF > 0.05",
MAF >= 0.005 & MAF <= 0.05, "MAF [0.005, 0.05]",
MAF < 0.005, "MAF < 0.005",
default = "Other"
)]
# Set factor levels
data[, MAF_Category := factor(MAF_Category,
levels = c("MAF > 0.05", "MAF [0.005, 0.05]", "MAF < 0.005"))]
# FIXED: Always use Ancestry column if it exists, with proper factor ordering
if ("Ancestry" %in% names(data)) {
display_label <- "Ancestry"
# Convert to factor to control ordering in plots
data[, Display_Label := factor(Ancestry, levels = unique(Ancestry))]
cat("Using Ancestry labels for plotting\n")
} else {
display_label <- "File"
# Create a cleaner file name for plotting (remove extensions and shorten if needed)
data[, Display_Label := gsub("\\.(info|info\\.gz|txt|tsv)$", "", File)]
# If file names are very long, keep only the most distinctive part
if (any(nchar(unique(data$Display_Label)) > 15)) {
data[, Display_Label := substr(Display_Label, 1, 15)]
}
# Convert to factor
data[, Display_Label := factor(Display_Label, levels = unique(Display_Label))]
cat("Using File names for plotting\n")
}
cat("Display labels:", paste(levels(data$Display_Label), collapse = ", "), "\n")
# Fast summary statistics using data.table
summary_stats <- data[!is.na(MAF_Category), .(
Count = .N,
Mean_R2 = mean(R2, na.rm = TRUE),
Median_R2 = median(R2, na.rm = TRUE),
Min_R2 = min(R2, na.rm = TRUE),
Max_R2 = max(R2, na.rm = TRUE),
Q25_R2 = quantile(R2, 0.25, na.rm = TRUE),
Q75_R2 = quantile(R2, 0.75, na.rm = TRUE)
), by = .(Display_Label, MAF_Category)]
cat("\nSummary by", display_label, "and MAF category:\n")
print(summary_stats)
# Create plot title with filtering info
plot_subtitle <- paste("Comparison across", length(unique(data$Display_Label)), display_label, "groups")
if (exclude_perfect_r2) {
plot_subtitle <- paste(plot_subtitle, "(excluding R² = 1.0)")
}
# Check how many files we have to determine the best layout
n_groups <- length(unique(data$Display_Label))
# For many files, use different colors for each file instead of MAF categories
if (n_groups > 6) {
# Use RColorBrewer or rainbow colors for many files
colors <- rainbow(n_groups)
names(colors) <- levels(data$Display_Label)
# Create violin plot with files as fill aesthetic
p <- ggplot(data[!is.na(MAF_Category)], aes(x = Display_Label, y = R2, fill = Display_Label)) +
geom_violin(scale = "width", trim = FALSE, alpha = 0.7) +
geom_boxplot(width = 0.1, alpha = 0.5, outlier.size = 0.5, fill = "white") +
facet_wrap(~MAF_Category, scales = "free_x", ncol = 1) +
scale_fill_manual(values = colors) +
labs(
title = "Distribution of Imputation Quality (R²) by MAF Categories",
subtitle = plot_subtitle,
x = display_label,
y = "R² (Imputation Quality)",
fill = display_label
) +
theme_minimal() +
theme(
plot.title = element_text(size = 14, face = "bold"),
plot.subtitle = element_text(size = 12),
strip.text = element_text(size = 11, face = "bold"),
legend.position = "none", # Too many files for legend
axis.text.x = element_text(angle = 45, hjust = 1, size = 8)
) +
ylim(0, 1)
} else {
# For fewer files, keep MAF categories as colors
p <- ggplot(data[!is.na(MAF_Category)], aes(x = Display_Label, y = R2, fill = MAF_Category)) +
geom_violin(scale = "width", trim = FALSE, alpha = 0.7) +
geom_boxplot(width = 0.1, alpha = 0.5, outlier.size = 0.5) +
facet_wrap(~MAF_Category, scales = "free_x", ncol = 1) +
scale_fill_manual(values = c("MAF > 0.05" = "#2E8B57",
"MAF [0.005, 0.05]" = "#4682B4",
"MAF < 0.005" = "#CD5C5C")) +
labs(
title = "Distribution of Imputation Quality (R²) by MAF Categories",
subtitle = plot_subtitle,
x = display_label,
y = "R² (Imputation Quality)",
fill = "MAF Category"
) +
theme_minimal() +
theme(
plot.title = element_text(size = 14, face = "bold"),
plot.subtitle = element_text(size = 12),
strip.text = element_text(size = 11, face = "bold"),
legend.position = "bottom",
axis.text.x = element_text(angle = 45, hjust = 1, size = 8)
) +
ylim(0, 1)
}
return(list(plot = p, summary = summary_stats, data = data))
}
# Alternative plotting function for side-by-side comparison
plot_violin_sidebyside_multifile <- function(data, exclude_perfect_r2 = FALSE) {
cat("Creating side-by-side violin plots by MAF categories...\n")
# Convert to data.table if not already
if (!is.data.table(data)) {
data <- as.data.table(data)
}
# Filter out R2 == 1 if requested
if (exclude_perfect_r2) {
original_count <- nrow(data)
data <- data[R2 != 1.0]
filtered_count <- nrow(data)
cat("Filtered out", (original_count - filtered_count), "variants with R2 = 1.0\n")
cat("Remaining variants:", filtered_count, "\n")
}
# Create MAF categories using data.table syntax (very fast)
data[, MAF_Category := fcase(
MAF > 0.05, "MAF > 0.05",
MAF >= 0.005 & MAF <= 0.05, "MAF [0.005, 0.05]",
MAF < 0.005, "MAF < 0.005",
default = "Other"
)]
# Set factor levels
data[, MAF_Category := factor(MAF_Category,
levels = c("MAF > 0.05", "MAF [0.005, 0.05]", "MAF < 0.005"))]
# FIXED: Always use Ancestry column if it exists, with proper factor ordering
if ("Ancestry" %in% names(data)) {
display_label <- "Ancestry"
# Convert to factor to control ordering in plots
data[, Display_Label := factor(Ancestry, levels = unique(Ancestry))]
cat("Using Ancestry labels for plotting\n")
} else {
display_label <- "File"
# Create a cleaner file name for plotting (remove extensions and shorten if needed)
data[, Display_Label := gsub("\\.(info|info\\.gz|txt|tsv)$", "", File)]
# If file names are very long, keep only the most distinctive part
if (any(nchar(unique(data$Display_Label)) > 15)) {
data[, Display_Label := substr(Display_Label, 1, 15)]
}
# Convert to factor
data[, Display_Label := factor(Display_Label, levels = unique(Display_Label))]
}
# Fast summary statistics using data.table
summary_stats <- data[!is.na(MAF_Category), .(
Count = .N,
Mean_R2 = mean(R2, na.rm = TRUE),
Median_R2 = median(R2, na.rm = TRUE),
Min_R2 = min(R2, na.rm = TRUE),
Max_R2 = max(R2, na.rm = TRUE),
Q25_R2 = quantile(R2, 0.25, na.rm = TRUE),
Q75_R2 = quantile(R2, 0.75, na.rm = TRUE)
), by = .(Display_Label, MAF_Category)]
cat("\nSummary by", display_label, "and MAF category:\n")
print(summary_stats)
# Create plot title with filtering info
plot_subtitle <- paste("Comparison across", length(unique(data$Display_Label)), display_label, "groups")
if (exclude_perfect_r2) {
plot_subtitle <- paste(plot_subtitle, "(excluding R² = 1.0)")
}
# Check how many groups we have
n_groups <- length(unique(data$Display_Label))
# Generate colors for groups using the levels from the factor
if (n_groups <= 10) {
# Use Set3 palette for up to 10 files (more color-blind friendly)
library(RColorBrewer)
group_colors <- brewer.pal(min(n_groups, 12), "Set3")[1:n_groups]
} else {
# Use rainbow for many files
group_colors <- rainbow(n_groups)
}
names(group_colors) <- levels(data$Display_Label)
# Create side-by-side violin plot (MAF categories as x-axis, groups as colors)
p <- ggplot(data[!is.na(MAF_Category)], aes(x = MAF_Category, y = R2, fill = Display_Label)) +
geom_violin(scale = "width", trim = FALSE, alpha = 0.7, position = position_dodge(width = 0.8)) +
geom_boxplot(width = 0.1, alpha = 0.8, position = position_dodge(width = 0.8),
outlier.size = 0.3, color = "black") +
scale_fill_manual(values = group_colors) +
labs(
title = "Distribution of Imputation Quality (R²) by MAF Categories",
subtitle = plot_subtitle,
x = "MAF Category",
y = "R² (Imputation Quality)",
fill = display_label
) +
theme_minimal() +
theme(
plot.title = element_text(size = 14, face = "bold"),
plot.subtitle = element_text(size = 12),
legend.position = if(n_groups <= 8) "bottom" else "right",
legend.title = element_text(size = 10),
legend.text = element_text(size = 8),
axis.text.x = element_text(angle = 0, hjust = 0.5, size = 10),
axis.text.y = element_text(size = 10)
) +
ylim(0, 1)
# If too many groups, split legend into multiple columns
if (n_groups > 8) {
p <- p + guides(fill = guide_legend(ncol = ceiling(n_groups/10)))
}
return(list(plot = p, summary = summary_stats, data = data))
}
# Keep the original single-file plotting function for backward compatibility
plot_r2_by_maf <- function(data, exclude_perfect_r2 = FALSE) {
cat("Creating MAF categories and calculating statistics...\n")
# Convert to data.table if not already
if (!is.data.table(data)) {
data <- as.data.table(data)
}
# Filter out R2 == 1 if requested
if (exclude_perfect_r2) {
original_count <- nrow(data)
data <- data[R2 != 1.0]
filtered_count <- nrow(data)
cat("Filtered out", (original_count - filtered_count), "variants with R2 = 1.0\n")
cat("Remaining variants:", filtered_count, "\n")
}
# Create MAF categories using data.table syntax (very fast)
data[, MAF_Category := fcase(
MAF > 0.05, "MAF > 0.05",
MAF >= 0.005 & MAF <= 0.05, "MAF [0.005, 0.05]",
MAF < 0.005, "MAF < 0.005",
default = "Other"
)]
# Set factor levels
data[, MAF_Category := factor(MAF_Category,
levels = c("MAF > 0.05", "MAF [0.005, 0.05]", "MAF < 0.005"))]
# Fast summary statistics using data.table
summary_stats <- data[!is.na(MAF_Category), .(
Count = .N,
Mean_R2 = mean(R2, na.rm = TRUE),
Median_R2 = median(R2, na.rm = TRUE),
Min_R2 = min(R2, na.rm = TRUE),
Max_R2 = max(R2, na.rm = TRUE),
Q25_R2 = quantile(R2, 0.25, na.rm = TRUE),
Q75_R2 = quantile(R2, 0.75, na.rm = TRUE)
), by = MAF_Category]
cat("\nSummary by MAF category:\n")
print(summary_stats)
# Create plot title with filtering info
plot_subtitle <- "Imputation Results"
if (exclude_perfect_r2) {
plot_subtitle <- paste(plot_subtitle, "(excluding R² = 1.0)")
}
# Create the plot using data.table (ggplot2 works well with data.table)
p <- ggplot(data[!is.na(MAF_Category)], aes(x = R2, fill = MAF_Category)) +
geom_histogram(bins = 50, alpha = 0.7, position = "identity") +
facet_wrap(~MAF_Category, scales = "free_y", ncol = 1) +
scale_fill_manual(values = c("MAF > 0.05" = "#2E8B57",
"MAF [0.005, 0.05]" = "#4682B4",
"MAF < 0.005" = "#CD5C5C")) +
labs(
title = "Distribution of Imputation Quality (R²) by MAF Categories",
subtitle = plot_subtitle,
x = "R² (Imputation Quality)",
y = "Count",
fill = "MAF Category"
) +
theme_minimal() +
theme(
plot.title = element_text(size = 14, face = "bold"),
plot.subtitle = element_text(size = 12),
strip.text = element_text(size = 11, face = "bold"),
legend.position = "bottom"
) +
xlim(0, 1)
return(list(plot = p, summary = summary_stats, data = data))
}
# Main function that takes command line arguments
main <- function() {
# Get command line arguments
args <- commandArgs(trailingOnly = TRUE)
# Check if file path is provided
if (length(args) == 0) {
cat("Usage: Rscript topmed_processor.R <file_or_list> [options]\n")
cat("Examples:\n")
cat(" # Single file\n")
cat(" Rscript topmed_processor.R chr20.info.gz\n")
cat(" \n")
cat(" # Multiple files with ancestry labels\n")
cat(" Rscript topmed_processor.R chr20.info.gz chr21.info.gz --ancestry='EUR,AFR' --sidebyside\n")
cat(" \n")
cat(" # Multiple files from a list file\n")
cat(" Rscript topmed_processor.R --filelist=file_list.txt\n")
cat(" \n")
cat(" # With options\n")
cat(" Rscript topmed_processor.R chr*.info.gz --exclude-perfect --violin\n")
cat("\nOptions:\n")
cat(" --filelist=<file> Read file paths from a text file (one per line)\n")
cat(" --ancestry=<list> Comma-separated ancestry labels (e.g., 'EUR,AFR,EAS')\n")
cat(" --violin Create violin plots (automatically enabled for multiple files)\n")
cat(" --sidebyside Create side-by-side violin plots (MAF categories on x-axis)\n")
cat(" --histogram Force histogram plots even for multiple files\n")
cat(" --exclude-perfect Filter out variants with R² = 1.0 before plotting\n")
cat(" --type=<format> Force file format (auto, topmed, tabdelim)\n")
cat("\nFile types:\n")
cat(" auto (default) - Auto-detect based on file extension\n")
cat(" topmed - TopMed .info format with INFO column containing MAF= and R2=\n")
cat(" tabdelim - Tab-delimited with separate MAF and R2 columns\n")
cat("\nFor tab-delimited files, script looks for columns named:\n")
cat(" MAF: all_maf, maf, MAF, ALL_MAF, allele_freq, AF\n")
cat(" R2: info, r2, R2, INFO, rsq, RSQ, imputation_quality\n")
stop("Please provide file path(s) as arguments")
}
# Parse arguments
file_list_arg <- grep("^--filelist=", args, value = TRUE)
ancestry_arg <- grep("^--ancestry=", args, value = TRUE)
exclude_perfect_r2 <- "--exclude-perfect" %in% args
force_violin <- "--violin" %in% args
force_sidebyside <- "--sidebyside" %in% args
force_histogram <- "--histogram" %in% args
type_arg <- grep("^--type=", args, value = TRUE)
# Get file type if specified
file_type <- "auto"
if (length(type_arg) > 0) {
file_type <- gsub("^--type=", "", type_arg[1])
}
# FIXED: Get ancestry labels if specified - handle both quoted and unquoted strings
ancestry_labels <- NULL
if (length(ancestry_arg) > 0) {
ancestry_string <- gsub("^--ancestry=", "", ancestry_arg[1])
# Remove quotes if present
ancestry_string <- gsub("^['\"]|['\"]$", "", ancestry_string)
ancestry_labels <- trimws(strsplit(ancestry_string, ",")[[1]])
cat("Parsed ancestry labels:", paste(ancestry_labels, collapse = ", "), "\n")
}
# Remove option arguments to get file paths
file_args <- args[!grepl("^--", args)]
# Determine file list
if (length(file_list_arg) > 0) {
# Read from file list
list_file <- gsub("^--filelist=", "", file_list_arg[1])
if (!file.exists(list_file)) {
stop(paste("File list not found:", list_file))
}
file_paths <- readLines(list_file)
file_paths <- file_paths[file_paths != "" & !grepl("^#", file_paths)] # Remove empty lines and comments
cat("Read", length(file_paths), "file paths from", list_file, "\n")
} else {
# Use command line arguments
file_paths <- file_args
}
if (length(file_paths) == 0) {
stop("No file paths provided")
}
cat("File paths to process:\n")
for (i in seq_along(file_paths)) {
cat(" ", i, ":", file_paths[i], "\n")
}
# Check files exist
missing_files <- file_paths[!file.exists(file_paths)]
if (length(missing_files) > 0) {
cat("Warning: The following files were not found:\n")
for (f in missing_files) {
cat(" ", f, "\n")
}
file_paths <- file_paths[file.exists(file_paths)]
}
if (length(file_paths) == 0) {
stop("No valid files found")
}
# FIXED: Check ancestry labels if provided with better error messaging
if (!is.null(ancestry_labels)) {
if (length(ancestry_labels) != length(file_paths)) {
cat("ERROR: Number of ancestry labels (", length(ancestry_labels),
") must match number of files (", length(file_paths), ")\n", sep="")
cat("Ancestry labels provided: ", paste(ancestry_labels, collapse = ", "), "\n")
cat("Files found: ", length(file_paths), "\n")
for (i in seq_along(file_paths)) {
cat(" File", i, ":", basename(file_paths[i]), "\n")
}
stop("Mismatch between number of ancestry labels and files")
}
cat("Ancestry labels provided:\n")
for (i in seq_along(file_paths)) {
cat(" ", basename(file_paths[i]), " -> ", ancestry_labels[i], "\n")
}
}
cat("Processing", length(file_paths), "file(s):\n")
for (f in file_paths) {
cat(" ", f, "\n")
}
if (exclude_perfect_r2) {
cat("Will exclude variants with R² = 1.0 from plots\n")
}
# Start timing
start_time <- Sys.time()
# Process files
if (length(file_paths) == 1 && !force_violin) {
# Single file processing (original functionality)
cat("\n=== Single file processing ===\n")
imputation_data <- process_file(file_paths[1], file_type = file_type)
# Use original plotting function for single files (unless violin forced)
results <- plot_r2_by_maf(imputation_data, exclude_perfect_r2 = exclude_perfect_r2)
# Create output filenames
base_name <- gsub("\\.(info|info\\.gz|txt|tsv)$", "", basename(file_paths[1]))
} else {
# Multiple file processing
cat("\n=== Multiple file processing ===\n")
# Process all files
combined_data <- process_multiple_files(file_paths, ancestry_labels)
# FIXED: Debug output to verify ancestry labels are correctly applied
if ("Ancestry" %in% names(combined_data)) {
cat("\nFinal ancestry assignments in combined data:\n")
ancestry_check <- combined_data[, .(Count = .N), by = .(File = basename(File), Ancestry)]
print(ancestry_check)
}
# Create plots based on options
if (force_histogram) {
results <- plot_r2_by_maf(combined_data, exclude_perfect_r2 = exclude_perfect_r2)
plot_type <- "histogram"
} else if (force_sidebyside) {
results <- plot_violin_sidebyside_multifile(combined_data, exclude_perfect_r2 = exclude_perfect_r2)
plot_type <- "violin_sidebyside"
} else {
# Default: stacked violin plots (original layout)
results <- plot_violin_by_maf_multifile(combined_data, exclude_perfect_r2 = exclude_perfect_r2)
plot_type <- "violin"
}
# Create output filename based on ancestry labels or number of files
if (!is.null(ancestry_labels)) {
base_name <- paste0("combined_", paste(ancestry_labels, collapse = "_"))
} else {
base_name <- paste0("combined_", length(file_paths), "_files")
}
}
# End timing
end_time <- Sys.time()
processing_time <- end_time - start_time
cat("\nTotal processing time:", round(as.numeric(processing_time), 2),
attr(processing_time, "units"), "\n")
# Display the plot
print(results$plot)
# Add suffix if filtering perfect R2
suffix <- ""
if (exclude_perfect_r2) {
suffix <- "_no_perfect_r2"
}
# Add plot type suffix
if (length(file_paths) > 1 && !force_histogram) {
plot_type_suffix <- if (force_sidebyside) "violin_sidebyside" else "violin"
} else {
plot_type_suffix <- "histogram"
}
plot_filename <- paste0(base_name, "_r2_", plot_type_suffix, "_by_maf", suffix, ".png")
data_filename <- paste0(base_name, "_processed_data", suffix, ".csv")
# Save the plot
plot_width <- if (length(file_paths) > 3) max(10, length(file_paths) * 2) else 10
ggsave(plot_filename,
plot = results$plot,
width = plot_width, height = 8,
dpi = 300)
cat("Plot saved as:", plot_filename, "\n")
# Save processed data using fwrite (much faster than write.csv)
fwrite(results$data, data_filename)
cat("Processed data saved as:", data_filename, "\n")
# Print final statistics
total_variants <- nrow(results$data)
cat("\nFinal Statistics:\n")
cat("Total variants processed:", total_variants, "\n")
if (length(file_paths) > 1) {
cat("Files processed:", length(file_paths), "\n")
cat("Average variants per file:", round(total_variants / length(file_paths), 0), "\n")
if ("Ancestry" %in% names(results$data)) {
cat("Ancestry groups:", paste(unique(results$data$Ancestry), collapse = ", "), "\n")
}
}
cat("Processing speed:", round(total_variants / as.numeric(processing_time), 0),
"variants per", attr(processing_time, "units"), "\n")
}
# Run main function
if (!interactive()) {
main()
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment