Last active
June 10, 2025 19:31
-
-
Save shukwong/ab9bc5812219db7e33988000cfc044ae to your computer and use it in GitHub Desktop.
plot distribution of TopMed Imputation R2 in various MAF in one chromosome
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# TopMed Imputation .info File Processor - Fixed Version | |
# Fast processing using data.table and vectorized operations | |
library(data.table) | |
library(ggplot2) | |
library(dplyr) | |
library(stringr) | |
# Function to process different file formats | |
process_file <- function(file_path, file_type = "auto") { | |
cat("Reading file:", file_path, "\n") | |
# Auto-detect file type if not specified | |
if (file_type == "auto") { | |
# Check file extension and structure | |
if (grepl("\\.(info|info\\.gz|vcf|vcf\\.gz)$", file_path)) { | |
file_type <- "topmed" | |
cat("Auto-detected: TopMed imputation format\n") | |
} else { | |
file_type <- "tabdelim" | |
cat("Auto-detected: Tab-delimited format\n") | |
} | |
} | |
if (file_type == "topmed") { | |
return(process_topmed_info(file_path)) | |
} else { | |
return(process_tabdelim_file(file_path)) | |
} | |
} | |
# Fast function to process TopMed .info file | |
process_topmed_info <- function(file_path) { | |
cat("Processing TopMed format file...\n") | |
# First, count lines starting with ## to skip them | |
cat("Scanning for header and comment lines...\n") | |
# Open connection to read line by line | |
if (grepl("\\.gz$", file_path)) { | |
con <- gzfile(file_path, "rt") | |
} else { | |
con <- file(file_path, "rt") | |
} | |
skip_lines <- 0 | |
line_num <- 0 | |
header_found <- FALSE | |
while (TRUE) { | |
line <- readLines(con, n = 1) | |
if (length(line) == 0) break | |
line_num <- line_num + 1 | |
# Count lines that start with ## (these need to be skipped) | |
if (grepl("^##", line)) { | |
skip_lines <- skip_lines + 1 | |
} | |
# Check if this is the header line (starts with #CHROM) | |
else if (grepl("^#CHROM", line)) { | |
header_found <- TRUE | |
cat("Found", skip_lines, "comment lines (##) to skip\n") | |
cat("Found header (#CHROM) at line", line_num, "\n") | |
break | |
} | |
# If we hit a line that doesn't start with # at all, we've gone too far | |
else if (!grepl("^#", line)) { | |
break | |
} | |
# Safety check - don't read too many lines looking for header | |
if (line_num > 1000) { | |
close(con) | |
stop("Header line starting with #CHROM not found in first 1000 lines") | |
} | |
} | |
close(con) | |
if (!header_found) { | |
stop("Header line starting with #CHROM not found") | |
} | |
cat("Loading data with fread, skipping", skip_lines, "comment lines...\n") | |
# Now use fread with the correct skip number (skip only the ## lines) | |
dt <- fread( | |
file_path, | |
skip = skip_lines, # Skip only the ## comment lines | |
header = TRUE, # The #CHROM line will be treated as header | |
sep = "\t", | |
stringsAsFactors = FALSE, | |
showProgress = TRUE | |
) | |
cat("Loaded", nrow(dt), "variants\n") | |
cat("Extracting MAF and R2 values from INFO column...\n") | |
# Extract MAF and R2 using vectorized string operations | |
# Much faster than loop-based extraction | |
# Extract MAF values using regex | |
maf_extracted <- str_extract(dt$INFO, "MAF=([0-9.e-]+)") | |
maf_values <- as.numeric(gsub("MAF=", "", maf_extracted)) | |
# Extract R2 values using regex | |
r2_extracted <- str_extract(dt$INFO, "R2=([0-9.e-]+)") | |
r2_values <- as.numeric(gsub("R2=", "", r2_extracted)) | |
# Create result data.table with only non-missing values | |
result_dt <- data.table( | |
MAF = maf_values, | |
R2 = r2_values | |
) | |
# Remove rows with missing MAF or R2 | |
result_dt <- result_dt[!is.na(MAF) & !is.na(R2)] | |
cat("Extracted", nrow(result_dt), "variants with valid MAF and R2 values\n") | |
return(result_dt) | |
} | |
# Function to process tab-delimited files with direct MAF and R2 columns | |
process_tabdelim_file <- function(file_path) { | |
cat("Processing tab-delimited file...\n") | |
# Read the file directly with fread | |
dt <- fread( | |
file_path, | |
header = TRUE, | |
sep = "\t", | |
stringsAsFactors = FALSE, | |
showProgress = TRUE | |
) | |
cat("Loaded", nrow(dt), "rows\n") | |
cat("Columns found:", paste(names(dt), collapse = ", "), "\n") | |
# Check for required columns | |
maf_col <- NULL | |
r2_col <- NULL | |
# Look for MAF column (case-insensitive) | |
maf_candidates <- c("all_maf", "maf", "MAF", "ALL_MAF", "allele_freq", "AF") | |
for (col in maf_candidates) { | |
if (col %in% names(dt)) { | |
maf_col <- col | |
break | |
} | |
} | |
# Look for R2 column (case-insensitive) | |
r2_candidates <- c("info", "r2", "R2", "INFO", "rsq", "RSQ", "imputation_quality") | |
for (col in r2_candidates) { | |
if (col %in% names(dt)) { | |
r2_col <- col | |
break | |
} | |
} | |
if (is.null(maf_col)) { | |
stop("MAF column not found. Looking for: ", paste(maf_candidates, collapse = ", ")) | |
} | |
if (is.null(r2_col)) { | |
stop("R2 column not found. Looking for: ", paste(r2_candidates, collapse = ", ")) | |
} | |
cat("Using MAF column:", maf_col, "\n") | |
cat("Using R2 column:", r2_col, "\n") | |
# Extract the values | |
result_dt <- data.table( | |
MAF = as.numeric(dt[[maf_col]]), | |
R2 = as.numeric(dt[[r2_col]]) | |
) | |
# Remove rows with missing MAF or R2 | |
result_dt <- result_dt[!is.na(MAF) & !is.na(R2)] | |
cat("Extracted", nrow(result_dt), "variants with valid MAF and R2 values\n") | |
return(result_dt) | |
} | |
# Function to process multiple files | |
process_multiple_files <- function(file_list, ancestry_labels = NULL) { | |
cat("Processing", length(file_list), "files...\n") | |
all_data <- list() | |
for (i in seq_along(file_list)) { | |
file_path <- file_list[i] | |
cat("\n--- Processing file", i, "of", length(file_list), ":", basename(file_path), "---\n") | |
if (!file.exists(file_path)) { | |
cat("Warning: File not found:", file_path, "- skipping\n") | |
next | |
} | |
# Process the file | |
file_data <- process_file(file_path) | |
if (nrow(file_data) == 0) { | |
cat("Warning: No valid data found in", basename(file_path), "- skipping\n") | |
next | |
} | |
# Add file identifier and ancestry label | |
file_data$File <- basename(file_path) | |
# Use ancestry label if provided, otherwise use cleaned file name | |
if (!is.null(ancestry_labels) && i <= length(ancestry_labels)) { | |
file_data$Ancestry <- ancestry_labels[i] | |
cat("Assigned ancestry label:", ancestry_labels[i], "to file:", basename(file_path), "\n") | |
} else { | |
# Create a cleaner file name (remove extensions) | |
clean_name <- gsub("\\.(info|info\\.gz|txt|tsv)$", "", basename(file_path)) | |
file_data$Ancestry <- clean_name | |
cat("No ancestry label provided, using cleaned filename:", clean_name, "\n") | |
} | |
# Store the data | |
all_data[[i]] <- file_data | |
} | |
# Combine all data | |
if (length(all_data) == 0) { | |
stop("No valid data found in any files") | |
} | |
combined_data <- rbindlist(all_data, fill = TRUE) | |
cat("\nCombined data summary:\n") | |
cat("Total variants:", nrow(combined_data), "\n") | |
cat("Files processed:", length(unique(combined_data$File)), "\n") | |
# Check if Ancestry column exists and show its values | |
if ("Ancestry" %in% names(combined_data)) { | |
cat("Ancestries found:", paste(unique(combined_data$Ancestry), collapse = ", "), "\n") | |
} else { | |
cat("Warning: No Ancestry column found in combined data!\n") | |
} | |
# Print per-ancestry summary | |
if ("Ancestry" %in% names(combined_data)) { | |
ancestry_summary <- combined_data[, .(Count = .N), by = Ancestry] | |
print(ancestry_summary) | |
} | |
return(combined_data) | |
} | |
# Function to create violin plots for multiple files | |
plot_violin_by_maf_multifile <- function(data, exclude_perfect_r2 = FALSE) { | |
cat("Creating MAF categories and violin plots...\n") | |
# Convert to data.table if not already | |
if (!is.data.table(data)) { | |
data <- as.data.table(data) | |
} | |
# Debug: Check what columns we have | |
cat("Available columns in data:", paste(names(data), collapse = ", "), "\n") | |
if ("Ancestry" %in% names(data)) { | |
cat("Ancestry values found:", paste(unique(data$Ancestry), collapse = ", "), "\n") | |
} | |
# Filter out R2 == 1 if requested | |
if (exclude_perfect_r2) { | |
original_count <- nrow(data) | |
data <- data[R2 != 1.0] | |
filtered_count <- nrow(data) | |
cat("Filtered out", (original_count - filtered_count), "variants with R2 = 1.0\n") | |
cat("Remaining variants:", filtered_count, "\n") | |
} | |
# Create MAF categories using data.table syntax (very fast) | |
data[, MAF_Category := fcase( | |
MAF > 0.05, "MAF > 0.05", | |
MAF >= 0.005 & MAF <= 0.05, "MAF [0.005, 0.05]", | |
MAF < 0.005, "MAF < 0.005", | |
default = "Other" | |
)] | |
# Set factor levels | |
data[, MAF_Category := factor(MAF_Category, | |
levels = c("MAF > 0.05", "MAF [0.005, 0.05]", "MAF < 0.005"))] | |
# FIXED: Always use Ancestry column if it exists, with proper factor ordering | |
if ("Ancestry" %in% names(data)) { | |
display_label <- "Ancestry" | |
# Convert to factor to control ordering in plots | |
data[, Display_Label := factor(Ancestry, levels = unique(Ancestry))] | |
cat("Using Ancestry labels for plotting\n") | |
} else { | |
display_label <- "File" | |
# Create a cleaner file name for plotting (remove extensions and shorten if needed) | |
data[, Display_Label := gsub("\\.(info|info\\.gz|txt|tsv)$", "", File)] | |
# If file names are very long, keep only the most distinctive part | |
if (any(nchar(unique(data$Display_Label)) > 15)) { | |
data[, Display_Label := substr(Display_Label, 1, 15)] | |
} | |
# Convert to factor | |
data[, Display_Label := factor(Display_Label, levels = unique(Display_Label))] | |
cat("Using File names for plotting\n") | |
} | |
cat("Display labels:", paste(levels(data$Display_Label), collapse = ", "), "\n") | |
# Fast summary statistics using data.table | |
summary_stats <- data[!is.na(MAF_Category), .( | |
Count = .N, | |
Mean_R2 = mean(R2, na.rm = TRUE), | |
Median_R2 = median(R2, na.rm = TRUE), | |
Min_R2 = min(R2, na.rm = TRUE), | |
Max_R2 = max(R2, na.rm = TRUE), | |
Q25_R2 = quantile(R2, 0.25, na.rm = TRUE), | |
Q75_R2 = quantile(R2, 0.75, na.rm = TRUE) | |
), by = .(Display_Label, MAF_Category)] | |
cat("\nSummary by", display_label, "and MAF category:\n") | |
print(summary_stats) | |
# Create plot title with filtering info | |
plot_subtitle <- paste("Comparison across", length(unique(data$Display_Label)), display_label, "groups") | |
if (exclude_perfect_r2) { | |
plot_subtitle <- paste(plot_subtitle, "(excluding R² = 1.0)") | |
} | |
# Check how many files we have to determine the best layout | |
n_groups <- length(unique(data$Display_Label)) | |
# For many files, use different colors for each file instead of MAF categories | |
if (n_groups > 6) { | |
# Use RColorBrewer or rainbow colors for many files | |
colors <- rainbow(n_groups) | |
names(colors) <- levels(data$Display_Label) | |
# Create violin plot with files as fill aesthetic | |
p <- ggplot(data[!is.na(MAF_Category)], aes(x = Display_Label, y = R2, fill = Display_Label)) + | |
geom_violin(scale = "width", trim = FALSE, alpha = 0.7) + | |
geom_boxplot(width = 0.1, alpha = 0.5, outlier.size = 0.5, fill = "white") + | |
facet_wrap(~MAF_Category, scales = "free_x", ncol = 1) + | |
scale_fill_manual(values = colors) + | |
labs( | |
title = "Distribution of Imputation Quality (R²) by MAF Categories", | |
subtitle = plot_subtitle, | |
x = display_label, | |
y = "R² (Imputation Quality)", | |
fill = display_label | |
) + | |
theme_minimal() + | |
theme( | |
plot.title = element_text(size = 14, face = "bold"), | |
plot.subtitle = element_text(size = 12), | |
strip.text = element_text(size = 11, face = "bold"), | |
legend.position = "none", # Too many files for legend | |
axis.text.x = element_text(angle = 45, hjust = 1, size = 8) | |
) + | |
ylim(0, 1) | |
} else { | |
# For fewer files, keep MAF categories as colors | |
p <- ggplot(data[!is.na(MAF_Category)], aes(x = Display_Label, y = R2, fill = MAF_Category)) + | |
geom_violin(scale = "width", trim = FALSE, alpha = 0.7) + | |
geom_boxplot(width = 0.1, alpha = 0.5, outlier.size = 0.5) + | |
facet_wrap(~MAF_Category, scales = "free_x", ncol = 1) + | |
scale_fill_manual(values = c("MAF > 0.05" = "#2E8B57", | |
"MAF [0.005, 0.05]" = "#4682B4", | |
"MAF < 0.005" = "#CD5C5C")) + | |
labs( | |
title = "Distribution of Imputation Quality (R²) by MAF Categories", | |
subtitle = plot_subtitle, | |
x = display_label, | |
y = "R² (Imputation Quality)", | |
fill = "MAF Category" | |
) + | |
theme_minimal() + | |
theme( | |
plot.title = element_text(size = 14, face = "bold"), | |
plot.subtitle = element_text(size = 12), | |
strip.text = element_text(size = 11, face = "bold"), | |
legend.position = "bottom", | |
axis.text.x = element_text(angle = 45, hjust = 1, size = 8) | |
) + | |
ylim(0, 1) | |
} | |
return(list(plot = p, summary = summary_stats, data = data)) | |
} | |
# Alternative plotting function for side-by-side comparison | |
plot_violin_sidebyside_multifile <- function(data, exclude_perfect_r2 = FALSE) { | |
cat("Creating side-by-side violin plots by MAF categories...\n") | |
# Convert to data.table if not already | |
if (!is.data.table(data)) { | |
data <- as.data.table(data) | |
} | |
# Filter out R2 == 1 if requested | |
if (exclude_perfect_r2) { | |
original_count <- nrow(data) | |
data <- data[R2 != 1.0] | |
filtered_count <- nrow(data) | |
cat("Filtered out", (original_count - filtered_count), "variants with R2 = 1.0\n") | |
cat("Remaining variants:", filtered_count, "\n") | |
} | |
# Create MAF categories using data.table syntax (very fast) | |
data[, MAF_Category := fcase( | |
MAF > 0.05, "MAF > 0.05", | |
MAF >= 0.005 & MAF <= 0.05, "MAF [0.005, 0.05]", | |
MAF < 0.005, "MAF < 0.005", | |
default = "Other" | |
)] | |
# Set factor levels | |
data[, MAF_Category := factor(MAF_Category, | |
levels = c("MAF > 0.05", "MAF [0.005, 0.05]", "MAF < 0.005"))] | |
# FIXED: Always use Ancestry column if it exists, with proper factor ordering | |
if ("Ancestry" %in% names(data)) { | |
display_label <- "Ancestry" | |
# Convert to factor to control ordering in plots | |
data[, Display_Label := factor(Ancestry, levels = unique(Ancestry))] | |
cat("Using Ancestry labels for plotting\n") | |
} else { | |
display_label <- "File" | |
# Create a cleaner file name for plotting (remove extensions and shorten if needed) | |
data[, Display_Label := gsub("\\.(info|info\\.gz|txt|tsv)$", "", File)] | |
# If file names are very long, keep only the most distinctive part | |
if (any(nchar(unique(data$Display_Label)) > 15)) { | |
data[, Display_Label := substr(Display_Label, 1, 15)] | |
} | |
# Convert to factor | |
data[, Display_Label := factor(Display_Label, levels = unique(Display_Label))] | |
} | |
# Fast summary statistics using data.table | |
summary_stats <- data[!is.na(MAF_Category), .( | |
Count = .N, | |
Mean_R2 = mean(R2, na.rm = TRUE), | |
Median_R2 = median(R2, na.rm = TRUE), | |
Min_R2 = min(R2, na.rm = TRUE), | |
Max_R2 = max(R2, na.rm = TRUE), | |
Q25_R2 = quantile(R2, 0.25, na.rm = TRUE), | |
Q75_R2 = quantile(R2, 0.75, na.rm = TRUE) | |
), by = .(Display_Label, MAF_Category)] | |
cat("\nSummary by", display_label, "and MAF category:\n") | |
print(summary_stats) | |
# Create plot title with filtering info | |
plot_subtitle <- paste("Comparison across", length(unique(data$Display_Label)), display_label, "groups") | |
if (exclude_perfect_r2) { | |
plot_subtitle <- paste(plot_subtitle, "(excluding R² = 1.0)") | |
} | |
# Check how many groups we have | |
n_groups <- length(unique(data$Display_Label)) | |
# Generate colors for groups using the levels from the factor | |
if (n_groups <= 10) { | |
# Use Set3 palette for up to 10 files (more color-blind friendly) | |
library(RColorBrewer) | |
group_colors <- brewer.pal(min(n_groups, 12), "Set3")[1:n_groups] | |
} else { | |
# Use rainbow for many files | |
group_colors <- rainbow(n_groups) | |
} | |
names(group_colors) <- levels(data$Display_Label) | |
# Create side-by-side violin plot (MAF categories as x-axis, groups as colors) | |
p <- ggplot(data[!is.na(MAF_Category)], aes(x = MAF_Category, y = R2, fill = Display_Label)) + | |
geom_violin(scale = "width", trim = FALSE, alpha = 0.7, position = position_dodge(width = 0.8)) + | |
geom_boxplot(width = 0.1, alpha = 0.8, position = position_dodge(width = 0.8), | |
outlier.size = 0.3, color = "black") + | |
scale_fill_manual(values = group_colors) + | |
labs( | |
title = "Distribution of Imputation Quality (R²) by MAF Categories", | |
subtitle = plot_subtitle, | |
x = "MAF Category", | |
y = "R² (Imputation Quality)", | |
fill = display_label | |
) + | |
theme_minimal() + | |
theme( | |
plot.title = element_text(size = 14, face = "bold"), | |
plot.subtitle = element_text(size = 12), | |
legend.position = if(n_groups <= 8) "bottom" else "right", | |
legend.title = element_text(size = 10), | |
legend.text = element_text(size = 8), | |
axis.text.x = element_text(angle = 0, hjust = 0.5, size = 10), | |
axis.text.y = element_text(size = 10) | |
) + | |
ylim(0, 1) | |
# If too many groups, split legend into multiple columns | |
if (n_groups > 8) { | |
p <- p + guides(fill = guide_legend(ncol = ceiling(n_groups/10))) | |
} | |
return(list(plot = p, summary = summary_stats, data = data)) | |
} | |
# Keep the original single-file plotting function for backward compatibility | |
plot_r2_by_maf <- function(data, exclude_perfect_r2 = FALSE) { | |
cat("Creating MAF categories and calculating statistics...\n") | |
# Convert to data.table if not already | |
if (!is.data.table(data)) { | |
data <- as.data.table(data) | |
} | |
# Filter out R2 == 1 if requested | |
if (exclude_perfect_r2) { | |
original_count <- nrow(data) | |
data <- data[R2 != 1.0] | |
filtered_count <- nrow(data) | |
cat("Filtered out", (original_count - filtered_count), "variants with R2 = 1.0\n") | |
cat("Remaining variants:", filtered_count, "\n") | |
} | |
# Create MAF categories using data.table syntax (very fast) | |
data[, MAF_Category := fcase( | |
MAF > 0.05, "MAF > 0.05", | |
MAF >= 0.005 & MAF <= 0.05, "MAF [0.005, 0.05]", | |
MAF < 0.005, "MAF < 0.005", | |
default = "Other" | |
)] | |
# Set factor levels | |
data[, MAF_Category := factor(MAF_Category, | |
levels = c("MAF > 0.05", "MAF [0.005, 0.05]", "MAF < 0.005"))] | |
# Fast summary statistics using data.table | |
summary_stats <- data[!is.na(MAF_Category), .( | |
Count = .N, | |
Mean_R2 = mean(R2, na.rm = TRUE), | |
Median_R2 = median(R2, na.rm = TRUE), | |
Min_R2 = min(R2, na.rm = TRUE), | |
Max_R2 = max(R2, na.rm = TRUE), | |
Q25_R2 = quantile(R2, 0.25, na.rm = TRUE), | |
Q75_R2 = quantile(R2, 0.75, na.rm = TRUE) | |
), by = MAF_Category] | |
cat("\nSummary by MAF category:\n") | |
print(summary_stats) | |
# Create plot title with filtering info | |
plot_subtitle <- "Imputation Results" | |
if (exclude_perfect_r2) { | |
plot_subtitle <- paste(plot_subtitle, "(excluding R² = 1.0)") | |
} | |
# Create the plot using data.table (ggplot2 works well with data.table) | |
p <- ggplot(data[!is.na(MAF_Category)], aes(x = R2, fill = MAF_Category)) + | |
geom_histogram(bins = 50, alpha = 0.7, position = "identity") + | |
facet_wrap(~MAF_Category, scales = "free_y", ncol = 1) + | |
scale_fill_manual(values = c("MAF > 0.05" = "#2E8B57", | |
"MAF [0.005, 0.05]" = "#4682B4", | |
"MAF < 0.005" = "#CD5C5C")) + | |
labs( | |
title = "Distribution of Imputation Quality (R²) by MAF Categories", | |
subtitle = plot_subtitle, | |
x = "R² (Imputation Quality)", | |
y = "Count", | |
fill = "MAF Category" | |
) + | |
theme_minimal() + | |
theme( | |
plot.title = element_text(size = 14, face = "bold"), | |
plot.subtitle = element_text(size = 12), | |
strip.text = element_text(size = 11, face = "bold"), | |
legend.position = "bottom" | |
) + | |
xlim(0, 1) | |
return(list(plot = p, summary = summary_stats, data = data)) | |
} | |
# Main function that takes command line arguments | |
main <- function() { | |
# Get command line arguments | |
args <- commandArgs(trailingOnly = TRUE) | |
# Check if file path is provided | |
if (length(args) == 0) { | |
cat("Usage: Rscript topmed_processor.R <file_or_list> [options]\n") | |
cat("Examples:\n") | |
cat(" # Single file\n") | |
cat(" Rscript topmed_processor.R chr20.info.gz\n") | |
cat(" \n") | |
cat(" # Multiple files with ancestry labels\n") | |
cat(" Rscript topmed_processor.R chr20.info.gz chr21.info.gz --ancestry='EUR,AFR' --sidebyside\n") | |
cat(" \n") | |
cat(" # Multiple files from a list file\n") | |
cat(" Rscript topmed_processor.R --filelist=file_list.txt\n") | |
cat(" \n") | |
cat(" # With options\n") | |
cat(" Rscript topmed_processor.R chr*.info.gz --exclude-perfect --violin\n") | |
cat("\nOptions:\n") | |
cat(" --filelist=<file> Read file paths from a text file (one per line)\n") | |
cat(" --ancestry=<list> Comma-separated ancestry labels (e.g., 'EUR,AFR,EAS')\n") | |
cat(" --violin Create violin plots (automatically enabled for multiple files)\n") | |
cat(" --sidebyside Create side-by-side violin plots (MAF categories on x-axis)\n") | |
cat(" --histogram Force histogram plots even for multiple files\n") | |
cat(" --exclude-perfect Filter out variants with R² = 1.0 before plotting\n") | |
cat(" --type=<format> Force file format (auto, topmed, tabdelim)\n") | |
cat("\nFile types:\n") | |
cat(" auto (default) - Auto-detect based on file extension\n") | |
cat(" topmed - TopMed .info format with INFO column containing MAF= and R2=\n") | |
cat(" tabdelim - Tab-delimited with separate MAF and R2 columns\n") | |
cat("\nFor tab-delimited files, script looks for columns named:\n") | |
cat(" MAF: all_maf, maf, MAF, ALL_MAF, allele_freq, AF\n") | |
cat(" R2: info, r2, R2, INFO, rsq, RSQ, imputation_quality\n") | |
stop("Please provide file path(s) as arguments") | |
} | |
# Parse arguments | |
file_list_arg <- grep("^--filelist=", args, value = TRUE) | |
ancestry_arg <- grep("^--ancestry=", args, value = TRUE) | |
exclude_perfect_r2 <- "--exclude-perfect" %in% args | |
force_violin <- "--violin" %in% args | |
force_sidebyside <- "--sidebyside" %in% args | |
force_histogram <- "--histogram" %in% args | |
type_arg <- grep("^--type=", args, value = TRUE) | |
# Get file type if specified | |
file_type <- "auto" | |
if (length(type_arg) > 0) { | |
file_type <- gsub("^--type=", "", type_arg[1]) | |
} | |
# FIXED: Get ancestry labels if specified - handle both quoted and unquoted strings | |
ancestry_labels <- NULL | |
if (length(ancestry_arg) > 0) { | |
ancestry_string <- gsub("^--ancestry=", "", ancestry_arg[1]) | |
# Remove quotes if present | |
ancestry_string <- gsub("^['\"]|['\"]$", "", ancestry_string) | |
ancestry_labels <- trimws(strsplit(ancestry_string, ",")[[1]]) | |
cat("Parsed ancestry labels:", paste(ancestry_labels, collapse = ", "), "\n") | |
} | |
# Remove option arguments to get file paths | |
file_args <- args[!grepl("^--", args)] | |
# Determine file list | |
if (length(file_list_arg) > 0) { | |
# Read from file list | |
list_file <- gsub("^--filelist=", "", file_list_arg[1]) | |
if (!file.exists(list_file)) { | |
stop(paste("File list not found:", list_file)) | |
} | |
file_paths <- readLines(list_file) | |
file_paths <- file_paths[file_paths != "" & !grepl("^#", file_paths)] # Remove empty lines and comments | |
cat("Read", length(file_paths), "file paths from", list_file, "\n") | |
} else { | |
# Use command line arguments | |
file_paths <- file_args | |
} | |
if (length(file_paths) == 0) { | |
stop("No file paths provided") | |
} | |
cat("File paths to process:\n") | |
for (i in seq_along(file_paths)) { | |
cat(" ", i, ":", file_paths[i], "\n") | |
} | |
# Check files exist | |
missing_files <- file_paths[!file.exists(file_paths)] | |
if (length(missing_files) > 0) { | |
cat("Warning: The following files were not found:\n") | |
for (f in missing_files) { | |
cat(" ", f, "\n") | |
} | |
file_paths <- file_paths[file.exists(file_paths)] | |
} | |
if (length(file_paths) == 0) { | |
stop("No valid files found") | |
} | |
# FIXED: Check ancestry labels if provided with better error messaging | |
if (!is.null(ancestry_labels)) { | |
if (length(ancestry_labels) != length(file_paths)) { | |
cat("ERROR: Number of ancestry labels (", length(ancestry_labels), | |
") must match number of files (", length(file_paths), ")\n", sep="") | |
cat("Ancestry labels provided: ", paste(ancestry_labels, collapse = ", "), "\n") | |
cat("Files found: ", length(file_paths), "\n") | |
for (i in seq_along(file_paths)) { | |
cat(" File", i, ":", basename(file_paths[i]), "\n") | |
} | |
stop("Mismatch between number of ancestry labels and files") | |
} | |
cat("Ancestry labels provided:\n") | |
for (i in seq_along(file_paths)) { | |
cat(" ", basename(file_paths[i]), " -> ", ancestry_labels[i], "\n") | |
} | |
} | |
cat("Processing", length(file_paths), "file(s):\n") | |
for (f in file_paths) { | |
cat(" ", f, "\n") | |
} | |
if (exclude_perfect_r2) { | |
cat("Will exclude variants with R² = 1.0 from plots\n") | |
} | |
# Start timing | |
start_time <- Sys.time() | |
# Process files | |
if (length(file_paths) == 1 && !force_violin) { | |
# Single file processing (original functionality) | |
cat("\n=== Single file processing ===\n") | |
imputation_data <- process_file(file_paths[1], file_type = file_type) | |
# Use original plotting function for single files (unless violin forced) | |
results <- plot_r2_by_maf(imputation_data, exclude_perfect_r2 = exclude_perfect_r2) | |
# Create output filenames | |
base_name <- gsub("\\.(info|info\\.gz|txt|tsv)$", "", basename(file_paths[1])) | |
} else { | |
# Multiple file processing | |
cat("\n=== Multiple file processing ===\n") | |
# Process all files | |
combined_data <- process_multiple_files(file_paths, ancestry_labels) | |
# FIXED: Debug output to verify ancestry labels are correctly applied | |
if ("Ancestry" %in% names(combined_data)) { | |
cat("\nFinal ancestry assignments in combined data:\n") | |
ancestry_check <- combined_data[, .(Count = .N), by = .(File = basename(File), Ancestry)] | |
print(ancestry_check) | |
} | |
# Create plots based on options | |
if (force_histogram) { | |
results <- plot_r2_by_maf(combined_data, exclude_perfect_r2 = exclude_perfect_r2) | |
plot_type <- "histogram" | |
} else if (force_sidebyside) { | |
results <- plot_violin_sidebyside_multifile(combined_data, exclude_perfect_r2 = exclude_perfect_r2) | |
plot_type <- "violin_sidebyside" | |
} else { | |
# Default: stacked violin plots (original layout) | |
results <- plot_violin_by_maf_multifile(combined_data, exclude_perfect_r2 = exclude_perfect_r2) | |
plot_type <- "violin" | |
} | |
# Create output filename based on ancestry labels or number of files | |
if (!is.null(ancestry_labels)) { | |
base_name <- paste0("combined_", paste(ancestry_labels, collapse = "_")) | |
} else { | |
base_name <- paste0("combined_", length(file_paths), "_files") | |
} | |
} | |
# End timing | |
end_time <- Sys.time() | |
processing_time <- end_time - start_time | |
cat("\nTotal processing time:", round(as.numeric(processing_time), 2), | |
attr(processing_time, "units"), "\n") | |
# Display the plot | |
print(results$plot) | |
# Add suffix if filtering perfect R2 | |
suffix <- "" | |
if (exclude_perfect_r2) { | |
suffix <- "_no_perfect_r2" | |
} | |
# Add plot type suffix | |
if (length(file_paths) > 1 && !force_histogram) { | |
plot_type_suffix <- if (force_sidebyside) "violin_sidebyside" else "violin" | |
} else { | |
plot_type_suffix <- "histogram" | |
} | |
plot_filename <- paste0(base_name, "_r2_", plot_type_suffix, "_by_maf", suffix, ".png") | |
data_filename <- paste0(base_name, "_processed_data", suffix, ".csv") | |
# Save the plot | |
plot_width <- if (length(file_paths) > 3) max(10, length(file_paths) * 2) else 10 | |
ggsave(plot_filename, | |
plot = results$plot, | |
width = plot_width, height = 8, | |
dpi = 300) | |
cat("Plot saved as:", plot_filename, "\n") | |
# Save processed data using fwrite (much faster than write.csv) | |
fwrite(results$data, data_filename) | |
cat("Processed data saved as:", data_filename, "\n") | |
# Print final statistics | |
total_variants <- nrow(results$data) | |
cat("\nFinal Statistics:\n") | |
cat("Total variants processed:", total_variants, "\n") | |
if (length(file_paths) > 1) { | |
cat("Files processed:", length(file_paths), "\n") | |
cat("Average variants per file:", round(total_variants / length(file_paths), 0), "\n") | |
if ("Ancestry" %in% names(results$data)) { | |
cat("Ancestry groups:", paste(unique(results$data$Ancestry), collapse = ", "), "\n") | |
} | |
} | |
cat("Processing speed:", round(total_variants / as.numeric(processing_time), 0), | |
"variants per", attr(processing_time, "units"), "\n") | |
} | |
# Run main function | |
if (!interactive()) { | |
main() | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment