shukwong/plot_R2.R

## plot_R2.R
# TopMed Imputation .info File Processor - Fixed Version
# Fast processing using data.table and vectorized operations

library(data.table)
library(ggplot2)
library(dplyr)
library(stringr)

# Function to process different file formats
process_file <- function(file_path, file_type = "auto") {

  cat("Reading file:", file_path, "\n")

  # Auto-detect file type if not specified
  if (file_type == "auto") {
    # Check file extension and structure
    if (grepl("\\.(info|info\\.gz|vcf|vcf\\.gz)$", file_path)) {
      file_type <- "topmed"
      cat("Auto-detected: TopMed imputation format\n")
    } else {
      file_type <- "tabdelim"
      cat("Auto-detected: Tab-delimited format\n")
    }
  }

  if (file_type == "topmed") {
    return(process_topmed_info(file_path))
  } else {
    return(process_tabdelim_file(file_path))
  }
}

# Fast function to process TopMed .info file
process_topmed_info <- function(file_path) {

  cat("Processing TopMed format file...\n")

  # First, count lines starting with ## to skip them
  cat("Scanning for header and comment lines...\n")

  # Open connection to read line by line
  if (grepl("\\.gz$", file_path)) {
    con <- gzfile(file_path, "rt")
  } else {
    con <- file(file_path, "rt")
  }

  skip_lines <- 0
  line_num <- 0
  header_found <- FALSE

  while (TRUE) {
    line <- readLines(con, n = 1)
    if (length(line) == 0) break

    line_num <- line_num + 1

    # Count lines that start with ## (these need to be skipped)
    if (grepl("^##", line)) {
      skip_lines <- skip_lines + 1
    }
    # Check if this is the header line (starts with #CHROM)
    else if (grepl("^#CHROM", line)) {
      header_found <- TRUE
      cat("Found", skip_lines, "comment lines (##) to skip\n")
      cat("Found header (#CHROM) at line", line_num, "\n")
      break
    }
    # If we hit a line that doesn't start with # at all, we've gone too far
    else if (!grepl("^#", line)) {
      break
    }

    # Safety check - don't read too many lines looking for header
    if (line_num > 1000) {
      close(con)
      stop("Header line starting with #CHROM not found in first 1000 lines")
    }
  }

  close(con)

  if (!header_found) {
    stop("Header line starting with #CHROM not found")
  }

  cat("Loading data with fread, skipping", skip_lines, "comment lines...\n")

  # Now use fread with the correct skip number (skip only the ## lines)
  dt <- fread(
    file_path,
    skip = skip_lines,  # Skip only the ## comment lines
    header = TRUE,      # The #CHROM line will be treated as header
    sep = "\t",
    stringsAsFactors = FALSE,
    showProgress = TRUE
  )

  cat("Loaded", nrow(dt), "variants\n")
  cat("Extracting MAF and R2 values from INFO column...\n")

  # Extract MAF and R2 using vectorized string operations
  # Much faster than loop-based extraction

  # Extract MAF values using regex
  maf_extracted <- str_extract(dt$INFO, "MAF=([0-9.e-]+)")
  maf_values <- as.numeric(gsub("MAF=", "", maf_extracted))

  # Extract R2 values using regex
  r2_extracted <- str_extract(dt$INFO, "R2=([0-9.e-]+)")
  r2_values <- as.numeric(gsub("R2=", "", r2_extracted))

  # Create result data.table with only non-missing values
  result_dt <- data.table(
    MAF = maf_values,
    R2 = r2_values
  )

  # Remove rows with missing MAF or R2
  result_dt <- result_dt[!is.na(MAF) & !is.na(R2)]

  cat("Extracted", nrow(result_dt), "variants with valid MAF and R2 values\n")

  return(result_dt)
}

# Function to process tab-delimited files with direct MAF and R2 columns
process_tabdelim_file <- function(file_path) {

  cat("Processing tab-delimited file...\n")

  # Read the file directly with fread
  dt <- fread(
    file_path,
    header = TRUE,
    sep = "\t",
    stringsAsFactors = FALSE,
    showProgress = TRUE
  )

  cat("Loaded", nrow(dt), "rows\n")
  cat("Columns found:", paste(names(dt), collapse = ", "), "\n")

  # Check for required columns
  maf_col <- NULL
  r2_col <- NULL

  # Look for MAF column (case-insensitive)
  maf_candidates <- c("all_maf", "maf", "MAF", "ALL_MAF", "allele_freq", "AF")
  for (col in maf_candidates) {
    if (col %in% names(dt)) {
      maf_col <- col
      break
    }
  }

  # Look for R2 column (case-insensitive)
  r2_candidates <- c("info", "r2", "R2", "INFO", "rsq", "RSQ", "imputation_quality")
  for (col in r2_candidates) {
    if (col %in% names(dt)) {
      r2_col <- col
      break
    }
  }

  if (is.null(maf_col)) {
    stop("MAF column not found. Looking for: ", paste(maf_candidates, collapse = ", "))
  }

  if (is.null(r2_col)) {
    stop("R2 column not found. Looking for: ", paste(r2_candidates, collapse = ", "))
  }

  cat("Using MAF column:", maf_col, "\n")
  cat("Using R2 column:", r2_col, "\n")

  # Extract the values
  result_dt <- data.table(
    MAF = as.numeric(dt[[maf_col]]),
    R2 = as.numeric(dt[[r2_col]])
  )

  # Remove rows with missing MAF or R2
  result_dt <- result_dt[!is.na(MAF) & !is.na(R2)]

  cat("Extracted", nrow(result_dt), "variants with valid MAF and R2 values\n")

  return(result_dt)
}

# Function to process multiple files
process_multiple_files <- function(file_list, ancestry_labels = NULL) {

  cat("Processing", length(file_list), "files...\n")

  all_data <- list()

  for (i in seq_along(file_list)) {
    file_path <- file_list[i]
    cat("\n--- Processing file", i, "of", length(file_list), ":", basename(file_path), "---\n")

    if (!file.exists(file_path)) {
      cat("Warning: File not found:", file_path, "- skipping\n")
      next
    }

    # Process the file
    file_data <- process_file(file_path)

    if (nrow(file_data) == 0) {
      cat("Warning: No valid data found in", basename(file_path), "- skipping\n")
      next
    }

    # Add file identifier and ancestry label
    file_data$File <- basename(file_path)

    # Use ancestry label if provided, otherwise use cleaned file name
    if (!is.null(ancestry_labels) && i <= length(ancestry_labels)) {
      file_data$Ancestry <- ancestry_labels[i]
      cat("Assigned ancestry label:", ancestry_labels[i], "to file:", basename(file_path), "\n")
    } else {
      # Create a cleaner file name (remove extensions)
      clean_name <- gsub("\\.(info|info\\.gz|txt|tsv)$", "", basename(file_path))
      file_data$Ancestry <- clean_name
      cat("No ancestry label provided, using cleaned filename:", clean_name, "\n")
    }

    # Store the data
    all_data[[i]] <- file_data
  }

  # Combine all data
  if (length(all_data) == 0) {
    stop("No valid data found in any files")
  }

  combined_data <- rbindlist(all_data, fill = TRUE)

  cat("\nCombined data summary:\n")
  cat("Total variants:", nrow(combined_data), "\n")
  cat("Files processed:", length(unique(combined_data$File)), "\n")

  # Check if Ancestry column exists and show its values
  if ("Ancestry" %in% names(combined_data)) {
    cat("Ancestries found:", paste(unique(combined_data$Ancestry), collapse = ", "), "\n")
  } else {
    cat("Warning: No Ancestry column found in combined data!\n")
  }

  # Print per-ancestry summary
  if ("Ancestry" %in% names(combined_data)) {
    ancestry_summary <- combined_data[, .(Count = .N), by = Ancestry]
    print(ancestry_summary)
  }

  return(combined_data)
}

# Function to create violin plots for multiple files
plot_violin_by_maf_multifile <- function(data, exclude_perfect_r2 = FALSE) {

  cat("Creating MAF categories and violin plots...\n")

  # Convert to data.table if not already
  if (!is.data.table(data)) {
    data <- as.data.table(data)
  }

  # Debug: Check what columns we have
  cat("Available columns in data:", paste(names(data), collapse = ", "), "\n")
  if ("Ancestry" %in% names(data)) {
    cat("Ancestry values found:", paste(unique(data$Ancestry), collapse = ", "), "\n")
  }

  # Filter out R2 == 1 if requested
  if (exclude_perfect_r2) {
    original_count <- nrow(data)
    data <- data[R2 != 1.0]
    filtered_count <- nrow(data)
    cat("Filtered out", (original_count - filtered_count), "variants with R2 = 1.0\n")
    cat("Remaining variants:", filtered_count, "\n")
  }

  # Create MAF categories using data.table syntax (very fast)
  data[, MAF_Category := fcase(
    MAF > 0.05, "MAF > 0.05",
    MAF >= 0.005 & MAF <= 0.05, "MAF [0.005, 0.05]",
    MAF < 0.005, "MAF < 0.005",
    default = "Other"
  )]

  # Set factor levels
  data[, MAF_Category := factor(MAF_Category,
                               levels = c("MAF > 0.05", "MAF [0.005, 0.05]", "MAF < 0.005"))]

  # FIXED: Always use Ancestry column if it exists, with proper factor ordering
  if ("Ancestry" %in% names(data)) {
    display_label <- "Ancestry"
    # Convert to factor to control ordering in plots
    data[, Display_Label := factor(Ancestry, levels = unique(Ancestry))]
    cat("Using Ancestry labels for plotting\n")
  } else {
    display_label <- "File"
    # Create a cleaner file name for plotting (remove extensions and shorten if needed)
    data[, Display_Label := gsub("\\.(info|info\\.gz|txt|tsv)$", "", File)]

    # If file names are very long, keep only the most distinctive part
    if (any(nchar(unique(data$Display_Label)) > 15)) {
      data[, Display_Label := substr(Display_Label, 1, 15)]
    }
    # Convert to factor
    data[, Display_Label := factor(Display_Label, levels = unique(Display_Label))]
    cat("Using File names for plotting\n")
  }

  cat("Display labels:", paste(levels(data$Display_Label), collapse = ", "), "\n")

  # Fast summary statistics using data.table
  summary_stats <- data[!is.na(MAF_Category), .(
    Count = .N,
    Mean_R2 = mean(R2, na.rm = TRUE),
    Median_R2 = median(R2, na.rm = TRUE),
    Min_R2 = min(R2, na.rm = TRUE),
    Max_R2 = max(R2, na.rm = TRUE),
    Q25_R2 = quantile(R2, 0.25, na.rm = TRUE),
    Q75_R2 = quantile(R2, 0.75, na.rm = TRUE)
  ), by = .(Display_Label, MAF_Category)]

  cat("\nSummary by", display_label, "and MAF category:\n")
  print(summary_stats)

  # Create plot title with filtering info
  plot_subtitle <- paste("Comparison across", length(unique(data$Display_Label)), display_label, "groups")
  if (exclude_perfect_r2) {
    plot_subtitle <- paste(plot_subtitle, "(excluding R² = 1.0)")
  }

  # Check how many files we have to determine the best layout
  n_groups <- length(unique(data$Display_Label))

  # For many files, use different colors for each file instead of MAF categories
  if (n_groups > 6) {
    # Use RColorBrewer or rainbow colors for many files
    colors <- rainbow(n_groups)
    names(colors) <- levels(data$Display_Label)

    # Create violin plot with files as fill aesthetic
    p <- ggplot(data[!is.na(MAF_Category)], aes(x = Display_Label, y = R2, fill = Display_Label)) +
      geom_violin(scale = "width", trim = FALSE, alpha = 0.7) +
      geom_boxplot(width = 0.1, alpha = 0.5, outlier.size = 0.5, fill = "white") +
      facet_wrap(~MAF_Category, scales = "free_x", ncol = 1) +
      scale_fill_manual(values = colors) +
      labs(
        title = "Distribution of Imputation Quality (R²) by MAF Categories",
        subtitle = plot_subtitle,
        x = display_label,
        y = "R² (Imputation Quality)",
        fill = display_label
      ) +
      theme_minimal() +
      theme(
        plot.title = element_text(size = 14, face = "bold"),
        plot.subtitle = element_text(size = 12),
        strip.text = element_text(size = 11, face = "bold"),
        legend.position = "none",  # Too many files for legend
        axis.text.x = element_text(angle = 45, hjust = 1, size = 8)
      ) +
      ylim(0, 1)

  } else {
    # For fewer files, keep MAF categories as colors
    p <- ggplot(data[!is.na(MAF_Category)], aes(x = Display_Label, y = R2, fill = MAF_Category)) +
      geom_violin(scale = "width", trim = FALSE, alpha = 0.7) +
      geom_boxplot(width = 0.1, alpha = 0.5, outlier.size = 0.5) +
      facet_wrap(~MAF_Category, scales = "free_x", ncol = 1) +
      scale_fill_manual(values = c("MAF > 0.05" = "#2E8B57",
                                  "MAF [0.005, 0.05]" = "#4682B4",
                                  "MAF < 0.005" = "#CD5C5C")) +
      labs(
        title = "Distribution of Imputation Quality (R²) by MAF Categories",
        subtitle = plot_subtitle,
        x = display_label,
        y = "R² (Imputation Quality)",
        fill = "MAF Category"
      ) +
      theme_minimal() +
      theme(
        plot.title = element_text(size = 14, face = "bold"),
        plot.subtitle = element_text(size = 12),
        strip.text = element_text(size = 11, face = "bold"),
        legend.position = "bottom",
        axis.text.x = element_text(angle = 45, hjust = 1, size = 8)
      ) +
      ylim(0, 1)
  }

  return(list(plot = p, summary = summary_stats, data = data))
}

# Alternative plotting function for side-by-side comparison
plot_violin_sidebyside_multifile <- function(data, exclude_perfect_r2 = FALSE) {

  cat("Creating side-by-side violin plots by MAF categories...\n")

  # Convert to data.table if not already
  if (!is.data.table(data)) {
    data <- as.data.table(data)
  }

  # Filter out R2 == 1 if requested
  if (exclude_perfect_r2) {
    original_count <- nrow(data)
    data <- data[R2 != 1.0]
    filtered_count <- nrow(data)
    cat("Filtered out", (original_count - filtered_count), "variants with R2 = 1.0\n")
    cat("Remaining variants:", filtered_count, "\n")
  }

  # Create MAF categories using data.table syntax (very fast)
  data[, MAF_Category := fcase(
    MAF > 0.05, "MAF > 0.05",
    MAF >= 0.005 & MAF <= 0.05, "MAF [0.005, 0.05]",
    MAF < 0.005, "MAF < 0.005",
    default = "Other"
  )]

  # Set factor levels
  data[, MAF_Category := factor(MAF_Category,
                               levels = c("MAF > 0.05", "MAF [0.005, 0.05]", "MAF < 0.005"))]

  # FIXED: Always use Ancestry column if it exists, with proper factor ordering
  if ("Ancestry" %in% names(data)) {
    display_label <- "Ancestry"
    # Convert to factor to control ordering in plots
    data[, Display_Label := factor(Ancestry, levels = unique(Ancestry))]
    cat("Using Ancestry labels for plotting\n")
  } else {
    display_label <- "File"
    # Create a cleaner file name for plotting (remove extensions and shorten if needed)
    data[, Display_Label := gsub("\\.(info|info\\.gz|txt|tsv)$", "", File)]

    # If file names are very long, keep only the most distinctive part
    if (any(nchar(unique(data$Display_Label)) > 15)) {
      data[, Display_Label := substr(Display_Label, 1, 15)]
    }
    # Convert to factor
    data[, Display_Label := factor(Display_Label, levels = unique(Display_Label))]
  }

  # Fast summary statistics using data.table
  summary_stats <- data[!is.na(MAF_Category), .(
    Count = .N,
    Mean_R2 = mean(R2, na.rm = TRUE),
    Median_R2 = median(R2, na.rm = TRUE),
    Min_R2 = min(R2, na.rm = TRUE),
    Max_R2 = max(R2, na.rm = TRUE),
    Q25_R2 = quantile(R2, 0.25, na.rm = TRUE),
    Q75_R2 = quantile(R2, 0.75, na.rm = TRUE)
  ), by = .(Display_Label, MAF_Category)]

  cat("\nSummary by", display_label, "and MAF category:\n")
  print(summary_stats)

  # Create plot title with filtering info
  plot_subtitle <- paste("Comparison across", length(unique(data$Display_Label)), display_label, "groups")
  if (exclude_perfect_r2) {
    plot_subtitle <- paste(plot_subtitle, "(excluding R² = 1.0)")
  }

  # Check how many groups we have
  n_groups <- length(unique(data$Display_Label))

  # Generate colors for groups using the levels from the factor
  if (n_groups <= 10) {
    # Use Set3 palette for up to 10 files (more color-blind friendly)
    library(RColorBrewer)
    group_colors <- brewer.pal(min(n_groups, 12), "Set3")[1:n_groups]
  } else {
    # Use rainbow for many files
    group_colors <- rainbow(n_groups)
  }
  names(group_colors) <- levels(data$Display_Label)

  # Create side-by-side violin plot (MAF categories as x-axis, groups as colors)
  p <- ggplot(data[!is.na(MAF_Category)], aes(x = MAF_Category, y = R2, fill = Display_Label)) +
    geom_violin(scale = "width", trim = FALSE, alpha = 0.7, position = position_dodge(width = 0.8)) +
    geom_boxplot(width = 0.1, alpha = 0.8, position = position_dodge(width = 0.8),
                 outlier.size = 0.3, color = "black") +
    scale_fill_manual(values = group_colors) +
    labs(
      title = "Distribution of Imputation Quality (R²) by MAF Categories",
      subtitle = plot_subtitle,
      x = "MAF Category",
      y = "R² (Imputation Quality)",
      fill = display_label
    ) +
    theme_minimal() +
    theme(
      plot.title = element_text(size = 14, face = "bold"),
      plot.subtitle = element_text(size = 12),
      legend.position = if(n_groups <= 8) "bottom" else "right",
      legend.title = element_text(size = 10),
      legend.text = element_text(size = 8),
      axis.text.x = element_text(angle = 0, hjust = 0.5, size = 10),
      axis.text.y = element_text(size = 10)
    ) +
    ylim(0, 1)

  # If too many groups, split legend into multiple columns
  if (n_groups > 8) {
    p <- p + guides(fill = guide_legend(ncol = ceiling(n_groups/10)))
  }

  return(list(plot = p, summary = summary_stats, data = data))
}

# Keep the original single-file plotting function for backward compatibility
plot_r2_by_maf <- function(data, exclude_perfect_r2 = FALSE) {

  cat("Creating MAF categories and calculating statistics...\n")

  # Convert to data.table if not already
  if (!is.data.table(data)) {
    data <- as.data.table(data)
  }

  # Filter out R2 == 1 if requested
  if (exclude_perfect_r2) {
    original_count <- nrow(data)
    data <- data[R2 != 1.0]
    filtered_count <- nrow(data)
    cat("Filtered out", (original_count - filtered_count), "variants with R2 = 1.0\n")
    cat("Remaining variants:", filtered_count, "\n")
  }

  # Create MAF categories using data.table syntax (very fast)
  data[, MAF_Category := fcase(
    MAF > 0.05, "MAF > 0.05",
    MAF >= 0.005 & MAF <= 0.05, "MAF [0.005, 0.05]",
    MAF < 0.005, "MAF < 0.005",
    default = "Other"
  )]

  # Set factor levels
  data[, MAF_Category := factor(MAF_Category,
                               levels = c("MAF > 0.05", "MAF [0.005, 0.05]", "MAF < 0.005"))]

  # Fast summary statistics using data.table
  summary_stats <- data[!is.na(MAF_Category), .(
    Count = .N,
    Mean_R2 = mean(R2, na.rm = TRUE),
    Median_R2 = median(R2, na.rm = TRUE),
    Min_R2 = min(R2, na.rm = TRUE),
    Max_R2 = max(R2, na.rm = TRUE),
    Q25_R2 = quantile(R2, 0.25, na.rm = TRUE),
    Q75_R2 = quantile(R2, 0.75, na.rm = TRUE)
  ), by = MAF_Category]

  cat("\nSummary by MAF category:\n")
  print(summary_stats)

  # Create plot title with filtering info
  plot_subtitle <- "Imputation Results"
  if (exclude_perfect_r2) {
    plot_subtitle <- paste(plot_subtitle, "(excluding R² = 1.0)")
  }

  # Create the plot using data.table (ggplot2 works well with data.table)
  p <- ggplot(data[!is.na(MAF_Category)], aes(x = R2, fill = MAF_Category)) +
    geom_histogram(bins = 50, alpha = 0.7, position = "identity") +
    facet_wrap(~MAF_Category, scales = "free_y", ncol = 1) +
    scale_fill_manual(values = c("MAF > 0.05" = "#2E8B57",
                                "MAF [0.005, 0.05]" = "#4682B4",
                                "MAF < 0.005" = "#CD5C5C")) +
    labs(
      title = "Distribution of Imputation Quality (R²) by MAF Categories",
      subtitle = plot_subtitle,
      x = "R² (Imputation Quality)",
      y = "Count",
      fill = "MAF Category"
    ) +
    theme_minimal() +
    theme(
      plot.title = element_text(size = 14, face = "bold"),
      plot.subtitle = element_text(size = 12),
      strip.text = element_text(size = 11, face = "bold"),
      legend.position = "bottom"
    ) +
    xlim(0, 1)

  return(list(plot = p, summary = summary_stats, data = data))
}

# Main function that takes command line arguments
main <- function() {
  # Get command line arguments
  args <- commandArgs(trailingOnly = TRUE)

  # Check if file path is provided
  if (length(args) == 0) {
    cat("Usage: Rscript topmed_processor.R <file_or_list> [options]\n")
    cat("Examples:\n")
    cat("  # Single file\n")
    cat("  Rscript topmed_processor.R chr20.info.gz\n")
    cat("  \n")
    cat("  # Multiple files with ancestry labels\n")
    cat("  Rscript topmed_processor.R chr20.info.gz chr21.info.gz --ancestry='EUR,AFR' --sidebyside\n")
    cat("  \n")
    cat("  # Multiple files from a list file\n")
    cat("  Rscript topmed_processor.R --filelist=file_list.txt\n")
    cat("  \n")
    cat("  # With options\n")
    cat("  Rscript topmed_processor.R chr*.info.gz --exclude-perfect --violin\n")
    cat("\nOptions:\n")
    cat("  --filelist=<file>  Read file paths from a text file (one per line)\n")
    cat("  --ancestry=<list>  Comma-separated ancestry labels (e.g., 'EUR,AFR,EAS')\n")
    cat("  --violin           Create violin plots (automatically enabled for multiple files)\n")
    cat("  --sidebyside       Create side-by-side violin plots (MAF categories on x-axis)\n")
    cat("  --histogram        Force histogram plots even for multiple files\n")
    cat("  --exclude-perfect  Filter out variants with R² = 1.0 before plotting\n")
    cat("  --type=<format>    Force file format (auto, topmed, tabdelim)\n")
    cat("\nFile types:\n")
    cat("  auto (default) - Auto-detect based on file extension\n")
    cat("  topmed - TopMed .info format with INFO column containing MAF= and R2=\n")
    cat("  tabdelim - Tab-delimited with separate MAF and R2 columns\n")
    cat("\nFor tab-delimited files, script looks for columns named:\n")
    cat("  MAF: all_maf, maf, MAF, ALL_MAF, allele_freq, AF\n")
    cat("  R2: info, r2, R2, INFO, rsq, RSQ, imputation_quality\n")
    stop("Please provide file path(s) as arguments")
  }

  # Parse arguments
  file_list_arg <- grep("^--filelist=", args, value = TRUE)
  ancestry_arg <- grep("^--ancestry=", args, value = TRUE)
  exclude_perfect_r2 <- "--exclude-perfect" %in% args
  force_violin <- "--violin" %in% args
  force_sidebyside <- "--sidebyside" %in% args
  force_histogram <- "--histogram" %in% args
  type_arg <- grep("^--type=", args, value = TRUE)

  # Get file type if specified
  file_type <- "auto"
  if (length(type_arg) > 0) {
    file_type <- gsub("^--type=", "", type_arg[1])
  }

  # FIXED: Get ancestry labels if specified - handle both quoted and unquoted strings
  ancestry_labels <- NULL
  if (length(ancestry_arg) > 0) {
    ancestry_string <- gsub("^--ancestry=", "", ancestry_arg[1])
    # Remove quotes if present
    ancestry_string <- gsub("^['\"]|['\"]$", "", ancestry_string)
    ancestry_labels <- trimws(strsplit(ancestry_string, ",")[[1]])
    cat("Parsed ancestry labels:", paste(ancestry_labels, collapse = ", "), "\n")
  }

  # Remove option arguments to get file paths
  file_args <- args[!grepl("^--", args)]

  # Determine file list
  if (length(file_list_arg) > 0) {
    # Read from file list
    list_file <- gsub("^--filelist=", "", file_list_arg[1])
    if (!file.exists(list_file)) {
      stop(paste("File list not found:", list_file))
    }
    file_paths <- readLines(list_file)
    file_paths <- file_paths[file_paths != "" & !grepl("^#", file_paths)]  # Remove empty lines and comments
    cat("Read", length(file_paths), "file paths from", list_file, "\n")
  } else {
    # Use command line arguments
    file_paths <- file_args
  }

  if (length(file_paths) == 0) {
    stop("No file paths provided")
  }

  cat("File paths to process:\n")
  for (i in seq_along(file_paths)) {
    cat("  ", i, ":", file_paths[i], "\n")
  }

  # Check files exist
  missing_files <- file_paths[!file.exists(file_paths)]
  if (length(missing_files) > 0) {
    cat("Warning: The following files were not found:\n")
    for (f in missing_files) {
      cat("  ", f, "\n")
    }
    file_paths <- file_paths[file.exists(file_paths)]
  }

  if (length(file_paths) == 0) {
    stop("No valid files found")
  }

  # FIXED: Check ancestry labels if provided with better error messaging
  if (!is.null(ancestry_labels)) {
    if (length(ancestry_labels) != length(file_paths)) {
      cat("ERROR: Number of ancestry labels (", length(ancestry_labels),
          ") must match number of files (", length(file_paths), ")\n", sep="")
      cat("Ancestry labels provided: ", paste(ancestry_labels, collapse = ", "), "\n")
      cat("Files found: ", length(file_paths), "\n")
      for (i in seq_along(file_paths)) {
        cat("  File", i, ":", basename(file_paths[i]), "\n")
      }
      stop("Mismatch between number of ancestry labels and files")
    }
    cat("Ancestry labels provided:\n")
    for (i in seq_along(file_paths)) {
      cat("  ", basename(file_paths[i]), " -> ", ancestry_labels[i], "\n")
    }
  }

  cat("Processing", length(file_paths), "file(s):\n")
  for (f in file_paths) {
    cat("  ", f, "\n")
  }

  if (exclude_perfect_r2) {
    cat("Will exclude variants with R² = 1.0 from plots\n")
  }

  # Start timing
  start_time <- Sys.time()

  # Process files
  if (length(file_paths) == 1 && !force_violin) {
    # Single file processing (original functionality)
    cat("\n=== Single file processing ===\n")

    imputation_data <- process_file(file_paths[1], file_type = file_type)

    # Use original plotting function for single files (unless violin forced)
    results <- plot_r2_by_maf(imputation_data, exclude_perfect_r2 = exclude_perfect_r2)

    # Create output filenames
    base_name <- gsub("\\.(info|info\\.gz|txt|tsv)$", "", basename(file_paths[1]))

  } else {
    # Multiple file processing
    cat("\n=== Multiple file processing ===\n")

    # Process all files
    combined_data <- process_multiple_files(file_paths, ancestry_labels)

    # FIXED: Debug output to verify ancestry labels are correctly applied
    if ("Ancestry" %in% names(combined_data)) {
      cat("\nFinal ancestry assignments in combined data:\n")
      ancestry_check <- combined_data[, .(Count = .N), by = .(File = basename(File), Ancestry)]
      print(ancestry_check)
    }

    # Create plots based on options
    if (force_histogram) {
      results <- plot_r2_by_maf(combined_data, exclude_perfect_r2 = exclude_perfect_r2)
      plot_type <- "histogram"
    } else if (force_sidebyside) {
      results <- plot_violin_sidebyside_multifile(combined_data, exclude_perfect_r2 = exclude_perfect_r2)
      plot_type <- "violin_sidebyside"
    } else {
      # Default: stacked violin plots (original layout)
      results <- plot_violin_by_maf_multifile(combined_data, exclude_perfect_r2 = exclude_perfect_r2)
      plot_type <- "violin"
    }

    # Create output filename based on ancestry labels or number of files
    if (!is.null(ancestry_labels)) {
      base_name <- paste0("combined_", paste(ancestry_labels, collapse = "_"))
    } else {
      base_name <- paste0("combined_", length(file_paths), "_files")
    }
  }

  # End timing
  end_time <- Sys.time()
  processing_time <- end_time - start_time
  cat("\nTotal processing time:", round(as.numeric(processing_time), 2),
      attr(processing_time, "units"), "\n")

  # Display the plot
  print(results$plot)

  # Add suffix if filtering perfect R2
  suffix <- ""
  if (exclude_perfect_r2) {
    suffix <- "_no_perfect_r2"
  }

  # Add plot type suffix
  if (length(file_paths) > 1 && !force_histogram) {
    plot_type_suffix <- if (force_sidebyside) "violin_sidebyside" else "violin"
  } else {
    plot_type_suffix <- "histogram"
  }

  plot_filename <- paste0(base_name, "_r2_", plot_type_suffix, "_by_maf", suffix, ".png")
  data_filename <- paste0(base_name, "_processed_data", suffix, ".csv")

  # Save the plot
  plot_width <- if (length(file_paths) > 3) max(10, length(file_paths) * 2) else 10
  ggsave(plot_filename,
         plot = results$plot,
         width = plot_width, height = 8,
         dpi = 300)

  cat("Plot saved as:", plot_filename, "\n")

  # Save processed data using fwrite (much faster than write.csv)
  fwrite(results$data, data_filename)
  cat("Processed data saved as:", data_filename, "\n")

  # Print final statistics
  total_variants <- nrow(results$data)
  cat("\nFinal Statistics:\n")
  cat("Total variants processed:", total_variants, "\n")
  if (length(file_paths) > 1) {
    cat("Files processed:", length(file_paths), "\n")
    cat("Average variants per file:", round(total_variants / length(file_paths), 0), "\n")
    if ("Ancestry" %in% names(results$data)) {
      cat("Ancestry groups:", paste(unique(results$data$Ancestry), collapse = ", "), "\n")
    }
  }
  cat("Processing speed:", round(total_variants / as.numeric(processing_time), 0),
      "variants per", attr(processing_time, "units"), "\n")
}

# Run main function
if (!interactive()) {
  main()
}