Created
August 2, 2023 13:34
-
-
Save kumeS/a3c6de1b645c4d5624a3d0e36deeaa62 to your computer and use it in GitHub Desktop.
This function analyzes the reads from a FASTQ file and extracts information such as tile, lane, X coordinate, Y coordinate, index1, and index2. It returns a data frame containing this information.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#' Tile Reads Counts Analysis | |
#' | |
#' This function analyzes the reads from a FASTQ file and extracts information such as tile, lane, X coordinate, Y coordinate, index1, and index2. It returns a data frame containing this information. | |
#' | |
#' @title Tile Reads Counts Analysis | |
#' @description Analyzes the reads from a FASTQ file and extracts information such as tile, lane, X coordinate, Y coordinate, index1, and index2. The function can also filter the reads based on a specific index1 value. | |
#' @param rfq A ShortReadQ object containing the reads from the FASTQ file. | |
#' @param N An integer representing the number of reads to sample. Default is 100000. | |
#' @param GGG A logical value indicating whether to filter the reads based on the index1 value "GGGGGGGGGG". Default is FALSE. | |
#' @importFrom assertthat is.count noNA | |
#' @importFrom ShortRead readFastq sread id | |
#' @importFrom magrittr %>% | |
#' @importFrom progress progress_bar | |
#' @importFrom ggplot2 ggplot aes geom_hex scale_fill_gradient2 | |
#' @importFrom reshape2 melt | |
#' @return A data frame containing the extracted information, including surface, swaths, and tiles. | |
#' @export tile_Reads_Counts | |
#' @author Satoshi Kume | |
#' @examples | |
#' \dontrun{ | |
#' Filepath <- "./Undetermined_S0_R1_001.fastq.gz" | |
#' rfq <- readFastq(Filepath) | |
#' res <- tile_Reads_Counts(rfq) | |
#' | |
#' #Hex-plot | |
#' ggplot(res, aes(x=XR, y = -YR)) + | |
#' geom_hex(bins = 35, aes(fill = stat(scale(count)))) + | |
#' scale_fill_gradient2(low="white", mid = "blue", high = "red") | |
#' } | |
tile_Reads_Counts <- function(rfq, N=100000, GGG=FALSE){ | |
assertthat::is.count(N) | |
assertthat::noNA(rfq) | |
# Extract and print sequence reads and IDs | |
print(sread(rfq)) | |
print(id(rfq)) | |
# Initialize data frame for storing information | |
info <- data.frame(matrix(NA, ncol=7, nrow=N)) | |
colnames(info) <- c("No", "Lane", "Tile", "X", "Y", "index1", "index2") | |
pb <- progress_bar$new(total = N) | |
k <- sample(1:length(id(rfq)), N, replace = F) | |
# Extract information from each read | |
for(n in 1:N){ | |
pb$tick() | |
a <- as.character(id(rfq)[k[n]]) | |
a1 <- strsplit(x = a, " ") | |
b <- unlist(strsplit(x = a1[[1]][1], "[:]")) | |
d <- unlist(strsplit(x = a1[[1]][2], "[:]|[+]")) | |
info[n,] <- c(k[n], b[4:7], d[4:5]) | |
} | |
# Filter based on index1 if GGG is TRUE | |
if(GGG){ | |
info <- info[info$index1 == "GGGGGGGGGG",] | |
} | |
# Convert columns to numeric | |
info$Lane <- as.numeric(info$Lane) | |
info$Tile <- as.numeric(info$Tile) | |
info$X <- as.numeric(info$X) | |
info$Y <- as.numeric(info$Y) | |
res <- info | |
# Extract surface, swaths, and tiles | |
res$Surface <- as.numeric(substr(res$Tile, 1, 1)) | |
res$Swaths <- as.numeric(substr(res$Tile, 2, 2)) | |
res$Tiles <- as.numeric(substr(res$Tile, 3, 4)) | |
# Normalize X and Y coordinates | |
res$X <- res$X - min(res$X) | |
res$X <- res$X/max(res$X) | |
res$Y <- res$Y - min(res$Y) | |
res$Y <- res$Y/max(res$Y) | |
# Calculate XR and YR | |
res$XR <- res$X + ifelse(res$Surface == 1, 0, 6) + res$Swaths | |
res$YR <- res$Y + res$Tiles | |
return(res) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment