Skip to content

Instantly share code, notes, and snippets.

@kumeS
Created August 2, 2023 13:34
Show Gist options
  • Save kumeS/a3c6de1b645c4d5624a3d0e36deeaa62 to your computer and use it in GitHub Desktop.
Save kumeS/a3c6de1b645c4d5624a3d0e36deeaa62 to your computer and use it in GitHub Desktop.
This function analyzes the reads from a FASTQ file and extracts information such as tile, lane, X coordinate, Y coordinate, index1, and index2. It returns a data frame containing this information.
#' Tile Reads Counts Analysis
#'
#' This function analyzes the reads from a FASTQ file and extracts information such as tile, lane, X coordinate, Y coordinate, index1, and index2. It returns a data frame containing this information.
#'
#' @title Tile Reads Counts Analysis
#' @description Analyzes the reads from a FASTQ file and extracts information such as tile, lane, X coordinate, Y coordinate, index1, and index2. The function can also filter the reads based on a specific index1 value.
#' @param rfq A ShortReadQ object containing the reads from the FASTQ file.
#' @param N An integer representing the number of reads to sample. Default is 100000.
#' @param GGG A logical value indicating whether to filter the reads based on the index1 value "GGGGGGGGGG". Default is FALSE.
#' @importFrom assertthat is.count noNA
#' @importFrom ShortRead readFastq sread id
#' @importFrom magrittr %>%
#' @importFrom progress progress_bar
#' @importFrom ggplot2 ggplot aes geom_hex scale_fill_gradient2
#' @importFrom reshape2 melt
#' @return A data frame containing the extracted information, including surface, swaths, and tiles.
#' @export tile_Reads_Counts
#' @author Satoshi Kume
#' @examples
#' \dontrun{
#' Filepath <- "./Undetermined_S0_R1_001.fastq.gz"
#' rfq <- readFastq(Filepath)
#' res <- tile_Reads_Counts(rfq)
#'
#' #Hex-plot
#' ggplot(res, aes(x=XR, y = -YR)) +
#' geom_hex(bins = 35, aes(fill = stat(scale(count)))) +
#' scale_fill_gradient2(low="white", mid = "blue", high = "red")
#' }
tile_Reads_Counts <- function(rfq, N=100000, GGG=FALSE){
assertthat::is.count(N)
assertthat::noNA(rfq)
# Extract and print sequence reads and IDs
print(sread(rfq))
print(id(rfq))
# Initialize data frame for storing information
info <- data.frame(matrix(NA, ncol=7, nrow=N))
colnames(info) <- c("No", "Lane", "Tile", "X", "Y", "index1", "index2")
pb <- progress_bar$new(total = N)
k <- sample(1:length(id(rfq)), N, replace = F)
# Extract information from each read
for(n in 1:N){
pb$tick()
a <- as.character(id(rfq)[k[n]])
a1 <- strsplit(x = a, " ")
b <- unlist(strsplit(x = a1[[1]][1], "[:]"))
d <- unlist(strsplit(x = a1[[1]][2], "[:]|[+]"))
info[n,] <- c(k[n], b[4:7], d[4:5])
}
# Filter based on index1 if GGG is TRUE
if(GGG){
info <- info[info$index1 == "GGGGGGGGGG",]
}
# Convert columns to numeric
info$Lane <- as.numeric(info$Lane)
info$Tile <- as.numeric(info$Tile)
info$X <- as.numeric(info$X)
info$Y <- as.numeric(info$Y)
res <- info
# Extract surface, swaths, and tiles
res$Surface <- as.numeric(substr(res$Tile, 1, 1))
res$Swaths <- as.numeric(substr(res$Tile, 2, 2))
res$Tiles <- as.numeric(substr(res$Tile, 3, 4))
# Normalize X and Y coordinates
res$X <- res$X - min(res$X)
res$X <- res$X/max(res$X)
res$Y <- res$Y - min(res$Y)
res$Y <- res$Y/max(res$Y)
# Calculate XR and YR
res$XR <- res$X + ifelse(res$Surface == 1, 0, 6) + res$Swaths
res$YR <- res$Y + res$Tiles
return(res)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment