kumeS/tile_Reads_Counts.R

## tile_Reads_Counts.R
#' Tile Reads Counts Analysis
#'
#' This function analyzes the reads from a FASTQ file and extracts information such as tile, lane, X coordinate, Y coordinate, index1, and index2. It returns a data frame containing this information.
#'
#' @title Tile Reads Counts Analysis
#' @description Analyzes the reads from a FASTQ file and extracts information such as tile, lane, X coordinate, Y coordinate, index1, and index2. The function can also filter the reads based on a specific index1 value.
#' @param rfq A ShortReadQ object containing the reads from the FASTQ file.
#' @param N An integer representing the number of reads to sample. Default is 100000.
#' @param GGG A logical value indicating whether to filter the reads based on the index1 value "GGGGGGGGGG". Default is FALSE.
#' @importFrom assertthat is.count noNA
#' @importFrom ShortRead readFastq sread id
#' @importFrom magrittr %>%
#' @importFrom progress progress_bar
#' @importFrom ggplot2 ggplot aes geom_hex scale_fill_gradient2
#' @importFrom reshape2 melt
#' @return A data frame containing the extracted information, including surface, swaths, and tiles.
#' @export tile_Reads_Counts
#' @author Satoshi Kume
#' @examples
#' \dontrun{
#' Filepath <- "./Undetermined_S0_R1_001.fastq.gz"
#' rfq <- readFastq(Filepath)
#' res <- tile_Reads_Counts(rfq)
#'
#' #Hex-plot
#' ggplot(res, aes(x=XR, y = -YR)) +
#'   geom_hex(bins = 35, aes(fill = stat(scale(count)))) +
#'   scale_fill_gradient2(low="white", mid = "blue", high = "red")
#' }

tile_Reads_Counts <- function(rfq, N=100000, GGG=FALSE){
  assertthat::is.count(N)
  assertthat::noNA(rfq)

  # Extract and print sequence reads and IDs
  print(sread(rfq))
  print(id(rfq))

  # Initialize data frame for storing information
  info <- data.frame(matrix(NA, ncol=7, nrow=N))
  colnames(info) <- c("No", "Lane", "Tile", "X", "Y", "index1", "index2")
  pb <- progress_bar$new(total = N)
  k <- sample(1:length(id(rfq)), N, replace = F)

  # Extract information from each read
  for(n in 1:N){
    pb$tick()
    a <- as.character(id(rfq)[k[n]])
    a1 <- strsplit(x = a, " ")
    b <- unlist(strsplit(x = a1[[1]][1], "[:]"))
    d <- unlist(strsplit(x = a1[[1]][2], "[:]|[+]"))
    info[n,] <- c(k[n], b[4:7], d[4:5])
  }

  # Filter based on index1 if GGG is TRUE
  if(GGG){
    info <- info[info$index1 == "GGGGGGGGGG",]
  }

  # Convert columns to numeric
  info$Lane <- as.numeric(info$Lane)
  info$Tile <- as.numeric(info$Tile)
  info$X <- as.numeric(info$X)
  info$Y <- as.numeric(info$Y)

  res <- info

  # Extract surface, swaths, and tiles
  res$Surface <- as.numeric(substr(res$Tile, 1, 1))
  res$Swaths <- as.numeric(substr(res$Tile, 2, 2))
  res$Tiles <- as.numeric(substr(res$Tile, 3, 4))

  # Normalize X and Y coordinates
  res$X <- res$X - min(res$X)
  res$X <- res$X/max(res$X)
  res$Y <- res$Y - min(res$Y)
  res$Y <- res$Y/max(res$Y)

  # Calculate XR and YR
  res$XR <- res$X + ifelse(res$Surface == 1, 0, 6) + res$Swaths
  res$YR <- res$Y + res$Tiles

  return(res)
}
	#' Tile Reads Counts Analysis
	#'
	#' This function analyzes the reads from a FASTQ file and extracts information such as tile, lane, X coordinate, Y coordinate, index1, and index2. It returns a data frame containing this information.
	#'
	#' @title Tile Reads Counts Analysis
	#' @description Analyzes the reads from a FASTQ file and extracts information such as tile, lane, X coordinate, Y coordinate, index1, and index2. The function can also filter the reads based on a specific index1 value.
	#' @param rfq A ShortReadQ object containing the reads from the FASTQ file.
	#' @param N An integer representing the number of reads to sample. Default is 100000.
	#' @param GGG A logical value indicating whether to filter the reads based on the index1 value "GGGGGGGGGG". Default is FALSE.
	#' @importFrom assertthat is.count noNA
	#' @importFrom ShortRead readFastq sread id
	#' @importFrom magrittr %>%
	#' @importFrom progress progress_bar
	#' @importFrom ggplot2 ggplot aes geom_hex scale_fill_gradient2
	#' @importFrom reshape2 melt
	#' @return A data frame containing the extracted information, including surface, swaths, and tiles.
	#' @export tile_Reads_Counts
	#' @author Satoshi Kume
	#' @examples
	#' \dontrun{
	#' Filepath <- "./Undetermined_S0_R1_001.fastq.gz"
	#' rfq <- readFastq(Filepath)
	#' res <- tile_Reads_Counts(rfq)
	#'
	#' #Hex-plot
	#' ggplot(res, aes(x=XR, y = -YR)) +
	#' geom_hex(bins = 35, aes(fill = stat(scale(count)))) +
	#' scale_fill_gradient2(low="white", mid = "blue", high = "red")
	#' }

	tile_Reads_Counts <- function(rfq, N=100000, GGG=FALSE){
	assertthat::is.count(N)
	assertthat::noNA(rfq)

	# Extract and print sequence reads and IDs
	print(sread(rfq))
	print(id(rfq))

	# Initialize data frame for storing information
	info <- data.frame(matrix(NA, ncol=7, nrow=N))
	colnames(info) <- c("No", "Lane", "Tile", "X", "Y", "index1", "index2")
	pb <- progress_bar$new(total = N)
	k <- sample(1:length(id(rfq)), N, replace = F)

	# Extract information from each read
	for(n in 1:N){
	pb$tick()
	a <- as.character(id(rfq)[k[n]])
	a1 <- strsplit(x = a, " ")
	b <- unlist(strsplit(x = a1[[1]][1], "[:]"))
	d <- unlist(strsplit(x = a1[[1]][2], "[:]\|[+]"))
	info[n,] <- c(k[n], b[4:7], d[4:5])
	}

	# Filter based on index1 if GGG is TRUE
	if(GGG){
	info <- info[info$index1 == "GGGGGGGGGG",]
	}

	# Convert columns to numeric
	info$Lane <- as.numeric(info$Lane)
	info$Tile <- as.numeric(info$Tile)
	info$X <- as.numeric(info$X)
	info$Y <- as.numeric(info$Y)

	res <- info

	# Extract surface, swaths, and tiles
	res$Surface <- as.numeric(substr(res$Tile, 1, 1))
	res$Swaths <- as.numeric(substr(res$Tile, 2, 2))
	res$Tiles <- as.numeric(substr(res$Tile, 3, 4))

	# Normalize X and Y coordinates
	res$X <- res$X - min(res$X)
	res$X <- res$X/max(res$X)
	res$Y <- res$Y - min(res$Y)
	res$Y <- res$Y/max(res$Y)

	# Calculate XR and YR
	res$XR <- res$X + ifelse(res$Surface == 1, 0, 6) + res$Swaths
	res$YR <- res$Y + res$Tiles

	return(res)
	}