Skip to content

Instantly share code, notes, and snippets.

@kumeS
Created August 2, 2023 13:25
Show Gist options
  • Save kumeS/505682a73066cbd28f20ba5a60c583c8 to your computer and use it in GitHub Desktop.
Save kumeS/505682a73066cbd28f20ba5a60c583c8 to your computer and use it in GitHub Desktop.
This function analyzes the samples and top barcodes that match the given index and generates a sequence logo.
#' Index Mismatch Sequence Analysis
#'
#' This function analyzes the samples and top barcodes that match the given index and generates a sequence logo.
#'
#' @title Index Mismatch Sequence Analysis
#' @description Analyzes the samples and top barcodes that match the given index, and generates a sequence logo. The function takes two CSV files and an index as input, and optionally saves the sequence logo as a PNG file.
#' @param x1 A data frame containing the demultiplex stats, read from "Demultiplex_Stats.csv".
#' @param x2 A data frame containing the top unknown barcodes, read from "Top_Unknown_Barcodes.csv".
#' @param index A string representing the index to be matched.
#' @param AllData A logical value indicating whether to include all data in the result. Default is TRUE.
#' @param SavePNG A logical value indicating whether to save the sequence logo as a PNG file. Default is TRUE.
#' @importFrom assertthat is.data.frame is.string is.count noNA
#' @importFrom seqLogo makePWM seqLogo
#' @return A PWM object representing the sequence logo.
#' @export indexMismatchSeq
#' @author Satoshi Kume
#' @examples
#' \dontrun{
#' x1 <- Demultiplex_Stats_proc(read.csv("./Demultiplex_Stats.csv", header = TRUE, check.names = FALSE)[1:12,])
#' x2 <- read.csv("./Top_Unknown_Barcodes.csv", header = TRUE, check.names = FALSE)
#' index <- "GGGGGGGGGG"
#' result <- indexMismatchSeq(x1, x2, index)
#' }
indexMismatchSeq <- function(x1, x2, index, AllData = TRUE, SavePNG = TRUE){
assertthat::is.data.frame(x1)
assertthat::is.data.frame(x2)
assertthat::is.string(index)
assertthat::is.count(length(index))
assertthat::noNA(index)
# Sample: Extract relevant columns and rename
x1a <- x1[,c("Index", "index1", "index2")]
colnames(x1a) <- c("x0", "X1", "X2")
x1a$X3 <- x1$`# Reads`
res1 <- base::agrepl(pattern=index, x=x1a$x0, max.distance = 0.1)
if(sum(res1) != 1){ stop() }
x1b <- x1a[res1,]
r <- rownames(x1b)
result1 <- data.frame(matrix(unlist(strsplit(paste0(x1b$X1, x1b$X2), "")), ncol=20, byrow = T))
result1$X21 <- x1b$X3
#Top
x2$Index <- paste0(x2$index, x2$index2)
res2 <- base::agrepl(pattern=index, x=x2$Index, max.distance = 0.25)
b <- x2[res2,]
#head(b)
#dim(b)
A <- stringdist::stringdistmatrix(a=index, b=b$Index, method="osa", nthread=2, useNames="strings")
A1 <- colnames(A)[A < 5]
b <- b[b$Index %in% A1,]
#dim(b)
result2 <- data.frame(matrix(unlist(strsplit(paste0(b$index, b$index2), "")), ncol=20, byrow = T))
#head(result2)
#dim(result2)
result2$X21 <- b$`# Reads`
#head(result2)
if(AllData){
result <- rbind(result1, result2)
}else{
result <- result2
}
dd <- matrix(0, 5, nchar(index))
rownames(dd) <- c("A", "C", "G", "N", "T")
colnames(dd) <- paste0("X", formatC(1:nchar(index), width = 2, flag = 0))
for(n in 1:(ncol(result)-1)){
#n <- 1
d <- tapply(result$X21, factor(result[,n]), sum)/sum(result$X21)
if(!all(diff(order(names(d))) == 1)){warning("Failed!!")}
dd[rownames(dd) %in% names(d),n] <- as.numeric(d)
}
ddd <- dd[-4,]
#d1 <- apply(ddd, 2, sum)
#dddd <- ddd/d1
#apply(dddd, 2, sum) == 1
p <- makePWM(ddd)
#consensus(p)
if(SavePNG){
png(file = paste0(gsub(" ", "0", formatC(r, width = 2)), "_",index1, ".png"), res = 150, width = 1200, height = 600)
seqLogo(p)
dev.off()
}
seqLogo(p)
return(p)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment