sklarz-bgu/primer_search_wrapper.R

## primer_search_wrapper.R

if(!(all(c("plyr","optparse","tools","magrittr") %in% installed.packages()))) {
    cat("'plyr','optparse','tools','magrittr' are not installed.\nYou must install them for this script to work!\nInstall by running the following commands:\ninstall.packages('plyr')\ninstall.packages('optparse')\ninstall.packages('tools')\ninstall.packages('magrittr')")
}

library(plyr)
library(optparse)
library(tools)
library(magrittr)

# AUTHOR: Menachem Sklarz


# This script

# paste("Rscript",
#       "primers.R",
# "--input", "Staph_aureus_example.nucl.merge.fasta",
# "--output", "Staph_aureus_example.nucl.merge.primersearch",
# "--primers", "SCC_primers",
# "--primersearch_path", "/path/to/emboss/primersearch",
# "--mismatch_percent","10") %>% system

# paste("Rscript","primers.R","-h") %>% system


args    = commandArgs(trailingOnly = F)
# Get dir of script. Not always useful.
filepos = args %>% grep("--file=",x = .)
curfile = sub(pattern     = "--file=",
              replacement = "",
              x           = args[filepos]) %>% file_path_as_absolute
print(curfile)
args = commandArgs(trailingOnly=TRUE)

option_list = list(
    make_option(c("-i", "--input"),      type="character", default=NULL,
                help="Path to fasta file of contigs",   metavar="character"),
    make_option(c("-o", "--output"),        type="character", default=NULL,
                help="Path to output file (default: <input>.primersearch)",             metavar="character"),
    make_option(c("-p", "--primers"),      type="character", default=NULL,
                help="Path to file with primers (EMBOSS/primersearch format)",   metavar="character"),
    make_option(c("-P", "--primersearch_path"),      type="character", default="primersearch",
                help="Path to primersearch executable. (default: primersearch)", metavar="character"),
    make_option(c("-m", "--mismatch_percent"),      type="integer", default=0,
                help="Permitted mismatches (default: 0)", metavar="character"),
    make_option(c("-e", "--extract_fasta"),      type="logical", action = "store_true",
                help="Should the fasta fragment be extracted? (Assumes cutseq is installed in same location as primersearch) (default: FALSE)", metavar="logical")
);


opt_parser = optparse::OptionParser(usage       = "
                                    A script for running EMBOSS/primersearch and parsing it's results as a table of amplicons.

                                    usage: %prog [options]",
                                    option_list = option_list,
                                    epilogue    = "\n\nAuthor: Menachem Sklarz");
opt = optparse::parse_args(opt_parser);

# If output not set, set it now:
if (is.null(opt$output)) {
    opt$output = sprintf("%s.primersearch", opt$input)
    cat(sprintf("Setting output to %s\n",opt$output))
}

CMD = sprintf("%s -seqall %s -infile %s -mismatchpercent %s -stdout -auto",
              opt$primersearch_path, opt$input, opt$primers, opt$mismatch_percent)

cat(sprintf("Running command:\n%s\n",CMD))
cat("Waiting for primerseach to complete:\n")

tryCatch(expr = {
        prim_find <- system(command = CMD,intern = T)
    },
    error= function(x) stop(sprintf("Error executing primersearch: %s", x))
    )
print("This is the output:")
cat(sprintf("%s\n",prim_find))

# Define REs to extract: primer name, amplimer number, length, forward and reverse positions and mismatches
primNum     <- regexpr(pattern = "Primer name ([[:graph:]]+)",text = prim_find, perl=T)
amplimer    <- regexpr(pattern = "Amplimer ([[:digit:]]+)",text = prim_find, perl=T)
sequence    <- regexpr(pattern = "\tSequence: ([[:graph:]]+) ",text = prim_find, perl=T)
amplen      <- regexpr(pattern = "\tAmplimer length: ([[:digit:]]+)",text = prim_find, perl=T)
forward_pos <- regexpr(pattern = "hits forward strand at ([[:digit:]]+) with [[:digit:]]+ mismatches",text = prim_find, perl=T)
forward_MM  <- regexpr(pattern = "hits forward strand at [[:digit:]]+ with ([[:digit:]]+) mismatches",text = prim_find, perl=T)
reverse_pos <- regexpr(pattern = "hits reverse strand at \\[([[:digit:]]+)\\] with [[:digit:]]+ mismatches",text = prim_find, perl=T)
reverse_MM  <- regexpr(pattern = "hits reverse strand at \\[[[:digit:]]+\\] with ([[:digit:]]+) mismatches",text = prim_find, perl=T)

# Define function for extracting text info based on text and REhit:
extractREhit <- function(mytext,rehit) {

    mytext <- substring(text = mytext,
                        first = attributes(rehit)$capture.start %>% as.integer,
                        last = (attributes(rehit)$capture.start %>% as.integer) + (attributes(rehit)$capture.length %>% as.integer) - 1)
    mytext[mytext==""] <- NA
    return(mytext)
}

# Works on a vector. "Pull down" values on all subsequent NA elements
# e.g. pulldown(c(NA,2,NA,NA,"nuc",NA,NA)) will return (NA,2,2,2,"nuc","nuc","nuc")
pulldown <- function(x) {
    for(i in 2:length(x)) {x[i] <- ifelse(is.na(x[i]),x[i-1],x[i]) };
    return(x)
}

# Create tabular summary of all primers:
final<-data.frame(org         = prim_find,
                  primer      = extractREhit(mytext = prim_find, rehit = primNum) %>% pulldown,
                  amplimer    = extractREhit(mytext = prim_find, rehit = amplimer) %>% pulldown,
                  sequence    = extractREhit(mytext = prim_find, rehit = sequence),
                  amplen      = extractREhit(mytext = prim_find, rehit = amplen),
                  forward_pos = extractREhit(mytext = prim_find, rehit = forward_pos),
                  forward_MM  = extractREhit(mytext = prim_find, rehit = forward_MM),
                  reverse_pos = extractREhit(mytext = prim_find, rehit = reverse_pos),
                  reverse_MM  = extractREhit(mytext = prim_find, rehit = reverse_MM),
                  stringsAsFactors = FALSE)
# Keep only lines with information about the amplimer:
final <- final[-which(with(final,
                           is.na(amplen) &
                               is.na(sequence) &
                               is.na(forward_pos) &
                               is.na(forward_MM) &
                               is.na(reverse_pos) &
                               is.na(reverse_MM))),]

# Convert to table: For each primer/amplimer, return one line with specific info (if no amplimers, return all NAs):
results_table <- ddply(.data = final,.variables = .(primer,amplimer),.fun = function(x) {
    if(dim(x)[1] > 2) {
        retval <- c(sequence = x$sequence[!is.na(x$sequence)] %>% as.character,
                    amplen = x$amplen[!is.na(x$amplen)] %>% as.numeric,
                    forward_pos = x$forward_pos[!is.na(x$forward_pos)] %>% as.numeric,
                    forward_MM = x$forward_MM[!is.na(x$forward_MM)] %>% as.numeric,
                    reverse_pos = x$reverse_pos[!is.na(x$reverse_pos)] %>% as.numeric,
                    reverse_MM = x$reverse_MM[!is.na(x$reverse_MM)] %>% as.numeric)
    } else {
        retval <- c(sequence = NA,
                    amplen = NA,
                    forward_pos = NA,
                    forward_MM = NA,
                    reverse_pos = NA,
                    reverse_MM = NA)
    }
    # print(retval)
    return(retval)
})

results_table$amplen <- as.numeric(results_table$amplen)
results_table$forward_pos <- as.numeric(results_table$forward_pos)
results_table$reverse_pos <- as.numeric(results_table$reverse_pos)

write.table(x = results_table,
            file = opt$output,
            append = F,
            quote = F,
            sep = "\t",
            row.names = F,
            col.names = T)


# If user requested extraction of sequences:
if (exists(x = "extract_fasta",where = opt)) {
    # stop("This option is not implemented yet. Sorry...")
    # Load Biostrings library:
    if(not("Biostrings") %in% installed.packages()) {
        cat("'Biostrings' is not installed.\nYou must install it when selecting extract_fasta option. \nInstall by running the following commands:\ninstall.packages('Biostrings')")
        break
    }

    library(Biostrings)

    # Create fasta file index
    fastaindex <- fasta.index(filepath =  opt$input)


    # For each blast result:
    for(i in 1:dim(results_table)[1]) {
        # If it is in reverse: (see documentation below for non-reverse)
        newseq <- readBStringSet(fastaindex[grep(pattern = results_table[i,"sequence"],
                                                 x       = fastaindex$desc),]   )

        newseq <- subseq(newseq,
                         start = results_table$forward_pos[i],
                         end = results_table$forward_pos[i] + results_table$amplen[i])
        names(newseq) <- paste(names(newseq),
                               results_table$forward_pos[i],
                               results_table$forward_pos[i] + results_table$amplen[i],
                               sep="_")
        # Add coordinates to the seq name:
        # Write the sequence to file
        writeXStringSet(newseq,
                        filepath = sprintf("%s.fasta",opt$output),
                        append = TRUE,
                        compress=FALSE,
                        format="fasta")


    }


}

	if(!(all(c("plyr","optparse","tools","magrittr") %in% installed.packages()))) {
	cat("'plyr','optparse','tools','magrittr' are not installed.\nYou must install them for this script to work!\nInstall by running the following commands:\ninstall.packages('plyr')\ninstall.packages('optparse')\ninstall.packages('tools')\ninstall.packages('magrittr')")
	}

	library(plyr)
	library(optparse)
	library(tools)
	library(magrittr)

	# AUTHOR: Menachem Sklarz



	# This script

	# paste("Rscript",
	# "primers.R",
	# "--input", "Staph_aureus_example.nucl.merge.fasta",
	# "--output", "Staph_aureus_example.nucl.merge.primersearch",
	# "--primers", "SCC_primers",
	# "--primersearch_path", "/path/to/emboss/primersearch",
	# "--mismatch_percent","10") %>% system

	# paste("Rscript","primers.R","-h") %>% system


	args = commandArgs(trailingOnly = F)
	# Get dir of script. Not always useful.
	filepos = args %>% grep("--file=",x = .)
	curfile = sub(pattern = "--file=",
	replacement = "",
	x = args[filepos]) %>% file_path_as_absolute
	print(curfile)
	args = commandArgs(trailingOnly=TRUE)

	option_list = list(
	make_option(c("-i", "--input"), type="character", default=NULL,
	help="Path to fasta file of contigs", metavar="character"),
	make_option(c("-o", "--output"), type="character", default=NULL,
	help="Path to output file (default: <input>.primersearch)", metavar="character"),
	make_option(c("-p", "--primers"), type="character", default=NULL,
	help="Path to file with primers (EMBOSS/primersearch format)", metavar="character"),
	make_option(c("-P", "--primersearch_path"), type="character", default="primersearch",
	help="Path to primersearch executable. (default: primersearch)", metavar="character"),
	make_option(c("-m", "--mismatch_percent"), type="integer", default=0,
	help="Permitted mismatches (default: 0)", metavar="character"),
	make_option(c("-e", "--extract_fasta"), type="logical", action = "store_true",
	help="Should the fasta fragment be extracted? (Assumes cutseq is installed in same location as primersearch) (default: FALSE)", metavar="logical")
	);


	opt_parser = optparse::OptionParser(usage = "
	A script for running EMBOSS/primersearch and parsing it's results as a table of amplicons.

	usage: %prog [options]",
	option_list = option_list,
	epilogue = "\n\nAuthor: Menachem Sklarz");
	opt = optparse::parse_args(opt_parser);

	# If output not set, set it now:
	if (is.null(opt$output)) {
	opt$output = sprintf("%s.primersearch", opt$input)
	cat(sprintf("Setting output to %s\n",opt$output))
	}

	CMD = sprintf("%s -seqall %s -infile %s -mismatchpercent %s -stdout -auto",
	opt$primersearch_path, opt$input, opt$primers, opt$mismatch_percent)

	cat(sprintf("Running command:\n%s\n",CMD))
	cat("Waiting for primerseach to complete:\n")

	tryCatch(expr = {
	prim_find <- system(command = CMD,intern = T)
	},
	error= function(x) stop(sprintf("Error executing primersearch: %s", x))
	)
	print("This is the output:")
	cat(sprintf("%s\n",prim_find))

	# Define REs to extract: primer name, amplimer number, length, forward and reverse positions and mismatches
	primNum <- regexpr(pattern = "Primer name ([[:graph:]]+)",text = prim_find, perl=T)
	amplimer <- regexpr(pattern = "Amplimer ([[:digit:]]+)",text = prim_find, perl=T)
	sequence <- regexpr(pattern = "\tSequence: ([[:graph:]]+) ",text = prim_find, perl=T)
	amplen <- regexpr(pattern = "\tAmplimer length: ([[:digit:]]+)",text = prim_find, perl=T)
	forward_pos <- regexpr(pattern = "hits forward strand at ([[:digit:]]+) with [[:digit:]]+ mismatches",text = prim_find, perl=T)
	forward_MM <- regexpr(pattern = "hits forward strand at [[:digit:]]+ with ([[:digit:]]+) mismatches",text = prim_find, perl=T)
	reverse_pos <- regexpr(pattern = "hits reverse strand at \\[([[:digit:]]+)\\] with [[:digit:]]+ mismatches",text = prim_find, perl=T)
	reverse_MM <- regexpr(pattern = "hits reverse strand at \\[[[:digit:]]+\\] with ([[:digit:]]+) mismatches",text = prim_find, perl=T)

	# Define function for extracting text info based on text and REhit:
	extractREhit <- function(mytext,rehit) {

	mytext <- substring(text = mytext,
	first = attributes(rehit)$capture.start %>% as.integer,
	last = (attributes(rehit)$capture.start %>% as.integer) + (attributes(rehit)$capture.length %>% as.integer) - 1)
	mytext[mytext==""] <- NA
	return(mytext)
	}

	# Works on a vector. "Pull down" values on all subsequent NA elements
	# e.g. pulldown(c(NA,2,NA,NA,"nuc",NA,NA)) will return (NA,2,2,2,"nuc","nuc","nuc")
	pulldown <- function(x) {
	for(i in 2:length(x)) {x[i] <- ifelse(is.na(x[i]),x[i-1],x[i]) };
	return(x)
	}

	# Create tabular summary of all primers:
	final<-data.frame(org = prim_find,
	primer = extractREhit(mytext = prim_find, rehit = primNum) %>% pulldown,
	amplimer = extractREhit(mytext = prim_find, rehit = amplimer) %>% pulldown,
	sequence = extractREhit(mytext = prim_find, rehit = sequence),
	amplen = extractREhit(mytext = prim_find, rehit = amplen),
	forward_pos = extractREhit(mytext = prim_find, rehit = forward_pos),
	forward_MM = extractREhit(mytext = prim_find, rehit = forward_MM),
	reverse_pos = extractREhit(mytext = prim_find, rehit = reverse_pos),
	reverse_MM = extractREhit(mytext = prim_find, rehit = reverse_MM),
	stringsAsFactors = FALSE)
	# Keep only lines with information about the amplimer:
	final <- final[-which(with(final,
	is.na(amplen) &
	is.na(sequence) &
	is.na(forward_pos) &
	is.na(forward_MM) &
	is.na(reverse_pos) &
	is.na(reverse_MM))),]

	# Convert to table: For each primer/amplimer, return one line with specific info (if no amplimers, return all NAs):
	results_table <- ddply(.data = final,.variables = .(primer,amplimer),.fun = function(x) {
	if(dim(x)[1] > 2) {
	retval <- c(sequence = x$sequence[!is.na(x$sequence)] %>% as.character,
	amplen = x$amplen[!is.na(x$amplen)] %>% as.numeric,
	forward_pos = x$forward_pos[!is.na(x$forward_pos)] %>% as.numeric,
	forward_MM = x$forward_MM[!is.na(x$forward_MM)] %>% as.numeric,
	reverse_pos = x$reverse_pos[!is.na(x$reverse_pos)] %>% as.numeric,
	reverse_MM = x$reverse_MM[!is.na(x$reverse_MM)] %>% as.numeric)
	} else {
	retval <- c(sequence = NA,
	amplen = NA,
	forward_pos = NA,
	forward_MM = NA,
	reverse_pos = NA,
	reverse_MM = NA)
	}
	# print(retval)
	return(retval)
	})

	results_table$amplen <- as.numeric(results_table$amplen)
	results_table$forward_pos <- as.numeric(results_table$forward_pos)
	results_table$reverse_pos <- as.numeric(results_table$reverse_pos)

	write.table(x = results_table,
	file = opt$output,
	append = F,
	quote = F,
	sep = "\t",
	row.names = F,
	col.names = T)



	# If user requested extraction of sequences:
	if (exists(x = "extract_fasta",where = opt)) {
	# stop("This option is not implemented yet. Sorry...")
	# Load Biostrings library:
	if(not("Biostrings") %in% installed.packages()) {
	cat("'Biostrings' is not installed.\nYou must install it when selecting extract_fasta option. \nInstall by running the following commands:\ninstall.packages('Biostrings')")
	break
	}

	library(Biostrings)

	# Create fasta file index
	fastaindex <- fasta.index(filepath = opt$input)


	# For each blast result:
	for(i in 1:dim(results_table)[1]) {
	# If it is in reverse: (see documentation below for non-reverse)
	newseq <- readBStringSet(fastaindex[grep(pattern = results_table[i,"sequence"],
	x = fastaindex$desc),] )

	newseq <- subseq(newseq,
	start = results_table$forward_pos[i],
	end = results_table$forward_pos[i] + results_table$amplen[i])
	names(newseq) <- paste(names(newseq),
	results_table$forward_pos[i],
	results_table$forward_pos[i] + results_table$amplen[i],
	sep="_")
	# Add coordinates to the seq name:
	# Write the sequence to file
	writeXStringSet(newseq,
	filepath = sprintf("%s.fasta",opt$output),
	append = TRUE,
	compress=FALSE,
	format="fasta")


	}


	}