Created
May 3, 2017 07:06
-
-
Save sklarz-bgu/d4cbe33a2e60f5ef5c2d5b10da2e8eed to your computer and use it in GitHub Desktop.
Wrapper for primer_search from EMBOSS
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
if(!(all(c("plyr","optparse","tools","magrittr") %in% installed.packages()))) { | |
cat("'plyr','optparse','tools','magrittr' are not installed.\nYou must install them for this script to work!\nInstall by running the following commands:\ninstall.packages('plyr')\ninstall.packages('optparse')\ninstall.packages('tools')\ninstall.packages('magrittr')") | |
} | |
library(plyr) | |
library(optparse) | |
library(tools) | |
library(magrittr) | |
# AUTHOR: Menachem Sklarz | |
# This script | |
# paste("Rscript", | |
# "primers.R", | |
# "--input", "Staph_aureus_example.nucl.merge.fasta", | |
# "--output", "Staph_aureus_example.nucl.merge.primersearch", | |
# "--primers", "SCC_primers", | |
# "--primersearch_path", "/path/to/emboss/primersearch", | |
# "--mismatch_percent","10") %>% system | |
# paste("Rscript","primers.R","-h") %>% system | |
args = commandArgs(trailingOnly = F) | |
# Get dir of script. Not always useful. | |
filepos = args %>% grep("--file=",x = .) | |
curfile = sub(pattern = "--file=", | |
replacement = "", | |
x = args[filepos]) %>% file_path_as_absolute | |
print(curfile) | |
args = commandArgs(trailingOnly=TRUE) | |
option_list = list( | |
make_option(c("-i", "--input"), type="character", default=NULL, | |
help="Path to fasta file of contigs", metavar="character"), | |
make_option(c("-o", "--output"), type="character", default=NULL, | |
help="Path to output file (default: <input>.primersearch)", metavar="character"), | |
make_option(c("-p", "--primers"), type="character", default=NULL, | |
help="Path to file with primers (EMBOSS/primersearch format)", metavar="character"), | |
make_option(c("-P", "--primersearch_path"), type="character", default="primersearch", | |
help="Path to primersearch executable. (default: primersearch)", metavar="character"), | |
make_option(c("-m", "--mismatch_percent"), type="integer", default=0, | |
help="Permitted mismatches (default: 0)", metavar="character"), | |
make_option(c("-e", "--extract_fasta"), type="logical", action = "store_true", | |
help="Should the fasta fragment be extracted? (Assumes cutseq is installed in same location as primersearch) (default: FALSE)", metavar="logical") | |
); | |
opt_parser = optparse::OptionParser(usage = " | |
A script for running EMBOSS/primersearch and parsing it's results as a table of amplicons. | |
usage: %prog [options]", | |
option_list = option_list, | |
epilogue = "\n\nAuthor: Menachem Sklarz"); | |
opt = optparse::parse_args(opt_parser); | |
# If output not set, set it now: | |
if (is.null(opt$output)) { | |
opt$output = sprintf("%s.primersearch", opt$input) | |
cat(sprintf("Setting output to %s\n",opt$output)) | |
} | |
CMD = sprintf("%s -seqall %s -infile %s -mismatchpercent %s -stdout -auto", | |
opt$primersearch_path, opt$input, opt$primers, opt$mismatch_percent) | |
cat(sprintf("Running command:\n%s\n",CMD)) | |
cat("Waiting for primerseach to complete:\n") | |
tryCatch(expr = { | |
prim_find <- system(command = CMD,intern = T) | |
}, | |
error= function(x) stop(sprintf("Error executing primersearch: %s", x)) | |
) | |
print("This is the output:") | |
cat(sprintf("%s\n",prim_find)) | |
# Define REs to extract: primer name, amplimer number, length, forward and reverse positions and mismatches | |
primNum <- regexpr(pattern = "Primer name ([[:graph:]]+)",text = prim_find, perl=T) | |
amplimer <- regexpr(pattern = "Amplimer ([[:digit:]]+)",text = prim_find, perl=T) | |
sequence <- regexpr(pattern = "\tSequence: ([[:graph:]]+) ",text = prim_find, perl=T) | |
amplen <- regexpr(pattern = "\tAmplimer length: ([[:digit:]]+)",text = prim_find, perl=T) | |
forward_pos <- regexpr(pattern = "hits forward strand at ([[:digit:]]+) with [[:digit:]]+ mismatches",text = prim_find, perl=T) | |
forward_MM <- regexpr(pattern = "hits forward strand at [[:digit:]]+ with ([[:digit:]]+) mismatches",text = prim_find, perl=T) | |
reverse_pos <- regexpr(pattern = "hits reverse strand at \\[([[:digit:]]+)\\] with [[:digit:]]+ mismatches",text = prim_find, perl=T) | |
reverse_MM <- regexpr(pattern = "hits reverse strand at \\[[[:digit:]]+\\] with ([[:digit:]]+) mismatches",text = prim_find, perl=T) | |
# Define function for extracting text info based on text and REhit: | |
extractREhit <- function(mytext,rehit) { | |
mytext <- substring(text = mytext, | |
first = attributes(rehit)$capture.start %>% as.integer, | |
last = (attributes(rehit)$capture.start %>% as.integer) + (attributes(rehit)$capture.length %>% as.integer) - 1) | |
mytext[mytext==""] <- NA | |
return(mytext) | |
} | |
# Works on a vector. "Pull down" values on all subsequent NA elements | |
# e.g. pulldown(c(NA,2,NA,NA,"nuc",NA,NA)) will return (NA,2,2,2,"nuc","nuc","nuc") | |
pulldown <- function(x) { | |
for(i in 2:length(x)) {x[i] <- ifelse(is.na(x[i]),x[i-1],x[i]) }; | |
return(x) | |
} | |
# Create tabular summary of all primers: | |
final<-data.frame(org = prim_find, | |
primer = extractREhit(mytext = prim_find, rehit = primNum) %>% pulldown, | |
amplimer = extractREhit(mytext = prim_find, rehit = amplimer) %>% pulldown, | |
sequence = extractREhit(mytext = prim_find, rehit = sequence), | |
amplen = extractREhit(mytext = prim_find, rehit = amplen), | |
forward_pos = extractREhit(mytext = prim_find, rehit = forward_pos), | |
forward_MM = extractREhit(mytext = prim_find, rehit = forward_MM), | |
reverse_pos = extractREhit(mytext = prim_find, rehit = reverse_pos), | |
reverse_MM = extractREhit(mytext = prim_find, rehit = reverse_MM), | |
stringsAsFactors = FALSE) | |
# Keep only lines with information about the amplimer: | |
final <- final[-which(with(final, | |
is.na(amplen) & | |
is.na(sequence) & | |
is.na(forward_pos) & | |
is.na(forward_MM) & | |
is.na(reverse_pos) & | |
is.na(reverse_MM))),] | |
# Convert to table: For each primer/amplimer, return one line with specific info (if no amplimers, return all NAs): | |
results_table <- ddply(.data = final,.variables = .(primer,amplimer),.fun = function(x) { | |
if(dim(x)[1] > 2) { | |
retval <- c(sequence = x$sequence[!is.na(x$sequence)] %>% as.character, | |
amplen = x$amplen[!is.na(x$amplen)] %>% as.numeric, | |
forward_pos = x$forward_pos[!is.na(x$forward_pos)] %>% as.numeric, | |
forward_MM = x$forward_MM[!is.na(x$forward_MM)] %>% as.numeric, | |
reverse_pos = x$reverse_pos[!is.na(x$reverse_pos)] %>% as.numeric, | |
reverse_MM = x$reverse_MM[!is.na(x$reverse_MM)] %>% as.numeric) | |
} else { | |
retval <- c(sequence = NA, | |
amplen = NA, | |
forward_pos = NA, | |
forward_MM = NA, | |
reverse_pos = NA, | |
reverse_MM = NA) | |
} | |
# print(retval) | |
return(retval) | |
}) | |
results_table$amplen <- as.numeric(results_table$amplen) | |
results_table$forward_pos <- as.numeric(results_table$forward_pos) | |
results_table$reverse_pos <- as.numeric(results_table$reverse_pos) | |
write.table(x = results_table, | |
file = opt$output, | |
append = F, | |
quote = F, | |
sep = "\t", | |
row.names = F, | |
col.names = T) | |
# If user requested extraction of sequences: | |
if (exists(x = "extract_fasta",where = opt)) { | |
# stop("This option is not implemented yet. Sorry...") | |
# Load Biostrings library: | |
if(not("Biostrings") %in% installed.packages()) { | |
cat("'Biostrings' is not installed.\nYou must install it when selecting extract_fasta option. \nInstall by running the following commands:\ninstall.packages('Biostrings')") | |
break | |
} | |
library(Biostrings) | |
# Create fasta file index | |
fastaindex <- fasta.index(filepath = opt$input) | |
# For each blast result: | |
for(i in 1:dim(results_table)[1]) { | |
# If it is in reverse: (see documentation below for non-reverse) | |
newseq <- readBStringSet(fastaindex[grep(pattern = results_table[i,"sequence"], | |
x = fastaindex$desc),] ) | |
newseq <- subseq(newseq, | |
start = results_table$forward_pos[i], | |
end = results_table$forward_pos[i] + results_table$amplen[i]) | |
names(newseq) <- paste(names(newseq), | |
results_table$forward_pos[i], | |
results_table$forward_pos[i] + results_table$amplen[i], | |
sep="_") | |
# Add coordinates to the seq name: | |
# Write the sequence to file | |
writeXStringSet(newseq, | |
filepath = sprintf("%s.fasta",opt$output), | |
append = TRUE, | |
compress=FALSE, | |
format="fasta") | |
} | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment