Skip to content

Instantly share code, notes, and snippets.

@sklarz-bgu
Created May 3, 2017 07:06
Show Gist options
  • Save sklarz-bgu/d4cbe33a2e60f5ef5c2d5b10da2e8eed to your computer and use it in GitHub Desktop.
Save sklarz-bgu/d4cbe33a2e60f5ef5c2d5b10da2e8eed to your computer and use it in GitHub Desktop.
Wrapper for primer_search from EMBOSS
if(!(all(c("plyr","optparse","tools","magrittr") %in% installed.packages()))) {
cat("'plyr','optparse','tools','magrittr' are not installed.\nYou must install them for this script to work!\nInstall by running the following commands:\ninstall.packages('plyr')\ninstall.packages('optparse')\ninstall.packages('tools')\ninstall.packages('magrittr')")
}
library(plyr)
library(optparse)
library(tools)
library(magrittr)
# AUTHOR: Menachem Sklarz
# This script
# paste("Rscript",
# "primers.R",
# "--input", "Staph_aureus_example.nucl.merge.fasta",
# "--output", "Staph_aureus_example.nucl.merge.primersearch",
# "--primers", "SCC_primers",
# "--primersearch_path", "/path/to/emboss/primersearch",
# "--mismatch_percent","10") %>% system
# paste("Rscript","primers.R","-h") %>% system
args = commandArgs(trailingOnly = F)
# Get dir of script. Not always useful.
filepos = args %>% grep("--file=",x = .)
curfile = sub(pattern = "--file=",
replacement = "",
x = args[filepos]) %>% file_path_as_absolute
print(curfile)
args = commandArgs(trailingOnly=TRUE)
option_list = list(
make_option(c("-i", "--input"), type="character", default=NULL,
help="Path to fasta file of contigs", metavar="character"),
make_option(c("-o", "--output"), type="character", default=NULL,
help="Path to output file (default: <input>.primersearch)", metavar="character"),
make_option(c("-p", "--primers"), type="character", default=NULL,
help="Path to file with primers (EMBOSS/primersearch format)", metavar="character"),
make_option(c("-P", "--primersearch_path"), type="character", default="primersearch",
help="Path to primersearch executable. (default: primersearch)", metavar="character"),
make_option(c("-m", "--mismatch_percent"), type="integer", default=0,
help="Permitted mismatches (default: 0)", metavar="character"),
make_option(c("-e", "--extract_fasta"), type="logical", action = "store_true",
help="Should the fasta fragment be extracted? (Assumes cutseq is installed in same location as primersearch) (default: FALSE)", metavar="logical")
);
opt_parser = optparse::OptionParser(usage = "
A script for running EMBOSS/primersearch and parsing it's results as a table of amplicons.
usage: %prog [options]",
option_list = option_list,
epilogue = "\n\nAuthor: Menachem Sklarz");
opt = optparse::parse_args(opt_parser);
# If output not set, set it now:
if (is.null(opt$output)) {
opt$output = sprintf("%s.primersearch", opt$input)
cat(sprintf("Setting output to %s\n",opt$output))
}
CMD = sprintf("%s -seqall %s -infile %s -mismatchpercent %s -stdout -auto",
opt$primersearch_path, opt$input, opt$primers, opt$mismatch_percent)
cat(sprintf("Running command:\n%s\n",CMD))
cat("Waiting for primerseach to complete:\n")
tryCatch(expr = {
prim_find <- system(command = CMD,intern = T)
},
error= function(x) stop(sprintf("Error executing primersearch: %s", x))
)
print("This is the output:")
cat(sprintf("%s\n",prim_find))
# Define REs to extract: primer name, amplimer number, length, forward and reverse positions and mismatches
primNum <- regexpr(pattern = "Primer name ([[:graph:]]+)",text = prim_find, perl=T)
amplimer <- regexpr(pattern = "Amplimer ([[:digit:]]+)",text = prim_find, perl=T)
sequence <- regexpr(pattern = "\tSequence: ([[:graph:]]+) ",text = prim_find, perl=T)
amplen <- regexpr(pattern = "\tAmplimer length: ([[:digit:]]+)",text = prim_find, perl=T)
forward_pos <- regexpr(pattern = "hits forward strand at ([[:digit:]]+) with [[:digit:]]+ mismatches",text = prim_find, perl=T)
forward_MM <- regexpr(pattern = "hits forward strand at [[:digit:]]+ with ([[:digit:]]+) mismatches",text = prim_find, perl=T)
reverse_pos <- regexpr(pattern = "hits reverse strand at \\[([[:digit:]]+)\\] with [[:digit:]]+ mismatches",text = prim_find, perl=T)
reverse_MM <- regexpr(pattern = "hits reverse strand at \\[[[:digit:]]+\\] with ([[:digit:]]+) mismatches",text = prim_find, perl=T)
# Define function for extracting text info based on text and REhit:
extractREhit <- function(mytext,rehit) {
mytext <- substring(text = mytext,
first = attributes(rehit)$capture.start %>% as.integer,
last = (attributes(rehit)$capture.start %>% as.integer) + (attributes(rehit)$capture.length %>% as.integer) - 1)
mytext[mytext==""] <- NA
return(mytext)
}
# Works on a vector. "Pull down" values on all subsequent NA elements
# e.g. pulldown(c(NA,2,NA,NA,"nuc",NA,NA)) will return (NA,2,2,2,"nuc","nuc","nuc")
pulldown <- function(x) {
for(i in 2:length(x)) {x[i] <- ifelse(is.na(x[i]),x[i-1],x[i]) };
return(x)
}
# Create tabular summary of all primers:
final<-data.frame(org = prim_find,
primer = extractREhit(mytext = prim_find, rehit = primNum) %>% pulldown,
amplimer = extractREhit(mytext = prim_find, rehit = amplimer) %>% pulldown,
sequence = extractREhit(mytext = prim_find, rehit = sequence),
amplen = extractREhit(mytext = prim_find, rehit = amplen),
forward_pos = extractREhit(mytext = prim_find, rehit = forward_pos),
forward_MM = extractREhit(mytext = prim_find, rehit = forward_MM),
reverse_pos = extractREhit(mytext = prim_find, rehit = reverse_pos),
reverse_MM = extractREhit(mytext = prim_find, rehit = reverse_MM),
stringsAsFactors = FALSE)
# Keep only lines with information about the amplimer:
final <- final[-which(with(final,
is.na(amplen) &
is.na(sequence) &
is.na(forward_pos) &
is.na(forward_MM) &
is.na(reverse_pos) &
is.na(reverse_MM))),]
# Convert to table: For each primer/amplimer, return one line with specific info (if no amplimers, return all NAs):
results_table <- ddply(.data = final,.variables = .(primer,amplimer),.fun = function(x) {
if(dim(x)[1] > 2) {
retval <- c(sequence = x$sequence[!is.na(x$sequence)] %>% as.character,
amplen = x$amplen[!is.na(x$amplen)] %>% as.numeric,
forward_pos = x$forward_pos[!is.na(x$forward_pos)] %>% as.numeric,
forward_MM = x$forward_MM[!is.na(x$forward_MM)] %>% as.numeric,
reverse_pos = x$reverse_pos[!is.na(x$reverse_pos)] %>% as.numeric,
reverse_MM = x$reverse_MM[!is.na(x$reverse_MM)] %>% as.numeric)
} else {
retval <- c(sequence = NA,
amplen = NA,
forward_pos = NA,
forward_MM = NA,
reverse_pos = NA,
reverse_MM = NA)
}
# print(retval)
return(retval)
})
results_table$amplen <- as.numeric(results_table$amplen)
results_table$forward_pos <- as.numeric(results_table$forward_pos)
results_table$reverse_pos <- as.numeric(results_table$reverse_pos)
write.table(x = results_table,
file = opt$output,
append = F,
quote = F,
sep = "\t",
row.names = F,
col.names = T)
# If user requested extraction of sequences:
if (exists(x = "extract_fasta",where = opt)) {
# stop("This option is not implemented yet. Sorry...")
# Load Biostrings library:
if(not("Biostrings") %in% installed.packages()) {
cat("'Biostrings' is not installed.\nYou must install it when selecting extract_fasta option. \nInstall by running the following commands:\ninstall.packages('Biostrings')")
break
}
library(Biostrings)
# Create fasta file index
fastaindex <- fasta.index(filepath = opt$input)
# For each blast result:
for(i in 1:dim(results_table)[1]) {
# If it is in reverse: (see documentation below for non-reverse)
newseq <- readBStringSet(fastaindex[grep(pattern = results_table[i,"sequence"],
x = fastaindex$desc),] )
newseq <- subseq(newseq,
start = results_table$forward_pos[i],
end = results_table$forward_pos[i] + results_table$amplen[i])
names(newseq) <- paste(names(newseq),
results_table$forward_pos[i],
results_table$forward_pos[i] + results_table$amplen[i],
sep="_")
# Add coordinates to the seq name:
# Write the sequence to file
writeXStringSet(newseq,
filepath = sprintf("%s.fasta",opt$output),
append = TRUE,
compress=FALSE,
format="fasta")
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment