Created
June 5, 2018 13:33
-
-
Save DomBennett/bfad277b42c949b188e8965df1b9a84d to your computer and use it in GitHub Desktop.
Get consensus from forward and reverse sequences in R
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Installed | |
already_installed <- installed.packages() | |
# CRAN packages | |
cran_deps <- c("ape", "reshape2", "phangorn", "stringi", "stringr") | |
cran_deps <- cran_deps[!cran_deps %in% already_installed] | |
for (dep in cran_deps) { | |
install.packages(dep) | |
} | |
# Bioconductor packages | |
source("https://bioconductor.org/biocLite.R") | |
bioc_deps <- c("DECIPHER", "Biostrings", "sangerseqR") | |
bioc_deps <- bioc_deps[!bioc_deps %in% already_installed] | |
for (dep in bioc_deps) { | |
biocLite(dep) | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# LIBS ---- | |
library(sangeranalyseR) | |
source('stubs.R') | |
# VARS ---- | |
fwd_fldr <- '2853893' | |
rev_fldr <- '2853895' | |
# PARSE FOLDERS ---- | |
# list all .ab1 files | |
fwd_files <- list.files(path = fwd_fldr, pattern = '.ab1') | |
rev_files <- list.files(path = rev_fldr, pattern = '.ab1') | |
# name the vectors by the IDs | |
names(fwd_files) <- sub(pattern = '\\-16s(a|b)r\\-(L|H)\\.ab1', | |
replacement = '', x = fwd_files) | |
names(rev_files) <- sub(pattern = '\\-16s(a|b)r\\-(L|H)\\.ab1', | |
replacement = '', x = rev_files) | |
if (!all(names(rev_files) %in% names(fwd_files)) & | |
!all(names(fwd_files) %in% names(rev_files))) { | |
warning('Not all names are shared between fwd and rev') | |
} | |
# LOOP ---- | |
# see: https://github.com/roblanf/sangeranalyseR | |
conseqs <- vector(mode = 'list', length = length(fwd_files)) | |
names(conseqs) <- names(fwd_files) | |
for (nm in names(fwd_files)) { | |
cat('.... working on [', nm, ']\n', sep = '') | |
fwd_file <- fwd_files[[nm]] | |
rev_file <- rev_files[[nm]] | |
fwd <- readsangerseq(filename = file.path(fwd_fldr, fwd_file)) | |
rev <- readsangerseq(filename = file.path(rev_fldr, rev_file)) | |
fwd <- primarySeq(fwd) | |
rev <- primarySeq(rev) | |
# get reverse complement | |
rev <- reverseComplement(rev) | |
# get unaligned set of the reads we wish to merge | |
reads <- DNAStringSet(c(as.character(fwd), as.character(rev))) | |
names(reads) <- c('fwd', 'rev') | |
merged_reads <- merge.reads(reads) | |
# add to store | |
conseqs[[nm]] <- merged_reads[['consensus']] | |
} | |
# WRITE ---- | |
conseqs <- DNAStringSet(conseqs) | |
writeXStringSet(x = conseqs, filepath = 'consensus.fas') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# minor correction: https://github.com/roblanf/sangeranalyseR/issues/25 | |
merge.reads <- function(readset, ref.aa.seq = NULL, minInformation = 0.5, threshold = 0.5, | |
processors = NULL, genetic.code = GENETIC_CODE, accept.stop.codons = TRUE, | |
reading.frame = 1) { | |
processors = sangeranalyseR:::get.processors(processors) | |
if (length(readset) < 2) { | |
return(NULL) | |
} | |
if (!is.null(ref.aa.seq)) { | |
print("Correcting frameshifts in reads using amino acid reference sequence") | |
corrected = CorrectFrameshifts(myXStringSet = readset, | |
myAAStringSet = AAStringSet(ref.aa.seq), geneticCode = genetic.code, | |
type = "both", processors = processors, verbose = FALSE) | |
readset = corrected$sequences | |
indels = sangeranalyseR:::get.indel.df(corrected$indels) | |
stops = as.numeric(unlist(mclapply(readset, count.stop.codons, | |
reading.frame, genetic.code, mc.cores = processors))) | |
stops.df = data.frame(read = names(readset), stop.codons = stops) | |
readset.lengths = unlist(lapply(readset, function(x) length(x))) | |
readset = readset[which(readset.lengths > 0)] | |
} else { | |
indels = NULL | |
stops.df = NULL | |
} | |
if (length(readset) < 2) { | |
return(NULL) | |
} | |
if (accept.stop.codons == FALSE) { | |
print("Removing reads with stop codons") | |
if (is.null(ref.aa.seq)) { | |
stops = as.numeric(unlist(mclapply(readset, count.stop.codons, | |
reading.frame, genetic.code, mc.cores = processors))) | |
stops.df = data.frame(read = names(readset), stop.codons = stops) | |
} | |
old_length = length(readset) | |
readset = readset[which(stops == 0)] | |
print(sprintf("%d reads with stop codons removed", old_length - | |
length(readset))) | |
} | |
if (length(readset) < 2) { | |
return(NULL) | |
} | |
print("Aligning reads") | |
if (!is.null(ref.aa.seq)) { | |
aln = AlignTranslation(readset, geneticCode = genetic.code, | |
processors = processors, verbose = FALSE) | |
} else { | |
aln = AlignSeqs(readset, processors = processors, verbose = FALSE) | |
} | |
if (is.null(names(aln))) { | |
names(aln) = paste("read", 1:length(aln), sep = "_") | |
} | |
print("Calling consensus sequence") | |
consensus = ConsensusSequence(aln, minInformation = minInformation, | |
includeTerminalGaps = TRUE, ignoreNonBases = TRUE, threshold = threshold, | |
noConsensusChar = "-", ambiguity = TRUE)[[1]] | |
print("Calculating differences between reads and consensus") | |
diffs = mclapply(aln, sangeranalyseR:::n.pairwise.diffs, subject = consensus, | |
mc.cores = processors) | |
diffs = do.call(rbind, diffs) | |
diffs.df = data.frame(name = names(aln), pairwise.diffs.to.consensus = diffs[, | |
1], unused.chars = diffs[, 2]) | |
rownames(diffs.df) = NULL | |
dist = DistanceMatrix(aln, correction = "Jukes-Cantor", penalizeGapLetterMatches = FALSE, | |
processors = processors, verbose = FALSE) | |
dend = IdClusters(dist, type = "dendrogram", processors = processors, | |
verbose = FALSE) | |
aln2 = c(aln, DNAStringSet(consensus)) | |
names(aln2)[length(aln2)] = "consensus" | |
consensus.gapfree = DNAString(paste(as.matrix(del.gaps(consensus)), collapse = "")) | |
sp.df = sangeranalyseR:::count.coincident.sp(aln, processors = processors) | |
merged.read = list(consensus = consensus.gapfree, alignment = aln2, | |
differences = diffs.df, distance.matrix = dist, dendrogram = dend, | |
indels = indels, stop.codons = stops.df, secondary.peak.columns = sp.df) | |
class(merged.read) = "merged.read" | |
return(merged.read) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment