Skip to content

Instantly share code, notes, and snippets.

@DomBennett
Created June 5, 2018 13:33
Show Gist options
  • Save DomBennett/bfad277b42c949b188e8965df1b9a84d to your computer and use it in GitHub Desktop.
Save DomBennett/bfad277b42c949b188e8965df1b9a84d to your computer and use it in GitHub Desktop.
Get consensus from forward and reverse sequences in R
# Installed
already_installed <- installed.packages()
# CRAN packages
cran_deps <- c("ape", "reshape2", "phangorn", "stringi", "stringr")
cran_deps <- cran_deps[!cran_deps %in% already_installed]
for (dep in cran_deps) {
install.packages(dep)
}
# Bioconductor packages
source("https://bioconductor.org/biocLite.R")
bioc_deps <- c("DECIPHER", "Biostrings", "sangerseqR")
bioc_deps <- bioc_deps[!bioc_deps %in% already_installed]
for (dep in bioc_deps) {
biocLite(dep)
}
# LIBS ----
library(sangeranalyseR)
source('stubs.R')
# VARS ----
fwd_fldr <- '2853893'
rev_fldr <- '2853895'
# PARSE FOLDERS ----
# list all .ab1 files
fwd_files <- list.files(path = fwd_fldr, pattern = '.ab1')
rev_files <- list.files(path = rev_fldr, pattern = '.ab1')
# name the vectors by the IDs
names(fwd_files) <- sub(pattern = '\\-16s(a|b)r\\-(L|H)\\.ab1',
replacement = '', x = fwd_files)
names(rev_files) <- sub(pattern = '\\-16s(a|b)r\\-(L|H)\\.ab1',
replacement = '', x = rev_files)
if (!all(names(rev_files) %in% names(fwd_files)) &
!all(names(fwd_files) %in% names(rev_files))) {
warning('Not all names are shared between fwd and rev')
}
# LOOP ----
# see: https://github.com/roblanf/sangeranalyseR
conseqs <- vector(mode = 'list', length = length(fwd_files))
names(conseqs) <- names(fwd_files)
for (nm in names(fwd_files)) {
cat('.... working on [', nm, ']\n', sep = '')
fwd_file <- fwd_files[[nm]]
rev_file <- rev_files[[nm]]
fwd <- readsangerseq(filename = file.path(fwd_fldr, fwd_file))
rev <- readsangerseq(filename = file.path(rev_fldr, rev_file))
fwd <- primarySeq(fwd)
rev <- primarySeq(rev)
# get reverse complement
rev <- reverseComplement(rev)
# get unaligned set of the reads we wish to merge
reads <- DNAStringSet(c(as.character(fwd), as.character(rev)))
names(reads) <- c('fwd', 'rev')
merged_reads <- merge.reads(reads)
# add to store
conseqs[[nm]] <- merged_reads[['consensus']]
}
# WRITE ----
conseqs <- DNAStringSet(conseqs)
writeXStringSet(x = conseqs, filepath = 'consensus.fas')
# minor correction: https://github.com/roblanf/sangeranalyseR/issues/25
merge.reads <- function(readset, ref.aa.seq = NULL, minInformation = 0.5, threshold = 0.5,
processors = NULL, genetic.code = GENETIC_CODE, accept.stop.codons = TRUE,
reading.frame = 1) {
processors = sangeranalyseR:::get.processors(processors)
if (length(readset) < 2) {
return(NULL)
}
if (!is.null(ref.aa.seq)) {
print("Correcting frameshifts in reads using amino acid reference sequence")
corrected = CorrectFrameshifts(myXStringSet = readset,
myAAStringSet = AAStringSet(ref.aa.seq), geneticCode = genetic.code,
type = "both", processors = processors, verbose = FALSE)
readset = corrected$sequences
indels = sangeranalyseR:::get.indel.df(corrected$indels)
stops = as.numeric(unlist(mclapply(readset, count.stop.codons,
reading.frame, genetic.code, mc.cores = processors)))
stops.df = data.frame(read = names(readset), stop.codons = stops)
readset.lengths = unlist(lapply(readset, function(x) length(x)))
readset = readset[which(readset.lengths > 0)]
} else {
indels = NULL
stops.df = NULL
}
if (length(readset) < 2) {
return(NULL)
}
if (accept.stop.codons == FALSE) {
print("Removing reads with stop codons")
if (is.null(ref.aa.seq)) {
stops = as.numeric(unlist(mclapply(readset, count.stop.codons,
reading.frame, genetic.code, mc.cores = processors)))
stops.df = data.frame(read = names(readset), stop.codons = stops)
}
old_length = length(readset)
readset = readset[which(stops == 0)]
print(sprintf("%d reads with stop codons removed", old_length -
length(readset)))
}
if (length(readset) < 2) {
return(NULL)
}
print("Aligning reads")
if (!is.null(ref.aa.seq)) {
aln = AlignTranslation(readset, geneticCode = genetic.code,
processors = processors, verbose = FALSE)
} else {
aln = AlignSeqs(readset, processors = processors, verbose = FALSE)
}
if (is.null(names(aln))) {
names(aln) = paste("read", 1:length(aln), sep = "_")
}
print("Calling consensus sequence")
consensus = ConsensusSequence(aln, minInformation = minInformation,
includeTerminalGaps = TRUE, ignoreNonBases = TRUE, threshold = threshold,
noConsensusChar = "-", ambiguity = TRUE)[[1]]
print("Calculating differences between reads and consensus")
diffs = mclapply(aln, sangeranalyseR:::n.pairwise.diffs, subject = consensus,
mc.cores = processors)
diffs = do.call(rbind, diffs)
diffs.df = data.frame(name = names(aln), pairwise.diffs.to.consensus = diffs[,
1], unused.chars = diffs[, 2])
rownames(diffs.df) = NULL
dist = DistanceMatrix(aln, correction = "Jukes-Cantor", penalizeGapLetterMatches = FALSE,
processors = processors, verbose = FALSE)
dend = IdClusters(dist, type = "dendrogram", processors = processors,
verbose = FALSE)
aln2 = c(aln, DNAStringSet(consensus))
names(aln2)[length(aln2)] = "consensus"
consensus.gapfree = DNAString(paste(as.matrix(del.gaps(consensus)), collapse = ""))
sp.df = sangeranalyseR:::count.coincident.sp(aln, processors = processors)
merged.read = list(consensus = consensus.gapfree, alignment = aln2,
differences = diffs.df, distance.matrix = dist, dendrogram = dend,
indels = indels, stop.codons = stops.df, secondary.peak.columns = sp.df)
class(merged.read) = "merged.read"
return(merged.read)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment