DomBennett/install_deps.R

## install_deps.R
# Installed
already_installed <- installed.packages()

# CRAN packages
cran_deps <- c("ape", "reshape2", "phangorn", "stringi", "stringr")
cran_deps <- cran_deps[!cran_deps %in% already_installed]
for (dep in cran_deps) {
  install.packages(dep)
}

# Bioconductor packages
source("https://bioconductor.org/biocLite.R")
bioc_deps <- c("DECIPHER", "Biostrings", "sangerseqR")
bioc_deps <- bioc_deps[!bioc_deps %in% already_installed]
for (dep in bioc_deps) {
  biocLite(dep)
}

## merge_sequences.R
# LIBS ----
library(sangeranalyseR)
source('stubs.R')

# VARS ----
fwd_fldr <- '2853893'
rev_fldr <- '2853895'

# PARSE FOLDERS ----
# list all .ab1 files
fwd_files <- list.files(path = fwd_fldr, pattern = '.ab1')
rev_files <- list.files(path = rev_fldr, pattern = '.ab1')
# name the vectors by the IDs
names(fwd_files) <- sub(pattern = '\\-16s(a|b)r\\-(L|H)\\.ab1',
                        replacement = '', x = fwd_files)
names(rev_files) <- sub(pattern = '\\-16s(a|b)r\\-(L|H)\\.ab1',
                        replacement = '', x = rev_files)
if (!all(names(rev_files) %in% names(fwd_files)) &
    !all(names(fwd_files) %in% names(rev_files))) {
  warning('Not all names are shared between fwd and rev')
}

# LOOP ----
# see: https://github.com/roblanf/sangeranalyseR
conseqs <- vector(mode = 'list', length = length(fwd_files))
names(conseqs) <- names(fwd_files)
for (nm in names(fwd_files)) {
  cat('.... working on [', nm, ']\n', sep = '')
  fwd_file <- fwd_files[[nm]]
  rev_file <- rev_files[[nm]]
  fwd <- readsangerseq(filename = file.path(fwd_fldr, fwd_file))
  rev <- readsangerseq(filename = file.path(rev_fldr, rev_file))
  fwd <- primarySeq(fwd)
  rev <- primarySeq(rev)
  # get reverse complement
  rev <- reverseComplement(rev)
  # get unaligned set of the reads we wish to merge
  reads <- DNAStringSet(c(as.character(fwd), as.character(rev)))
  names(reads) <- c('fwd', 'rev')
  merged_reads <- merge.reads(reads)
  # add to store
  conseqs[[nm]] <- merged_reads[['consensus']]
}

# WRITE ----
conseqs <- DNAStringSet(conseqs)
writeXStringSet(x = conseqs, filepath = 'consensus.fas')

## stubs.R
# minor correction: https://github.com/roblanf/sangeranalyseR/issues/25
merge.reads <- function(readset, ref.aa.seq = NULL, minInformation = 0.5, threshold = 0.5,
                        processors = NULL, genetic.code = GENETIC_CODE, accept.stop.codons = TRUE,
                        reading.frame = 1) {
  processors = sangeranalyseR:::get.processors(processors)
  if (length(readset) < 2) {
    return(NULL)
  }
  if (!is.null(ref.aa.seq)) {
    print("Correcting frameshifts in reads using amino acid reference sequence")
    corrected = CorrectFrameshifts(myXStringSet = readset,
                                   myAAStringSet = AAStringSet(ref.aa.seq), geneticCode = genetic.code,
                                   type = "both", processors = processors, verbose = FALSE)
    readset = corrected$sequences
    indels = sangeranalyseR:::get.indel.df(corrected$indels)
    stops = as.numeric(unlist(mclapply(readset, count.stop.codons,
                                       reading.frame, genetic.code, mc.cores = processors)))
    stops.df = data.frame(read = names(readset), stop.codons = stops)
    readset.lengths = unlist(lapply(readset, function(x) length(x)))
    readset = readset[which(readset.lengths > 0)]
  } else {
    indels = NULL
    stops.df = NULL
  }
  if (length(readset) < 2) {
    return(NULL)
  }
  if (accept.stop.codons == FALSE) {
    print("Removing reads with stop codons")
    if (is.null(ref.aa.seq)) {
      stops = as.numeric(unlist(mclapply(readset, count.stop.codons,
                                         reading.frame, genetic.code, mc.cores = processors)))
      stops.df = data.frame(read = names(readset), stop.codons = stops)
    }
    old_length = length(readset)
    readset = readset[which(stops == 0)]
    print(sprintf("%d reads with stop codons removed", old_length -
                    length(readset)))
  }
  if (length(readset) < 2) {
    return(NULL)
  }
  print("Aligning reads")
  if (!is.null(ref.aa.seq)) {
    aln = AlignTranslation(readset, geneticCode = genetic.code,
                           processors = processors, verbose = FALSE)
  } else {
    aln = AlignSeqs(readset, processors = processors, verbose = FALSE)
  }
  if (is.null(names(aln))) {
    names(aln) = paste("read", 1:length(aln), sep = "_")
  }
  print("Calling consensus sequence")
  consensus = ConsensusSequence(aln, minInformation = minInformation,
                                includeTerminalGaps = TRUE, ignoreNonBases = TRUE, threshold = threshold,
                                noConsensusChar = "-", ambiguity = TRUE)[[1]]
  print("Calculating differences between reads and consensus")
  diffs = mclapply(aln, sangeranalyseR:::n.pairwise.diffs, subject = consensus,
                   mc.cores = processors)
  diffs = do.call(rbind, diffs)
  diffs.df = data.frame(name = names(aln), pairwise.diffs.to.consensus = diffs[,
                                                                               1], unused.chars = diffs[, 2])
  rownames(diffs.df) = NULL
  dist = DistanceMatrix(aln, correction = "Jukes-Cantor", penalizeGapLetterMatches = FALSE,
                        processors = processors, verbose = FALSE)
  dend = IdClusters(dist, type = "dendrogram", processors = processors,
                    verbose = FALSE)
  aln2 = c(aln, DNAStringSet(consensus))
  names(aln2)[length(aln2)] = "consensus"
  consensus.gapfree = DNAString(paste(as.matrix(del.gaps(consensus)), collapse = ""))
  sp.df = sangeranalyseR:::count.coincident.sp(aln, processors = processors)
  merged.read = list(consensus = consensus.gapfree, alignment = aln2,
                     differences = diffs.df, distance.matrix = dist, dendrogram = dend,
                     indels = indels, stop.codons = stops.df, secondary.peak.columns = sp.df)
  class(merged.read) = "merged.read"
  return(merged.read)
}
	# Installed
	already_installed <- installed.packages()

	# CRAN packages
	cran_deps <- c("ape", "reshape2", "phangorn", "stringi", "stringr")
	cran_deps <- cran_deps[!cran_deps %in% already_installed]
	for (dep in cran_deps) {
	install.packages(dep)
	}

	# Bioconductor packages
	source("https://bioconductor.org/biocLite.R")
	bioc_deps <- c("DECIPHER", "Biostrings", "sangerseqR")
	bioc_deps <- bioc_deps[!bioc_deps %in% already_installed]
	for (dep in bioc_deps) {
	biocLite(dep)
	}
	# LIBS ----
	library(sangeranalyseR)
	source('stubs.R')

	# VARS ----
	fwd_fldr <- '2853893'
	rev_fldr <- '2853895'

	# PARSE FOLDERS ----
	# list all .ab1 files
	fwd_files <- list.files(path = fwd_fldr, pattern = '.ab1')
	rev_files <- list.files(path = rev_fldr, pattern = '.ab1')
	# name the vectors by the IDs
	names(fwd_files) <- sub(pattern = '\\-16s(a\|b)r\\-(L\|H)\\.ab1',
	replacement = '', x = fwd_files)
	names(rev_files) <- sub(pattern = '\\-16s(a\|b)r\\-(L\|H)\\.ab1',
	replacement = '', x = rev_files)
	if (!all(names(rev_files) %in% names(fwd_files)) &
	!all(names(fwd_files) %in% names(rev_files))) {
	warning('Not all names are shared between fwd and rev')
	}

	# LOOP ----
	# see: https://github.com/roblanf/sangeranalyseR
	conseqs <- vector(mode = 'list', length = length(fwd_files))
	names(conseqs) <- names(fwd_files)
	for (nm in names(fwd_files)) {
	cat('.... working on [', nm, ']\n', sep = '')
	fwd_file <- fwd_files[[nm]]
	rev_file <- rev_files[[nm]]
	fwd <- readsangerseq(filename = file.path(fwd_fldr, fwd_file))
	rev <- readsangerseq(filename = file.path(rev_fldr, rev_file))
	fwd <- primarySeq(fwd)
	rev <- primarySeq(rev)
	# get reverse complement
	rev <- reverseComplement(rev)
	# get unaligned set of the reads we wish to merge
	reads <- DNAStringSet(c(as.character(fwd), as.character(rev)))
	names(reads) <- c('fwd', 'rev')
	merged_reads <- merge.reads(reads)
	# add to store
	conseqs[[nm]] <- merged_reads[['consensus']]
	}

	# WRITE ----
	conseqs <- DNAStringSet(conseqs)
	writeXStringSet(x = conseqs, filepath = 'consensus.fas')
	# minor correction: https://github.com/roblanf/sangeranalyseR/issues/25
	merge.reads <- function(readset, ref.aa.seq = NULL, minInformation = 0.5, threshold = 0.5,
	processors = NULL, genetic.code = GENETIC_CODE, accept.stop.codons = TRUE,
	reading.frame = 1) {
	processors = sangeranalyseR:::get.processors(processors)
	if (length(readset) < 2) {
	return(NULL)
	}
	if (!is.null(ref.aa.seq)) {
	print("Correcting frameshifts in reads using amino acid reference sequence")
	corrected = CorrectFrameshifts(myXStringSet = readset,
	myAAStringSet = AAStringSet(ref.aa.seq), geneticCode = genetic.code,
	type = "both", processors = processors, verbose = FALSE)
	readset = corrected$sequences
	indels = sangeranalyseR:::get.indel.df(corrected$indels)
	stops = as.numeric(unlist(mclapply(readset, count.stop.codons,
	reading.frame, genetic.code, mc.cores = processors)))
	stops.df = data.frame(read = names(readset), stop.codons = stops)
	readset.lengths = unlist(lapply(readset, function(x) length(x)))
	readset = readset[which(readset.lengths > 0)]
	} else {
	indels = NULL
	stops.df = NULL
	}
	if (length(readset) < 2) {
	return(NULL)
	}
	if (accept.stop.codons == FALSE) {
	print("Removing reads with stop codons")
	if (is.null(ref.aa.seq)) {
	stops = as.numeric(unlist(mclapply(readset, count.stop.codons,
	reading.frame, genetic.code, mc.cores = processors)))
	stops.df = data.frame(read = names(readset), stop.codons = stops)
	}
	old_length = length(readset)
	readset = readset[which(stops == 0)]
	print(sprintf("%d reads with stop codons removed", old_length -
	length(readset)))
	}
	if (length(readset) < 2) {
	return(NULL)
	}
	print("Aligning reads")
	if (!is.null(ref.aa.seq)) {
	aln = AlignTranslation(readset, geneticCode = genetic.code,
	processors = processors, verbose = FALSE)
	} else {
	aln = AlignSeqs(readset, processors = processors, verbose = FALSE)
	}
	if (is.null(names(aln))) {
	names(aln) = paste("read", 1:length(aln), sep = "_")
	}
	print("Calling consensus sequence")
	consensus = ConsensusSequence(aln, minInformation = minInformation,
	includeTerminalGaps = TRUE, ignoreNonBases = TRUE, threshold = threshold,
	noConsensusChar = "-", ambiguity = TRUE)[[1]]
	print("Calculating differences between reads and consensus")
	diffs = mclapply(aln, sangeranalyseR:::n.pairwise.diffs, subject = consensus,
	mc.cores = processors)
	diffs = do.call(rbind, diffs)
	diffs.df = data.frame(name = names(aln), pairwise.diffs.to.consensus = diffs[,
	1], unused.chars = diffs[, 2])
	rownames(diffs.df) = NULL
	dist = DistanceMatrix(aln, correction = "Jukes-Cantor", penalizeGapLetterMatches = FALSE,
	processors = processors, verbose = FALSE)
	dend = IdClusters(dist, type = "dendrogram", processors = processors,
	verbose = FALSE)
	aln2 = c(aln, DNAStringSet(consensus))
	names(aln2)[length(aln2)] = "consensus"
	consensus.gapfree = DNAString(paste(as.matrix(del.gaps(consensus)), collapse = ""))
	sp.df = sangeranalyseR:::count.coincident.sp(aln, processors = processors)
	merged.read = list(consensus = consensus.gapfree, alignment = aln2,
	differences = diffs.df, distance.matrix = dist, dendrogram = dend,
	indels = indels, stop.codons = stops.df, secondary.peak.columns = sp.df)
	class(merged.read) = "merged.read"
	return(merged.read)
	}