telatin/dada-its.R

## dada-its.R
library(dada2)
library(ggplot)
packageVersion("dada2")
library(ShortRead)
packageVersion("ShortRead")
library(Biostrings)
packageVersion("Biostrings")
path <- "/Volumes/Informatics/telatin/qibx/ITS/"
prmr_funcn <- function(prmr) {
  prmr_dna <- DNAString(prmr)
  cmplmnt <- toString(complement(prmr_dna))
  rvrse <- toString(reverse(prmr_dna))
  rvrse_cmplmnt <- toString(reverseComplement(prmr_dna))
  return(c(prmr, cmplmnt, rvrse, rvrse_cmplmnt))
}


vcnt_funcn <- function(prmr, filt_seq) {
  vcount_pattern <- vcountPattern(prmr, DNAStringSet(filt_seq), fixed = FALSE)
  return(sum(vcount_pattern))
}

ITS1f <- "CTTGGTCATTTAGAGGAAGTAA"  ## Define your primer sequence here
ITS4 <- "GCTGCGTTCTTCATCGATGC"
ITS1f.form <- prmr_funcn(ITS1f)
ITS4.form <- prmr_funcn(ITS4)
primer.form <- c(ITS1f.form, ITS4.form)

Mock.fnFs <- sort(list.files(path, pattern = "R1.fastq.gz", full.names = TRUE))
Mock.fnRs <- sort(list.files(path, pattern = "R2.fastq.gz", full.names = TRUE))
mock.names <- sapply(strsplit(basename(Mock.fnFs), "_"), "[", 1)
mock.names  ## Lists the names of all the sequence files

# Remove Ns
Mock.fnFs.filtN <- file.path(path, "filtN", paste0(mock.names, "_R1_filtN.fastq.gz"))
Mock.fnRs.filtN <- file.path(path, "filtN", paste0(mock.names, "_R2_filtN.fastq.gz"))
filterAndTrim(Mock.fnFs, Mock.fnFs.filtN, Mock.fnRs, Mock.fnRs.filtN, maxN = 0,
              multithread = TRUE)

#Here we read in the Ns removed FASTQ file using the readFastq function from the ShortRead package and then store it as a character vector.

Mock.R1.filtN <- as.character(sread(readFastq(Mock.fnFs[1])))
Mock.R2.filtN <- as.character(sread(readFastq(Mock.fnRs[1])))

#The outputs from applying the vnct_funcn show the presence of the primers on the FASTQ files.

# Checking and counting the presence of the primer set in the forward read
sapply(primer.form, vcnt_funcn, filt_seq = Mock.R1.filtN)
# Checking and counting the presence of the primer set in the reverse read
sapply(primer.form, vcnt_funcn, filt_seq = Mock.R2.filtN)

dir.create(file.path(path, "cutadapt_trimmed"))
path.cutadapt <-"/Volumes/Informatics/telatin/qibx/ITS/cutadapt"
Mock.fnFs.cutadapt <- file.path(path.cutadapt, paste0(mock.names, "_R1_trimmed.fastq.gz"))
Mock.fnRs.cutadapt <- file.path(path.cutadapt, paste0(mock.names, "_R2_trimmed.fastq.gz"))
trim.cmd <- "-n 2 -m 50"
primers <- c("-g CTTGGTCATTTAGAGGAAGTAA", "-a GCATCGATGAAGAACGCAGC", "-G GCTGCGTTCTTCATCGATGC",
             "-A TTACTTCCTCTAAATGACCAAG")
cutadapt.path <-"/Users/telatina/miniconda3/envs/cutadapt/bin/cutadapt"
for (i in 1:length(mock.names)) {
  system2(cutadapt.path, args = c(primers, trim.cmd, "-o", Mock.fnFs.cutadapt[i],
                                  "-p", Mock.fnRs.cutadapt[i], Mock.fnFs.filtN[i], Mock.fnRs.filtN[i]))
}

Mock.R1.trimmed <- as.character(sread(readFastq(Mock.fnFs.cutadapt[1])))
Mock.R2.trimmed <- as.character(sread(readFastq(Mock.fnRs.cutadapt[2])))

# Checking and counting the presence of the primer set in the forward read
sapply(primer.form, vcnt_funcn, filt_seq = Mock.R1.trimmed)
# Checking and counting the presence of the primer set in the reverse read
sapply(primer.form, vcnt_funcn, filt_seq = Mock.R2.trimmed)

# Forward and reverse fastq filenames have the format:
fnFs <- sort(list.files(path.cutadapt, pattern = "R1_trimmed.fastq", full.names = TRUE))
fnRs <- sort(list.files(path.cutadapt, pattern = "R2_trimmed.fastq", full.names = TRUE))
sample.names <- sapply(strsplit(basename(fnFs), "_"), "[", 1)
sample.names
plotQualityProfile(fnFs[1:2])
filtFs <- file.path(path.cutadapt, "filtered", paste0(sample.names, "_F_filt.fastq.gz"))
filtRs <- file.path(path.cutadapt, "filtered", paste0(sample.names, "_R_filt.fastq.gz"))
out <- filterAndTrim(fnFs, filtFs, fnRs, filtRs, maxN = 0, maxEE = c(2, 2),
                     truncQ = 2, rm.phix = TRUE, compress = TRUE, multithread = TRUE)
errF <- learnErrors(filtFs, multithread = TRUE)
errR <- learnErrors(filtRs, multithread = TRUE)

plotErrors(errF, nominalQ = TRUE)


derepFs <- derepFastq(filtFs, verbose = TRUE)
derepRs <- derepFastq(filtRs, verbose = TRUE)
names(derepFs) <- sample.names
names(derepRs) <- sample.names

dadaFs <- dada(derepFs, err = errF, multithread = TRUE)
dadaRs <- dada(derepRs, err = errR, multithread = TRUE)


dadaFs[1]
mergers <- mergePairs(dadaFs, derepFs, dadaRs, derepRs, verbose=TRUE)

head(mergers[[1]])
seqtab <- makeSequenceTable(mergers)
seqtab.nochim <- removeBimeraDenovo(seqtab, method="consensus", multithread=TRUE, verbose=TRUE)

getN <- function(x) sum(getUniques(x))
track <- cbind(out, sapply(dadaFs, getN), sapply(dadaRs, getN), sapply(mergers,
                                                                       getN), rowSums(seqtab.nochim))
# If processing a single sample, remove the sapply calls: e.g. replace
# sapply(dadaFs, getN) with getN(dadaFs)
colnames(track) <- c("input", "filtered", "denoisedF", "denoisedR", "merged",
                     "nonchim")
rownames(track) <- sample.names
UNITE <- "~/git/dadaist2/refs/uniref.fa.gz"
taxa <- assignTaxonomy(seqtab.nochim, UNITE,
                       multithread = TRUE, tryRC = TRUE)

taxa.print <- taxa  # Removing sequence rownames for display only
rownames(taxa.print) <- NULL
head(taxa.print)
write.table(taxa.print,
            file.path("~/git/dadaist2/qibx_ITS/taxonomy_R.txt"),
            row.names=TRUE,
            quote=FALSE
)

seqtab.nochim2 <- t(seqtab.nochim) # QIIME has OTUs as rows
col.names <- basename(filtFs)
feature_table_header = '#OTU ID';
col.names[[1]] <- paste0(feature_table_header,"\t", col.names[[1]])
write.table(seqtab.nochim2, file.path("~/git/dadaist2/qibx_ITS/table_R.txt"), sep="\t",
            row.names=TRUE, col.names=col.names,quote=FALSE)
	library(dada2)
	library(ggplot)
	packageVersion("dada2")
	library(ShortRead)
	packageVersion("ShortRead")
	library(Biostrings)
	packageVersion("Biostrings")
	path <- "/Volumes/Informatics/telatin/qibx/ITS/"
	prmr_funcn <- function(prmr) {
	prmr_dna <- DNAString(prmr)
	cmplmnt <- toString(complement(prmr_dna))
	rvrse <- toString(reverse(prmr_dna))
	rvrse_cmplmnt <- toString(reverseComplement(prmr_dna))
	return(c(prmr, cmplmnt, rvrse, rvrse_cmplmnt))
	}


	vcnt_funcn <- function(prmr, filt_seq) {
	vcount_pattern <- vcountPattern(prmr, DNAStringSet(filt_seq), fixed = FALSE)
	return(sum(vcount_pattern))
	}

	ITS1f <- "CTTGGTCATTTAGAGGAAGTAA" ## Define your primer sequence here
	ITS4 <- "GCTGCGTTCTTCATCGATGC"
	ITS1f.form <- prmr_funcn(ITS1f)
	ITS4.form <- prmr_funcn(ITS4)
	primer.form <- c(ITS1f.form, ITS4.form)

	Mock.fnFs <- sort(list.files(path, pattern = "R1.fastq.gz", full.names = TRUE))
	Mock.fnRs <- sort(list.files(path, pattern = "R2.fastq.gz", full.names = TRUE))
	mock.names <- sapply(strsplit(basename(Mock.fnFs), "_"), "[", 1)
	mock.names ## Lists the names of all the sequence files

	# Remove Ns
	Mock.fnFs.filtN <- file.path(path, "filtN", paste0(mock.names, "_R1_filtN.fastq.gz"))
	Mock.fnRs.filtN <- file.path(path, "filtN", paste0(mock.names, "_R2_filtN.fastq.gz"))
	filterAndTrim(Mock.fnFs, Mock.fnFs.filtN, Mock.fnRs, Mock.fnRs.filtN, maxN = 0,
	multithread = TRUE)

	#Here we read in the Ns removed FASTQ file using the readFastq function from the ShortRead package and then store it as a character vector.

	Mock.R1.filtN <- as.character(sread(readFastq(Mock.fnFs[1])))
	Mock.R2.filtN <- as.character(sread(readFastq(Mock.fnRs[1])))

	#The outputs from applying the vnct_funcn show the presence of the primers on the FASTQ files.

	# Checking and counting the presence of the primer set in the forward read
	sapply(primer.form, vcnt_funcn, filt_seq = Mock.R1.filtN)
	# Checking and counting the presence of the primer set in the reverse read
	sapply(primer.form, vcnt_funcn, filt_seq = Mock.R2.filtN)

	dir.create(file.path(path, "cutadapt_trimmed"))
	path.cutadapt <-"/Volumes/Informatics/telatin/qibx/ITS/cutadapt"
	Mock.fnFs.cutadapt <- file.path(path.cutadapt, paste0(mock.names, "_R1_trimmed.fastq.gz"))
	Mock.fnRs.cutadapt <- file.path(path.cutadapt, paste0(mock.names, "_R2_trimmed.fastq.gz"))
	trim.cmd <- "-n 2 -m 50"
	primers <- c("-g CTTGGTCATTTAGAGGAAGTAA", "-a GCATCGATGAAGAACGCAGC", "-G GCTGCGTTCTTCATCGATGC",
	"-A TTACTTCCTCTAAATGACCAAG")
	cutadapt.path <-"/Users/telatina/miniconda3/envs/cutadapt/bin/cutadapt"
	for (i in 1:length(mock.names)) {
	system2(cutadapt.path, args = c(primers, trim.cmd, "-o", Mock.fnFs.cutadapt[i],
	"-p", Mock.fnRs.cutadapt[i], Mock.fnFs.filtN[i], Mock.fnRs.filtN[i]))
	}

	Mock.R1.trimmed <- as.character(sread(readFastq(Mock.fnFs.cutadapt[1])))
	Mock.R2.trimmed <- as.character(sread(readFastq(Mock.fnRs.cutadapt[2])))

	# Checking and counting the presence of the primer set in the forward read
	sapply(primer.form, vcnt_funcn, filt_seq = Mock.R1.trimmed)
	# Checking and counting the presence of the primer set in the reverse read
	sapply(primer.form, vcnt_funcn, filt_seq = Mock.R2.trimmed)

	# Forward and reverse fastq filenames have the format:
	fnFs <- sort(list.files(path.cutadapt, pattern = "R1_trimmed.fastq", full.names = TRUE))
	fnRs <- sort(list.files(path.cutadapt, pattern = "R2_trimmed.fastq", full.names = TRUE))
	sample.names <- sapply(strsplit(basename(fnFs), "_"), "[", 1)
	sample.names
	plotQualityProfile(fnFs[1:2])
	filtFs <- file.path(path.cutadapt, "filtered", paste0(sample.names, "_F_filt.fastq.gz"))
	filtRs <- file.path(path.cutadapt, "filtered", paste0(sample.names, "_R_filt.fastq.gz"))
	out <- filterAndTrim(fnFs, filtFs, fnRs, filtRs, maxN = 0, maxEE = c(2, 2),
	truncQ = 2, rm.phix = TRUE, compress = TRUE, multithread = TRUE)
	errF <- learnErrors(filtFs, multithread = TRUE)
	errR <- learnErrors(filtRs, multithread = TRUE)

	plotErrors(errF, nominalQ = TRUE)


	derepFs <- derepFastq(filtFs, verbose = TRUE)
	derepRs <- derepFastq(filtRs, verbose = TRUE)
	names(derepFs) <- sample.names
	names(derepRs) <- sample.names

	dadaFs <- dada(derepFs, err = errF, multithread = TRUE)
	dadaRs <- dada(derepRs, err = errR, multithread = TRUE)


	dadaFs[1]
	mergers <- mergePairs(dadaFs, derepFs, dadaRs, derepRs, verbose=TRUE)

	head(mergers[[1]])
	seqtab <- makeSequenceTable(mergers)
	seqtab.nochim <- removeBimeraDenovo(seqtab, method="consensus", multithread=TRUE, verbose=TRUE)

	getN <- function(x) sum(getUniques(x))
	track <- cbind(out, sapply(dadaFs, getN), sapply(dadaRs, getN), sapply(mergers,
	getN), rowSums(seqtab.nochim))
	# If processing a single sample, remove the sapply calls: e.g. replace
	# sapply(dadaFs, getN) with getN(dadaFs)
	colnames(track) <- c("input", "filtered", "denoisedF", "denoisedR", "merged",
	"nonchim")
	rownames(track) <- sample.names
	UNITE <- "~/git/dadaist2/refs/uniref.fa.gz"
	taxa <- assignTaxonomy(seqtab.nochim, UNITE,
	multithread = TRUE, tryRC = TRUE)

	taxa.print <- taxa # Removing sequence rownames for display only
	rownames(taxa.print) <- NULL
	head(taxa.print)
	write.table(taxa.print,
	file.path("~/git/dadaist2/qibx_ITS/taxonomy_R.txt"),
	row.names=TRUE,
	quote=FALSE
	)

	seqtab.nochim2 <- t(seqtab.nochim) # QIIME has OTUs as rows
	col.names <- basename(filtFs)
	feature_table_header = '#OTU ID';
	col.names[[1]] <- paste0(feature_table_header,"\t", col.names[[1]])
	write.table(seqtab.nochim2, file.path("~/git/dadaist2/qibx_ITS/table_R.txt"), sep="\t",
	row.names=TRUE, col.names=col.names,quote=FALSE)