Skip to content

Instantly share code, notes, and snippets.

@telatin
Created January 22, 2021 18:04
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save telatin/8a2c80b760657c10eda791b9e80ad6ce to your computer and use it in GitHub Desktop.
Save telatin/8a2c80b760657c10eda791b9e80ad6ce to your computer and use it in GitHub Desktop.
library(dada2)
library(ggplot)
packageVersion("dada2")
library(ShortRead)
packageVersion("ShortRead")
library(Biostrings)
packageVersion("Biostrings")
path <- "/Volumes/Informatics/telatin/qibx/ITS/"
prmr_funcn <- function(prmr) {
prmr_dna <- DNAString(prmr)
cmplmnt <- toString(complement(prmr_dna))
rvrse <- toString(reverse(prmr_dna))
rvrse_cmplmnt <- toString(reverseComplement(prmr_dna))
return(c(prmr, cmplmnt, rvrse, rvrse_cmplmnt))
}
vcnt_funcn <- function(prmr, filt_seq) {
vcount_pattern <- vcountPattern(prmr, DNAStringSet(filt_seq), fixed = FALSE)
return(sum(vcount_pattern))
}
ITS1f <- "CTTGGTCATTTAGAGGAAGTAA" ## Define your primer sequence here
ITS4 <- "GCTGCGTTCTTCATCGATGC"
ITS1f.form <- prmr_funcn(ITS1f)
ITS4.form <- prmr_funcn(ITS4)
primer.form <- c(ITS1f.form, ITS4.form)
Mock.fnFs <- sort(list.files(path, pattern = "R1.fastq.gz", full.names = TRUE))
Mock.fnRs <- sort(list.files(path, pattern = "R2.fastq.gz", full.names = TRUE))
mock.names <- sapply(strsplit(basename(Mock.fnFs), "_"), "[", 1)
mock.names ## Lists the names of all the sequence files
# Remove Ns
Mock.fnFs.filtN <- file.path(path, "filtN", paste0(mock.names, "_R1_filtN.fastq.gz"))
Mock.fnRs.filtN <- file.path(path, "filtN", paste0(mock.names, "_R2_filtN.fastq.gz"))
filterAndTrim(Mock.fnFs, Mock.fnFs.filtN, Mock.fnRs, Mock.fnRs.filtN, maxN = 0,
multithread = TRUE)
#Here we read in the Ns removed FASTQ file using the readFastq function from the ShortRead package and then store it as a character vector.
Mock.R1.filtN <- as.character(sread(readFastq(Mock.fnFs[1])))
Mock.R2.filtN <- as.character(sread(readFastq(Mock.fnRs[1])))
#The outputs from applying the vnct_funcn show the presence of the primers on the FASTQ files.
# Checking and counting the presence of the primer set in the forward read
sapply(primer.form, vcnt_funcn, filt_seq = Mock.R1.filtN)
# Checking and counting the presence of the primer set in the reverse read
sapply(primer.form, vcnt_funcn, filt_seq = Mock.R2.filtN)
dir.create(file.path(path, "cutadapt_trimmed"))
path.cutadapt <-"/Volumes/Informatics/telatin/qibx/ITS/cutadapt"
Mock.fnFs.cutadapt <- file.path(path.cutadapt, paste0(mock.names, "_R1_trimmed.fastq.gz"))
Mock.fnRs.cutadapt <- file.path(path.cutadapt, paste0(mock.names, "_R2_trimmed.fastq.gz"))
trim.cmd <- "-n 2 -m 50"
primers <- c("-g CTTGGTCATTTAGAGGAAGTAA", "-a GCATCGATGAAGAACGCAGC", "-G GCTGCGTTCTTCATCGATGC",
"-A TTACTTCCTCTAAATGACCAAG")
cutadapt.path <-"/Users/telatina/miniconda3/envs/cutadapt/bin/cutadapt"
for (i in 1:length(mock.names)) {
system2(cutadapt.path, args = c(primers, trim.cmd, "-o", Mock.fnFs.cutadapt[i],
"-p", Mock.fnRs.cutadapt[i], Mock.fnFs.filtN[i], Mock.fnRs.filtN[i]))
}
Mock.R1.trimmed <- as.character(sread(readFastq(Mock.fnFs.cutadapt[1])))
Mock.R2.trimmed <- as.character(sread(readFastq(Mock.fnRs.cutadapt[2])))
# Checking and counting the presence of the primer set in the forward read
sapply(primer.form, vcnt_funcn, filt_seq = Mock.R1.trimmed)
# Checking and counting the presence of the primer set in the reverse read
sapply(primer.form, vcnt_funcn, filt_seq = Mock.R2.trimmed)
# Forward and reverse fastq filenames have the format:
fnFs <- sort(list.files(path.cutadapt, pattern = "R1_trimmed.fastq", full.names = TRUE))
fnRs <- sort(list.files(path.cutadapt, pattern = "R2_trimmed.fastq", full.names = TRUE))
sample.names <- sapply(strsplit(basename(fnFs), "_"), "[", 1)
sample.names
plotQualityProfile(fnFs[1:2])
filtFs <- file.path(path.cutadapt, "filtered", paste0(sample.names, "_F_filt.fastq.gz"))
filtRs <- file.path(path.cutadapt, "filtered", paste0(sample.names, "_R_filt.fastq.gz"))
out <- filterAndTrim(fnFs, filtFs, fnRs, filtRs, maxN = 0, maxEE = c(2, 2),
truncQ = 2, rm.phix = TRUE, compress = TRUE, multithread = TRUE)
errF <- learnErrors(filtFs, multithread = TRUE)
errR <- learnErrors(filtRs, multithread = TRUE)
plotErrors(errF, nominalQ = TRUE)
derepFs <- derepFastq(filtFs, verbose = TRUE)
derepRs <- derepFastq(filtRs, verbose = TRUE)
names(derepFs) <- sample.names
names(derepRs) <- sample.names
dadaFs <- dada(derepFs, err = errF, multithread = TRUE)
dadaRs <- dada(derepRs, err = errR, multithread = TRUE)
dadaFs[1]
mergers <- mergePairs(dadaFs, derepFs, dadaRs, derepRs, verbose=TRUE)
head(mergers[[1]])
seqtab <- makeSequenceTable(mergers)
seqtab.nochim <- removeBimeraDenovo(seqtab, method="consensus", multithread=TRUE, verbose=TRUE)
getN <- function(x) sum(getUniques(x))
track <- cbind(out, sapply(dadaFs, getN), sapply(dadaRs, getN), sapply(mergers,
getN), rowSums(seqtab.nochim))
# If processing a single sample, remove the sapply calls: e.g. replace
# sapply(dadaFs, getN) with getN(dadaFs)
colnames(track) <- c("input", "filtered", "denoisedF", "denoisedR", "merged",
"nonchim")
rownames(track) <- sample.names
UNITE <- "~/git/dadaist2/refs/uniref.fa.gz"
taxa <- assignTaxonomy(seqtab.nochim, UNITE,
multithread = TRUE, tryRC = TRUE)
taxa.print <- taxa # Removing sequence rownames for display only
rownames(taxa.print) <- NULL
head(taxa.print)
write.table(taxa.print,
file.path("~/git/dadaist2/qibx_ITS/taxonomy_R.txt"),
row.names=TRUE,
quote=FALSE
)
seqtab.nochim2 <- t(seqtab.nochim) # QIIME has OTUs as rows
col.names <- basename(filtFs)
feature_table_header = '#OTU ID';
col.names[[1]] <- paste0(feature_table_header,"\t", col.names[[1]])
write.table(seqtab.nochim2, file.path("~/git/dadaist2/qibx_ITS/table_R.txt"), sep="\t",
row.names=TRUE, col.names=col.names,quote=FALSE)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment