Created
September 19, 2016 20:12
-
-
Save elsherbini/a165a0760912c78d9adcc737215c6077 to your computer and use it in GitHub Desktop.
dada2_snakefile
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
configfile: | |
"config.yaml" | |
from snakemake.utils import R | |
import glob | |
import os | |
import numpy as np | |
SETS = ["02_frac", "5_frac", "1_and_63_frac", "unfrac"] | |
SET_TO_SAMPLES = {s: [os.path.splitext(os.path.basename(fn))[0].split(".")[0] for fn in os.listdir("data/{}/1.cat".format(s))] for s in SETS} | |
INPUT = [{"set": set_name, "sample": sample, "direction": direction} for set_name in SETS for sample in SET_TO_SAMPLES[set_name] for direction in ["f", "r"]] | |
def seqtab_names(input): | |
return ["data/{set}/11.merge/{sample}.rds".format(**d) for d in input if d["direction"]=="f"] | |
def derep_names(input, set_name, direction): | |
return ["data/{set}/9.derep/{sample}.{direction}.rds".format(**d) for d in input if d["set"] == set_name and d["direction"]==direction] | |
derep_random = {(set_name, direction): np.random.choice(derep_names(INPUT, set_name, direction), size=5, replace=False) for set_name in SETS for direction in ["f", "r"]} | |
localrules: | |
target | |
rule target: | |
input: | |
combined = "data/combined.seqtab" | |
rule convert_r: | |
input: | |
"data/{set}/1.cat/{sample}.{direction}.fq" | |
output: | |
fastq = "data/{set}/2.convert/{sample}.{direction}.fq" | |
run: | |
shell('{config[caravan_path]}van.py convert {input} -o {output[fastq]} -r') | |
rule trim_primer_f: | |
input: | |
"data/{set}/2.convert/{sample}.f.fq" | |
output: | |
"data/{set}/4.trim/{sample}.f.fq" | |
shell: | |
"/home/josephe/dev/caravan/van.py trim ACCCDATGGATCTKAAGCGYGG {input} -o {output} -w 25" | |
rule trim_primer_r: | |
input: | |
"data/{set}/2.convert/{sample}.r.fq" | |
output: | |
"data/{set}/4.trim/{sample}.r.fq" | |
shell: | |
"/home/josephe/dev/caravan/van.py trim CCRCGRTCGAACTGCATVCCTT {input} -o {output} -w 25" | |
rule intersect: | |
input: | |
f = "data/{set}/4.trim/{sample}.f.fq", | |
r = "data/{set}/4.trim/{sample}.r.fq" | |
output: | |
f = "data/{set}/7.intersect/{sample}.f.fq", | |
r = "data/{set}/7.intersect/{sample}.r.fq" | |
shell: | |
"/home/josephe/dev/caravan/van.py intersect_no_mapping {input[f]} {input[r]} {output[f]} {output[r]}" | |
rule filter_sample: | |
input: | |
f = "data/{set}/7.intersect/{sample}.f.fq", | |
r = "data/{set}/7.intersect/{sample}.r.fq", | |
output: | |
f = "data/{set}/8.filter/{sample}.f.fq", | |
r = "data/{set}/8.filter/{sample}.r.fq" | |
params: | |
f_truncLen = 115, | |
r_truncLen = 115 | |
run: | |
R(""" | |
library(ShortRead) | |
library(dada2) | |
fastqPairedFilter( | |
c("{input.f}", "{input.r}"), c("{output.f}", "{output.r}"), | |
truncLen=c({params.f_truncLen},{params.r_truncLen}), | |
maxN=0, | |
maxEE=2, | |
truncQ=2, | |
compress=FALSE, | |
verbose=TRUE | |
) | |
if (!file.exists("{output.f}")){{ | |
file.create("{output.f}") | |
}} | |
if (!file.exists("{output.r}")){{ | |
file.create("{output.r}") | |
}} | |
""") | |
rule derep_sample: | |
input: | |
fq = "data/{set}/8.filter/{sample}.{direction}.fq" | |
output: | |
derep = "data/{set}/9.derep/{sample}.{direction}.rds" | |
run: | |
R(""" | |
library(ShortRead) | |
library(dada2) | |
derep <- derepFastq("{input.fq}", verbose=TRUE) | |
saveRDS(derep, "{output.derep}") | |
""") | |
rule infer_error: | |
input: | |
lambda wildcards: derep_random[(wildcards.set, wildcards.direction)] | |
output: | |
error_matrix = "data/{set}/error_matrix.{direction}.rds" | |
run: | |
input = ",".join(['"{}"'.format(i) for i in input]) | |
R(""" | |
library(ShortRead) | |
library(dada2) | |
debug <- lapply(c({input}), readRDS) | |
saveRDS(debug, "{wildcards.set}.{wildcards.direction}.derep.debug.rds") | |
drp.learn <- dada2:::combineDereps2(debug) | |
dd.learn <- dada(drp.learn, err=NULL, selfConsist=TRUE) | |
saveRDS(dd.learn, "{wildcards.set}.{wildcards.direction}.dada.debug.rds") | |
err <- dd.learn[[1]]["err_out"] | |
saveRDS(err, "{output.error_matrix}") | |
""") | |
rule denoise_sample: | |
input: | |
derep = "data/{set}/9.derep/{sample}.{direction}.rds", | |
error_matrix = "data/{set}/error_matrix.{direction}.rds" | |
output: | |
denoise = "data/{set}/10.denoise/{sample}.{direction}.rds" | |
run: | |
R(""" | |
library(ShortRead) | |
library(dada2) | |
derep <- readRDS("{input.derep}") | |
error_matrix <- readRDS("{input.error_matrix}") | |
denoise <- dada(derep, err=NULL, selfConsist=FALSE) | |
saveRDS(derep, "{output.denoise}") | |
""") | |
rule merge_and_table_sample: | |
input: | |
derep_f = "data/{set}/9.derep/{sample}.f.rds", | |
denoise_f = "data/{set}/10.denoise/{sample}.f.rds", | |
derep_r = "data/{set}/9.derep/{sample}.r.rds", | |
denoise_r = "data/{set}/10.denoise/{sample}.r.rds" | |
output: | |
seqtab = "data/{set}/11.merge/{sample}.rds" | |
params: | |
minOverlap = 14 | |
run: | |
R(""" | |
library(ShortRead) | |
library(dada2) | |
derep_f <- readRDS("{input.derep_f}") | |
denoise_f <- readRDS("{input.denoise_f}") | |
derep_r <- readRDS("{input.derep_r}") | |
denoise_r <- readRDS("{input.denoise_r}") | |
merge <- mergePairs(denoise_f, derep_f, denoise_r, derep_r, minOverlap={params.minOverlap}, verbose=TRUE) | |
seqtab <- makeSequenceTable(merge) | |
saveRDS(seqtab, "{output.seqtab}") | |
""") | |
rule combine_tables: | |
input: | |
seqtab_names(INPUT) | |
output: | |
combined = "data/combined.seqtab" | |
run: | |
input = ','.join(['"{}"'.format(i) for i in input]) | |
R(""" | |
library(dada2) | |
library(ShortRead) | |
C_isACGT <- function(seqs) {{ | |
.Call('dada2_C_isACGT', PACKAGE = 'dada2', seqs) | |
}} | |
is.sequence.table <- function(tab) {{ | |
rval <- is.matrix(tab) && all(tab>=0) && | |
!is.null(colnames(tab)) && !is.null(colnames(tab)) && | |
all(sapply(colnames(tab), nchar)>0) && | |
all(sapply(colnames(tab), C_isACGT)) && | |
all(sapply(rownames(tab), nchar)>0) | |
rval | |
}} | |
mergeSequenceTables <- function(table1, table2, ..., orderBy = "abundance") {{ | |
# Combine passed tables into a list | |
tables <- list(table1, table2) | |
tables <- c(tables, list(...)) | |
# Validate tables | |
if(!(all(sapply(tables, is.sequence.table)))) {{ | |
stop("At least two valid sequence tables, and no invalid objects, are expected.") | |
}} | |
sample.names <- rownames(tables[[1]]) | |
for(i in seq(2, length(tables))) {{ | |
sample.names <- c(sample.names, rownames(tables[[i]])) | |
}} | |
if(any(duplicated(sample.names))) {{ | |
stop("Duplicated sample names detected in the rownames.") | |
}} | |
seqs <- unique(c(sapply(tables, colnames), recursive=TRUE)) | |
# Make merged table | |
rval <- matrix(0L, nrow=length(sample.names), ncol=length(seqs)) | |
rownames(rval) <- sample.names | |
colnames(rval) <- seqs | |
for(tab in tables) {{ | |
rval[rownames(tab), colnames(tab)] <- tab | |
}} | |
# Order columns | |
if(!is.null(orderBy)) {{ | |
if(orderBy == "abundance") {{ | |
rval <- rval[,order(colSums(rval), decreasing=TRUE),drop=FALSE] | |
}} else if(orderBy == "nsamples") {{ | |
rval <- rval[,order(colSums(rval>0), decreasing=TRUE),drop=FALSE] | |
}} | |
}} | |
rval | |
}} | |
combined <- mergeSequenceTables(lapply({input}), readRDS) | |
saveRDS(combined, "{output.combined}") | |
""") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment