Skip to content

Instantly share code, notes, and snippets.

@moldach
Created May 22, 2020 18:18
Show Gist options
  • Save moldach/7e8a293c12934fd3f6f080c4d49c335f to your computer and use it in GitHub Desktop.
Save moldach/7e8a293c12934fd3f6f080c4d49c335f to your computer and use it in GitHub Desktop.
Snakemake workflow
'''
This is a snakemake for variant calling workflow
-------------------
# An example run of this workflow
Usage:
snakemake --profile slurm --jobs 20
'''
# Directories------------------------------------------------------------------
configfile: "config.yaml"
# Setting the names of all directories
dir_list = ["LOG_DIR", "BENCHMARK_DIR", "QC_DIR", "TRIM_DIR", "ALIGN_DIR", "MARKDUP_DIR", "CALLING_DIR", "ANNOT_DIR"]
dir_names = ["LOGS", "BENCHMARKS", "QC", "TRIMMING", "ALIGNMENT", "MARK_DUPLICATES", "VARIANT_CALLING", "ANNOTATION"]
dirs_dict = dict(zip(dir_list, dir_names))
# Load Samples ----------------------------------------------------------------
import pandas as pd
# getting the samples information (names, path to r1 & r2) from samples.txt
samples_information = pd.read_csv("samples.txt", sep='\t', index_col=False)
# get a list of the sample names
sample_names = list(samples_information['sample'])
sample_locations = list(samples_information['location'])
# Rules -----------------------------------------------------------------------
rule all:
'''
The target rule for the workflow.
This lists all of the final products of the workflow
'''
input:
expand('{QC_DIR}/{QC_TOOL}/before_trim/{sample}_R1_fastqc.html', QC_DIR=dirs_dict["QC_DIR"], QC_TOOL=config["QC_TOOL"], sample=sample_names),
expand('{QC_DIR}/{QC_TOOL}/before_trim/{sample}_R1_fastqc.zip', QC_DIR=dirs_dict["QC_DIR"], QC_TOOL=config["QC_TOOL"], sample=sample_names),
expand('{QC_DIR}/{QC_TOOL}/before_trim/{sample}_R2_fastqc.html', QC_DIR=dirs_dict["QC_DIR"], QC_TOOL=config["QC_TOOL"], sample=sample_names),
expand('{QC_DIR}/{QC_TOOL}/before_trim/{sample}_R2_fastqc.zip', QC_DIR=dirs_dict["QC_DIR"], QC_TOOL=config["QC_TOOL"], sample=sample_names),
expand('{QC_DIR}/{QC_TOOL}/before_align/{sample}_R1_trim_paired_fastqc.html', QC_DIR=dirs_dict["QC_DIR"], QC_TOOL=config["QC_TOOL"], sample=sample_names),
expand('{QC_DIR}/{QC_TOOL}/before_align/{sample}_R1_trim_paired_fastqc.zip', QC_DIR=dirs_dict["QC_DIR"], QC_TOOL=config["QC_TOOL"], sample=sample_names),
expand('{QC_DIR}/{QC_TOOL}/before_align/{sample}_R2_trim_paired_fastqc.html', QC_DIR=dirs_dict["QC_DIR"], QC_TOOL=config["QC_TOOL"], sample=sample_names),
expand('{QC_DIR}/{QC_TOOL}/before_align/{sample}_R2_trim_paired_fastqc.zip', QC_DIR=dirs_dict["QC_DIR"], QC_TOOL=config["QC_TOOL"], sample=sample_names),
expand('{QC_DIR}/{QC_TOOL}/after_align/{sample}.sorted.dedupped_fastqc.html', QC_DIR=dirs_dict["QC_DIR"], QC_TOOL=config["QC_TOOL"], sample=sample_names),
expand('{QC_DIR}/{QC_TOOL}/after_align/{sample}.sorted.dedupped_fastqc.zip', QC_DIR=dirs_dict["QC_DIR"], QC_TOOL=config["QC_TOOL"], sample=sample_names),
expand('{TRIM_DIR}/{TRIM_TOOL}/{sample}_R1_trim_paired.fq.gz', TRIM_DIR=dirs_dict["TRIM_DIR"], TRIM_TOOL=config["TRIM_TOOL"], sample=sample_names),
expand('{TRIM_DIR}/{TRIM_TOOL}/{sample}_R1_trim_unpaired.fq.gz', TRIM_DIR=dirs_dict["TRIM_DIR"], TRIM_TOOL=config["TRIM_TOOL"], sample=sample_names),
expand('{TRIM_DIR}/{TRIM_TOOL}/{sample}_R2_trim_paired.fq.gz', TRIM_DIR=dirs_dict["TRIM_DIR"], TRIM_TOOL=config["TRIM_TOOL"], sample=sample_names),
expand('{TRIM_DIR}/{TRIM_TOOL}/{sample}_R2_trim_unpaired.fq.gz', TRIM_DIR=dirs_dict["TRIM_DIR"], TRIM_TOOL=config["TRIM_TOOL"], sample=sample_names),
expand('{ALIGN_DIR}/{ALIGN_TOOL}/{sample}.bam', ALIGN_DIR=dirs_dict["ALIGN_DIR"], ALIGN_TOOL=config["ALIGN_TOOL"], sample=sample_names),
expand('{ALIGN_DIR}/{ALIGN_TOOL}/{sample}.sorted.bam', ALIGN_DIR=dirs_dict["ALIGN_DIR"], ALIGN_TOOL=config["ALIGN_TOOL"], sample=sample_names),
expand('{ALIGN_DIR}/{ALIGN_TOOL}/{sample}.sorted.bam.bai', ALIGN_DIR=dirs_dict["ALIGN_DIR"], ALIGN_TOOL=config["ALIGN_TOOL"], sample=sample_names),
expand('{ALIGN_DIR}/{ALIGN_TOOL}/{sample}.sorted.dedupped.bam', ALIGN_DIR=dirs_dict["ALIGN_DIR"], ALIGN_TOOL=config["ALIGN_TOOL"], sample=sample_names),
expand('{ALIGN_DIR}/{ALIGN_TOOL}/{sample}.sorted.dedupped.bam.bai', ALIGN_DIR=dirs_dict["ALIGN_DIR"], ALIGN_TOOL=config["ALIGN_TOOL"], sample=sample_names),
expand('{ALIGN_DIR}/{ALIGN_TOOL}/{sample}.sorted.dedupped.txt', ALIGN_DIR=dirs_dict["ALIGN_DIR"], ALIGN_TOOL=config["ALIGN_TOOL"], sample=sample_names),
expand('{CALLING_DIR}/{CALLING_TOOL}/{sample}_sorted_dedupped_snp_varscan.vcf', CALLING_DIR=dirs_dict["CALLING_DIR"], CALLING_TOOL=config["CALLING_TOOL"], sample=sample_names),
expand('{CALLING_DIR}/{CALLING_TOOL}/{sample}_sorted_dedupped_snp_varscan.tsv', CALLING_DIR=dirs_dict["CALLING_DIR"], CALLING_TOOL=config["CALLING_TOOL"], sample=sample_names),
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/categorized-gvs.gvf', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names),
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/transcripts.gff3', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names),
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/deletions/distribution_deletions.out', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names),
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/deletions/excluded_{sample}_sorted_dedupped_snp_varscan.tsv.del', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names),
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/deletions/kept_{sample}_sorted_dedupped_snp_varscan.tsv.del', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names),
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/deletions/kept_{sample}_sorted_dedupped_snp_varscan.tsv.del.circos', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names),
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/insertions/distribution_insertions.out', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names),
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/insertions/excluded_{sample}_sorted_dedupped_snp_varscan.tsv.ins', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names),
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/insertions/kept_{sample}_sorted_dedupped_snp_varscan.tsv.ins', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names),
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/insertions/kept_{sample}_sorted_dedupped_snp_varscan.tsv.ins.circos', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names),
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/intermediate-files/{sample}_sorted_dedupped_snp_varscan.tsv.del', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names),
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/intermediate-files/{sample}_sorted_dedupped_snp_varscan.tsv.ins', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names),
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/intermediate-files/{sample}_sorted_dedupped_snp_varscan.tsv.snp', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names),
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/intermediate-files/categorized_snp_coords.list', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names),
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/intermediate-files/contigs.summary', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names),
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/intermediate-files/snps_frequency.table', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names),
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/intermediate-files/sorted_gff3.tmp', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names),
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/intermediate-files/splice_junctions.tmp', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names),
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/intermediate-files/target_cdna_snps.exons', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names),
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/intermediate-files/target_cdna_snps.fasta', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names),
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/intermediate-files/transcripts.gff3.tmp', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names),
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/intermediate-files/transcripts_snps_applied.gff3.tmp', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names),
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/intermediate-files/variant.summary', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names),
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/snps/ambiguous_snps.list', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names),
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/snps/categorized_snps.stat', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names),
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/snps/codon_bias.stat', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names),
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/snps/excluded_{sample}_sorted_dedupped_snp_varscan.tsv.snp', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names),
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/snps/kept_{sample}_sorted_dedupped_snp_varscan.tsv.snp', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names),
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/snps/kept_{sample}_sorted_dedupped_snp_varscan.tsv.snp.circos', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names),
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/transcripts/exons.circos', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names),
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/transcripts/reference_cdna.exons', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names),
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/transcripts/reference_cdna.fasta', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names),
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/transcripts/reference_peptides.fasta', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names),
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/transcripts/reference_variant.alignment', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names),
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/transcripts/variant_cdna.exons', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names),
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/transcripts/variant_cdna.fasta', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names),
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/transcripts/variant_peptides.fasta', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names),
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/transcripts/variant.stat', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names),
rule qc_before_trim:
input:
r1 = expand('{FASTQ_DIR}/{sample}_R1.fastq.gz', FASTQ_DIR=sample_locations, sample=sample_names),
r2 = expand('{FASTQ_DIR}/{sample}_R2.fastq.gz', FASTQ_DIR=sample_locations, sample=sample_names)
output:
expand('{QC_DIR}/{QC_TOOL}/before_trim/{sample}_R1_fastqc.html', QC_DIR=dirs_dict["QC_DIR"], QC_TOOL=config["QC_TOOL"], sample=sample_names),
expand('{QC_DIR}/{QC_TOOL}/before_trim/{sample}_R1_fastqc.zip', QC_DIR=dirs_dict["QC_DIR"], QC_TOOL=config["QC_TOOL"], sample=sample_names),
expand('{QC_DIR}/{QC_TOOL}/before_trim/{sample}_R2_fastqc.html', QC_DIR=dirs_dict["QC_DIR"], QC_TOOL=config["QC_TOOL"], sample=sample_names),
expand('{QC_DIR}/{QC_TOOL}/before_trim/{sample}_R2_fastqc.zip', QC_DIR=dirs_dict["QC_DIR"], QC_TOOL=config["QC_TOOL"], sample=sample_names)
log: expand('{LOG_DIR}/{sample}-{QC_TOOL}-qc-before-trim.log', QC_TOOL=config["QC_TOOL"], LOG_DIR=dirs_dict["LOG_DIR"], sample=sample_names)
resources:
mem = 1000,
time = 30
threads: 1
params:
dir = expand('{QC_DIR}/{QC_TOOL}/before_trim/', QC_DIR=dirs_dict["QC_DIR"], QC_TOOL=config["QC_TOOL"])
message: """--- Quality check of raw data with FastQC before trimming."""
shell:
'module load fastqc/0.11.5;'
'fastqc -o {params.dir} -f fastq {input.r1} &'
'fastqc -o {params.dir} -f fastq {input.r2}'
rule trimming:
input:
r1 = expand('{FASTQ_DIR}/{sample}_R1.fastq.gz', FASTQ_DIR=sample_locations, sample=sample_names),
r2 = expand('{FASTQ_DIR}/{sample}_R2.fastq.gz', FASTQ_DIR=sample_locations, sample=sample_names)
output:
r1_paired = expand('{TRIM_DIR}/{TRIM_TOOL}/{sample}_R1_trim_paired.fq.gz', TRIM_DIR=dirs_dict["TRIM_DIR"], TRIM_TOOL=config["TRIM_TOOL"], sample=sample_names),
r1_unpaired = expand('{TRIM_DIR}/{TRIM_TOOL}/{sample}_R1_trim_unpaired.fq.gz', TRIM_DIR=dirs_dict["TRIM_DIR"], TRIM_TOOL=config["TRIM_TOOL"], sample=sample_names),
r2_paired = expand('{TRIM_DIR}/{TRIM_TOOL}/{sample}_R2_trim_paired.fq.gz', TRIM_DIR=dirs_dict["TRIM_DIR"], TRIM_TOOL=config["TRIM_TOOL"], sample=sample_names),
r2_unpaired = expand('{TRIM_DIR}/{TRIM_TOOL}/{sample}_R2_trim_unpaired.fq.gz', TRIM_DIR=dirs_dict["TRIM_DIR"], TRIM_TOOL=config["TRIM_TOOL"], sample=sample_names)
log: expand('{LOG_DIR}/{sample}-{TRIM_TOOL}-trimming.log', LOG_DIR=dirs_dict["LOG_DIR"], TRIM_TOOL=config["TRIM_TOOL"], sample=sample_names)
resources:
mem = 1000,
time = 120
message: """--- Trimming FASTQ files with Trimmomatic."""
shell: """
module load trimmomatic/0.36;
java -jar $EBROOTTRIMMOMATIC/trimmomatic-0.36.jar PE -phred33 \
{input.r1} \
{input.r2} \
{output.r1_paired} \
{output.r1_unpaired} \
{output.r2_paired} \
{output.r2_unpaired} \
ILLUMINACLIP:adapters.fa:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36;
"""
rule qc_before_alignment:
input:
r1 = expand('{TRIM_DIR}/{TRIM_TOOL}/{sample}_R1_trim_paired.fq.gz', TRIM_DIR=dirs_dict["TRIM_DIR"], TRIM_TOOL=config["TRIM_TOOL"], sample=sample_names),
r2 = expand('{TRIM_DIR}/{TRIM_TOOL}/{sample}_R2_trim_paired.fq.gz', TRIM_DIR=dirs_dict["TRIM_DIR"], TRIM_TOOL=config["TRIM_TOOL"], sample=sample_names)
output:
expand('{QC_DIR}/{QC_TOOL}/before_align/{sample}_R1_trim_paired_fastqc.html', QC_DIR=dirs_dict["QC_DIR"], QC_TOOL=config["QC_TOOL"], sample=sample_names),
expand('{QC_DIR}/{QC_TOOL}/before_align/{sample}_R1_trim_paired_fastqc.zip', QC_DIR=dirs_dict["QC_DIR"], QC_TOOL=config["QC_TOOL"], sample=sample_names),
expand('{QC_DIR}/{QC_TOOL}/before_align/{sample}_R2_trim_paired_fastqc.html', QC_DIR=dirs_dict["QC_DIR"], QC_TOOL=config["QC_TOOL"], sample=sample_names),
expand('{QC_DIR}/{QC_TOOL}/before_align/{sample}_R2_trim_paired_fastqc.zip', QC_DIR=dirs_dict["QC_DIR"], QC_TOOL=config["QC_TOOL"], sample=sample_names)
log: expand('{LOG_DIR}/{sample}-{QC_TOOL}-qc-before-align.log', QC_TOOL=config["QC_TOOL"], LOG_DIR=dirs_dict["LOG_DIR"], sample=sample_names)
resources:
mem = 1000,
time = 30
threads: 1
params:
dir = expand('{QC_DIR}/{QC_TOOL}/before_align/', QC_DIR=dirs_dict["QC_DIR"], QC_TOOL=config["QC_TOOL"])
message: """--- Quality check of raw data with FastQC before alignment."""
shell:
'module load fastqc/0.11.5;'
'fastqc -o {params.dir} -f fastq {input.r1} &'
'fastqc -o {params.dir} -f fastq {input.r2}'
rule alignment:
input:
r1 = expand('{TRIM_DIR}/{TRIM_TOOL}/{sample}_R1_trim_paired.fq.gz', TRIM_DIR=dirs_dict["TRIM_DIR"], TRIM_TOOL=config["TRIM_TOOL"], sample=sample_names),
r2 = expand('{TRIM_DIR}/{TRIM_TOOL}/{sample}_R2_trim_paired.fq.gz', TRIM_DIR=dirs_dict["TRIM_DIR"], TRIM_TOOL=config["TRIM_TOOL"], sample=sample_names),
ref = config["REF_GENOME"]
output:
expand('{ALIGN_DIR}/{ALIGN_TOOL}/{sample}.bam', ALIGN_DIR=dirs_dict["ALIGN_DIR"], ALIGN_TOOL=config["ALIGN_TOOL"], sample=sample_names)
log: expand('{LOG_DIR}/{sample}-{ALIGN_TOOL}.log', ALIGN_TOOL=config["ALIGN_TOOL"], LOG_DIR=dirs_dict["LOG_DIR"], sample=sample_names)
message: """--- Alignment with BWA."""
threads: 8
resources:
mem = 10000,
time = 720
params:
rg=expand('r"@RG\tID:{sample}\tSM:{sample}"', sample=sample_names)
shell: """
module load bwa;
module load samtools;
bwa mem -t {threads} \
{input.ref} {input.r1} {input.r2} |\
samtools view -Sb - > {output}
"""
rule sort_bam:
input:
expand('{ALIGN_DIR}/{ALIGN_TOOL}/{sample}.bam', ALIGN_DIR=dirs_dict["ALIGN_DIR"], ALIGN_TOOL=config["ALIGN_TOOL"], sample=sample_names)
output:
expand('{ALIGN_DIR}/{ALIGN_TOOL}/{sample}.sorted.bam', ALIGN_DIR=dirs_dict["ALIGN_DIR"], ALIGN_TOOL=config["ALIGN_TOOL"], sample=sample_names)
log: expand('{LOG_DIR}/{sample}-bamSort.log', LOG_DIR=dirs_dict["LOG_DIR"], sample=sample_names)
resources:
mem = 2000,
time = 90
threads: 2
message: """--- Sort BAM files."""
shell: """
module load samtools;
samtools sort {input} -o {output}
"""
rule index_bam:
input:
expand('{ALIGN_DIR}/{ALIGN_TOOL}/{sample}.sorted.bam', ALIGN_DIR=dirs_dict["ALIGN_DIR"], ALIGN_TOOL=config["ALIGN_TOOL"], sample=sample_names)
output:
expand('{ALIGN_DIR}/{ALIGN_TOOL}/{sample}.sorted.bam.bai', ALIGN_DIR=dirs_dict["ALIGN_DIR"], ALIGN_TOOL=config["ALIGN_TOOL"], sample=sample_names)
log: expand('{LOG_DIR}/{sample}-bamIndex.log', LOG_DIR=dirs_dict["LOG_DIR"], sample=sample_names)
resources:
mem = 2000,
time = 720
message: """--- Index BAM files."""
shell: """
module load samtools;
samtools index {input} {output}
"""
rule mark_duplicates:
input: expand('{ALIGN_DIR}/{ALIGN_TOOL}/{sample}.sorted.bam', ALIGN_DIR=dirs_dict["ALIGN_DIR"], ALIGN_TOOL=config["ALIGN_TOOL"], sample=sample_names)
output:
dedup = expand('{ALIGN_DIR}/{ALIGN_TOOL}/{sample}.sorted.dedupped.bam', ALIGN_DIR=dirs_dict["ALIGN_DIR"], ALIGN_TOOL=config["ALIGN_TOOL"], sample=sample_names),
text = expand('{ALIGN_DIR}/{ALIGN_TOOL}/{sample}.sorted.dedupped.txt', ALIGN_DIR=dirs_dict["ALIGN_DIR"], ALIGN_TOOL=config["ALIGN_TOOL"], sample=sample_names)
log: expand('{LOG_DIR}/{sample}-{ALIGN_TOOL}.log', LOG_DIR=dirs_dict["LOG_DIR"], ALIGN_TOOL=config["ALIGN_TOOL"], sample=sample_names)
resources:
mem = 15000,
time = 720
message: """--- Mark duplicates."""
threads: 1
shell: """
module load picard/2.17.3;
java -Xmx6G -jar $EBROOTPICARD/picard.jar MarkDuplicates \
VALIDATION_STRINGENCY=LENIENT READ_NAME_REGEX=null \
I={input} \
O={output.dedup} \
M={output.text}
"""
rule index_dedupped_bam:
input: expand('{ALIGN_DIR}/{ALIGN_TOOL}/{sample}.sorted.dedupped.bam', ALIGN_DIR=dirs_dict["ALIGN_DIR"], ALIGN_TOOL=config["ALIGN_TOOL"], sample=sample_names)
output: expand('{ALIGN_DIR}/{ALIGN_TOOL}/{sample}.sorted.dedupped.bam.bai', ALIGN_DIR=dirs_dict["ALIGN_DIR"], ALIGN_TOOL=config["ALIGN_TOOL"], sample=sample_names)
log: expand('{LOG_DIR}/{sample}-indexMarkDups.log', LOG_DIR=dirs_dict["LOG_DIR"], sample=sample_names)
resources:
mem = 2000,
time = 30
message: """--- Inded Sorted, Dedupped BAM files."""
shell: """
module load samtools;
samtools index {input} {output}
"""
rule qc_after_alignment:
input:
expand('{ALIGN_DIR}/{ALIGN_TOOL}/{sample}.sorted.dedupped.bam', ALIGN_DIR=dirs_dict["ALIGN_DIR"], ALIGN_TOOL=config["ALIGN_TOOL"], sample=sample_names),
output:
expand('{QC_DIR}/{QC_TOOL}/after_align/{sample}.sorted.dedupped_fastqc.html', QC_DIR=dirs_dict["QC_DIR"], QC_TOOL=config["QC_TOOL"], sample=sample_names),
expand('{QC_DIR}/{QC_TOOL}/after_align/{sample}.sorted.dedupped_fastqc.zip', QC_DIR=dirs_dict["QC_DIR"], QC_TOOL=config["QC_TOOL"], sample=sample_names),
log: expand('{LOG_DIR}/{sample}-{QC_TOOL}-qc-after-align.log', QC_TOOL=config["QC_TOOL"], LOG_DIR=dirs_dict["LOG_DIR"], sample=sample_names)
resources:
mem = 1000,
time = 30
threads: 1
params:
dir = expand('{QC_DIR}/{QC_TOOL}/after_align/', QC_DIR=dirs_dict["QC_DIR"], QC_TOOL=config["QC_TOOL"])
message: """--- Quality check of raw data with FastQC after alignment."""
shell:
"""
module load fastqc/0.11.5
fastqc -o {params.dir} -f bam {input}
"""
rule variant_calling:
input:
ref = config["REF_GENOME"],
bam = expand('{ALIGN_DIR}/{ALIGN_TOOL}/{sample}.sorted.dedupped.bam', ALIGN_DIR=dirs_dict["ALIGN_DIR"], ALIGN_TOOL=config["ALIGN_TOOL"], sample=sample_names)
output:
expand('{CALLING_DIR}/{CALLING_TOOL}/{sample}_sorted_dedupped_snp_varscan.vcf', CALLING_DIR=dirs_dict["CALLING_DIR"], CALLING_TOOL=config["CALLING_TOOL"], sample=sample_names),
log: expand('{LOG_DIR}/{sample}-{CALLING_TOOL}.log', CALLING_TOOL=config["CALLING_TOOL"], LOG_DIR=dirs_dict["LOG_DIR"], sample=sample_names)
resources:
mem = 5000,
time = 720
message: """--- Varscan pileup2snp."""
shell: """
module load samtools;
module load java/13.0.1;
echo -n "I II III IV V X MtDNA" |\
xargs -d " " -n 1 -P 7 -I {{}} /bin/bash -c \
"samtools mpileup -r {{}} \
-f ~/projects/def-mtarailo/common/indexes/WS265_wormbase/{{}}.fa alignment/470.sorted.dedupped.bam |\
java -Xmx5G -jar ~/projects/def-mtarailo/common/tools/VarScan.v2.3.9.jar pileup2snp \
--variants \
--min-coverage 5 \
--min-avg-qual 30 \
--min-var-freq 0.9 > {{}}.vcf"
awk '
FNR==1 && NR!=1 {{ while (/^<header>/) getline; }}
1 {{print}}
' *.vcf > {output}
rm I.vcf II.vcf III.vcf IV.vcf V.vcf X.vcf MtDNA.vcf
"""
rule prep_gff3:
input: expand('{CALLING_DIR}/{CALLING_TOOL}/{sample}_sorted_dedupped_snp_varscan.vcf', CALLING_DIR=dirs_dict["CALLING_DIR"], CALLING_TOOL=config["CALLING_TOOL"], sample=sample_names)
output: expand('{CALLING_DIR}/{CALLING_TOOL}/{sample}_sorted_dedupped_snp_varscan.tsv', CALLING_DIR=dirs_dict["CALLING_DIR"], CALLING_TOOL=config["CALLING_TOOL"], sample=sample_names)
log: expand('{LOG_DIR}/{sample}-{CALLING_TOOL}-prepGff3.log', CALLING_TOOL=config["CALLING_TOOL"], LOG_DIR=dirs_dict["LOG_DIR"], sample=sample_names)
resources:
mem = 2000,
time = 30
message: """--- Prepping VCF->TSV for CooVar."""
shell: """
sed 1d {input} |\
awk "{{print "SNP\\t"\$1"\\t"\$2"\\t"\$3"\\t"\$4}}" > {output}
"""
rule variant_annotation:
input:
tsv = expand('{CALLING_DIR}/{CALLING_TOOL}/{sample}_sorted_dedupped_snp_varscan.tsv', CALLING_DIR=dirs_dict["CALLING_DIR"], CALLING_TOOL=config["CALLING_TOOL"], sample=sample_names),
ref = config["REF_GENOME"],
gff3 = config["GENOME_ANNOTATION"]
output:
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/categorized-gvs.gvf', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names),
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/transcripts.gff3', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names),
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/deletions/distribution_deletions.out', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names),
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/deletions/excluded_{sample}_sorted_dedupped_snp_varscan.tsv.del', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names),
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/deletions/kept_{sample}_sorted_dedupped_snp_varscan.tsv.del', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names),
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/deletions/kept_{sample}_sorted_dedupped_snp_varscan.tsv.del.circos', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names),
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/insertions/distribution_insertions.out', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names),
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/insertions/excluded_{sample}_sorted_dedupped_snp_varscan.tsv.ins', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names),
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/insertions/kept_{sample}_sorted_dedupped_snp_varscan.tsv.ins', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names),
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/insertions/kept_{sample}_sorted_dedupped_snp_varscan.tsv.ins.circos', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names),
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/intermediate-files/{sample}_sorted_dedupped_snp_varscan.tsv.del', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names),
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/intermediate-files/{sample}_sorted_dedupped_snp_varscan.tsv.ins', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names),
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/intermediate-files/{sample}_sorted_dedupped_snp_varscan.tsv.snp', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names),
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/intermediate-files/categorized_snp_coords.list', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names),
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/intermediate-files/contigs.summary', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names),
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/intermediate-files/snps_frequency.table', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names),
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/intermediate-files/sorted_gff3.tmp', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names),
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/intermediate-files/splice_junctions.tmp', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names),
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/intermediate-files/target_cdna_snps.exons', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names),
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/intermediate-files/target_cdna_snps.fasta', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names),
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/intermediate-files/transcripts.gff3.tmp', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names),
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/intermediate-files/transcripts_snps_applied.gff3.tmp', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names),
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/intermediate-files/variant.summary', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names),
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/snps/ambiguous_snps.list', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names),
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/snps/categorized_snps.stat', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names),
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/snps/codon_bias.stat', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names),
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/snps/excluded_{sample}_sorted_dedupped_snp_varscan.tsv.snp', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names),
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/snps/kept_{sample}_sorted_dedupped_snp_varscan.tsv.snp', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names),
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/snps/kept_{sample}_sorted_dedupped_snp_varscan.tsv.snp.circos', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names),
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/transcripts/exons.circos', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names),
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/transcripts/reference_cdna.exons', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names),
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/transcripts/reference_cdna.fasta', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names),
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/transcripts/reference_peptides.fasta', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names),
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/transcripts/reference_variant.alignment', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names),
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/transcripts/variant_cdna.exons', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names),
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/transcripts/variant_cdna.fasta', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names),
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/transcripts/variant_peptides.fasta', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names),
expand('{ANNOT_DIR}/{ANNOT_TOOL}/{sample}/transcripts/variant.stat', ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names)
params:
prefix = expand("{ANNOT_DIR}/{ANNOT_TOOL}/{sample}", ANNOT_DIR=dirs_dict["ANNOT_DIR"], ANNOT_TOOL=config["ANNOT_TOOL"], sample=sample_names)
log: expand('{LOG_DIR}/{sample}-{ANNOT_TOOL}.log', ANNOT_TOOL=config["ANNOT_TOOL"], LOG_DIR=dirs_dict["LOG_DIR"], sample=sample_names)
message: """--- Annotating Variants."""
resources:
mem = 12000,
time = 1440
threads: 6
shell: """
module load bioperl;
perl ~/projects/def-mtarailo/common/tools/CooVar/coovar.pl \
--cores 6 \
-e {input.gff3} \
-r {input.ref} \
-t {input.tsv} \
-o {params.prefix} \
--circos
"""
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment