conchoecia/Snakefile_locus_map_and_reassemble

## Snakefile_locus_map_and_reassemble
# The goal of this script is to map reads to a region and to assemble it de novo.
# This is useful for scenarios in which there are gaps between known homologous sequence,
#  like what one might encounter when doing a de novo mitochondrial genome assembly
#  or assembling a gene locus from a transcript.
#

#The steps for this assembly process are.
# 1) Map all of the reads to the scaffold
# 2) Extract all of the read pairs from the original fastq files and make new fastqs.
# 3) Use the new Fastq files in a de novo Spades assembly.
#    The output of this is fed back into step 1

# I found the idea for the iterative use of snakemake here:
#  https://groups.google.com/forum/#!searchin/Snakemake/dirty$20trick$20loop%7Csort:date/snakemake/Qb3sBXuIriQ/Y9jdconuuxMJ


configfile: "config.yaml"

PARAM = 10

rule all:
    input:
        "analysis/done_mail.txt"

rule done:
    """
    email the haddock lab slack that the job is done.
    """
    input:
        "analysis/round{param}/round{param}.sorted.bam.bai".format(param=PARAM-1)
    output:
        "analysis/done_mail.txt"
    shell:
        """mail -s "DTS Erenna Locus Assembler done" -t u1n8b8b6b3q8g4e7@haddocklab.slack.com <<< "Finished" && \
        touch {output}"""

rule initial_copy:
    """This copies the initial fasta file to the starting directory and formats
    it as a multiline fasta file to ensure that bwa index works properly.
    """
    input:
        config["files"]["fasta"]
    output:
        "analysis/round0/round0.fasta"
    shell:
        """bioawk -cfastx '{{system("printf \\">%s\\"" $name " >> {output}"); \
                             system("echo '' >> {output}"); \
               system("echo " $seq " | fold -w 60 - >> {output}") }}' {input}"""

for i in range(PARAM):
    print("in round {}".format(i))
    rule:
        # 1) Map all of the reads to the scaffold
        """
        Index the fasta file for mapping.
        """
        input:
            "analysis/round{param}/round{param}.fasta".format(param=i)
        output:
            "analysis/round{param}/round{param}.fasta.amb".format(param=i),
            "analysis/round{param}/round{param}.fasta.ann".format(param=i),
            "analysis/round{param}/round{param}.fasta.bwt".format(param=i),
            "analysis/round{param}/round{param}.fasta.pac".format(param=i),
            "analysis/round{param}/round{param}.fasta.sa".format(param=i)
        shell:
            "bwa index {input}"

    rule:
        """
        Map the reads
        """
        input:
            "analysis/round{param}/round{param}.fasta.amb".format(param=i),
            "analysis/round{param}/round{param}.fasta.ann".format(param=i),
            "analysis/round{param}/round{param}.fasta.bwt".format(param=i),
            "analysis/round{param}/round{param}.fasta.pac".format(param=i),
            "analysis/round{param}/round{param}.fasta.sa".format(param=i),
            this_fasta = "analysis/round{param}/round{param}.fasta".format(param=i),
            forward  = config["files"]["forward"],
            reverse  = config["files"]["reverse"]
        output:
            bam = "analysis/round{param}/round{param}.sorted.bam".format(param=i)
        threads:
            90
        shell:
            """bwa mem -t {threads} {input.this_fasta} {input.forward} {input.reverse} | \
            samtools view -bh -G 12 -@ {threads} - | \
            samtools sort -@ {threads} - > {output.bam}"""

    rule:
        """
        index the sorted bam file to view in IGV
        """
        input:
            bam = "analysis/round{param}/round{param}.sorted.bam".format(param=i)
        output:
            bai = "analysis/round{param}/round{param}.sorted.bam.bai".format(param=i)
        shell:
            "samtools index {input}"

    rule:
        """
        make fastq files using the bam file
        """
        input:
            bam = "analysis/round{param}/round{param}.sorted.bam".format(param=i),
            bai = "analysis/round{param}/round{param}.sorted.bam.bai".format(param=i)
        output:
            fq_f = "analysis/round{param}/round{param}_1.fq".format(param=i),
            fq_r = "analysis/round{param}/round{param}_2.fq".format(param=i),
            fq_s = "analysis/round{param}/round{param}_s.fq".format(param=i)
        shell:
            "samtools fastq {input.bam} -1 {output.fq_f} -2 {output.fq_r} -N -s {output.fq_s}"

    rule:
        """
        assemble the genome region with the reads in hand
        """
        input:
            fq_f = "analysis/round{param}/round{param}_1.fq".format(param=i),
            fq_r = "analysis/round{param}/round{param}_2.fq".format(param=i),
            fq_s = "analysis/round{param}/round{param}_s.fq".format(param=i)
        output:
            outdir = "analysis/round{param}/spades/".format(param=i),
            contigs = "analysis/round{param}/spades/contigs.fasta".format(param=i),
            scaffolds = "analysis/round{param}/spades/scaffolds.fasta".format(param=i)
        threads:
            90
        shell:
           """spades.py -t {threads} -m 500 -k 55,101 \
           -o {output.outdir} \
           -1 {input.fq_f} \
           -2 {input.fq_r} \
           -s {input.fq_s}
           """
    rule:
        """
        This step joins the seed fasta file with the scaffolds from the spades
         assembly into the new seed for the next round
        """
        input:
            scaffolds = "analysis/round{param}/spades/scaffolds.fasta".format(param=i),
            og_fasta = "analysis/round0/round0.fasta"
        output:
            new_seed = "analysis/round{param}/round{param}.fasta".format(param = i+1)
        shell:
            """echo {input.scaffolds} >> {output.new_seed} &&\
            echo {input.og_fasta} >> {output.new_seed}
            """
	# The goal of this script is to map reads to a region and to assemble it de novo.
	# This is useful for scenarios in which there are gaps between known homologous sequence,
	# like what one might encounter when doing a de novo mitochondrial genome assembly
	# or assembling a gene locus from a transcript.
	#

	#The steps for this assembly process are.
	# 1) Map all of the reads to the scaffold
	# 2) Extract all of the read pairs from the original fastq files and make new fastqs.
	# 3) Use the new Fastq files in a de novo Spades assembly.
	# The output of this is fed back into step 1

	# I found the idea for the iterative use of snakemake here:
	# https://groups.google.com/forum/#!searchin/Snakemake/dirty$20trick$20loop%7Csort:date/snakemake/Qb3sBXuIriQ/Y9jdconuuxMJ


	configfile: "config.yaml"

	PARAM = 10

	rule all:
	input:
	"analysis/done_mail.txt"

	rule done:
	"""
	email the haddock lab slack that the job is done.
	"""
	input:
	"analysis/round{param}/round{param}.sorted.bam.bai".format(param=PARAM-1)
	output:
	"analysis/done_mail.txt"
	shell:
	"""mail -s "DTS Erenna Locus Assembler done" -t u1n8b8b6b3q8g4e7@haddocklab.slack.com <<< "Finished" && \
	touch {output}"""

	rule initial_copy:
	"""This copies the initial fasta file to the starting directory and formats
	it as a multiline fasta file to ensure that bwa index works properly.
	"""
	input:
	config["files"]["fasta"]
	output:
	"analysis/round0/round0.fasta"
	shell:
	"""bioawk -cfastx '{{system("printf \\">%s\\"" $name " >> {output}"); \
	system("echo '' >> {output}"); \
	system("echo " $seq " \| fold -w 60 - >> {output}") }}' {input}"""

	for i in range(PARAM):
	print("in round {}".format(i))
	rule:
	# 1) Map all of the reads to the scaffold
	"""
	Index the fasta file for mapping.
	"""
	input:
	"analysis/round{param}/round{param}.fasta".format(param=i)
	output:
	"analysis/round{param}/round{param}.fasta.amb".format(param=i),
	"analysis/round{param}/round{param}.fasta.ann".format(param=i),
	"analysis/round{param}/round{param}.fasta.bwt".format(param=i),
	"analysis/round{param}/round{param}.fasta.pac".format(param=i),
	"analysis/round{param}/round{param}.fasta.sa".format(param=i)
	shell:
	"bwa index {input}"

	rule:
	"""
	Map the reads
	"""
	input:
	"analysis/round{param}/round{param}.fasta.amb".format(param=i),
	"analysis/round{param}/round{param}.fasta.ann".format(param=i),
	"analysis/round{param}/round{param}.fasta.bwt".format(param=i),
	"analysis/round{param}/round{param}.fasta.pac".format(param=i),
	"analysis/round{param}/round{param}.fasta.sa".format(param=i),
	this_fasta = "analysis/round{param}/round{param}.fasta".format(param=i),
	forward = config["files"]["forward"],
	reverse = config["files"]["reverse"]
	output:
	bam = "analysis/round{param}/round{param}.sorted.bam".format(param=i)
	threads:
	90
	shell:
	"""bwa mem -t {threads} {input.this_fasta} {input.forward} {input.reverse} \| \
	samtools view -bh -G 12 -@ {threads} - \| \
	samtools sort -@ {threads} - > {output.bam}"""

	rule:
	"""
	index the sorted bam file to view in IGV
	"""
	input:
	bam = "analysis/round{param}/round{param}.sorted.bam".format(param=i)
	output:
	bai = "analysis/round{param}/round{param}.sorted.bam.bai".format(param=i)
	shell:
	"samtools index {input}"

	rule:
	"""
	make fastq files using the bam file
	"""
	input:
	bam = "analysis/round{param}/round{param}.sorted.bam".format(param=i),
	bai = "analysis/round{param}/round{param}.sorted.bam.bai".format(param=i)
	output:
	fq_f = "analysis/round{param}/round{param}_1.fq".format(param=i),
	fq_r = "analysis/round{param}/round{param}_2.fq".format(param=i),
	fq_s = "analysis/round{param}/round{param}_s.fq".format(param=i)
	shell:
	"samtools fastq {input.bam} -1 {output.fq_f} -2 {output.fq_r} -N -s {output.fq_s}"

	rule:
	"""
	assemble the genome region with the reads in hand
	"""
	input:
	fq_f = "analysis/round{param}/round{param}_1.fq".format(param=i),
	fq_r = "analysis/round{param}/round{param}_2.fq".format(param=i),
	fq_s = "analysis/round{param}/round{param}_s.fq".format(param=i)
	output:
	outdir = "analysis/round{param}/spades/".format(param=i),
	contigs = "analysis/round{param}/spades/contigs.fasta".format(param=i),
	scaffolds = "analysis/round{param}/spades/scaffolds.fasta".format(param=i)
	threads:
	90
	shell:
	"""spades.py -t {threads} -m 500 -k 55,101 \
	-o {output.outdir} \
	-1 {input.fq_f} \
	-2 {input.fq_r} \
	-s {input.fq_s}
	"""
	rule:
	"""
	This step joins the seed fasta file with the scaffolds from the spades
	assembly into the new seed for the next round
	"""
	input:
	scaffolds = "analysis/round{param}/spades/scaffolds.fasta".format(param=i),
	og_fasta = "analysis/round0/round0.fasta"
	output:
	new_seed = "analysis/round{param}/round{param}.fasta".format(param = i+1)
	shell:
	"""echo {input.scaffolds} >> {output.new_seed} &&\
	echo {input.og_fasta} >> {output.new_seed}
	"""