lindenb/bcftools-gvf.nf

## bcftools-gvf.nf

reference=file("${params.reference}")
bams=file("${params.bams}")

process getChromosomes {
	tag "get chromosomes from ${ref}"
	executor 'local'
	input:
		file ref from reference
	output:
		file("contigs.txt") into contigs
	script:
	"""
	cut -f 1 "${ref.toRealPath()}.fai" > contigs.txt
	"""
	}

process sampleAndBams {
	tag "get bams from ${bams}"
	executor 'local'
	input:
		file bams from bams
	output:
		file("sample_bams.csv") into sample_bams
	script:
	"""
	cat "${bams}" | sort | while read B
	do
		samtools view -H "\${B}" | grep '^@RG' | tr '\\t' '\\n' | grep ^SM -m1 | cut -d ':' -f 2- | tr '\\n' ',' >>  sample_bams.csv
		echo "\${B}" >> sample_bams.csv
	done
	"""
	}

sample_bams.splitCsv(header: false,sep:',',strip:true).
	combine(contigs.splitCsv(header: false,sep:',',strip:true)).
	into{sample_bam_contig}

process call_gvcf {
	tag "call ${sample} ${bam} ${contig}"
	input:
		file ref from reference
		set sample,bam,contig from sample_bam_contig
	output:
		set contig,file("${sample}.csv") into sample_contig_bcf
		file("${sample}.${contig}.bcf")
		file("${sample}.${contig}.bcf.csi")
	script:
	"""

	bcftools mpileup --output-type u --fasta-ref "${ref.toRealPath()}" --regions "${contig}" -a 'INFO/AD'  -a 'FORMAT/AD'  -a 'FORMAT/DP'  --gvcf 5,10,20,30 "${bam}" |\
	bcftools call  --gvcf 5,10,20,30  --multiallelic-caller --output-type b -o "${sample}.${contig}.bcf"

	bcftools index "${sample}.${contig}.bcf"

	echo "${sample},\${PWD}/${sample}.${contig}.bcf" > "${sample}.csv"
	"""
	}

process combine_contig {
	tag "combine ${contig} N=${vcfs.size()}"
	input:
		file ref from reference
		set contig,vcfs from  sample_contig_bcf.groupTuple()
	output:
		file("${contig}.bcf") into contig_merged
		file("${contig}.bcf.csi")
	script:
	"""
# make sure we always use the samples in the same order
cat ${vcfs.join(" ")} | sort -T . -t, -k1,1 | cut -d, -f 2 > vcf.list

	bcftools merge --merge all --gvcf "${ref.toRealPath()}" --file-list "vcf.list" --output-type 'u' |\
	bcftools convert  --gvcf2vcf --fasta-ref "${ref.toRealPath()}" --output-type 'u'  |\
	bcftools view -m 2 --output-type 'u'  |\
	bcftools norm --fasta-ref "${ref.toRealPath()}"  --output-type 'b' -o "${contig}.bcf"
	bcftools index "${contig}.bcf"
rm vcf.list
	"""
	}
process combine_all{
	tag "combine N=${vcfs.size()}"
	input:
		file ref from reference
		val vcfs from  contig_merged.collect()
	output:
		file("${params.prefix}vcf.gz")
		file("${params.prefix}vcf.gz.tbi")
	script:
	"""
cat << __EOF__ | sort > vcf.list
${vcfs.join("\n")}
__EOF__
	bcftools concat --file-list "vcf.list" --output-type 'z' -o "${params.prefix}vcf.gz"
	bcftools index --tbi "${params.prefix}vcf.gz"
	"""
	}

	reference=file("${params.reference}")
	bams=file("${params.bams}")

	process getChromosomes {
	tag "get chromosomes from ${ref}"
	executor 'local'
	input:
	file ref from reference
	output:
	file("contigs.txt") into contigs
	script:
	"""
	cut -f 1 "${ref.toRealPath()}.fai" > contigs.txt
	"""
	}

	process sampleAndBams {
	tag "get bams from ${bams}"
	executor 'local'
	input:
	file bams from bams
	output:
	file("sample_bams.csv") into sample_bams
	script:
	"""
	cat "${bams}" \| sort \| while read B
	do
	samtools view -H "\${B}" \| grep '^@RG' \| tr '\\t' '\\n' \| grep ^SM -m1 \| cut -d ':' -f 2- \| tr '\\n' ',' >> sample_bams.csv
	echo "\${B}" >> sample_bams.csv
	done
	"""
	}

	sample_bams.splitCsv(header: false,sep:',',strip:true).
	combine(contigs.splitCsv(header: false,sep:',',strip:true)).
	into{sample_bam_contig}

	process call_gvcf {
	tag "call ${sample} ${bam} ${contig}"
	input:
	file ref from reference
	set sample,bam,contig from sample_bam_contig
	output:
	set contig,file("${sample}.csv") into sample_contig_bcf
	file("${sample}.${contig}.bcf")
	file("${sample}.${contig}.bcf.csi")
	script:
	"""

	bcftools mpileup --output-type u --fasta-ref "${ref.toRealPath()}" --regions "${contig}" -a 'INFO/AD' -a 'FORMAT/AD' -a 'FORMAT/DP' --gvcf 5,10,20,30 "${bam}" \|\
	bcftools call --gvcf 5,10,20,30 --multiallelic-caller --output-type b -o "${sample}.${contig}.bcf"

	bcftools index "${sample}.${contig}.bcf"

	echo "${sample},\${PWD}/${sample}.${contig}.bcf" > "${sample}.csv"
	"""
	}

	process combine_contig {
	tag "combine ${contig} N=${vcfs.size()}"
	input:
	file ref from reference
	set contig,vcfs from sample_contig_bcf.groupTuple()
	output:
	file("${contig}.bcf") into contig_merged
	file("${contig}.bcf.csi")
	script:
	"""
	# make sure we always use the samples in the same order
	cat ${vcfs.join(" ")} \| sort -T . -t, -k1,1 \| cut -d, -f 2 > vcf.list

	bcftools merge --merge all --gvcf "${ref.toRealPath()}" --file-list "vcf.list" --output-type 'u' \|\
	bcftools convert --gvcf2vcf --fasta-ref "${ref.toRealPath()}" --output-type 'u' \|\
	bcftools view -m 2 --output-type 'u' \|\
	bcftools norm --fasta-ref "${ref.toRealPath()}" --output-type 'b' -o "${contig}.bcf"
	bcftools index "${contig}.bcf"
	rm vcf.list
	"""
	}
	process combine_all{
	tag "combine N=${vcfs.size()}"
	input:
	file ref from reference
	val vcfs from contig_merged.collect()
	output:
	file("${params.prefix}vcf.gz")
	file("${params.prefix}vcf.gz.tbi")
	script:
	"""
	cat << __EOF__ \| sort > vcf.list
	${vcfs.join("\n")}
	__EOF__
	bcftools concat --file-list "vcf.list" --output-type 'z' -o "${params.prefix}vcf.gz"
	bcftools index --tbi "${params.prefix}vcf.gz"
	"""
	}