lindenb/roh.nf

## roh.nf
params.prefix="output."
params.vcf=""
params.help=false
params.rohopts="  --rec-rate 100 --skip-indels "
params.pedigree = "NO_PED"

def helpMessage() {
  log.info"""

=========================================
Usage:

Detects regions of autozygosity in sequencing data, including exome data, using a hidden Markov model.
See https://samtools.github.io/bcftools/howtos/roh-calling.html

Mandatory arguments:
  --prefix  output prefix
  --publishDir <dir> Output directory.
  --vcf <file> the vcf

Other options:
  --pedigree <file> optional pedigree with phenotypes
  --rohopts <string> options for roh (${params.rohopts})

Nextflow options:
  -w  Work directory used by Nextflow.

workflow Author: Pierre Lindenbaum @yokofakun 20200122
=========================================

  """
}


vcfIn = file(params.vcf)
pedIn = file(params.pedigree)

if( params.help ) {
    helpMessage()
    exit 0
}


String jvarkit(String tool) {
        return "\${JVARKIT_DIST}/" + tool + ".jar"
	}

process download1kAF {
tag "download AF"

output:
	file("1000GP-AFs/AFs.tab.gz") into af1kg
	file("1000GP-AFs/AFs.tab.gz.hdr") into hdr1kg
	file("1000GP-AFs/AFs.tab.gz.tbi")

"""
wget -q -O jeter.tgz "http://ngs.sanger.ac.uk/production/samtools/1000GP-AFs.tgz"
tar  xzf jeter.tgz --wildcards 1000GP-AFs/\\*
touch -c 1000GP-AFs/AFs.tab.gz.tbi
rm jeter.tgz
"""
}

process downloadGeneticMap {
tag "download genetic-map"

output:
	file("genetic-map") into genmap_dir

"""
wget -q -O jeter.tgz "http://ngs.sanger.ac.uk/production/samtools/genetic-map.tgz"
tar xfz jeter.tgz
rm jeter.tgz
"""
}

process chromNames {
executor 'local'
output:
	file("rename.txt") into (rename_contig1,rename_contig2)
script:
"""
seq 1 22 | awk '{printf("chr%s\t%s\\n",\$1,\$1);}' > rename.txt
"""
}

process samplesContigs{
input:
	file vcf from vcfIn
	file rename from rename_contig1
output:
	file("samples.txt") into sample_list
	file("contigs.txt") into contig_list
script:
"""
bcftools view --header-only "${vcf}" | grep '^#CHROM' | cut -f 10- | tr "\\t" "\\n" > samples.txt
test -s samples.txt

tr "\\t" "\\n"  < "${rename}" | sort | uniq > contig.pattern

bcftools index  --stats  "${vcf.toRealPath()}" | cut -f 1 | grep -F -w -f contig.pattern > contigs.txt
test -s contigs.txt

rm contig.pattern
"""
}


sample_list.splitCsv(header: false,sep:',',strip:true).map{T->T[0]}.set{samples}
contig_list.splitCsv(header: false,sep:',',strip:true).map{T->T[0]}.set{contigs}

process roh {
tag "${sample}/${contig}"
input:
	file af1k from af1kg
	file hdr1k from hdr1kg
	file genmapdir from genmap_dir
	file vcf from vcfIn
	file rename from rename_contig2
	set sample,contig from samples.combine(contigs)
output:
	set sample,contig,file("${sample}.${contig}.roh.txt.gz") into sn_ctg_roh
	set contig,file("${sample}.bed") into sample_bed
script:
"""


bcftools view -O u --samples "${sample}" "${vcf.toRealPath()}" "${contig}" |\
bcftools annotate -O u --rename-chrs ${rename}  |\
bcftools annotate -O u --columns "CHROM,POS,REF,ALT,AF1KG" --header-lines "${hdr1k}" --annotations "${af1k.toRealPath()}" | \
bcftools roh --output-type "srz" --AF-tag AF1KG ${params.rohopts} -m ${genmapdir}/genetic_map_chr{CHROM}_combined_b37.txt -o ${sample}.${contig}.roh.txt.gz

gunzip -c "${sample}.${contig}.roh.txt.gz" | grep '^RG' | cut -f 3,4,5,8 | LC_ALL=C sort -T . -t '\t' -k1,1 -k2,2n > ${sample}.bed

"""
}

process overlap {
tag "${contig}"
input:
	set contig,list from  sample_bed.groupTuple()
output:
	file("${contig}.bed") into overlap_bed
script:
	def sorted = list.sort{A,B->A.getSimpleName().compareTo(B.getSimpleName())}
"""

bedtools unionbedg -header -filler NA -i ${sorted.join(" ")} -names ${sorted.collect{F->F.getSimpleName()}.join(" ")}   | sed 's/^chrom/#chrom/' > ${contig}.bed

"""
}

process mergeChroms {
	tag "merge beds (N=${beds.size()}"
	publishDir "${params.publishDir}" ,  mode: 'copy', overwrite: true
	input:
		val beds from overlap_bed.collect()
	output:
		file("${params.prefix}roh.bed.gz") into merged_bed
	script:
	"""
	cat ${beds.join(" ")} | LC_ALL=C sort -T . -t '\t' -k1,1 -k2,2n | uniq > "${params.prefix}roh.bed"

	gzip --best "${params.prefix}roh.bed"
	"""
	}

process applyPedigree {
	publishDir "${params.publishDir}" ,  mode: 'copy', overwrite: true
	input:
		file ped from pedIn
		file bed from merged_bed
	output:
		file("${params.prefix}filtered.roh.bed.gz") into filtered_bed
	when:
		ped.name != "NO_PED"
	script:
	"""
cat << EOF > jeter.code
String header[] = null;
if(iter.hasNext()) {
	String line = iter.next();
	header = line.split("[\\t]");
	println(line+"\tTYPE\tSAMPLE\tSCORE");
	}
final Set<String> affected =  getPedigree().getAffectedSamples().stream().map(SN->SN.getId()).collect(Collectors.toSet());


while(iter.hasNext()) {
	String line = iter.next();
	String tokens[] = line.split("[\\t]");
	boolean ok = true;
	double sum = 0.0;
	int count = 0;
	for(int i=3;i<tokens.length && i < header.length && ok;i++) {
		boolean is_affected = affected.contains(header[i]);
		boolean is_na = tokens[i].equals("NA");
		/* cases must have a value */
		if(is_affected) {
			if(is_na)
				{
				ok=false;

				}
			else
				{
				count++;
				sum+= Double.parseDouble(tokens[i]);
				}
			}
		/* controls shouldn't have a value */
		if(!is_na && !is_affected) ok=false;
		}
	if(ok && count>0) {
		println(line+"\tALL_AFFECTED\t.\t"+(sum/count));
		}
	String singleton = null;
	for(int i=3;i<tokens.length && i < header.length && affected.size()>1 ;i++) {
		boolean is_affected = affected.contains(header[i]);
		boolean is_na = tokens[i].equals("NA");
		/* cases */
		if(!is_na && is_affected) {
			if(singleton!=null) { singleton=null;break;}
			singleton = header[i]+"\t"+tokens[i];
			}
		/* controls shouldn't have a value */
		if(!is_na && !is_affected)  { singleton=null;break;}
		}
	if(singleton!=null) {
		println(line+"\tSINGLETON\t"+singleton);
		}
	}

EOF

	gunzip -c ${bed} | java -jar ${jvarkit("bioalcidaejdk")} --pedigree ${ped} --format TEXT -f jeter.code > ${params.prefix}filtered.roh.bed
	gzip --best ${params.prefix}filtered.roh.bed
	rm jeter.code

	"""
	}
	params.prefix="output."
	params.vcf=""
	params.help=false
	params.rohopts=" --rec-rate 100 --skip-indels "
	params.pedigree = "NO_PED"

	def helpMessage() {
	log.info"""

	=========================================
	Usage:

	Detects regions of autozygosity in sequencing data, including exome data, using a hidden Markov model.
	See https://samtools.github.io/bcftools/howtos/roh-calling.html

	Mandatory arguments:
	--prefix output prefix
	--publishDir <dir> Output directory.
	--vcf <file> the vcf

	Other options:
	--pedigree <file> optional pedigree with phenotypes
	--rohopts <string> options for roh (${params.rohopts})

	Nextflow options:
	-w Work directory used by Nextflow.

	workflow Author: Pierre Lindenbaum @yokofakun 20200122
	=========================================

	"""
	}



	vcfIn = file(params.vcf)
	pedIn = file(params.pedigree)

	if( params.help ) {
	helpMessage()
	exit 0
	}



	String jvarkit(String tool) {
	return "\${JVARKIT_DIST}/" + tool + ".jar"
	}

	process download1kAF {
	tag "download AF"

	output:
	file("1000GP-AFs/AFs.tab.gz") into af1kg
	file("1000GP-AFs/AFs.tab.gz.hdr") into hdr1kg
	file("1000GP-AFs/AFs.tab.gz.tbi")

	"""
	wget -q -O jeter.tgz "http://ngs.sanger.ac.uk/production/samtools/1000GP-AFs.tgz"
	tar xzf jeter.tgz --wildcards 1000GP-AFs/\\*
	touch -c 1000GP-AFs/AFs.tab.gz.tbi
	rm jeter.tgz
	"""
	}

	process downloadGeneticMap {
	tag "download genetic-map"

	output:
	file("genetic-map") into genmap_dir

	"""
	wget -q -O jeter.tgz "http://ngs.sanger.ac.uk/production/samtools/genetic-map.tgz"
	tar xfz jeter.tgz
	rm jeter.tgz
	"""
	}

	process chromNames {
	executor 'local'
	output:
	file("rename.txt") into (rename_contig1,rename_contig2)
	script:
	"""
	seq 1 22 \| awk '{printf("chr%s\t%s\\n",\$1,\$1);}' > rename.txt
	"""
	}

	process samplesContigs{
	input:
	file vcf from vcfIn
	file rename from rename_contig1
	output:
	file("samples.txt") into sample_list
	file("contigs.txt") into contig_list
	script:
	"""
	bcftools view --header-only "${vcf}" \| grep '^#CHROM' \| cut -f 10- \| tr "\\t" "\\n" > samples.txt
	test -s samples.txt

	tr "\\t" "\\n" < "${rename}" \| sort \| uniq > contig.pattern

	bcftools index --stats "${vcf.toRealPath()}" \| cut -f 1 \| grep -F -w -f contig.pattern > contigs.txt
	test -s contigs.txt

	rm contig.pattern
	"""
	}


	sample_list.splitCsv(header: false,sep:',',strip:true).map{T->T[0]}.set{samples}
	contig_list.splitCsv(header: false,sep:',',strip:true).map{T->T[0]}.set{contigs}

	process roh {
	tag "${sample}/${contig}"
	input:
	file af1k from af1kg
	file hdr1k from hdr1kg
	file genmapdir from genmap_dir
	file vcf from vcfIn
	file rename from rename_contig2
	set sample,contig from samples.combine(contigs)
	output:
	set sample,contig,file("${sample}.${contig}.roh.txt.gz") into sn_ctg_roh
	set contig,file("${sample}.bed") into sample_bed
	script:
	"""


	bcftools view -O u --samples "${sample}" "${vcf.toRealPath()}" "${contig}" \|\
	bcftools annotate -O u --rename-chrs ${rename} \|\
	bcftools annotate -O u --columns "CHROM,POS,REF,ALT,AF1KG" --header-lines "${hdr1k}" --annotations "${af1k.toRealPath()}" \| \
	bcftools roh --output-type "srz" --AF-tag AF1KG ${params.rohopts} -m ${genmapdir}/genetic_map_chr{CHROM}_combined_b37.txt -o ${sample}.${contig}.roh.txt.gz

	gunzip -c "${sample}.${contig}.roh.txt.gz" \| grep '^RG' \| cut -f 3,4,5,8 \| LC_ALL=C sort -T . -t '\t' -k1,1 -k2,2n > ${sample}.bed

	"""
	}

	process overlap {
	tag "${contig}"
	input:
	set contig,list from sample_bed.groupTuple()
	output:
	file("${contig}.bed") into overlap_bed
	script:
	def sorted = list.sort{A,B->A.getSimpleName().compareTo(B.getSimpleName())}
	"""

	bedtools unionbedg -header -filler NA -i ${sorted.join(" ")} -names ${sorted.collect{F->F.getSimpleName()}.join(" ")} \| sed 's/^chrom/#chrom/' > ${contig}.bed

	"""
	}

	process mergeChroms {
	tag "merge beds (N=${beds.size()}"
	publishDir "${params.publishDir}" , mode: 'copy', overwrite: true
	input:
	val beds from overlap_bed.collect()
	output:
	file("${params.prefix}roh.bed.gz") into merged_bed
	script:
	"""
	cat ${beds.join(" ")} \| LC_ALL=C sort -T . -t '\t' -k1,1 -k2,2n \| uniq > "${params.prefix}roh.bed"

	gzip --best "${params.prefix}roh.bed"
	"""
	}

	process applyPedigree {
	publishDir "${params.publishDir}" , mode: 'copy', overwrite: true
	input:
	file ped from pedIn
	file bed from merged_bed
	output:
	file("${params.prefix}filtered.roh.bed.gz") into filtered_bed
	when:
	ped.name != "NO_PED"
	script:
	"""
	cat << EOF > jeter.code
	String header[] = null;
	if(iter.hasNext()) {
	String line = iter.next();
	header = line.split("[\\t]");
	println(line+"\tTYPE\tSAMPLE\tSCORE");
	}
	final Set<String> affected = getPedigree().getAffectedSamples().stream().map(SN->SN.getId()).collect(Collectors.toSet());


	while(iter.hasNext()) {
	String line = iter.next();
	String tokens[] = line.split("[\\t]");
	boolean ok = true;
	double sum = 0.0;
	int count = 0;
	for(int i=3;i<tokens.length && i < header.length && ok;i++) {
	boolean is_affected = affected.contains(header[i]);
	boolean is_na = tokens[i].equals("NA");
	/* cases must have a value */
	if(is_affected) {
	if(is_na)
	{
	ok=false;

	}
	else
	{
	count++;
	sum+= Double.parseDouble(tokens[i]);
	}
	}
	/* controls shouldn't have a value */
	if(!is_na && !is_affected) ok=false;
	}
	if(ok && count>0) {
	println(line+"\tALL_AFFECTED\t.\t"+(sum/count));
	}
	String singleton = null;
	for(int i=3;i<tokens.length && i < header.length && affected.size()>1 ;i++) {
	boolean is_affected = affected.contains(header[i]);
	boolean is_na = tokens[i].equals("NA");
	/* cases */
	if(!is_na && is_affected) {
	if(singleton!=null) { singleton=null;break;}
	singleton = header[i]+"\t"+tokens[i];
	}
	/* controls shouldn't have a value */
	if(!is_na && !is_affected) { singleton=null;break;}
	}
	if(singleton!=null) {
	println(line+"\tSINGLETON\t"+singleton);
	}
	}

	EOF

	gunzip -c ${bed} \| java -jar ${jvarkit("bioalcidaejdk")} --pedigree ${ped} --format TEXT -f jeter.code > ${params.prefix}filtered.roh.bed
	gzip --best ${params.prefix}filtered.roh.bed
	rm jeter.code

	"""
	}