Created
January 23, 2020 12:08
-
-
Save lindenb/22e2b5dca0c73d654f17c728b330900b to your computer and use it in GitHub Desktop.
nextflow workflow for 'bcftools roh' . bioinformatics autozygosity vcf cnv
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
params.prefix="output." | |
params.vcf="" | |
params.help=false | |
params.rohopts=" --rec-rate 100 --skip-indels " | |
params.pedigree = "NO_PED" | |
def helpMessage() { | |
log.info""" | |
========================================= | |
Usage: | |
Detects regions of autozygosity in sequencing data, including exome data, using a hidden Markov model. | |
See https://samtools.github.io/bcftools/howtos/roh-calling.html | |
Mandatory arguments: | |
--prefix output prefix | |
--publishDir <dir> Output directory. | |
--vcf <file> the vcf | |
Other options: | |
--pedigree <file> optional pedigree with phenotypes | |
--rohopts <string> options for roh (${params.rohopts}) | |
Nextflow options: | |
-w Work directory used by Nextflow. | |
workflow Author: Pierre Lindenbaum @yokofakun 20200122 | |
========================================= | |
""" | |
} | |
vcfIn = file(params.vcf) | |
pedIn = file(params.pedigree) | |
if( params.help ) { | |
helpMessage() | |
exit 0 | |
} | |
String jvarkit(String tool) { | |
return "\${JVARKIT_DIST}/" + tool + ".jar" | |
} | |
process download1kAF { | |
tag "download AF" | |
output: | |
file("1000GP-AFs/AFs.tab.gz") into af1kg | |
file("1000GP-AFs/AFs.tab.gz.hdr") into hdr1kg | |
file("1000GP-AFs/AFs.tab.gz.tbi") | |
""" | |
wget -q -O jeter.tgz "http://ngs.sanger.ac.uk/production/samtools/1000GP-AFs.tgz" | |
tar xzf jeter.tgz --wildcards 1000GP-AFs/\\* | |
touch -c 1000GP-AFs/AFs.tab.gz.tbi | |
rm jeter.tgz | |
""" | |
} | |
process downloadGeneticMap { | |
tag "download genetic-map" | |
output: | |
file("genetic-map") into genmap_dir | |
""" | |
wget -q -O jeter.tgz "http://ngs.sanger.ac.uk/production/samtools/genetic-map.tgz" | |
tar xfz jeter.tgz | |
rm jeter.tgz | |
""" | |
} | |
process chromNames { | |
executor 'local' | |
output: | |
file("rename.txt") into (rename_contig1,rename_contig2) | |
script: | |
""" | |
seq 1 22 | awk '{printf("chr%s\t%s\\n",\$1,\$1);}' > rename.txt | |
""" | |
} | |
process samplesContigs{ | |
input: | |
file vcf from vcfIn | |
file rename from rename_contig1 | |
output: | |
file("samples.txt") into sample_list | |
file("contigs.txt") into contig_list | |
script: | |
""" | |
bcftools view --header-only "${vcf}" | grep '^#CHROM' | cut -f 10- | tr "\\t" "\\n" > samples.txt | |
test -s samples.txt | |
tr "\\t" "\\n" < "${rename}" | sort | uniq > contig.pattern | |
bcftools index --stats "${vcf.toRealPath()}" | cut -f 1 | grep -F -w -f contig.pattern > contigs.txt | |
test -s contigs.txt | |
rm contig.pattern | |
""" | |
} | |
sample_list.splitCsv(header: false,sep:',',strip:true).map{T->T[0]}.set{samples} | |
contig_list.splitCsv(header: false,sep:',',strip:true).map{T->T[0]}.set{contigs} | |
process roh { | |
tag "${sample}/${contig}" | |
input: | |
file af1k from af1kg | |
file hdr1k from hdr1kg | |
file genmapdir from genmap_dir | |
file vcf from vcfIn | |
file rename from rename_contig2 | |
set sample,contig from samples.combine(contigs) | |
output: | |
set sample,contig,file("${sample}.${contig}.roh.txt.gz") into sn_ctg_roh | |
set contig,file("${sample}.bed") into sample_bed | |
script: | |
""" | |
bcftools view -O u --samples "${sample}" "${vcf.toRealPath()}" "${contig}" |\ | |
bcftools annotate -O u --rename-chrs ${rename} |\ | |
bcftools annotate -O u --columns "CHROM,POS,REF,ALT,AF1KG" --header-lines "${hdr1k}" --annotations "${af1k.toRealPath()}" | \ | |
bcftools roh --output-type "srz" --AF-tag AF1KG ${params.rohopts} -m ${genmapdir}/genetic_map_chr{CHROM}_combined_b37.txt -o ${sample}.${contig}.roh.txt.gz | |
gunzip -c "${sample}.${contig}.roh.txt.gz" | grep '^RG' | cut -f 3,4,5,8 | LC_ALL=C sort -T . -t '\t' -k1,1 -k2,2n > ${sample}.bed | |
""" | |
} | |
process overlap { | |
tag "${contig}" | |
input: | |
set contig,list from sample_bed.groupTuple() | |
output: | |
file("${contig}.bed") into overlap_bed | |
script: | |
def sorted = list.sort{A,B->A.getSimpleName().compareTo(B.getSimpleName())} | |
""" | |
bedtools unionbedg -header -filler NA -i ${sorted.join(" ")} -names ${sorted.collect{F->F.getSimpleName()}.join(" ")} | sed 's/^chrom/#chrom/' > ${contig}.bed | |
""" | |
} | |
process mergeChroms { | |
tag "merge beds (N=${beds.size()}" | |
publishDir "${params.publishDir}" , mode: 'copy', overwrite: true | |
input: | |
val beds from overlap_bed.collect() | |
output: | |
file("${params.prefix}roh.bed.gz") into merged_bed | |
script: | |
""" | |
cat ${beds.join(" ")} | LC_ALL=C sort -T . -t '\t' -k1,1 -k2,2n | uniq > "${params.prefix}roh.bed" | |
gzip --best "${params.prefix}roh.bed" | |
""" | |
} | |
process applyPedigree { | |
publishDir "${params.publishDir}" , mode: 'copy', overwrite: true | |
input: | |
file ped from pedIn | |
file bed from merged_bed | |
output: | |
file("${params.prefix}filtered.roh.bed.gz") into filtered_bed | |
when: | |
ped.name != "NO_PED" | |
script: | |
""" | |
cat << EOF > jeter.code | |
String header[] = null; | |
if(iter.hasNext()) { | |
String line = iter.next(); | |
header = line.split("[\\t]"); | |
println(line+"\tTYPE\tSAMPLE\tSCORE"); | |
} | |
final Set<String> affected = getPedigree().getAffectedSamples().stream().map(SN->SN.getId()).collect(Collectors.toSet()); | |
while(iter.hasNext()) { | |
String line = iter.next(); | |
String tokens[] = line.split("[\\t]"); | |
boolean ok = true; | |
double sum = 0.0; | |
int count = 0; | |
for(int i=3;i<tokens.length && i < header.length && ok;i++) { | |
boolean is_affected = affected.contains(header[i]); | |
boolean is_na = tokens[i].equals("NA"); | |
/* cases must have a value */ | |
if(is_affected) { | |
if(is_na) | |
{ | |
ok=false; | |
} | |
else | |
{ | |
count++; | |
sum+= Double.parseDouble(tokens[i]); | |
} | |
} | |
/* controls shouldn't have a value */ | |
if(!is_na && !is_affected) ok=false; | |
} | |
if(ok && count>0) { | |
println(line+"\tALL_AFFECTED\t.\t"+(sum/count)); | |
} | |
String singleton = null; | |
for(int i=3;i<tokens.length && i < header.length && affected.size()>1 ;i++) { | |
boolean is_affected = affected.contains(header[i]); | |
boolean is_na = tokens[i].equals("NA"); | |
/* cases */ | |
if(!is_na && is_affected) { | |
if(singleton!=null) { singleton=null;break;} | |
singleton = header[i]+"\t"+tokens[i]; | |
} | |
/* controls shouldn't have a value */ | |
if(!is_na && !is_affected) { singleton=null;break;} | |
} | |
if(singleton!=null) { | |
println(line+"\tSINGLETON\t"+singleton); | |
} | |
} | |
EOF | |
gunzip -c ${bed} | java -jar ${jvarkit("bioalcidaejdk")} --pedigree ${ped} --format TEXT -f jeter.code > ${params.prefix}filtered.roh.bed | |
gzip --best ${params.prefix}filtered.roh.bed | |
rm jeter.code | |
""" | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment