Skip to content

Instantly share code, notes, and snippets.

@lindenb
Created January 23, 2020 12:08
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lindenb/22e2b5dca0c73d654f17c728b330900b to your computer and use it in GitHub Desktop.
Save lindenb/22e2b5dca0c73d654f17c728b330900b to your computer and use it in GitHub Desktop.
nextflow workflow for 'bcftools roh' . bioinformatics autozygosity vcf cnv
params.prefix="output."
params.vcf=""
params.help=false
params.rohopts=" --rec-rate 100 --skip-indels "
params.pedigree = "NO_PED"
def helpMessage() {
log.info"""
=========================================
Usage:
Detects regions of autozygosity in sequencing data, including exome data, using a hidden Markov model.
See https://samtools.github.io/bcftools/howtos/roh-calling.html
Mandatory arguments:
--prefix output prefix
--publishDir <dir> Output directory.
--vcf <file> the vcf
Other options:
--pedigree <file> optional pedigree with phenotypes
--rohopts <string> options for roh (${params.rohopts})
Nextflow options:
-w Work directory used by Nextflow.
workflow Author: Pierre Lindenbaum @yokofakun 20200122
=========================================
"""
}
vcfIn = file(params.vcf)
pedIn = file(params.pedigree)
if( params.help ) {
helpMessage()
exit 0
}
String jvarkit(String tool) {
return "\${JVARKIT_DIST}/" + tool + ".jar"
}
process download1kAF {
tag "download AF"
output:
file("1000GP-AFs/AFs.tab.gz") into af1kg
file("1000GP-AFs/AFs.tab.gz.hdr") into hdr1kg
file("1000GP-AFs/AFs.tab.gz.tbi")
"""
wget -q -O jeter.tgz "http://ngs.sanger.ac.uk/production/samtools/1000GP-AFs.tgz"
tar xzf jeter.tgz --wildcards 1000GP-AFs/\\*
touch -c 1000GP-AFs/AFs.tab.gz.tbi
rm jeter.tgz
"""
}
process downloadGeneticMap {
tag "download genetic-map"
output:
file("genetic-map") into genmap_dir
"""
wget -q -O jeter.tgz "http://ngs.sanger.ac.uk/production/samtools/genetic-map.tgz"
tar xfz jeter.tgz
rm jeter.tgz
"""
}
process chromNames {
executor 'local'
output:
file("rename.txt") into (rename_contig1,rename_contig2)
script:
"""
seq 1 22 | awk '{printf("chr%s\t%s\\n",\$1,\$1);}' > rename.txt
"""
}
process samplesContigs{
input:
file vcf from vcfIn
file rename from rename_contig1
output:
file("samples.txt") into sample_list
file("contigs.txt") into contig_list
script:
"""
bcftools view --header-only "${vcf}" | grep '^#CHROM' | cut -f 10- | tr "\\t" "\\n" > samples.txt
test -s samples.txt
tr "\\t" "\\n" < "${rename}" | sort | uniq > contig.pattern
bcftools index --stats "${vcf.toRealPath()}" | cut -f 1 | grep -F -w -f contig.pattern > contigs.txt
test -s contigs.txt
rm contig.pattern
"""
}
sample_list.splitCsv(header: false,sep:',',strip:true).map{T->T[0]}.set{samples}
contig_list.splitCsv(header: false,sep:',',strip:true).map{T->T[0]}.set{contigs}
process roh {
tag "${sample}/${contig}"
input:
file af1k from af1kg
file hdr1k from hdr1kg
file genmapdir from genmap_dir
file vcf from vcfIn
file rename from rename_contig2
set sample,contig from samples.combine(contigs)
output:
set sample,contig,file("${sample}.${contig}.roh.txt.gz") into sn_ctg_roh
set contig,file("${sample}.bed") into sample_bed
script:
"""
bcftools view -O u --samples "${sample}" "${vcf.toRealPath()}" "${contig}" |\
bcftools annotate -O u --rename-chrs ${rename} |\
bcftools annotate -O u --columns "CHROM,POS,REF,ALT,AF1KG" --header-lines "${hdr1k}" --annotations "${af1k.toRealPath()}" | \
bcftools roh --output-type "srz" --AF-tag AF1KG ${params.rohopts} -m ${genmapdir}/genetic_map_chr{CHROM}_combined_b37.txt -o ${sample}.${contig}.roh.txt.gz
gunzip -c "${sample}.${contig}.roh.txt.gz" | grep '^RG' | cut -f 3,4,5,8 | LC_ALL=C sort -T . -t '\t' -k1,1 -k2,2n > ${sample}.bed
"""
}
process overlap {
tag "${contig}"
input:
set contig,list from sample_bed.groupTuple()
output:
file("${contig}.bed") into overlap_bed
script:
def sorted = list.sort{A,B->A.getSimpleName().compareTo(B.getSimpleName())}
"""
bedtools unionbedg -header -filler NA -i ${sorted.join(" ")} -names ${sorted.collect{F->F.getSimpleName()}.join(" ")} | sed 's/^chrom/#chrom/' > ${contig}.bed
"""
}
process mergeChroms {
tag "merge beds (N=${beds.size()}"
publishDir "${params.publishDir}" , mode: 'copy', overwrite: true
input:
val beds from overlap_bed.collect()
output:
file("${params.prefix}roh.bed.gz") into merged_bed
script:
"""
cat ${beds.join(" ")} | LC_ALL=C sort -T . -t '\t' -k1,1 -k2,2n | uniq > "${params.prefix}roh.bed"
gzip --best "${params.prefix}roh.bed"
"""
}
process applyPedigree {
publishDir "${params.publishDir}" , mode: 'copy', overwrite: true
input:
file ped from pedIn
file bed from merged_bed
output:
file("${params.prefix}filtered.roh.bed.gz") into filtered_bed
when:
ped.name != "NO_PED"
script:
"""
cat << EOF > jeter.code
String header[] = null;
if(iter.hasNext()) {
String line = iter.next();
header = line.split("[\\t]");
println(line+"\tTYPE\tSAMPLE\tSCORE");
}
final Set<String> affected = getPedigree().getAffectedSamples().stream().map(SN->SN.getId()).collect(Collectors.toSet());
while(iter.hasNext()) {
String line = iter.next();
String tokens[] = line.split("[\\t]");
boolean ok = true;
double sum = 0.0;
int count = 0;
for(int i=3;i<tokens.length && i < header.length && ok;i++) {
boolean is_affected = affected.contains(header[i]);
boolean is_na = tokens[i].equals("NA");
/* cases must have a value */
if(is_affected) {
if(is_na)
{
ok=false;
}
else
{
count++;
sum+= Double.parseDouble(tokens[i]);
}
}
/* controls shouldn't have a value */
if(!is_na && !is_affected) ok=false;
}
if(ok && count>0) {
println(line+"\tALL_AFFECTED\t.\t"+(sum/count));
}
String singleton = null;
for(int i=3;i<tokens.length && i < header.length && affected.size()>1 ;i++) {
boolean is_affected = affected.contains(header[i]);
boolean is_na = tokens[i].equals("NA");
/* cases */
if(!is_na && is_affected) {
if(singleton!=null) { singleton=null;break;}
singleton = header[i]+"\t"+tokens[i];
}
/* controls shouldn't have a value */
if(!is_na && !is_affected) { singleton=null;break;}
}
if(singleton!=null) {
println(line+"\tSINGLETON\t"+singleton);
}
}
EOF
gunzip -c ${bed} | java -jar ${jvarkit("bioalcidaejdk")} --pedigree ${ped} --format TEXT -f jeter.code > ${params.prefix}filtered.roh.bed
gzip --best ${params.prefix}filtered.roh.bed
rm jeter.code
"""
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment