Skip to content

Instantly share code, notes, and snippets.

@lindenb
Created March 19, 2019 08:21
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lindenb/a0ca7ec35ae36c9d24f338190768c73c to your computer and use it in GitHub Desktop.
Save lindenb/a0ca7ec35ae36c9d24f338190768c73c to your computer and use it in GitHub Desktop.
REF = "/ccc/work/cont007/fg/fg/biobank/by-taxonid/9606/hs37d5/hs37d5_all_chr.fasta"
gatk= "/ccc/cont007/home/fg/fg/products/gatk-3.8.0/el7-x86_64-generic/GenomeAnalysisTK.jar"
dbsnp = "/ccc/work/cont007/fg/fg/biobank/by-taxonid/9606/hs37d5/variants/hs37d5_all_chr_dbsnp-142.vcf"
snpeff_jar="/ccc/cont007/home/fg/fg/products/snpeff-4.2/snpEff.jar"
snpeff_config="/ccc/cont007/dsku/zinc/home/app/fg/fg/products/snpeff-4.2/snpEff.config"
picard = "/ccc/work/cont007/fg0019/lindenbp/packages/picard-tools-2.17.11/picard.jar"
params.prefix="20190314.TRIOS.SAMTOOLS"
String modulesLoads() {
def m = ["java/oracle/1.8.0_192","r","datadir/"+System.getProperty("ccc.project",""),"extenv/fg","samtools","bcftools","bedtools"];
String s="";
for(String t : m) s+="module load "+t+"\n";
return s;
}
String jvarkit(String tool) {
return "/ccc/work/cont007/fg0019/lindenbp/packages/jvarkit-git/dist/"+tool+".jar";
}
process makePedigree {
executor 'ccrt'
tag "make pedigree"
cache 'lenient'
output:
file("pedigree.ped") into pedigree
script:
"""
cat << EOF > pedigree.ped
R B00I9FS B00I9FQ B00I9FR 0 2
R B00I9FQ 0 0 1 1
R B00I9FR 0 0 2 1
D C0009H9 0 0 1 1
D C0009H8 0 0 2 1
D C0009H7 C0009H9 C0009H8 1 2
M B00I9FW B00I9FU B00I9FV 0 2
M B00I9FU 0 0 0 1
M B00I9FV 0 0 0 1
B00I9FD B00I9FD 0 0 0 0
B00I9FE B00I9FE 0 0 0 0
B00I9FF B00I9FF 0 0 0 0
B00I9FG B00I9FG 0 0 0 0
B00I9FH B00I9FH 0 0 0 0
B00I9FI B00I9FI 0 0 0 0
B00I9FJ B00I9FJ 0 0 0 0
B00I9FK B00I9FK 0 0 0 0
B00I9FL B00I9FL 0 0 0 0
B00I9FM B00I9FM 0 0 0 0
B00I9FN B00I9FN 0 0 0 0
B00I9FO B00I9FO 0 0 0 0
B00I9FP B00I9FP 0 0 0 0
X B00GG9B 0 0 0 0
X B00GG9C 0 0 0 0
X B00GG9D 0 0 0 0
EOF
"""
}
process makeBamList {
executor 'ccrt'
tag "collect BAMs"
cache 'lenient'
input:
file ped from pedigree
output:
file("bam.list") into bam_list
script:
"""
set -o pipefail
cut -f 2,3,4 "${ped}" | tr "\t" "\n" | sort | uniq | grep -v '^0\$' > select.txt
find /ccc/genostore/cont007/fg0073/fg0073/analyse/projet_TROUCARD_605/ -type f -name "*.bam" |\
grep -f select.txt | sort > bam.list
grep -F -m1 .bam bam.list
rm select.txt
"""
}
process makeBed {
executor 'ccrt'
tag "collect regions"
cache 'lenient'
output:
file("split.bed") into split_bed
file("distinct.chroms.txt") into distinct_chroms
script:
"""
set -o pipefail
${modulesLoads()}
gunzip -c /ccc/work/cont007/fg0019/lindenbp/packages/ucsc/hg19/wgEncodeGencodeCompV19.txt.gz | cut -f 3,5,6 | grep -v '_' |\
sort -t ' ' -k1,1V -k2,2n | bedtools merge > genes.bed
java -jar ${jvarkit("faidxsplitter")} \
--gaps /ccc/work/cont007/fg0019/lindenbp/ucsc/hg19.gaps.bed \
--genes genes.bed \
-w 1000000 \
-R ${REF} > jeter1.bed
awk -F ' ' '/^chr/ {printf("%s\t%s\t%s\\n",\$1,(\$2=="0"?"1":\$2),\$3);}' jeter1.bed > jeter2.bed
cp jeter2.bed split.bed
cut -f 1 split.bed | uniq > distinct.chroms.txt
rm genes.bed jeter1.bed jeter2.bed
"""
}
process callBcftools {
executor 'ccrt'
errorStrategy 'finish'
tag "call ${chrom}:${start}-${end}"
cache false
maxForks 80
input:
file bams from bam_list
set chrom,start,end from split_bed.splitCsv(header: false,sep:'\t',strip:true)
output:
set chrom,start,end,file("${chrom}_${start}_${end}.calling.vcf.gz"),file("${chrom}_${start}_${end}.calling.vcf.gz.tbi") into called_vcfs
script:
"""
set -o pipefail
${modulesLoads()}
test \\! -f "${workflow.workDir}/STOP"
bcftools mpileup --regions "${chrom}:${start}-${end}" -a 'FORMAT/AD' -a 'FORMAT/DP' -a 'INFO/AD' -O u -f ${REF} --bam-list ${bams} --output jeter.bcf
bcftools call --ploidy GRCh37 --multiallelic-caller -O u --variants-only -o jeter2.bcf jeter.bcf
rm jeter.bcf
bcftools norm -f ${REF} -O z -o "${chrom}_${start}_${end}.calling.vcf.gz" jeter2.bcf
rm jeter2.bcf
bcftools index --tbi --force "${chrom}_${start}_${end}.calling.vcf.gz"
"""
}
process allCalled {
executor 'ccrt'
tag "allCalled"
cache false
input:
val vcfs from called_vcfs.map{T->T.join(",")}.collect()
output:
file("called.csv") into called_file
script:
"""
cat << EOF > called.csv
${vcfs.join("\n")}
EOF
"""
}
process mergeChromosomes {
executor 'ccrt'
tag "merge ${chrom}"
cache false
input:
file vcfs from called_file
val chrom from distinct_chroms.splitText().map{L->L.trim()}
output:
set chrom,file("${chrom}.merged.vcf.gz"),file("${chrom}.merged.vcf.gz.tbi") into merged_contig_vcf
script:
"""
${modulesLoads()}
set -o pipefail
awk -F, -v C=${chrom} '(\$1==C)' '${vcfs}' | sort -t, -k2,2n | cut -d, -f 4 > vcfs.list
java -Xmx5g -Djava.io.tmpdir=. -jar ${gatk} -T CombineVariants -R ${REF} --disable_auto_index_creation_and_locking_when_reading_rods \
-L "${chrom}" -V vcfs.list \
-o "${chrom}.merged.vcf.gz" -genotypeMergeOptions UNSORTED -setKey null -suppressCommandLineHeader
rm vcfs.list
bcftools index --tbi --force "${chrom}.merged.vcf.gz"
"""
}
process snpEffDataDir {
executor 'ccrt'
tag "snpeff data"
cache false
output:
file("snpeff/data/done.txt") into snpeff_done
script:
"""
mkdir snpeff
unzip -o /ccc/work/cont007/fg/fg/biobank/by-soft/snpeff/4.2/snpeff-4.2-hs37d5-75.zip -d ./snpeff
touch snpeff/data/done.txt
"""
}
process HengLiBed {
executor 'ccrt'
tag "snpeff data"
output:
file("hengli.bed.gz") into hengli_bed
script:
"""
${modulesLoads()}
gunzip -c /ccc/work/cont007/fg0019/lindenbp/packages/10.1093.bioinformatics.btu356/btu356_LCR-hs37d5.bed.gz | sed 's/^/chr/' | bgzip > hengli.bed.gz
tabix -f -p bed hengli.bed.gz
"""
}
process annotChromosome {
executor 'ccrt'
tag "annot ${chrom}"
cache 'lenient'
input:
file ped from pedigree
file regions_bed from split_bed
file snpeff from snpeff_done
file henglibed from hengli_bed
set chrom,merged,tbi from merged_contig_vcf
output:
set chrom,file("${chrom}.annot.vcf.gz"),file("${chrom}.annot.vcf.gz.tbi") into contig_annot_vcf
script:
"""
${modulesLoads()}
test \\! -f "${workflow.workDir}/STOP"
awk -F '\t' -v C=${chrom} '(\$1==C)' "${regions_bed}" > target.bed
## add dbsnp
java -Xmx3g -Djava.io.tmpdir=. -jar ${gatk} --disable_auto_index_creation_and_locking_when_reading_rods -T VariantAnnotator -R ${REF} -V `readlink -f "${merged}"` -A VariantType -A HomopolymerRun --dbsnp "${dbsnp}" -L target.bed -o "tmp2.vcf"
mv "tmp2.vcf" "tmp1.vcf" && rm -f tmp2.vcf.idx
## add ped
java -Xmx3g -Djava.io.tmpdir=. -jar ${gatk} --disable_auto_index_creation_and_locking_when_reading_rods -T VariantAnnotator -R ${REF} -V "tmp1.vcf" -ped ${ped} -A PossibleDeNovo --pedigreeValidationType SILENT -L target.bed -o "tmp2.vcf" && mv tmp2.vcf tmp1.vcf && rm -f tmp2.vcf.idx
## vcf trio because samtools without GQ (gatk requires it)
java -Xmx3g -Djava.io.tmpdir=. -jar ${jvarkit("vcftrio")} --attribute DENOVO_SAMPLES --pedigree ${ped} tmp1.vcf > tmp2.vcf && mv -v tmp2.vcf tmp1.vcf
## add bigwig for wgEncodeDukeMapabilityUniqueness35bp
java -Xmx3g -Djava.io.tmpdir=. ${jvarkit("vcfbigwig")} -T dukeMap35 -B /ccc/work/cont007/fg/fg/biobank/by-name/Homo_sapiens/hg19/annot/encodeDCC/wgEncodeMapability/wgEncodeDukeMapabilityUniqueness35bp.bigWig tmp1.vcf > tmp2.vcf && mv -v tmp2.vcf tmp1.vcf
## lowdp
java -Xmx3g -Djava.io.tmpdir=. -jar ${jvarkit("vcffilterjdk")} --filter AVG_DEPTH_LT_10 -e 'return variant.getGenotypes().stream().filter(G->G.isCalled() && G.hasDP()).mapToInt(G->G.getDP()).average().orElse(0)>=10;' tmp1.vcf > tmp2.vcf && mv -v tmp2.vcf tmp1.vcf
## cadd
java -Xmx3g -Djava.io.tmpdir=. -jar ${jvarkit("vcfcadd")} -u "/ccc/work/cont007/fg/fg/biobank/by-name/Homo_sapiens/hs37d5/annot/cadd/1.3/whole_genome_SNVs.tsv.gz" tmp1.vcf > tmp2.vcf && mv -v tmp2.vcf tmp1.vcf
## gnomad
java -Xmx3g -Djava.io.tmpdir=. -jar ${jvarkit("vcfgnomad")} --gnomadFilter IN_GNOMAD --overlapFilter CTG_POS_IN_GNOMAD -m "/ccc/work/cont007/fg0019/lindenbp/packages/broad/gnomad/vcf.manifest" tmp1.vcf > tmp2.vcf
mv -v tmp2.vcf tmp1.vcf
## vcf vcfafinfofilter
java -Xmx3g -Djava.io.tmpdir=. -jar ${jvarkit("vcfafinfofilter")} -gtf GNOMAD --filter "GNOMAD_NFE_GT_1_PER_1000" -t "0.001" -nfe tmp1.vcf > tmp2.vcf
mv -v tmp2.vcf tmp1.vcf
## annotate snpeff
java -Xmx3g -Djava.io.tmpdir=. -jar "${snpeff_jar}" eff -config "${snpeff_config}" -dataDir `readlink -f ${snpeff} | sed 's%/done.txt%%' ` -verbose -nodownload -noNextProt -noMotif -noInteraction -noLog -noStats -chr chr -i vcf -o vcf GRCh37.75 tmp1.vcf > tmp2.vcf
mv -v tmp2.vcf tmp1.vcf
## sequece ontology
java -Xmx3g -Djava.io.tmpdir=. -jar ${jvarkit("vcffilterso")} --filterout NOT_PROTEIN_ALTERING -A "SO:0001818" -A "SO:0001568" tmp1.vcf > tmp2.vcf
mv -v tmp2.vcf tmp1.vcf
## cardiogen
java -Djava.io.tmpdir=. -jar ${jvarkit("vcfburdenfiltergenes")} --filter NOT_CARDIOGENE -g /ccc/work/cont007/fg0019/lindenbp/CARDIOGENE/20181015.cardiogene.txt tmp1.vcf > tmp2.vcf
mv -v tmp2.vcf tmp1.vcf
## genmed
java -Djava.io.tmpdir=. -jar ${jvarkit("vcfpeekvcf")} --prefix GENMED_ --tags 'AC,AN,AF' --tabix /ccc/store/cont007/fg0039/lindenbp/20180323.FRENCHWGS/20180323.FRENCHWGS.REF0002.vcf.gz tmp1.vcf > tmp2.vcf
mv -v tmp2.vcf tmp1.vcf
## hengi + other filters
java -Xmx3g -Djava.io.tmpdir=. -jar ${gatk} -T VariantFiltration -R ${REF} -L target.bed --disable_auto_index_creation_and_locking_when_reading_rods --filterExpression "dukeMap35 < 1.0" --filterName "DUKEMAP35_LT_1" --mask `readlink -f ${henglibed}` --maskName "HENG_LI_MASK" -V tmp1.vcf -o tmp2.vcf && mv tmp2.vcf tmp1.vcf && rm -f tmp2.vcf.idx
bgzip -f tmp1.vcf
bcftools index -f -t tmp1.vcf.gz
bcftools view tmp1.vcf.gz | grep 'ANN=' -m1
mv --verbose "tmp1.vcf.gz" "${chrom}.annot.vcf.gz"
mv --verbose "tmp1.vcf.gz.tbi" "${chrom}.annot.vcf.gz.tbi"
rm -f target.bed tmp1.vcf tmp1.vcf.idx tmp2.vcf tmp2.vcf.idx
"""
}
process makeVcfListAnnotContigs {
tag "making list from ${vcfVector.size()} vcfs"
cache 'lenient'
input:
val vcfVector from contig_annot_vcf.map{L->L[0]+","+L[1]+","+L[2]}.collect()
output:
file("vcf.list") into vcf_all_list
script:
"""
set -o pipefail
awk -F '\\t' '{printf("%s,%d\\n",\$1,NR);}' "${REF}.fai" | LC_ALL=C sort -t, -k1,1 > tmp1.txt
cat << __EOF__ | LC_ALL=C sort -t, -k1,1 > tmp2.txt
${vcfVector.join("\n")}
__EOF__
LC_ALL=C join -t , -1 1 -2 1 tmp1.txt tmp2.txt | LC_ALL=C sort -t, -k2,2n -k3,3 | cut -d, -f 3 > vcf.list
grep -m1 '.vcf.gz\$' vcf.list > /dev/null
"""
}
process mergeVcf {
tag "making list from ${vcflist} "
cache 'lenient'
input:
file vcflist from vcf_all_list
output:
set file("${params.prefix}.vcf.gz"),file("${params.prefix}.vcf.gz.tbi") into merged_vcf
script:
"""
java -jar ${picard} GatherVcfs I=${vcflist} O=${params.prefix}.vcf.gz
bcftools view "${params.prefix}.vcf.gz" > /dev/null
bcftools index -f -t ${params.prefix}.vcf.gz
"""
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment