-
-
Save lindenb/a0ca7ec35ae36c9d24f338190768c73c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
REF = "/ccc/work/cont007/fg/fg/biobank/by-taxonid/9606/hs37d5/hs37d5_all_chr.fasta" | |
gatk= "/ccc/cont007/home/fg/fg/products/gatk-3.8.0/el7-x86_64-generic/GenomeAnalysisTK.jar" | |
dbsnp = "/ccc/work/cont007/fg/fg/biobank/by-taxonid/9606/hs37d5/variants/hs37d5_all_chr_dbsnp-142.vcf" | |
snpeff_jar="/ccc/cont007/home/fg/fg/products/snpeff-4.2/snpEff.jar" | |
snpeff_config="/ccc/cont007/dsku/zinc/home/app/fg/fg/products/snpeff-4.2/snpEff.config" | |
picard = "/ccc/work/cont007/fg0019/lindenbp/packages/picard-tools-2.17.11/picard.jar" | |
params.prefix="20190314.TRIOS.SAMTOOLS" | |
String modulesLoads() { | |
def m = ["java/oracle/1.8.0_192","r","datadir/"+System.getProperty("ccc.project",""),"extenv/fg","samtools","bcftools","bedtools"]; | |
String s=""; | |
for(String t : m) s+="module load "+t+"\n"; | |
return s; | |
} | |
String jvarkit(String tool) { | |
return "/ccc/work/cont007/fg0019/lindenbp/packages/jvarkit-git/dist/"+tool+".jar"; | |
} | |
process makePedigree { | |
executor 'ccrt' | |
tag "make pedigree" | |
cache 'lenient' | |
output: | |
file("pedigree.ped") into pedigree | |
script: | |
""" | |
cat << EOF > pedigree.ped | |
R B00I9FS B00I9FQ B00I9FR 0 2 | |
R B00I9FQ 0 0 1 1 | |
R B00I9FR 0 0 2 1 | |
D C0009H9 0 0 1 1 | |
D C0009H8 0 0 2 1 | |
D C0009H7 C0009H9 C0009H8 1 2 | |
M B00I9FW B00I9FU B00I9FV 0 2 | |
M B00I9FU 0 0 0 1 | |
M B00I9FV 0 0 0 1 | |
B00I9FD B00I9FD 0 0 0 0 | |
B00I9FE B00I9FE 0 0 0 0 | |
B00I9FF B00I9FF 0 0 0 0 | |
B00I9FG B00I9FG 0 0 0 0 | |
B00I9FH B00I9FH 0 0 0 0 | |
B00I9FI B00I9FI 0 0 0 0 | |
B00I9FJ B00I9FJ 0 0 0 0 | |
B00I9FK B00I9FK 0 0 0 0 | |
B00I9FL B00I9FL 0 0 0 0 | |
B00I9FM B00I9FM 0 0 0 0 | |
B00I9FN B00I9FN 0 0 0 0 | |
B00I9FO B00I9FO 0 0 0 0 | |
B00I9FP B00I9FP 0 0 0 0 | |
X B00GG9B 0 0 0 0 | |
X B00GG9C 0 0 0 0 | |
X B00GG9D 0 0 0 0 | |
EOF | |
""" | |
} | |
process makeBamList { | |
executor 'ccrt' | |
tag "collect BAMs" | |
cache 'lenient' | |
input: | |
file ped from pedigree | |
output: | |
file("bam.list") into bam_list | |
script: | |
""" | |
set -o pipefail | |
cut -f 2,3,4 "${ped}" | tr "\t" "\n" | sort | uniq | grep -v '^0\$' > select.txt | |
find /ccc/genostore/cont007/fg0073/fg0073/analyse/projet_TROUCARD_605/ -type f -name "*.bam" |\ | |
grep -f select.txt | sort > bam.list | |
grep -F -m1 .bam bam.list | |
rm select.txt | |
""" | |
} | |
process makeBed { | |
executor 'ccrt' | |
tag "collect regions" | |
cache 'lenient' | |
output: | |
file("split.bed") into split_bed | |
file("distinct.chroms.txt") into distinct_chroms | |
script: | |
""" | |
set -o pipefail | |
${modulesLoads()} | |
gunzip -c /ccc/work/cont007/fg0019/lindenbp/packages/ucsc/hg19/wgEncodeGencodeCompV19.txt.gz | cut -f 3,5,6 | grep -v '_' |\ | |
sort -t ' ' -k1,1V -k2,2n | bedtools merge > genes.bed | |
java -jar ${jvarkit("faidxsplitter")} \ | |
--gaps /ccc/work/cont007/fg0019/lindenbp/ucsc/hg19.gaps.bed \ | |
--genes genes.bed \ | |
-w 1000000 \ | |
-R ${REF} > jeter1.bed | |
awk -F ' ' '/^chr/ {printf("%s\t%s\t%s\\n",\$1,(\$2=="0"?"1":\$2),\$3);}' jeter1.bed > jeter2.bed | |
cp jeter2.bed split.bed | |
cut -f 1 split.bed | uniq > distinct.chroms.txt | |
rm genes.bed jeter1.bed jeter2.bed | |
""" | |
} | |
process callBcftools { | |
executor 'ccrt' | |
errorStrategy 'finish' | |
tag "call ${chrom}:${start}-${end}" | |
cache false | |
maxForks 80 | |
input: | |
file bams from bam_list | |
set chrom,start,end from split_bed.splitCsv(header: false,sep:'\t',strip:true) | |
output: | |
set chrom,start,end,file("${chrom}_${start}_${end}.calling.vcf.gz"),file("${chrom}_${start}_${end}.calling.vcf.gz.tbi") into called_vcfs | |
script: | |
""" | |
set -o pipefail | |
${modulesLoads()} | |
test \\! -f "${workflow.workDir}/STOP" | |
bcftools mpileup --regions "${chrom}:${start}-${end}" -a 'FORMAT/AD' -a 'FORMAT/DP' -a 'INFO/AD' -O u -f ${REF} --bam-list ${bams} --output jeter.bcf | |
bcftools call --ploidy GRCh37 --multiallelic-caller -O u --variants-only -o jeter2.bcf jeter.bcf | |
rm jeter.bcf | |
bcftools norm -f ${REF} -O z -o "${chrom}_${start}_${end}.calling.vcf.gz" jeter2.bcf | |
rm jeter2.bcf | |
bcftools index --tbi --force "${chrom}_${start}_${end}.calling.vcf.gz" | |
""" | |
} | |
process allCalled { | |
executor 'ccrt' | |
tag "allCalled" | |
cache false | |
input: | |
val vcfs from called_vcfs.map{T->T.join(",")}.collect() | |
output: | |
file("called.csv") into called_file | |
script: | |
""" | |
cat << EOF > called.csv | |
${vcfs.join("\n")} | |
EOF | |
""" | |
} | |
process mergeChromosomes { | |
executor 'ccrt' | |
tag "merge ${chrom}" | |
cache false | |
input: | |
file vcfs from called_file | |
val chrom from distinct_chroms.splitText().map{L->L.trim()} | |
output: | |
set chrom,file("${chrom}.merged.vcf.gz"),file("${chrom}.merged.vcf.gz.tbi") into merged_contig_vcf | |
script: | |
""" | |
${modulesLoads()} | |
set -o pipefail | |
awk -F, -v C=${chrom} '(\$1==C)' '${vcfs}' | sort -t, -k2,2n | cut -d, -f 4 > vcfs.list | |
java -Xmx5g -Djava.io.tmpdir=. -jar ${gatk} -T CombineVariants -R ${REF} --disable_auto_index_creation_and_locking_when_reading_rods \ | |
-L "${chrom}" -V vcfs.list \ | |
-o "${chrom}.merged.vcf.gz" -genotypeMergeOptions UNSORTED -setKey null -suppressCommandLineHeader | |
rm vcfs.list | |
bcftools index --tbi --force "${chrom}.merged.vcf.gz" | |
""" | |
} | |
process snpEffDataDir { | |
executor 'ccrt' | |
tag "snpeff data" | |
cache false | |
output: | |
file("snpeff/data/done.txt") into snpeff_done | |
script: | |
""" | |
mkdir snpeff | |
unzip -o /ccc/work/cont007/fg/fg/biobank/by-soft/snpeff/4.2/snpeff-4.2-hs37d5-75.zip -d ./snpeff | |
touch snpeff/data/done.txt | |
""" | |
} | |
process HengLiBed { | |
executor 'ccrt' | |
tag "snpeff data" | |
output: | |
file("hengli.bed.gz") into hengli_bed | |
script: | |
""" | |
${modulesLoads()} | |
gunzip -c /ccc/work/cont007/fg0019/lindenbp/packages/10.1093.bioinformatics.btu356/btu356_LCR-hs37d5.bed.gz | sed 's/^/chr/' | bgzip > hengli.bed.gz | |
tabix -f -p bed hengli.bed.gz | |
""" | |
} | |
process annotChromosome { | |
executor 'ccrt' | |
tag "annot ${chrom}" | |
cache 'lenient' | |
input: | |
file ped from pedigree | |
file regions_bed from split_bed | |
file snpeff from snpeff_done | |
file henglibed from hengli_bed | |
set chrom,merged,tbi from merged_contig_vcf | |
output: | |
set chrom,file("${chrom}.annot.vcf.gz"),file("${chrom}.annot.vcf.gz.tbi") into contig_annot_vcf | |
script: | |
""" | |
${modulesLoads()} | |
test \\! -f "${workflow.workDir}/STOP" | |
awk -F '\t' -v C=${chrom} '(\$1==C)' "${regions_bed}" > target.bed | |
## add dbsnp | |
java -Xmx3g -Djava.io.tmpdir=. -jar ${gatk} --disable_auto_index_creation_and_locking_when_reading_rods -T VariantAnnotator -R ${REF} -V `readlink -f "${merged}"` -A VariantType -A HomopolymerRun --dbsnp "${dbsnp}" -L target.bed -o "tmp2.vcf" | |
mv "tmp2.vcf" "tmp1.vcf" && rm -f tmp2.vcf.idx | |
## add ped | |
java -Xmx3g -Djava.io.tmpdir=. -jar ${gatk} --disable_auto_index_creation_and_locking_when_reading_rods -T VariantAnnotator -R ${REF} -V "tmp1.vcf" -ped ${ped} -A PossibleDeNovo --pedigreeValidationType SILENT -L target.bed -o "tmp2.vcf" && mv tmp2.vcf tmp1.vcf && rm -f tmp2.vcf.idx | |
## vcf trio because samtools without GQ (gatk requires it) | |
java -Xmx3g -Djava.io.tmpdir=. -jar ${jvarkit("vcftrio")} --attribute DENOVO_SAMPLES --pedigree ${ped} tmp1.vcf > tmp2.vcf && mv -v tmp2.vcf tmp1.vcf | |
## add bigwig for wgEncodeDukeMapabilityUniqueness35bp | |
java -Xmx3g -Djava.io.tmpdir=. ${jvarkit("vcfbigwig")} -T dukeMap35 -B /ccc/work/cont007/fg/fg/biobank/by-name/Homo_sapiens/hg19/annot/encodeDCC/wgEncodeMapability/wgEncodeDukeMapabilityUniqueness35bp.bigWig tmp1.vcf > tmp2.vcf && mv -v tmp2.vcf tmp1.vcf | |
## lowdp | |
java -Xmx3g -Djava.io.tmpdir=. -jar ${jvarkit("vcffilterjdk")} --filter AVG_DEPTH_LT_10 -e 'return variant.getGenotypes().stream().filter(G->G.isCalled() && G.hasDP()).mapToInt(G->G.getDP()).average().orElse(0)>=10;' tmp1.vcf > tmp2.vcf && mv -v tmp2.vcf tmp1.vcf | |
## cadd | |
java -Xmx3g -Djava.io.tmpdir=. -jar ${jvarkit("vcfcadd")} -u "/ccc/work/cont007/fg/fg/biobank/by-name/Homo_sapiens/hs37d5/annot/cadd/1.3/whole_genome_SNVs.tsv.gz" tmp1.vcf > tmp2.vcf && mv -v tmp2.vcf tmp1.vcf | |
## gnomad | |
java -Xmx3g -Djava.io.tmpdir=. -jar ${jvarkit("vcfgnomad")} --gnomadFilter IN_GNOMAD --overlapFilter CTG_POS_IN_GNOMAD -m "/ccc/work/cont007/fg0019/lindenbp/packages/broad/gnomad/vcf.manifest" tmp1.vcf > tmp2.vcf | |
mv -v tmp2.vcf tmp1.vcf | |
## vcf vcfafinfofilter | |
java -Xmx3g -Djava.io.tmpdir=. -jar ${jvarkit("vcfafinfofilter")} -gtf GNOMAD --filter "GNOMAD_NFE_GT_1_PER_1000" -t "0.001" -nfe tmp1.vcf > tmp2.vcf | |
mv -v tmp2.vcf tmp1.vcf | |
## annotate snpeff | |
java -Xmx3g -Djava.io.tmpdir=. -jar "${snpeff_jar}" eff -config "${snpeff_config}" -dataDir `readlink -f ${snpeff} | sed 's%/done.txt%%' ` -verbose -nodownload -noNextProt -noMotif -noInteraction -noLog -noStats -chr chr -i vcf -o vcf GRCh37.75 tmp1.vcf > tmp2.vcf | |
mv -v tmp2.vcf tmp1.vcf | |
## sequece ontology | |
java -Xmx3g -Djava.io.tmpdir=. -jar ${jvarkit("vcffilterso")} --filterout NOT_PROTEIN_ALTERING -A "SO:0001818" -A "SO:0001568" tmp1.vcf > tmp2.vcf | |
mv -v tmp2.vcf tmp1.vcf | |
## cardiogen | |
java -Djava.io.tmpdir=. -jar ${jvarkit("vcfburdenfiltergenes")} --filter NOT_CARDIOGENE -g /ccc/work/cont007/fg0019/lindenbp/CARDIOGENE/20181015.cardiogene.txt tmp1.vcf > tmp2.vcf | |
mv -v tmp2.vcf tmp1.vcf | |
## genmed | |
java -Djava.io.tmpdir=. -jar ${jvarkit("vcfpeekvcf")} --prefix GENMED_ --tags 'AC,AN,AF' --tabix /ccc/store/cont007/fg0039/lindenbp/20180323.FRENCHWGS/20180323.FRENCHWGS.REF0002.vcf.gz tmp1.vcf > tmp2.vcf | |
mv -v tmp2.vcf tmp1.vcf | |
## hengi + other filters | |
java -Xmx3g -Djava.io.tmpdir=. -jar ${gatk} -T VariantFiltration -R ${REF} -L target.bed --disable_auto_index_creation_and_locking_when_reading_rods --filterExpression "dukeMap35 < 1.0" --filterName "DUKEMAP35_LT_1" --mask `readlink -f ${henglibed}` --maskName "HENG_LI_MASK" -V tmp1.vcf -o tmp2.vcf && mv tmp2.vcf tmp1.vcf && rm -f tmp2.vcf.idx | |
bgzip -f tmp1.vcf | |
bcftools index -f -t tmp1.vcf.gz | |
bcftools view tmp1.vcf.gz | grep 'ANN=' -m1 | |
mv --verbose "tmp1.vcf.gz" "${chrom}.annot.vcf.gz" | |
mv --verbose "tmp1.vcf.gz.tbi" "${chrom}.annot.vcf.gz.tbi" | |
rm -f target.bed tmp1.vcf tmp1.vcf.idx tmp2.vcf tmp2.vcf.idx | |
""" | |
} | |
process makeVcfListAnnotContigs { | |
tag "making list from ${vcfVector.size()} vcfs" | |
cache 'lenient' | |
input: | |
val vcfVector from contig_annot_vcf.map{L->L[0]+","+L[1]+","+L[2]}.collect() | |
output: | |
file("vcf.list") into vcf_all_list | |
script: | |
""" | |
set -o pipefail | |
awk -F '\\t' '{printf("%s,%d\\n",\$1,NR);}' "${REF}.fai" | LC_ALL=C sort -t, -k1,1 > tmp1.txt | |
cat << __EOF__ | LC_ALL=C sort -t, -k1,1 > tmp2.txt | |
${vcfVector.join("\n")} | |
__EOF__ | |
LC_ALL=C join -t , -1 1 -2 1 tmp1.txt tmp2.txt | LC_ALL=C sort -t, -k2,2n -k3,3 | cut -d, -f 3 > vcf.list | |
grep -m1 '.vcf.gz\$' vcf.list > /dev/null | |
""" | |
} | |
process mergeVcf { | |
tag "making list from ${vcflist} " | |
cache 'lenient' | |
input: | |
file vcflist from vcf_all_list | |
output: | |
set file("${params.prefix}.vcf.gz"),file("${params.prefix}.vcf.gz.tbi") into merged_vcf | |
script: | |
""" | |
java -jar ${picard} GatherVcfs I=${vcflist} O=${params.prefix}.vcf.gz | |
bcftools view "${params.prefix}.vcf.gz" > /dev/null | |
bcftools index -f -t ${params.prefix}.vcf.gz | |
""" | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment