lindenb/Makefile

## Makefile
sam.dir=/commun/data/packages/samtools-0.1.19
bwa=/commun/data/packages/bwa-0.7.6a/bwa

.PHONY:all reads
all: test.vcf

test.vcf: test.bam.bai
	${sam.dir}/samtools mpileup -uf ex1.fa test.bam | ${sam.dir}/bcftools/bcftools view -vcg -  > $@

test.bam.bai : ex1.fa.bwt f1.fq f2.fq
	${bwa} mem ex1.fa f1.fq f2.fq |\
	${sam.dir}/samtools view -Sbu - |\
	${sam.dir}/samtools sort  - test && \
	${sam.dir}/samtools index test.bam

ex1.fa.bwt: ex1.fa
	${bwa} index $<

f1.fq f2.fq: reads
reads: ex1.fa
	${sam.dir}/misc/wgsim -N 2000 $< f1.fq f2.fq > wgsim.vcf
	$(foreach F,1 2, cat f${F}.fq | awk '(NR%4==2) {printf("ATGCATGC%sATGCATGC\n",$$0);next;} (NR%4==0) {printf("22222222%s22222222\n",$$0);next;} {print;}' > a ; mv a f${F}.fq ; )
ex1.fa:
	curl -o $@ "https://raw.github.com/samtools/samtools/develop/examples/$@"

## test.vcf
##fileformat=VCFv4.1
##samtoolsVersion=0.1.19-44428cd
##reference=file://ex1.fa
##contig=<ID=seq1,length=1575>
##contig=<ID=seq2,length=1584>
##INFO=<ID=DP,Number=1,Type=Integer,Description="Raw read depth">
##INFO=<ID=DP4,Number=4,Type=Integer,Description="# high-quality ref-forward bases, ref-reverse, alt-forward and alt-reverse bases">
##INFO=<ID=MQ,Number=1,Type=Integer,Description="Root-mean-square mapping quality of covering reads">
##INFO=<ID=FQ,Number=1,Type=Float,Description="Phred probability of all samples being the same">
##INFO=<ID=AF1,Number=1,Type=Float,Description="Max-likelihood estimate of the first ALT allele frequency (assuming HWE)">
##INFO=<ID=AC1,Number=1,Type=Float,Description="Max-likelihood estimate of the first ALT allele count (no HWE assumption)">
##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">
##INFO=<ID=IS,Number=2,Type=Float,Description="Maximum number of reads supporting an indel and fraction of indel reads">
##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes for each ALT allele, in the same order as listed">
##INFO=<ID=G3,Number=3,Type=Float,Description="ML estimate of genotype frequencies">
##INFO=<ID=HWE,Number=1,Type=Float,Description="Chi^2 based HWE test P-value based on G3">
##INFO=<ID=CLR,Number=1,Type=Integer,Description="Log ratio of genotype likelihoods with and without the constraint">
##INFO=<ID=UGT,Number=1,Type=String,Description="The most probable unconstrained genotype configuration in the trio">
##INFO=<ID=CGT,Number=1,Type=String,Description="The most probable constrained genotype configuration in the trio">
##INFO=<ID=PV4,Number=4,Type=Float,Description="P-values for strand bias, baseQ bias, mapQ bias and tail distance bias">
##INFO=<ID=INDEL,Number=0,Type=Flag,Description="Indicates that the variant is an INDEL.">
##INFO=<ID=PC2,Number=2,Type=Integer,Description="Phred probability of the nonRef allele frequency in group1 samples being larger (,smaller) than in group2.">
##INFO=<ID=PCHI2,Number=1,Type=Float,Description="Posterior weighted chi^2 P-value for testing the association between group1 and group2 samples.">
##INFO=<ID=QCHI2,Number=1,Type=Integer,Description="Phred scaled PCHI2.">
##INFO=<ID=PR,Number=1,Type=Integer,Description="# permutations yielding a smaller PCHI2.">
##INFO=<ID=QBD,Number=1,Type=Float,Description="Quality by Depth: QUAL/#reads">
##INFO=<ID=RPB,Number=1,Type=Float,Description="Read Position Bias">
##INFO=<ID=MDV,Number=1,Type=Integer,Description="Maximum number of high-quality nonRef reads in samples">
##INFO=<ID=VDB,Number=1,Type=Float,Description="Variant Distance Bias (v2) for filtering splice-site artefacts in RNA-seq data. Note: this version may be broken
.">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
##FORMAT=<ID=GL,Number=3,Type=Float,Description="Likelihoods for RR,RA,AA genotypes (R=ref,A=alt)">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="# high-quality bases">
##FORMAT=<ID=DV,Number=1,Type=Integer,Description="# high-quality non-reference bases">
##FORMAT=<ID=SP,Number=1,Type=Integer,Description="Phred-scaled strand bias P-value">
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="List of Phred-scaled genotype likelihoods">
#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	test.bam
seq1	993	.	T	A	61	.	DP=136;VDB=5.035851e-03;RPB=2.097285e-01;AF1=0.5;AC1=1;DP4=44,29,35,27;MQ=60;FQ=63.5;PV4=0.73,1,1,1
GT:PL:GQ	0/1:91,0,100:93
seq2	303	.	TC	TCCC	103	.	INDEL;IS=3,0.044118;DP=68;VDB=9.692566e-03;AF1=1;AC1=2;DP4=0,0,30,0;MQ=60;FQ=-125	GT:PL:GQ
1/1:144,90,0:99
seq2	304	.	C	CCA	214	.	INDEL;IS=52,0.800000;DP=65;VDB=4.882629e-04;AF1=1;AC1=2;DP4=0,0,46,0;MQ=60;FQ=-173	GT:PL:GQ
1/1:255,138,0:99
seq2	837	.	A	C	68	.	DP=128;VDB=2.587057e-02;RPB=3.599771e-01;AF1=0.5;AC1=1;DP4=41,25,30,32;MQ=60;FQ=67.5;PV4=0.15,1,1,1
GT:PL:GQ	0/1:98,0,97:97
seq2	893	.	G	C,A	143	.	DP=110;VDB=4.352574e-03;RPB=-2.270745e+00;AF1=1;AC1=2;DP4=1,1,47,61;MQ=60;FQ=-282;PV4=1,1,1,1	GT:PL:GQ
	1/1:176,255,0,178,255,175:99
	sam.dir=/commun/data/packages/samtools-0.1.19
	bwa=/commun/data/packages/bwa-0.7.6a/bwa

	.PHONY:all reads
	all: test.vcf

	test.vcf: test.bam.bai
	${sam.dir}/samtools mpileup -uf ex1.fa test.bam \| ${sam.dir}/bcftools/bcftools view -vcg - > $@

	test.bam.bai : ex1.fa.bwt f1.fq f2.fq
	${bwa} mem ex1.fa f1.fq f2.fq \|\
	${sam.dir}/samtools view -Sbu - \|\
	${sam.dir}/samtools sort - test && \
	${sam.dir}/samtools index test.bam

	ex1.fa.bwt: ex1.fa
	${bwa} index $<

	f1.fq f2.fq: reads
	reads: ex1.fa
	${sam.dir}/misc/wgsim -N 2000 $< f1.fq f2.fq > wgsim.vcf
	$(foreach F,1 2, cat f${F}.fq \| awk '(NR%4==2) {printf("ATGCATGC%sATGCATGC\n",$$0);next;} (NR%4==0) {printf("22222222%s22222222\n",$$0);next;} {print;}' > a ; mv a f${F}.fq ; )
	ex1.fa:
	curl -o $@ "https://raw.github.com/samtools/samtools/develop/examples/$@"
	##fileformat=VCFv4.1
	##samtoolsVersion=0.1.19-44428cd
	##reference=file://ex1.fa
	##contig=<ID=seq1,length=1575>
	##contig=<ID=seq2,length=1584>
	##INFO=<ID=DP,Number=1,Type=Integer,Description="Raw read depth">
	##INFO=<ID=DP4,Number=4,Type=Integer,Description="# high-quality ref-forward bases, ref-reverse, alt-forward and alt-reverse bases">
	##INFO=<ID=MQ,Number=1,Type=Integer,Description="Root-mean-square mapping quality of covering reads">
	##INFO=<ID=FQ,Number=1,Type=Float,Description="Phred probability of all samples being the same">
	##INFO=<ID=AF1,Number=1,Type=Float,Description="Max-likelihood estimate of the first ALT allele frequency (assuming HWE)">
	##INFO=<ID=AC1,Number=1,Type=Float,Description="Max-likelihood estimate of the first ALT allele count (no HWE assumption)">
	##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">
	##INFO=<ID=IS,Number=2,Type=Float,Description="Maximum number of reads supporting an indel and fraction of indel reads">
	##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes for each ALT allele, in the same order as listed">
	##INFO=<ID=G3,Number=3,Type=Float,Description="ML estimate of genotype frequencies">
	##INFO=<ID=HWE,Number=1,Type=Float,Description="Chi^2 based HWE test P-value based on G3">
	##INFO=<ID=CLR,Number=1,Type=Integer,Description="Log ratio of genotype likelihoods with and without the constraint">
	##INFO=<ID=UGT,Number=1,Type=String,Description="The most probable unconstrained genotype configuration in the trio">
	##INFO=<ID=CGT,Number=1,Type=String,Description="The most probable constrained genotype configuration in the trio">
	##INFO=<ID=PV4,Number=4,Type=Float,Description="P-values for strand bias, baseQ bias, mapQ bias and tail distance bias">
	##INFO=<ID=INDEL,Number=0,Type=Flag,Description="Indicates that the variant is an INDEL.">
	##INFO=<ID=PC2,Number=2,Type=Integer,Description="Phred probability of the nonRef allele frequency in group1 samples being larger (,smaller) than in group2.">
	##INFO=<ID=PCHI2,Number=1,Type=Float,Description="Posterior weighted chi^2 P-value for testing the association between group1 and group2 samples.">
	##INFO=<ID=QCHI2,Number=1,Type=Integer,Description="Phred scaled PCHI2.">
	##INFO=<ID=PR,Number=1,Type=Integer,Description="# permutations yielding a smaller PCHI2.">
	##INFO=<ID=QBD,Number=1,Type=Float,Description="Quality by Depth: QUAL/#reads">
	##INFO=<ID=RPB,Number=1,Type=Float,Description="Read Position Bias">
	##INFO=<ID=MDV,Number=1,Type=Integer,Description="Maximum number of high-quality nonRef reads in samples">
	##INFO=<ID=VDB,Number=1,Type=Float,Description="Variant Distance Bias (v2) for filtering splice-site artefacts in RNA-seq data. Note: this version may be broken
	.">
	##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
	##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
	##FORMAT=<ID=GL,Number=3,Type=Float,Description="Likelihoods for RR,RA,AA genotypes (R=ref,A=alt)">
	##FORMAT=<ID=DP,Number=1,Type=Integer,Description="# high-quality bases">
	##FORMAT=<ID=DV,Number=1,Type=Integer,Description="# high-quality non-reference bases">
	##FORMAT=<ID=SP,Number=1,Type=Integer,Description="Phred-scaled strand bias P-value">
	##FORMAT=<ID=PL,Number=G,Type=Integer,Description="List of Phred-scaled genotype likelihoods">
	#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT test.bam
	seq1 993 . T A 61 . DP=136;VDB=5.035851e-03;RPB=2.097285e-01;AF1=0.5;AC1=1;DP4=44,29,35,27;MQ=60;FQ=63.5;PV4=0.73,1,1,1
	GT:PL:GQ 0/1:91,0,100:93
	seq2 303 . TC TCCC 103 . INDEL;IS=3,0.044118;DP=68;VDB=9.692566e-03;AF1=1;AC1=2;DP4=0,0,30,0;MQ=60;FQ=-125 GT:PL:GQ
	1/1:144,90,0:99
	seq2 304 . C CCA 214 . INDEL;IS=52,0.800000;DP=65;VDB=4.882629e-04;AF1=1;AC1=2;DP4=0,0,46,0;MQ=60;FQ=-173 GT:PL:GQ
	1/1:255,138,0:99
	seq2 837 . A C 68 . DP=128;VDB=2.587057e-02;RPB=3.599771e-01;AF1=0.5;AC1=1;DP4=41,25,30,32;MQ=60;FQ=67.5;PV4=0.15,1,1,1
	GT:PL:GQ 0/1:98,0,97:97
	seq2 893 . G C,A 143 . DP=110;VDB=4.352574e-03;RPB=-2.270745e+00;AF1=1;AC1=2;DP4=1,1,47,61;MQ=60;FQ=-282;PV4=1,1,1,1 GT:PL:GQ
	1/1:176,255,0,178,255,175:99