Created
March 12, 2014 11:12
-
-
Save lindenb/9504877 to your computer and use it in GitHub Desktop.
Answer to https://twitter.com/BioMickWatson/status/443694391350525953 : clipped sequences ignored by samtools. Added 8 bases to all reads in 5' and 3': only a few SNPs found.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
sam.dir=/commun/data/packages/samtools-0.1.19 | |
bwa=/commun/data/packages/bwa-0.7.6a/bwa | |
.PHONY:all reads | |
all: test.vcf | |
test.vcf: test.bam.bai | |
${sam.dir}/samtools mpileup -uf ex1.fa test.bam | ${sam.dir}/bcftools/bcftools view -vcg - > $@ | |
test.bam.bai : ex1.fa.bwt f1.fq f2.fq | |
${bwa} mem ex1.fa f1.fq f2.fq |\ | |
${sam.dir}/samtools view -Sbu - |\ | |
${sam.dir}/samtools sort - test && \ | |
${sam.dir}/samtools index test.bam | |
ex1.fa.bwt: ex1.fa | |
${bwa} index $< | |
f1.fq f2.fq: reads | |
reads: ex1.fa | |
${sam.dir}/misc/wgsim -N 2000 $< f1.fq f2.fq > wgsim.vcf | |
$(foreach F,1 2, cat f${F}.fq | awk '(NR%4==2) {printf("ATGCATGC%sATGCATGC\n",$$0);next;} (NR%4==0) {printf("22222222%s22222222\n",$$0);next;} {print;}' > a ; mv a f${F}.fq ; ) | |
ex1.fa: | |
curl -o $@ "https://raw.github.com/samtools/samtools/develop/examples/$@" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
##fileformat=VCFv4.1 | |
##samtoolsVersion=0.1.19-44428cd | |
##reference=file://ex1.fa | |
##contig=<ID=seq1,length=1575> | |
##contig=<ID=seq2,length=1584> | |
##INFO=<ID=DP,Number=1,Type=Integer,Description="Raw read depth"> | |
##INFO=<ID=DP4,Number=4,Type=Integer,Description="# high-quality ref-forward bases, ref-reverse, alt-forward and alt-reverse bases"> | |
##INFO=<ID=MQ,Number=1,Type=Integer,Description="Root-mean-square mapping quality of covering reads"> | |
##INFO=<ID=FQ,Number=1,Type=Float,Description="Phred probability of all samples being the same"> | |
##INFO=<ID=AF1,Number=1,Type=Float,Description="Max-likelihood estimate of the first ALT allele frequency (assuming HWE)"> | |
##INFO=<ID=AC1,Number=1,Type=Float,Description="Max-likelihood estimate of the first ALT allele count (no HWE assumption)"> | |
##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes"> | |
##INFO=<ID=IS,Number=2,Type=Float,Description="Maximum number of reads supporting an indel and fraction of indel reads"> | |
##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes for each ALT allele, in the same order as listed"> | |
##INFO=<ID=G3,Number=3,Type=Float,Description="ML estimate of genotype frequencies"> | |
##INFO=<ID=HWE,Number=1,Type=Float,Description="Chi^2 based HWE test P-value based on G3"> | |
##INFO=<ID=CLR,Number=1,Type=Integer,Description="Log ratio of genotype likelihoods with and without the constraint"> | |
##INFO=<ID=UGT,Number=1,Type=String,Description="The most probable unconstrained genotype configuration in the trio"> | |
##INFO=<ID=CGT,Number=1,Type=String,Description="The most probable constrained genotype configuration in the trio"> | |
##INFO=<ID=PV4,Number=4,Type=Float,Description="P-values for strand bias, baseQ bias, mapQ bias and tail distance bias"> | |
##INFO=<ID=INDEL,Number=0,Type=Flag,Description="Indicates that the variant is an INDEL."> | |
##INFO=<ID=PC2,Number=2,Type=Integer,Description="Phred probability of the nonRef allele frequency in group1 samples being larger (,smaller) than in group2."> | |
##INFO=<ID=PCHI2,Number=1,Type=Float,Description="Posterior weighted chi^2 P-value for testing the association between group1 and group2 samples."> | |
##INFO=<ID=QCHI2,Number=1,Type=Integer,Description="Phred scaled PCHI2."> | |
##INFO=<ID=PR,Number=1,Type=Integer,Description="# permutations yielding a smaller PCHI2."> | |
##INFO=<ID=QBD,Number=1,Type=Float,Description="Quality by Depth: QUAL/#reads"> | |
##INFO=<ID=RPB,Number=1,Type=Float,Description="Read Position Bias"> | |
##INFO=<ID=MDV,Number=1,Type=Integer,Description="Maximum number of high-quality nonRef reads in samples"> | |
##INFO=<ID=VDB,Number=1,Type=Float,Description="Variant Distance Bias (v2) for filtering splice-site artefacts in RNA-seq data. Note: this version may be broken | |
."> | |
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> | |
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality"> | |
##FORMAT=<ID=GL,Number=3,Type=Float,Description="Likelihoods for RR,RA,AA genotypes (R=ref,A=alt)"> | |
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="# high-quality bases"> | |
##FORMAT=<ID=DV,Number=1,Type=Integer,Description="# high-quality non-reference bases"> | |
##FORMAT=<ID=SP,Number=1,Type=Integer,Description="Phred-scaled strand bias P-value"> | |
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="List of Phred-scaled genotype likelihoods"> | |
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT test.bam | |
seq1 993 . T A 61 . DP=136;VDB=5.035851e-03;RPB=2.097285e-01;AF1=0.5;AC1=1;DP4=44,29,35,27;MQ=60;FQ=63.5;PV4=0.73,1,1,1 | |
GT:PL:GQ 0/1:91,0,100:93 | |
seq2 303 . TC TCCC 103 . INDEL;IS=3,0.044118;DP=68;VDB=9.692566e-03;AF1=1;AC1=2;DP4=0,0,30,0;MQ=60;FQ=-125 GT:PL:GQ | |
1/1:144,90,0:99 | |
seq2 304 . C CCA 214 . INDEL;IS=52,0.800000;DP=65;VDB=4.882629e-04;AF1=1;AC1=2;DP4=0,0,46,0;MQ=60;FQ=-173 GT:PL:GQ | |
1/1:255,138,0:99 | |
seq2 837 . A C 68 . DP=128;VDB=2.587057e-02;RPB=3.599771e-01;AF1=0.5;AC1=1;DP4=41,25,30,32;MQ=60;FQ=67.5;PV4=0.15,1,1,1 | |
GT:PL:GQ 0/1:98,0,97:97 | |
seq2 893 . G C,A 143 . DP=110;VDB=4.352574e-03;RPB=-2.270745e+00;AF1=1;AC1=2;DP4=1,1,47,61;MQ=60;FQ=-282;PV4=1,1,1,1 GT:PL:GQ | |
1/1:176,255,0,178,255,175:99 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment