Skip to content

Instantly share code, notes, and snippets.

@lindenb
Created March 12, 2014 11:12
Show Gist options
  • Save lindenb/9504877 to your computer and use it in GitHub Desktop.
Save lindenb/9504877 to your computer and use it in GitHub Desktop.
Answer to https://twitter.com/BioMickWatson/status/443694391350525953 : clipped sequences ignored by samtools. Added 8 bases to all reads in 5' and 3': only a few SNPs found.
sam.dir=/commun/data/packages/samtools-0.1.19
bwa=/commun/data/packages/bwa-0.7.6a/bwa
.PHONY:all reads
all: test.vcf
test.vcf: test.bam.bai
${sam.dir}/samtools mpileup -uf ex1.fa test.bam | ${sam.dir}/bcftools/bcftools view -vcg - > $@
test.bam.bai : ex1.fa.bwt f1.fq f2.fq
${bwa} mem ex1.fa f1.fq f2.fq |\
${sam.dir}/samtools view -Sbu - |\
${sam.dir}/samtools sort - test && \
${sam.dir}/samtools index test.bam
ex1.fa.bwt: ex1.fa
${bwa} index $<
f1.fq f2.fq: reads
reads: ex1.fa
${sam.dir}/misc/wgsim -N 2000 $< f1.fq f2.fq > wgsim.vcf
$(foreach F,1 2, cat f${F}.fq | awk '(NR%4==2) {printf("ATGCATGC%sATGCATGC\n",$$0);next;} (NR%4==0) {printf("22222222%s22222222\n",$$0);next;} {print;}' > a ; mv a f${F}.fq ; )
ex1.fa:
curl -o $@ "https://raw.github.com/samtools/samtools/develop/examples/$@"
##fileformat=VCFv4.1
##samtoolsVersion=0.1.19-44428cd
##reference=file://ex1.fa
##contig=<ID=seq1,length=1575>
##contig=<ID=seq2,length=1584>
##INFO=<ID=DP,Number=1,Type=Integer,Description="Raw read depth">
##INFO=<ID=DP4,Number=4,Type=Integer,Description="# high-quality ref-forward bases, ref-reverse, alt-forward and alt-reverse bases">
##INFO=<ID=MQ,Number=1,Type=Integer,Description="Root-mean-square mapping quality of covering reads">
##INFO=<ID=FQ,Number=1,Type=Float,Description="Phred probability of all samples being the same">
##INFO=<ID=AF1,Number=1,Type=Float,Description="Max-likelihood estimate of the first ALT allele frequency (assuming HWE)">
##INFO=<ID=AC1,Number=1,Type=Float,Description="Max-likelihood estimate of the first ALT allele count (no HWE assumption)">
##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">
##INFO=<ID=IS,Number=2,Type=Float,Description="Maximum number of reads supporting an indel and fraction of indel reads">
##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes for each ALT allele, in the same order as listed">
##INFO=<ID=G3,Number=3,Type=Float,Description="ML estimate of genotype frequencies">
##INFO=<ID=HWE,Number=1,Type=Float,Description="Chi^2 based HWE test P-value based on G3">
##INFO=<ID=CLR,Number=1,Type=Integer,Description="Log ratio of genotype likelihoods with and without the constraint">
##INFO=<ID=UGT,Number=1,Type=String,Description="The most probable unconstrained genotype configuration in the trio">
##INFO=<ID=CGT,Number=1,Type=String,Description="The most probable constrained genotype configuration in the trio">
##INFO=<ID=PV4,Number=4,Type=Float,Description="P-values for strand bias, baseQ bias, mapQ bias and tail distance bias">
##INFO=<ID=INDEL,Number=0,Type=Flag,Description="Indicates that the variant is an INDEL.">
##INFO=<ID=PC2,Number=2,Type=Integer,Description="Phred probability of the nonRef allele frequency in group1 samples being larger (,smaller) than in group2.">
##INFO=<ID=PCHI2,Number=1,Type=Float,Description="Posterior weighted chi^2 P-value for testing the association between group1 and group2 samples.">
##INFO=<ID=QCHI2,Number=1,Type=Integer,Description="Phred scaled PCHI2.">
##INFO=<ID=PR,Number=1,Type=Integer,Description="# permutations yielding a smaller PCHI2.">
##INFO=<ID=QBD,Number=1,Type=Float,Description="Quality by Depth: QUAL/#reads">
##INFO=<ID=RPB,Number=1,Type=Float,Description="Read Position Bias">
##INFO=<ID=MDV,Number=1,Type=Integer,Description="Maximum number of high-quality nonRef reads in samples">
##INFO=<ID=VDB,Number=1,Type=Float,Description="Variant Distance Bias (v2) for filtering splice-site artefacts in RNA-seq data. Note: this version may be broken
.">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
##FORMAT=<ID=GL,Number=3,Type=Float,Description="Likelihoods for RR,RA,AA genotypes (R=ref,A=alt)">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="# high-quality bases">
##FORMAT=<ID=DV,Number=1,Type=Integer,Description="# high-quality non-reference bases">
##FORMAT=<ID=SP,Number=1,Type=Integer,Description="Phred-scaled strand bias P-value">
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="List of Phred-scaled genotype likelihoods">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT test.bam
seq1 993 . T A 61 . DP=136;VDB=5.035851e-03;RPB=2.097285e-01;AF1=0.5;AC1=1;DP4=44,29,35,27;MQ=60;FQ=63.5;PV4=0.73,1,1,1
GT:PL:GQ 0/1:91,0,100:93
seq2 303 . TC TCCC 103 . INDEL;IS=3,0.044118;DP=68;VDB=9.692566e-03;AF1=1;AC1=2;DP4=0,0,30,0;MQ=60;FQ=-125 GT:PL:GQ
1/1:144,90,0:99
seq2 304 . C CCA 214 . INDEL;IS=52,0.800000;DP=65;VDB=4.882629e-04;AF1=1;AC1=2;DP4=0,0,46,0;MQ=60;FQ=-173 GT:PL:GQ
1/1:255,138,0:99
seq2 837 . A C 68 . DP=128;VDB=2.587057e-02;RPB=3.599771e-01;AF1=0.5;AC1=1;DP4=41,25,30,32;MQ=60;FQ=67.5;PV4=0.15,1,1,1
GT:PL:GQ 0/1:98,0,97:97
seq2 893 . G C,A 143 . DP=110;VDB=4.352574e-03;RPB=-2.270745e+00;AF1=1;AC1=2;DP4=1,1,47,61;MQ=60;FQ=-282;PV4=1,1,1,1 GT:PL:GQ
1/1:176,255,0,178,255,175:99
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment