Created
January 28, 2014 17:41
-
-
Save cfljam/8672400 to your computer and use it in GitHub Desktop.
Using PyVcf to parse vcf format variant data
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
" " | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"* working with filtered vcf from Freebayes\n", | |
"* Using PyVcf as described in http://pyvcf.readthedocs.org/en/latest/index.html" | |
] | |
}, | |
{ | |
"cell_type": "raw", | |
"metadata": {}, | |
"source": [ | |
"git clone https://github.com/jamescasbon/PyVCF.git\n", | |
"sudo python setup.py install" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"import vcf\n", | |
"vcf_reader = vcf.Reader(open('Mum.vcf','r'))\n", | |
"for x in range(0,6):\n", | |
" print vcf_reader.next()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"Record(CHROM=Chr1, POS=330, REF=C, ALT=[A])\n", | |
"Record(CHROM=Chr1, POS=371, REF=A, ALT=[C, T])\n", | |
"Record(CHROM=Chr1, POS=415, REF=G, ALT=[T])\n", | |
"Record(CHROM=Chr1, POS=443, REF=A, ALT=[G, T])\n", | |
"Record(CHROM=Chr1, POS=471, REF=A, ALT=[G])\n", | |
"Record(CHROM=Chr1, POS=490, REF=T, ALT=[C])\n" | |
] | |
} | |
], | |
"prompt_number": 12 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"test_rec=vcf_reader.next()\n", | |
"dir(test_rec)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 19, | |
"text": [ | |
"['ALT',\n", | |
" 'CHROM',\n", | |
" 'FILTER',\n", | |
" 'FORMAT',\n", | |
" 'ID',\n", | |
" 'INFO',\n", | |
" 'POS',\n", | |
" 'QUAL',\n", | |
" 'REF',\n", | |
" '__class__',\n", | |
" '__cmp__',\n", | |
" '__delattr__',\n", | |
" '__dict__',\n", | |
" '__doc__',\n", | |
" '__eq__',\n", | |
" '__format__',\n", | |
" '__getattribute__',\n", | |
" '__hash__',\n", | |
" '__init__',\n", | |
" '__iter__',\n", | |
" '__lt__',\n", | |
" '__module__',\n", | |
" '__new__',\n", | |
" '__reduce__',\n", | |
" '__reduce_ex__',\n", | |
" '__repr__',\n", | |
" '__setattr__',\n", | |
" '__sizeof__',\n", | |
" '__str__',\n", | |
" '__subclasshook__',\n", | |
" '__weakref__',\n", | |
" '_sample_indexes',\n", | |
" 'aaf',\n", | |
" 'add_filter',\n", | |
" 'add_format',\n", | |
" 'add_info',\n", | |
" 'alleles',\n", | |
" 'call_rate',\n", | |
" 'end',\n", | |
" 'genotype',\n", | |
" 'get_hets',\n", | |
" 'get_hom_alts',\n", | |
" 'get_hom_refs',\n", | |
" 'get_unknowns',\n", | |
" 'is_deletion',\n", | |
" 'is_indel',\n", | |
" 'is_monomorphic',\n", | |
" 'is_snp',\n", | |
" 'is_sv',\n", | |
" 'is_sv_precise',\n", | |
" 'is_transition',\n", | |
" 'nucl_diversity',\n", | |
" 'num_called',\n", | |
" 'num_het',\n", | |
" 'num_hom_alt',\n", | |
" 'num_hom_ref',\n", | |
" 'num_unknown',\n", | |
" 'samples',\n", | |
" 'start',\n", | |
" 'sv_end',\n", | |
" 'var_subtype',\n", | |
" 'var_type']" | |
] | |
} | |
], | |
"prompt_number": 19 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"test_rec.var_type" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 37, | |
"text": [ | |
"'indel'" | |
] | |
} | |
], | |
"prompt_number": 37 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"test_rec.alleles" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 40, | |
"text": [ | |
"['TGACA', TGAAG]" | |
] | |
} | |
], | |
"prompt_number": 40 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Accessing genotype calls" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"test_rec.samples" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 32, | |
"text": [ | |
"[Call(sample=unknown, CallData(GT=0/1, DP=63, RO=31, QR=1063, AO=19, QA=532, GL=[-10.0, 0.0, -10.0]))]" | |
] | |
} | |
], | |
"prompt_number": 32 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"test_rec.genotype('unknown')['GT']" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 34, | |
"text": [ | |
"'0/1'" | |
] | |
} | |
], | |
"prompt_number": 34 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"test_rec.genotype('unknown').gt_bases" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 47, | |
"text": [ | |
"'TGACA/TGAAG'" | |
] | |
} | |
], | |
"prompt_number": 47 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Useful utility provided to **melt** vcf data to long format." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"!head -n 60 Mum.vcf > test.vcf\n", | |
"!vcf_melt < test.vcf" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"SAMPLE\tGT\tGQ\tGL\tDP\tRO\tQR\tAO\tQA\tFILTER\tCHROM\tPOS\tREF\tALT\tID\tinfo.NS\tinfo.DP\tinfo.DPB\tinfo.AC\tinfo.AN\tinfo.AF\tinfo.RO\tinfo.AO\tinfo.PRO\tinfo.PAO\tinfo.QR\tinfo.QA\tinfo.PQR\tinfo.PQA\tinfo.SRF\tinfo.SRR\tinfo.SAF\tinfo.SAR\tinfo.SRP\tinfo.SAP\tinfo.AB\tinfo.ABP\tinfo.RUN\tinfo.RPP\tinfo.RPPR\tinfo.EPP\tinfo.EPPR\tinfo.DPRA\tinfo.ODDS\tinfo.GTI\tinfo.TYPE\tinfo.CIGAR\tinfo.NUMALT\tinfo.MEANALT\tinfo.LEN\tinfo.MQM\tinfo.MQMR\tinfo.PAIRED\tinfo.PAIREDR\r", | |
"\r\n", | |
"unknown\t0/1\t\t-10.0,0.0,-10.0\t43\t22\t796\t21\t710\t.\tChr1\t330\tC\t[A]\t\t1\t43\t43.0\t1\t2\t0.5\t22\t21\t0.0\t0.0\t796\t710\t0.0\t0.0\t12\t10\t10\t11\t3.40511\t3.1137\t0.488372\t3.0608\t1\t5.59539\t3.40511\t5.59539\t3.40511\t0.0\t108.845\t0\tsnp\t1X\t1\t1.0\t1\t191.571\t195.091\t0.952381\t0.954545\r", | |
"\r\n", | |
"unknown\t1/2\t\t-10.0,-10.0,-10.0,-10.0,0.0,-10.0\t40\t0\t0\t22,18\t728,546\t.\tChr1\t371\tA\t[C, T]\t\t1\t40\t40.0\t1,1\t2\t0.5,0.5\t0\t22,18\t0.0\t0.0,0.0\t0\t728,546\t0.0\t0.0,0.0\t0\t0\t10,10\t12,8\t0.0\t3.40511,3.49285\t0.55,0.45\t3.87889,3.87889\t1,1\t4.58955,3.49285\t0.0\t4.58955,3.49285\t0.0\t0.0,0.0\t79.5374\t0\tsnp,snp\t1X,1X\t2\t2.0,2.0\t1,1\t189.0,194.056\t0.0\t0.954545,1.0\t0.0\r", | |
"\r\n", | |
"unknown\t0/1\t\t-10.0,0.0,-10.0\t48\t25\t902\t23\t866\t.\tChr1\t415\tG\t[T]\t\t1\t48\t48.0\t1\t2\t0.5\t25\t23\t0.0\t0.0\t902\t866\t0.0\t0.0\t13\t12\t14\t9\t3.09716\t5.3706\t0.479167\t3.19126\t1\t7.63648\t17.6895\t5.3706\t5.18177\t0.0\t137.849\t0\tsnp\t1X\t1\t1.0\t1\t189.696\t195.16\t0.956522\t1.0\r", | |
"\r\n", | |
"unknown\t1/2\t\t-10.0,-10.0,-10.0,-10.0,0.0,-10.0\t52\t0\t0\t27,25\t989,793\t.\tChr1\t443\tA\t[G, T]\t\t1\t52\t52.0\t1,1\t2\t0.5,0.5\t0\t27,25\t0.0\t0.0,0.0\t0\t989,793\t0.0\t0.0,0.0\t0\t0\t15,15\t12,10\t0.0\t3.73412,5.18177\t0.519231,0.480769\t3.17734,3.17734\t1,1\t3.09072,3.09716\t0.0\t5.02092,3.79203\t0.0\t0.0,0.0\t120.51\t0\tsnp,snp\t1X,1X\t2\t2.0,2.0\t1,1\t195.593,189.64\t0.0\t1.0,0.96\t0.0\r", | |
"\r\n", | |
"unknown\t0/1\t\t-10.0,0.0,-10.0\t55\t30\t1039\t25\t808\t.\tChr1\t471\tA\t[G]\t\t1\t55\t55.0\t1\t2\t0.5\t30\t25\t0.0\t0.0\t1039\t808\t0.0\t0.0\t16\t14\t14\t11\t3.29983\t3.79203\t0.454545\t3.99733\t1\t7.26639\t7.64277\t3.09716\t4.16842\t0.0\t121.081\t0\tsnp\t1X\t1\t1.0\t1\t185.68\t196.1\t0.96\t1.0\r", | |
"\r\n" | |
] | |
} | |
], | |
"prompt_number": 45 | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment