Skip to content

Instantly share code, notes, and snippets.

@cfljam
Created January 28, 2014 17:41
Show Gist options
  • Save cfljam/8672400 to your computer and use it in GitHub Desktop.
Save cfljam/8672400 to your computer and use it in GitHub Desktop.
Using PyVcf to parse vcf format variant data
Display the source blob
Display the rendered blob
Raw
{
"metadata": {
"name": ""
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
" "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"* working with filtered vcf from Freebayes\n",
"* Using PyVcf as described in http://pyvcf.readthedocs.org/en/latest/index.html"
]
},
{
"cell_type": "raw",
"metadata": {},
"source": [
"git clone https://github.com/jamescasbon/PyVCF.git\n",
"sudo python setup.py install"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"import vcf\n",
"vcf_reader = vcf.Reader(open('Mum.vcf','r'))\n",
"for x in range(0,6):\n",
" print vcf_reader.next()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Record(CHROM=Chr1, POS=330, REF=C, ALT=[A])\n",
"Record(CHROM=Chr1, POS=371, REF=A, ALT=[C, T])\n",
"Record(CHROM=Chr1, POS=415, REF=G, ALT=[T])\n",
"Record(CHROM=Chr1, POS=443, REF=A, ALT=[G, T])\n",
"Record(CHROM=Chr1, POS=471, REF=A, ALT=[G])\n",
"Record(CHROM=Chr1, POS=490, REF=T, ALT=[C])\n"
]
}
],
"prompt_number": 12
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"test_rec=vcf_reader.next()\n",
"dir(test_rec)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 19,
"text": [
"['ALT',\n",
" 'CHROM',\n",
" 'FILTER',\n",
" 'FORMAT',\n",
" 'ID',\n",
" 'INFO',\n",
" 'POS',\n",
" 'QUAL',\n",
" 'REF',\n",
" '__class__',\n",
" '__cmp__',\n",
" '__delattr__',\n",
" '__dict__',\n",
" '__doc__',\n",
" '__eq__',\n",
" '__format__',\n",
" '__getattribute__',\n",
" '__hash__',\n",
" '__init__',\n",
" '__iter__',\n",
" '__lt__',\n",
" '__module__',\n",
" '__new__',\n",
" '__reduce__',\n",
" '__reduce_ex__',\n",
" '__repr__',\n",
" '__setattr__',\n",
" '__sizeof__',\n",
" '__str__',\n",
" '__subclasshook__',\n",
" '__weakref__',\n",
" '_sample_indexes',\n",
" 'aaf',\n",
" 'add_filter',\n",
" 'add_format',\n",
" 'add_info',\n",
" 'alleles',\n",
" 'call_rate',\n",
" 'end',\n",
" 'genotype',\n",
" 'get_hets',\n",
" 'get_hom_alts',\n",
" 'get_hom_refs',\n",
" 'get_unknowns',\n",
" 'is_deletion',\n",
" 'is_indel',\n",
" 'is_monomorphic',\n",
" 'is_snp',\n",
" 'is_sv',\n",
" 'is_sv_precise',\n",
" 'is_transition',\n",
" 'nucl_diversity',\n",
" 'num_called',\n",
" 'num_het',\n",
" 'num_hom_alt',\n",
" 'num_hom_ref',\n",
" 'num_unknown',\n",
" 'samples',\n",
" 'start',\n",
" 'sv_end',\n",
" 'var_subtype',\n",
" 'var_type']"
]
}
],
"prompt_number": 19
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"test_rec.var_type"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 37,
"text": [
"'indel'"
]
}
],
"prompt_number": 37
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"test_rec.alleles"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 40,
"text": [
"['TGACA', TGAAG]"
]
}
],
"prompt_number": 40
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Accessing genotype calls"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"test_rec.samples"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 32,
"text": [
"[Call(sample=unknown, CallData(GT=0/1, DP=63, RO=31, QR=1063, AO=19, QA=532, GL=[-10.0, 0.0, -10.0]))]"
]
}
],
"prompt_number": 32
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"test_rec.genotype('unknown')['GT']"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 34,
"text": [
"'0/1'"
]
}
],
"prompt_number": 34
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"test_rec.genotype('unknown').gt_bases"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 47,
"text": [
"'TGACA/TGAAG'"
]
}
],
"prompt_number": 47
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Useful utility provided to **melt** vcf data to long format."
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"!head -n 60 Mum.vcf > test.vcf\n",
"!vcf_melt < test.vcf"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"SAMPLE\tGT\tGQ\tGL\tDP\tRO\tQR\tAO\tQA\tFILTER\tCHROM\tPOS\tREF\tALT\tID\tinfo.NS\tinfo.DP\tinfo.DPB\tinfo.AC\tinfo.AN\tinfo.AF\tinfo.RO\tinfo.AO\tinfo.PRO\tinfo.PAO\tinfo.QR\tinfo.QA\tinfo.PQR\tinfo.PQA\tinfo.SRF\tinfo.SRR\tinfo.SAF\tinfo.SAR\tinfo.SRP\tinfo.SAP\tinfo.AB\tinfo.ABP\tinfo.RUN\tinfo.RPP\tinfo.RPPR\tinfo.EPP\tinfo.EPPR\tinfo.DPRA\tinfo.ODDS\tinfo.GTI\tinfo.TYPE\tinfo.CIGAR\tinfo.NUMALT\tinfo.MEANALT\tinfo.LEN\tinfo.MQM\tinfo.MQMR\tinfo.PAIRED\tinfo.PAIREDR\r",
"\r\n",
"unknown\t0/1\t\t-10.0,0.0,-10.0\t43\t22\t796\t21\t710\t.\tChr1\t330\tC\t[A]\t\t1\t43\t43.0\t1\t2\t0.5\t22\t21\t0.0\t0.0\t796\t710\t0.0\t0.0\t12\t10\t10\t11\t3.40511\t3.1137\t0.488372\t3.0608\t1\t5.59539\t3.40511\t5.59539\t3.40511\t0.0\t108.845\t0\tsnp\t1X\t1\t1.0\t1\t191.571\t195.091\t0.952381\t0.954545\r",
"\r\n",
"unknown\t1/2\t\t-10.0,-10.0,-10.0,-10.0,0.0,-10.0\t40\t0\t0\t22,18\t728,546\t.\tChr1\t371\tA\t[C, T]\t\t1\t40\t40.0\t1,1\t2\t0.5,0.5\t0\t22,18\t0.0\t0.0,0.0\t0\t728,546\t0.0\t0.0,0.0\t0\t0\t10,10\t12,8\t0.0\t3.40511,3.49285\t0.55,0.45\t3.87889,3.87889\t1,1\t4.58955,3.49285\t0.0\t4.58955,3.49285\t0.0\t0.0,0.0\t79.5374\t0\tsnp,snp\t1X,1X\t2\t2.0,2.0\t1,1\t189.0,194.056\t0.0\t0.954545,1.0\t0.0\r",
"\r\n",
"unknown\t0/1\t\t-10.0,0.0,-10.0\t48\t25\t902\t23\t866\t.\tChr1\t415\tG\t[T]\t\t1\t48\t48.0\t1\t2\t0.5\t25\t23\t0.0\t0.0\t902\t866\t0.0\t0.0\t13\t12\t14\t9\t3.09716\t5.3706\t0.479167\t3.19126\t1\t7.63648\t17.6895\t5.3706\t5.18177\t0.0\t137.849\t0\tsnp\t1X\t1\t1.0\t1\t189.696\t195.16\t0.956522\t1.0\r",
"\r\n",
"unknown\t1/2\t\t-10.0,-10.0,-10.0,-10.0,0.0,-10.0\t52\t0\t0\t27,25\t989,793\t.\tChr1\t443\tA\t[G, T]\t\t1\t52\t52.0\t1,1\t2\t0.5,0.5\t0\t27,25\t0.0\t0.0,0.0\t0\t989,793\t0.0\t0.0,0.0\t0\t0\t15,15\t12,10\t0.0\t3.73412,5.18177\t0.519231,0.480769\t3.17734,3.17734\t1,1\t3.09072,3.09716\t0.0\t5.02092,3.79203\t0.0\t0.0,0.0\t120.51\t0\tsnp,snp\t1X,1X\t2\t2.0,2.0\t1,1\t195.593,189.64\t0.0\t1.0,0.96\t0.0\r",
"\r\n",
"unknown\t0/1\t\t-10.0,0.0,-10.0\t55\t30\t1039\t25\t808\t.\tChr1\t471\tA\t[G]\t\t1\t55\t55.0\t1\t2\t0.5\t30\t25\t0.0\t0.0\t1039\t808\t0.0\t0.0\t16\t14\t14\t11\t3.29983\t3.79203\t0.454545\t3.99733\t1\t7.26639\t7.64277\t3.09716\t4.16842\t0.0\t121.081\t0\tsnp\t1X\t1\t1.0\t1\t185.68\t196.1\t0.96\t1.0\r",
"\r\n"
]
}
],
"prompt_number": 45
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment