Created
January 30, 2014 19:03
-
-
Save cfljam/8716349 to your computer and use it in GitHub Desktop.
Using vcftools for VCF parsing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Exploring use of vcftools package http://vcftools.sourceforge.net/\n", | |
"\n", | |
"Documentation at http://vcftools.sourceforge.net/docs.html\n" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Using a partial draft vcf file of single populations" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"%cd /Users/johnmccallum/kiwi_data/\n", | |
"!head -n5 Mum.vcf" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"/Users/johnmccallum/kiwi_data\n", | |
"##fileformat=VCFv4.1\r\n", | |
"##fileDate=20140114\r\n", | |
"##source=freeBayes v9.9.2-36-gff98393\r\n", | |
"##reference=../00.data/Kiwifruit_pseudomolecule.fa\r\n", | |
"##phasing=none\r\n" | |
] | |
} | |
], | |
"prompt_number": 1 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"!vcftools" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"\r\n", | |
"VCFtools (v0.1.11)\r\n", | |
"\u00a9 Adam Auton 2009\r\n", | |
"\r\n", | |
"Process Variant Call Format files\r\n", | |
"\r\n", | |
"For a list of options, please go to:\r\n", | |
"\thttp://vcftools.sourceforge.net/options.html\r\n", | |
"\r\n", | |
"Questions, comments, and suggestions should be emailed to:\r\n", | |
"\tvcftools-help@lists.sourceforge.net\r\n", | |
"\r\n" | |
] | |
} | |
], | |
"prompt_number": 6 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Important points\n", | |
"\n", | |
"* can read compressed gzipped and bcf files directly \n", | |
"* can filter on anything\n", | |
"* can exclude regions and sites\n", | |
"\n", | |
"-------------" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Example usage to get \n", | |
"\n", | |
"* variant frequencies of SNPs \n", | |
"* SNP density in 10kb bins\n", | |
"\n", | |
"excluding repeat regions on Chromosome 1" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"!awk '$1==\"Chr1\" {print $1,$4,$5}' Kiwifruit_pseudomolecule_repeat.gff3 > Chr1_repeats.bed" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 11 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"!vcftools --vcf Mum.vcf --freq --remove-indels --out Chr1_SNP_freq --exclude-bed Chr1_repeats.bed --SNPdensity 10000\n" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"\r\n", | |
"VCFtools - v0.1.11\r\n", | |
"(C) Adam Auton 2009\r\n", | |
"\r\n", | |
"Parameters as interpreted:\r\n", | |
"\t--vcf Mum.vcf\r\n", | |
"\t--freq\r\n", | |
"\t--out Chr1_SNP_freq\r\n", | |
"\t--SNPdensity 10000\r\n", | |
"\t--remove-indels\r\n", | |
"\t--exclude-bed Chr1_repeats.bed\r\n", | |
"\r\n", | |
"Reading Index file.\r\n", | |
"File contains 47489 entries and 1 individuals.\r\n", | |
"Applying Required Filters.\r\n", | |
"Filtering sites by allele type\r\n" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"Filtering sites by BED file\r\n", | |
"\tRead 21946 BED file entries.\r\n" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"After filtering, kept 1 out of 1 Individuals\r\n", | |
"After filtering, kept 31023 out of a possible 47489 Sites\r\n", | |
"Outputting Frequency Statistics...\r\n" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"Outputting SNP density\r\n" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"Mean SNP density: 15.127 variants / kb\r\n", | |
"Run Time = 5.00 seconds\r\n" | |
] | |
} | |
], | |
"prompt_number": 15 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"!head Chr1_SNP_freq.frq" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"CHROM\tPOS\tN_ALLELES\tN_CHR\t{ALLELE:FREQ}\r\n", | |
"Chr1\t330\t2\t2\tC:0.5\tA:0.5\r\n", | |
"Chr1\t371\t3\t2\tA:0\tC:0.5\tT:0.5\r\n", | |
"Chr1\t415\t2\t2\tG:0.5\tT:0.5\r\n", | |
"Chr1\t443\t3\t2\tA:0\tG:0.5\tT:0.5\r\n", | |
"Chr1\t471\t2\t2\tA:0.5\tG:0.5\r\n", | |
"Chr1\t490\t2\t2\tT:0.5\tC:0.5\r\n", | |
"Chr1\t534\t2\t2\tA:0.5\tT:0.5\r\n", | |
"Chr1\t570\t2\t2\tC:0.5\tT:0.5\r\n", | |
"Chr1\t743\t2\t2\tA:0.5\tG:0.5\r\n" | |
] | |
} | |
], | |
"prompt_number": 16 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"!head Chr1_SNP_freq.snpden" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"CHROM\tBIN_START\tSNP_COUNT\tVARIANTS/KB\r\n", | |
"Chr1\t0\t131\t13.1\r\n", | |
"Chr1\t10000\t95\t9.5\r\n", | |
"Chr1\t20000\t99\t9.9\r\n", | |
"Chr1\t30000\t89\t8.9\r\n", | |
"Chr1\t40000\t102\t10.2\r\n", | |
"Chr1\t50000\t92\t9.2\r\n", | |
"Chr1\t60000\t53\t5.3\r\n", | |
"Chr1\t70000\t65\t6.5\r\n", | |
"Chr1\t80000\t93\t9.3\r\n" | |
] | |
} | |
], | |
"prompt_number": 17 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Package includes some handy Perl scripts http://vcftools.sourceforge.net/perl_module.html" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"!vcf-stats Mum.vcf" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"$VAR1 = {\r\n", | |
" 'samples' => {\r\n", | |
" 'unknown' => {\r\n", | |
" 'hom_AA_count' => 6007,\r\n", | |
" 'other_count' => 368,\r\n", | |
" 'indel_count' => 2012,\r\n", | |
" 'indel' => {\r\n", | |
" '6' => 27,\r\n", | |
" '-5' => 49,\r\n", | |
" '3' => 68,\r\n", | |
" '-6' => 44,\r\n", | |
" '-4' => 65,\r\n", | |
" '2' => 160,\r\n", | |
" '1' => 603,\r\n", | |
" '4' => 36,\r\n", | |
" '-2' => 173,\r\n", | |
" '5' => 24,\r\n", | |
" '-1' => 692,\r\n", | |
" '-3' => 106\r\n", | |
" },\r\n", | |
" 'het_RA_count' => 38406,\r\n", | |
" 'snp_count' => 45510,\r\n", | |
" 'count' => 47489,\r\n", | |
" 'ref' => 38406,\r\n", | |
" 'private' => 47489,\r\n", | |
" 'het_AA_count' => 3076,\r\n", | |
" 'other' => 370,\r\n", | |
" 'snp' => {\r\n", | |
" 'N>G' => 9,\r\n", | |
" 'A>C' => 5638,\r\n", | |
" 'C>T' => 7007,\r\n", | |
" 'T>C' => 7539,\r\n", | |
" 'C>A' => 4224,\r\n", | |
" 'A>T' => 9600,\r\n", | |
" 'G>T' => 4227,\r\n", | |
" 'G>C' => 2047,\r\n", | |
" 'N>A' => 23,\r\n", | |
" 'A>G' => 8247,\r\n", | |
" 'T>G' => 5141,\r\n", | |
" 'N>T' => 16,\r\n", | |
" 'T>A' => 8791,\r\n", | |
" 'N>C' => 12,\r\n", | |
" 'C>G' => 2027,\r\n", | |
" 'G>A' => 6923\r\n", | |
" },\r\n", | |
" 'ref_count' => 38406,\r\n", | |
" 'unphased' => 47489\r\n", | |
" }\r\n", | |
" },\r\n", | |
" 'all' => {\r\n", | |
" 'other_count' => 368,\r\n", | |
" 'indel_count' => 2012,\r\n", | |
" 'indel' => {\r\n", | |
" '6' => 27,\r\n", | |
" '-5' => 49,\r\n", | |
" '3' => 68,\r\n", | |
" '-6' => 44,\r\n", | |
" '-4' => 65,\r\n", | |
" '2' => 158,\r\n", | |
" '1' => 603,\r\n", | |
" '4' => 36,\r\n", | |
" '-2' => 173,\r\n", | |
" '5' => 24,\r\n", | |
" '-1' => 692,\r\n", | |
" '-3' => 106\r\n", | |
" },\r\n", | |
" 'snp_count' => 45510,\r\n", | |
" 'count' => 47489,\r\n", | |
" 'nalt_1' => 44415,\r\n", | |
" 'shared' => {\r\n", | |
" '1' => 47489\r\n", | |
" },\r\n", | |
" 'other' => 370,\r\n", | |
" 'nalt_2' => 3074,\r\n", | |
" 'snp' => {\r\n", | |
" 'N>G' => 9,\r\n", | |
" 'A>C' => 5638,\r\n", | |
" 'C>T' => 7007,\r\n", | |
" 'T>C' => 7539,\r\n", | |
" 'C>A' => 4224,\r\n", | |
" 'A>T' => 9600,\r\n", | |
" 'G>T' => 4227,\r\n", | |
" 'G>C' => 2047,\r\n", | |
" 'N>A' => 23,\r\n", | |
" 'A>G' => 8247,\r\n", | |
" 'T>G' => 5141,\r\n", | |
" 'N>T' => 16,\r\n", | |
" 'T>A' => 8791,\r\n", | |
" 'N>C' => 12,\r\n", | |
" 'C>G' => 2027,\r\n", | |
" 'G>A' => 6923\r\n", | |
" }\r\n", | |
" }\r\n", | |
" };\r\n" | |
] | |
} | |
], | |
"prompt_number": 3 | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment