Skip to content

Instantly share code, notes, and snippets.

@gideonite
Forked from slowkow/VCF.py
Last active August 29, 2015 14:27
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save gideonite/711e54d0e6c1243564f8 to your computer and use it in GitHub Desktop.
Save gideonite/711e54d0e6c1243564f8 to your computer and use it in GitHub Desktop.
VCF.py is a simple module for reading VCF files
"""
VCF.py
Kamil Slowikowski
October 30, 2013
Read VCF files. Works with gzip compressed files and pandas.
Note: This module ignores the genotype columns because
I didn't need them at the time of writing.
Read more about VCF:
http://vcftools.sourceforge.net/specs.html
Usage example:
>>> import VCF
>>> variants = VCF.lines('file.vcf.gz')
>>> print variants.next()['CHROM']
1
Use the generator to avoid loading the entire file into memory:
>>> for v in VCF.lines('file.vcf.gz'):
... print v['REF'], v['ALT']
... break
A T
If your file is not too large, read it directly into a DataFrame:
>>> df = VCF.dataframe('file.vcf.gz')
>>> df.columns
Index([u'CHROM', u'POS', u'ID', u'REF', u'ALT', u'QUAL', u'FILTER',
u'INFO'], dtype=object)
If your file is *very small* and you want to access INFO fields as columns:
>>> df = VCF.dataframe('file.vcf.gz', large=False)
>>> df.columns
Index([u'CHROM', u'POS', u'ID', u'REF', u'ALT', u'QUAL', u'FILTER',
u'GENE_NAME', u'GENE_ID', u'AA_POS', u'AA_CHANGE'], dtype=object)
"""
from collections import OrderedDict
import gzip
import pandas as pd
VCF_HEADER = ['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO']
def dataframe(filename, large=True):
"""Open an optionally gzipped VCF file and return a pandas.DataFrame with
each INFO field included as a column in the dataframe.
Note: Using large=False with large VCF files. It will be painfully slow.
:param filename: An optionally gzipped VCF file.
:param large: Use this with large VCF files to skip the ## lines and
leave the INFO fields unseparated as a single column.
"""
if large:
# Set the proper argument if the file is compressed.
comp = 'gzip' if filename.endswith('.gz') else None
# Count how many comment lines should be skipped.
comments = _count_comments(filename)
# Return a simple DataFrame without splitting the INFO column.
return pd.read_table(filename, compression=comp, skiprows=comments,
names=VCF_HEADER, usecols=range(8))
# Each column is a list stored as a value in this dict. The keys for this
# dict are the VCF column names and the keys in the INFO column.
result = OrderedDict()
# Parse each line in the VCF file into a dict.
for i, line in enumerate(lines(filename)):
for key in line.keys():
# This key has not been seen yet, so set it to None for all
# previous lines.
if key not in result:
result[key] = [None] * i
# Ensure this row has some value for each column.
for key in result.keys():
result[key].append(line.get(key, None))
return pd.DataFrame(result)
def lines(filename):
"""Open an optionally gzipped VCF file and generate an OrderedDict for
each line.
"""
fn_open = gzip.open if filename.endswith('.gz') else open
with fn_open(filename) as fh:
for line in fh:
if line.startswith('#'):
continue
else:
yield parse(line)
def parse(line):
"""Parse a single VCF line and return an OrderedDict.
"""
result = OrderedDict()
fields = line.rstrip().split('\t')
# Read the values in the first seven columns.
for i, col in enumerate(VCF_HEADER[:7]):
result[col] = _get_value(fields[i])
# INFO field consists of "key1=value;key2=value;...".
infos = fields[7].split(';')
for i, info in enumerate(infos, 1):
# info should be "key=value".
try:
key, value = info.split('=')
# But sometimes it is just "value", so we'll make our own key.
except ValueError:
key = 'INFO{}'.format(i)
value = info
# Set the value to None if there is no value.
result[key] = _get_value(value)
return result
def _get_value(value):
"""Interpret null values and return ``None``. Return a list if the value
contains a comma.
"""
if not value or value in ['', '.', 'NA']:
return None
if ',' in value:
return value.split(',')
return value
def _count_comments(filename):
"""Count comment lines (those that start with "#") in an optionally
gzipped file.
:param filename: An optionally gzipped file.
"""
comments = 0
fn_open = gzip.open if filename.endswith('.gz') else open
with fn_open(filename) as fh:
for line in fh:
if line.startswith(b'#'):
comments += 1
else:
break
return comments
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment