nilesh-tawari/vepvcf_to_pandas.py

## vepvcf_to_pandas.py
# -*- coding: utf-8 -*-
"""
Created on Mon Mar  5 14:21:42 2018

@author: nilesh-tawari
email: tawari.nilesh@gmail.com
GitHub: https://github.com/nilesh-tawari
"""
from __future__ import print_function
import os
import argparse
import numpy as np
import pandas as pd
from collections import OrderedDict
import gzip

'''
Convert VEP annotated VCF file in pandas dataframe
usage: python vepvcf_to_pandas.py -v test.vcf
'''

# Parameters
parser = argparse.ArgumentParser(description = 'Parse VCF output from VEP' \
                                                'to generate pandas dataframes')
parser.add_argument('-v', '--vcf', required=True, help='Input vep annotated vcf file name')

args = parser.parse_args()
vep_anno_vcf = os.path.abspath(args.vcf)

def calc_af(row, AD, AO, DP):
    '''
    Function to calculate AF from AO and DP columns or AD column alone
    '''
    try:
        if row[AD] == '-':
            val = float(row[AO]) / float(row[DP])
        else:
            val = float(row[AD].split(',')[1]) / (float(row[AD].split(',')[0]) + float(row[AD].split(',')[1]))
    except IndexError:
        val = '.'
    return val

def vcf_to_df(vcf_file, remove_cols = False):
    '''
    file -> df
    Opens vcf file and gets comments, column names for the file and vep headers and puts data in
    a pandas dataframe. Format column is splited based on samples.
    '''
    if vcf_file.endswith('.gz'):
        vcf_file_read = gzip.open(vcf_file, 'r')
    else:
        vcf_file_read = open(vcf_file, 'r')
    vcf_comments = []
    vep_columns = ''
    for line in vcf_file_read:
        if vcf_file.endswith('.gz'):
            line = line.decode()
        if line.startswith('#'):
            vcf_comments.append(line)
        if line.startswith('##') and "Format:" and "CSQ" in line:
            vep_columns = [line[:-3].split('Format:')[1].strip().split('|')]
            vep_columns = [item for slist in vep_columns for item in slist]
        elif line.startswith('#CHROM'):
            vcf_columns = [line.strip().strip('#').split('\t')]
            vcf_columns = [item for slist in vcf_columns for item in slist]
    vcf_file_read.close()
    sample_ids = vcf_columns[9:]
    df = pd.read_csv(vcf_file, sep = '\t', comment = '#', chunksize=1000, \
                low_memory=False, names = vcf_columns, iterator = True)
    df = pd.concat(list(df), ignore_index=True)
    for sample in sample_ids:
        df_info = pd.DataFrame(
                df.apply(lambda x: OrderedDict(zip(x['FORMAT'].split(':'),
                x[sample].split(':'))), 1).tolist(), index = df.index)
        # calculate AF
        AD, AO, DP = sample + ':AD', sample + ':AO', sample + ':DP'
        df_info = df_info.add_prefix(sample + ':')
        df_info[sample + ':calcAF'] = df_info.apply(calc_af, args=(AD, AO, DP) , axis=1)
        df = pd.merge(left = df, right = df_info, left_index = True, \
                    right_index = True, how = 'outer')
        df.fillna(value='-', inplace = True)
    if remove_cols:
        df.drop(sample_ids + ['FORMAT'], inplace = True, axis = 1, errors='ignore')
    info = df.INFO.str.extractall('([^;]+)=([^;]+)')
    info = pd.Series(info.values[:, 1], [info.index.get_level_values(0) \
                    , info.values[:, 0]]).unstack()
    df = pd.merge(left = df, right = info, left_index = True, \
                    right_index = True, how = 'outer', suffixes=['', 'tag'])
    if remove_cols:
        df.drop(['INFO'], inplace = True, axis = 1, errors='ignore')

    df['variant_ID']  = df.CHROM.astype(str) + '_' + df.POS.astype(str) + '_' + \
                        df.REF.astype(str) + '/' + df.ALT.astype(str)
    return vcf_comments, vep_columns, vcf_columns, sample_ids, df

def expand_csq(df, vep_columns, sort_pick=True):
    '''
    df -> df
    expand VEP consequence
    '''
    df_tr = pd.DataFrame(df.CSQ.str.split(',', expand = True).stack().str.split('|', expand = True))
    df_tr.columns = vep_columns
    if 'gnomADg' in vep_columns:
        df_tr['MAX_AF_genomADg'] = df_tr[[col for col in vep_columns if 'gnomADg_' in col or col == 'MAX_AF']].apply(pd.to_numeric, errors='coerce').max(axis=1)
        df_tr['MAX_AF_genomADg_pop'] = df_tr[[col for col in vep_columns if 'gnomADg_' in col or col == 'MAX_AF']].apply(pd.to_numeric, errors='coerce').replace('', np.nan).idxmax(axis=1)
    if 'PICK' in vep_columns and sort_pick:
        df_tr['PICK'].replace('-', 0, inplace=True)
        df_tr['PICK'] = df_tr['PICK'].apply(pd.to_numeric)
        df_tr.groupby(['Feature']).apply(lambda x: x.sort_values(['PICK'], ascending = False)).reset_index(drop=True)
        df_tr['PICK'].replace({0: '-', 1: '1'}, inplace=True)
    df_tr.index = df_tr.index.droplevel(-1)
    df_tr.replace('', '-', inplace = True)
    df_tr.fillna(value='-', inplace = True)
    df_tr['ID'] = df_tr.index
    df_all_trans = pd.merge(left = df, right = df_tr, left_index = True, \
                            right_on = 'ID', how = 'outer', suffixes = ["", "_vep"])
    df_all_trans.drop(['ID_vep', 'CSQ'], inplace = True, axis = 1, errors = 'ignore')
    df_single_trans = df_all_trans[~df_all_trans.index.duplicated()]
    df_snv = df_single_trans.loc[df_single_trans['VARIANT_CLASS'] == 'SNV']
    df_indel = df_single_trans.loc[~(df_single_trans['VARIANT_CLASS'] == 'SNV')]
    return df_single_trans, df_all_trans, df_snv, df_indel


vcf_comments, vep_columns, vcf_columns, sample_ids, df = vcf_to_df(vep_anno_vcf, remove_cols=True)
df_single_trans, df_all_trans, df_snv, df_indel = expand_csq(df, vep_columns, sort_pick=True)


## following dataframes are produced
# df: unexpanded VCF columns
# df_single_trans: fully expanded single transcripts vcf
# df_all_trans: fully expanded all transcripts vcf
# df_snv: df with snvs
# df_indel = df with indels
	# -- coding: utf-8 --
	"""
	Created on Mon Mar 5 14:21:42 2018

	@author: nilesh-tawari
	email: tawari.nilesh@gmail.com
	GitHub: https://github.com/nilesh-tawari
	"""
	from __future__ import print_function
	import os
	import argparse
	import numpy as np
	import pandas as pd
	from collections import OrderedDict
	import gzip

	'''
	Convert VEP annotated VCF file in pandas dataframe
	usage: python vepvcf_to_pandas.py -v test.vcf
	'''

	# Parameters
	parser = argparse.ArgumentParser(description = 'Parse VCF output from VEP' \
	'to generate pandas dataframes')
	parser.add_argument('-v', '--vcf', required=True, help='Input vep annotated vcf file name')

	args = parser.parse_args()
	vep_anno_vcf = os.path.abspath(args.vcf)

	def calc_af(row, AD, AO, DP):
	'''
	Function to calculate AF from AO and DP columns or AD column alone
	'''
	try:
	if row[AD] == '-':
	val = float(row[AO]) / float(row[DP])
	else:
	val = float(row[AD].split(',')[1]) / (float(row[AD].split(',')[0]) + float(row[AD].split(',')[1]))
	except IndexError:
	val = '.'
	return val

	def vcf_to_df(vcf_file, remove_cols = False):
	'''
	file -> df
	Opens vcf file and gets comments, column names for the file and vep headers and puts data in
	a pandas dataframe. Format column is splited based on samples.
	'''
	if vcf_file.endswith('.gz'):
	vcf_file_read = gzip.open(vcf_file, 'r')
	else:
	vcf_file_read = open(vcf_file, 'r')
	vcf_comments = []
	vep_columns = ''
	for line in vcf_file_read:
	if vcf_file.endswith('.gz'):
	line = line.decode()
	if line.startswith('#'):
	vcf_comments.append(line)
	if line.startswith('##') and "Format:" and "CSQ" in line:
	vep_columns = [line[:-3].split('Format:')[1].strip().split('\|')]
	vep_columns = [item for slist in vep_columns for item in slist]
	elif line.startswith('#CHROM'):
	vcf_columns = [line.strip().strip('#').split('\t')]
	vcf_columns = [item for slist in vcf_columns for item in slist]
	vcf_file_read.close()
	sample_ids = vcf_columns[9:]
	df = pd.read_csv(vcf_file, sep = '\t', comment = '#', chunksize=1000, \
	low_memory=False, names = vcf_columns, iterator = True)
	df = pd.concat(list(df), ignore_index=True)
	for sample in sample_ids:
	df_info = pd.DataFrame(
	df.apply(lambda x: OrderedDict(zip(x['FORMAT'].split(':'),
	x[sample].split(':'))), 1).tolist(), index = df.index)
	# calculate AF
	AD, AO, DP = sample + ':AD', sample + ':AO', sample + ':DP'
	df_info = df_info.add_prefix(sample + ':')
	df_info[sample + ':calcAF'] = df_info.apply(calc_af, args=(AD, AO, DP) , axis=1)
	df = pd.merge(left = df, right = df_info, left_index = True, \
	right_index = True, how = 'outer')
	df.fillna(value='-', inplace = True)
	if remove_cols:
	df.drop(sample_ids + ['FORMAT'], inplace = True, axis = 1, errors='ignore')
	info = df.INFO.str.extractall('([^;]+)=([^;]+)')
	info = pd.Series(info.values[:, 1], [info.index.get_level_values(0) \
	, info.values[:, 0]]).unstack()
	df = pd.merge(left = df, right = info, left_index = True, \
	right_index = True, how = 'outer', suffixes=['', 'tag'])
	if remove_cols:
	df.drop(['INFO'], inplace = True, axis = 1, errors='ignore')

	df['variant_ID'] = df.CHROM.astype(str) + '_' + df.POS.astype(str) + '_' + \
	df.REF.astype(str) + '/' + df.ALT.astype(str)
	return vcf_comments, vep_columns, vcf_columns, sample_ids, df

	def expand_csq(df, vep_columns, sort_pick=True):
	'''
	df -> df
	expand VEP consequence
	'''
	df_tr = pd.DataFrame(df.CSQ.str.split(',', expand = True).stack().str.split('\|', expand = True))
	df_tr.columns = vep_columns
	if 'gnomADg' in vep_columns:
	df_tr['MAX_AF_genomADg'] = df_tr[[col for col in vep_columns if 'gnomADg_' in col or col == 'MAX_AF']].apply(pd.to_numeric, errors='coerce').max(axis=1)
	df_tr['MAX_AF_genomADg_pop'] = df_tr[[col for col in vep_columns if 'gnomADg_' in col or col == 'MAX_AF']].apply(pd.to_numeric, errors='coerce').replace('', np.nan).idxmax(axis=1)
	if 'PICK' in vep_columns and sort_pick:
	df_tr['PICK'].replace('-', 0, inplace=True)
	df_tr['PICK'] = df_tr['PICK'].apply(pd.to_numeric)
	df_tr.groupby(['Feature']).apply(lambda x: x.sort_values(['PICK'], ascending = False)).reset_index(drop=True)
	df_tr['PICK'].replace({0: '-', 1: '1'}, inplace=True)
	df_tr.index = df_tr.index.droplevel(-1)
	df_tr.replace('', '-', inplace = True)
	df_tr.fillna(value='-', inplace = True)
	df_tr['ID'] = df_tr.index
	df_all_trans = pd.merge(left = df, right = df_tr, left_index = True, \
	right_on = 'ID', how = 'outer', suffixes = ["", "_vep"])
	df_all_trans.drop(['ID_vep', 'CSQ'], inplace = True, axis = 1, errors = 'ignore')
	df_single_trans = df_all_trans[~df_all_trans.index.duplicated()]
	df_snv = df_single_trans.loc[df_single_trans['VARIANT_CLASS'] == 'SNV']
	df_indel = df_single_trans.loc[~(df_single_trans['VARIANT_CLASS'] == 'SNV')]
	return df_single_trans, df_all_trans, df_snv, df_indel


	vcf_comments, vep_columns, vcf_columns, sample_ids, df = vcf_to_df(vep_anno_vcf, remove_cols=True)
	df_single_trans, df_all_trans, df_snv, df_indel = expand_csq(df, vep_columns, sort_pick=True)


	## following dataframes are produced
	# df: unexpanded VCF columns
	# df_single_trans: fully expanded single transcripts vcf
	# df_all_trans: fully expanded all transcripts vcf
	# df_snv: df with snvs
	# df_indel = df with indels