Skip to content

Instantly share code, notes, and snippets.

@gkarthik
Created October 21, 2019 23:02
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save gkarthik/4006165244b226532fee075b2f54f988 to your computer and use it in GitHub Desktop.
Save gkarthik/4006165244b226532fee075b2f54f988 to your computer and use it in GitHub Desktop.
Parse VCF to dataframe using pyvcf (including format).
import vcf
import os
import pandas as pd
import matplotlib.pyplot as plt
def create_vcf_df(file_name):
data = []
csq_fields = ["impact", "aa.pos", "aa.mut", "nuc.pos", "codon.change"]
vcf_reader = vcf.Reader(open("../data/"+file_name))
for rec in vcf_reader:
t = {}
t["POS"] = rec.POS
t["REF"] = rec.REF
t["ALT"] = rec.ALT[0] # Since VCF has only 1 ALT per position
for k,v in rec.INFO.items():
if k =="CSQ":
for i,j in zip(v[0].split("|"), csq_fields):
if ".pos" in j:
i = int(i)
t["INFO.CSQ."+j] = i
else:
t["INFO."+k] = v
for s in rec.samples:
for f in rec.FORMAT.split(":"):
t["s."+f] = s[f]
t["file.name"] = file_name
data.append(t)
return pd.DataFrame(data)
files = os.listdir("../data") # Concat list of files in all directories
dfs = [create_vcf_df(f) for f in files]
df = pd.concat(dfs)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment