Created
October 21, 2019 23:02
-
-
Save gkarthik/4006165244b226532fee075b2f54f988 to your computer and use it in GitHub Desktop.
Parse VCF to dataframe using pyvcf (including format).
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import vcf | |
import os | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
def create_vcf_df(file_name): | |
data = [] | |
csq_fields = ["impact", "aa.pos", "aa.mut", "nuc.pos", "codon.change"] | |
vcf_reader = vcf.Reader(open("../data/"+file_name)) | |
for rec in vcf_reader: | |
t = {} | |
t["POS"] = rec.POS | |
t["REF"] = rec.REF | |
t["ALT"] = rec.ALT[0] # Since VCF has only 1 ALT per position | |
for k,v in rec.INFO.items(): | |
if k =="CSQ": | |
for i,j in zip(v[0].split("|"), csq_fields): | |
if ".pos" in j: | |
i = int(i) | |
t["INFO.CSQ."+j] = i | |
else: | |
t["INFO."+k] = v | |
for s in rec.samples: | |
for f in rec.FORMAT.split(":"): | |
t["s."+f] = s[f] | |
t["file.name"] = file_name | |
data.append(t) | |
return pd.DataFrame(data) | |
files = os.listdir("../data") # Concat list of files in all directories | |
dfs = [create_vcf_df(f) for f in files] | |
df = pd.concat(dfs) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment