Skip to content

Instantly share code, notes, and snippets.

@mbreese
Created February 21, 2018 14:31
Show Gist options
  • Save mbreese/03acfa65b72f68cafa05df41671ebfb9 to your computer and use it in GitHub Desktop.
Save mbreese/03acfa65b72f68cafa05df41671ebfb9 to your computer and use it in GitHub Desktop.
Given a VCF file annotated with VEP (and expanded), calculate the worst consequence, gene, SIFT, PolyPhen, etc...
#!/usr/bin/env python
import sys
import itertools
cons_key = "VEP_Consequence"
impact_key = "VEP_IMPACT"
gene_key = "VEP_SYMBOL"
csn_key = "VEP_CSN"
sift_key = "VEP_SIFT"
polyphen_key = "VEP_PolyPhen"
new_cons_key = "VEP_Worst_Consequence"
new_impact_key = "VEP_Worst_Impact"
new_gene_key = "VEP_Worst_Gene"
new_csn_key = "VEP_Worst_CSN"
new_sift_key = "VEP_Worst_SIFT"
new_polyphen_key = "VEP_Worst_PolyPhen"
impact_ranked = ['HIGH', 'MODERATE','LOW','MODIFIER']
consequences = '''1 transcript_ablation
3 splice_acceptor_variant
3 splice_donor_variant
4 stop_gained
5 frameshift_variant
6 stop_lost
7 start_lost
8 transcript_amplification
10 inframe_insertion
11 inframe_deletion
12 missense_variant
12 protein_altering_variant
13 splice_region_variant
14 incomplete_terminal_codon_variant
15 start_retained_variant
15 stop_retained_variant
15 synonymous_variant
16 coding_sequence_variant
17 mature_miRNA_variant
18 5_prime_UTR_variant
19 3_prime_UTR_variant
20 non_coding_transcript_exon_variant
21 intron_variant
22 NMD_transcript_variant
23 non_coding_transcript_variant
24 upstream_gene_variant
25 downstream_gene_variant
26 TFBS_ablation
28 TFBS_amplification
30 TF_binding_site_variant
31 regulatory_region_ablation
33 regulatory_region_amplification
36 feature_elongation
36 regulatory_region_variant
37 feature_truncation
38 intergenic_variant
39 sequence_variant'''
cons_ranked = [x[1] for x in [y.split(' ') for y in consequences.split('\n')]]
def parse_pos(line):
cols = line.strip().split('\t')
outcols = cols[:7]
info = cols[7]
info_vals = info.split(';')
info_out = []
genes = []
csns = []
gene_idx = -1
for ival in info_vals:
if '=' not in ival:
info_out.append(ival)
else:
info_out.append(ival)
left, right = ival.split('=')
if left == gene_key:
genes = right.split(',')
elif left == csn_key:
csns = right.split(',')
elif left == polyphen_key:
worst = ''
worst_val = 0
spl = list(itertools.chain(*[x.split('&') for x in right.split(',')]))
for s in spl:
s2 = s.replace('(', ' ').replace(')','')
if s2:
name, val = s2.split(' ')
val = float(val)
if val > worst_val:
worst = name
worst_val = val
if worst:
info_out.append('%s=%s' % (new_polyphen_key, worst))
elif left == sift_key:
worst = ''
worst_val = 1
spl = list(itertools.chain(*[x.split('&') for x in right.split(',')]))
for s in spl:
s2 = s.replace('(', ' ').replace(')','')
if s2:
name, val = s2.split(' ')
val = float(val)
if val < worst_val:
worst = name
worst_val = val
if worst:
info_out.append('%s=%s' % (new_sift_key, worst))
elif left == impact_key:
worst = ''
spl = list(itertools.chain(*[x.split('&') for x in right.split(',')]))
for c in impact_ranked:
if c in spl:
worst = c
break
if worst:
info_out.append('%s=%s' % (new_impact_key, worst))
elif left == cons_key:
worst = ''
spl = list(itertools.chain(*[x.split('&') for x in right.split(',')]))
for c in cons_ranked:
if c in spl:
worst = c
break
if worst:
info_out.append('%s=%s' % (new_cons_key, worst))
for i,spl in enumerate(right.split(',')) :
for spl2 in spl.split('&'):
if worst in spl2:
gene_idx = i
if genes and gene_idx > -1:
if len(genes) > 1:
info_out.append('%s=%s' % (new_gene_key, genes[gene_idx]))
else:
info_out.append('%s=%s' % (new_gene_key, genes[0]))
if csns and gene_idx > -1:
if len(csns) > 1:
info_out.append('%s=%s' % (new_csn_key, csns[gene_idx]))
else:
info_out.append('%s=%s' % (new_csn_key, genes[0]))
outcols.append(';'.join(info_out))
outcols.extend(cols[8:])
return outcols
def parse_main(fin=sys.stdin, fout=sys.stdout):
printed_header = False
for line in fin:
if line[0] == '#' and line[1] == '#':
fout.write(line)
elif line[0] == '#':
if not printed_header:
printed_header = True
fout.write('##INFO=<ID=%s,Number=1,Type=String,Description="VEP Worst Consequence">\n' % (new_cons_key))
fout.write('##INFO=<ID=%s,Number=1,Type=String,Description="VEP Worst Impact">\n' % (new_impact_key))
fout.write('##INFO=<ID=%s,Number=1,Type=String,Description="VEP Worst Gene">\n' % (new_gene_key))
fout.write('##INFO=<ID=%s,Number=1,Type=String,Description="VEP Worst SIFT">\n' % (new_sift_key))
fout.write('##INFO=<ID=%s,Number=1,Type=String,Description="VEP Worst PolyPhen">\n' % (new_polyphen_key))
fout.write('##INFO=<ID=%s,Number=1,Type=String,Description="VEP Worst CSN">\n' % (new_csn_key))
fout.write(line)
else:
fout.write('%s\n' % '\t'.join(parse_pos(line)))
if __name__ == '__main__':
parse_main()
@mbreese
Copy link
Author

mbreese commented Jan 7, 2022

Note: this doesn't work right... it needs to split the consequences and process each value separately for each transcript (these are comma separated)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment