Skip to content

Instantly share code, notes, and snippets.

@jimhavrilla
Last active August 29, 2015 13:58
Show Gist options
  • Save jimhavrilla/10414821 to your computer and use it in GitHub Desktop.
Save jimhavrilla/10414821 to your computer and use it in GitHub Desktop.
ovarian-cancer-analysis

First found somatic mutations in patient database using Gemini and Python with varying dosages of carboplatin. Moved on to finding frequency profile for each patient's SNPs in Python and plotting them individually and as a group in R.

#!/usr/bin/env python
import sys
import numpy as np
from gemini import GeminiQuery
from gemini.gemini_constants import *
gq = GeminiQuery(sys.argv[1])
patient_samples = \
[('s02-N-2379-AJ-0030','s02-T-2379-AJ-0001','s02-R-2379-AJ-0002'),
('s03-N-2379-AJ-0031','s03-T-2379-AJ-0003','s03-R-2379-AJ-0004'),
('s04-N-2379-AJ-0032','s04-T-2379-AJ-0005','s04-R-2379-AJ-0006'),
('s05-N-2379-AJ-0007','s05-T-2379-AJ-0008','s05-R-2379-AJ-0009'),
('s09-N-2379-AJ-0033','s09-T-2379-AJ-0010','s09-R-2379-AJ-0011'),
('s10-N-2379-AJ-0034','s10-T-2379-AJ-0012','s10-R-2379-AJ-0013'),
('s11-N-2379-AJ-0035','s11-T-2379-AJ-0014','s11-R-2379-AJ-0015'),
('s12-N-2379-AJ-0036','s12-T-2379-AJ-0016','s12-R-2379-AJ-0017'),
('s15-N-2379-AJ-0037','s15-T-2379-AJ-0018','s15-R-2379-AJ-0019'),
('s17-N-2379-AJ-0038','s17-T-2379-AJ-0020','s17-R-2379-AJ-0021'),
('s18-N-2379-AJ-0039','s18-T-2379-AJ-0022','s18-R-2379-AJ-0023'),
('s19-N-2379-AJ-0040','s19-T-2379-AJ-0024','s19-R-2379-AJ-0025'),
('s20-N-2379-AJ-0041','s20-T-2379-AJ-0026','s20-R-2379-AJ-0027'),
('s21-N-2379-AJ-0042','s21-T-2379-AJ-0028','s21-R-2379-AJ-0029')]
normal_samples = \
['s02-N-2379-AJ-0030',
's03-N-2379-AJ-0031',
's04-N-2379-AJ-0032',
's05-N-2379-AJ-0007',
's09-N-2379-AJ-0033',
's10-N-2379-AJ-0034',
's11-N-2379-AJ-0035',
's12-N-2379-AJ-0036',
's15-N-2379-AJ-0037',
's17-N-2379-AJ-0038',
's18-N-2379-AJ-0039',
's19-N-2379-AJ-0040',
's20-N-2379-AJ-0041',
's21-N-2379-AJ-0042']
query = "SELECT chrom, start, end, ref, alt, type, sub_type, qual, gene, rs_ids,\
impact, impact_severity, num_het, num_hom_alt, \
gt_ref_depths, gt_alt_depths \
FROM variants \
WHERE on_probes=1 \
AND call_rate >= 0.95 \
AND aaf_1kg_all is NULL \
AND aaf_esp_all is NULL \
AND qual >= 5"
def in_normals(gt_types, smp2idx):
for normal in normal_samples:
norm_idx = smp2idx[normal]
if gt_types[norm_idx] != HOM_REF and gt_types[norm_idx] != UNKNOWN:
return True
return False
smp2idx = gq.sample_to_idx
idx2smp = gq.idx_to_sample
gq.run(query, show_variant_samples=True)
print '\t'.join(['sample', 'chrom', 'start', 'end', 'qual', 'ref', \
'alt', 'gene', 'type', 'sub_type', 'rs_ids', \
'impact', 'impact_severity', 'num_het', 'ref_norm', \
'ref_tum', 'ref_resist', 'alt_norm', 'alt_tum', 'alt_resist'])
for row in gq:
gt_types = row['gt_types']
gt_depths = row['gt_depths']
gt_ref_depths = row['gt_ref_depths']
gt_alt_depths = row['gt_alt_depths']
if in_normals(gt_types, smp2idx):
continue
for patient in patient_samples:
normal = patient[0]
tumor = patient[1]
resis = patient[2]
normal_idx = smp2idx[normal]
tumor_idx = smp2idx[tumor]
resis_idx = smp2idx[resis]
if gt_types[normal_idx] == HOM_REF and \
((gt_types[tumor_idx] != HOM_REF and gt_types[tumor_idx] != UNKNOWN) or (gt_types[resis_idx] != HOM_REF and gt_types[resis_idx] != UNKNOWN)) and \
gt_alt_depths[normal_idx] == 0 and \
(gt_alt_depths[tumor_idx] > 1 or gt_alt_depths[resis_idx] > 1) and \
gt_depths[normal_idx] >= 20 and \
gt_depths[tumor_idx] >= 20 and \
gt_depths[resis_idx] >= 20:
print '\t'.join(str(s) for s in [patient[0][0:3], row['chrom'], row['start'], row['end'], row['qual'], \
row['ref'], row['alt'], row['gene'], row['type'], row['sub_type'], row['rs_ids'], row['impact'], row['impact_severity'], \
row['num_het'],gt_ref_depths[normal_idx], gt_ref_depths[tumor_idx], gt_ref_depths[resis_idx], \
gt_alt_depths[normal_idx], gt_alt_depths[tumor_idx], gt_alt_depths[resis_idx]])
#snpfreq.py by Jim Havrilla @ quinlanlab @ UVA
#gets snp frequency distribution plots for samples individually (patients) and altogether
#table
#A-C > T-G
#A-G > T-C
#A-T > T-A
#C-A > C-A
#C-G > C-G
#C-T > C-T
#G-A > C-T
#G-C > C-G
#G-T > C-A
#T-A > T-A
#T-C > T-C
#T-G > T-G
import fileinput
import sys
import rpy2.robjects as ro
from rpy2.robjects import r
from collections import Counter
from rpy2.robjects.packages import importr
gd=importr('grDevices')
l=[]
#take in file
for line in fileinput.input("mutations.txt"):
foo=line.rstrip().split("\t")
if foo[8] == "snp":
if foo[5]=="A" and foo[6]=="C":
foo=(foo[0],"T->G")
elif foo[5]=="A" and foo[6]=="G":
foo=(foo[0],"T->C")
elif foo[5]=="A" and foo[6]=="T":
foo=(foo[0],"T->A")
elif foo[5]=="G" and foo[6]=="A":
foo=(foo[0],"C->T")
elif foo[5]=="G" and foo[6]=="C":
foo=(foo[0],"C->G")
elif foo[5]=="G" and foo[6]=="T":
foo=(foo[0],"C->A")
else:
foo=(foo[0],foo[5]+"->"+foo[6])
l.append(foo)
#sort by sample (patient)
l.sort()
#get counts for each sample (patient) and plot
m=[]
new=l[0][0]
for (t1,t2) in l:
if t1!=new:
c=Counter(m)
gd.jpeg("plot"+new+".jpeg")
r.barplot(ro.IntVector(c.values()),names=c.keys(),main="Mutational signatures in human cancer - "+new,legend=c.keys(),col=r.c("darkblue","red","black","yellow","green","purple"),xlab="snp type",ylab="number of snps")
gd.dev_off()
new=t1
m=[]
m.append(t2)
#plot from R for all samples (patients)
c=Counter([t2 for (t1,t2) in l])
gd.jpeg("allplot.jpeg")
r.barplot(ro.IntVector(c.values()),names=c.keys(),main="Mutational signatures in human cancer",legend=c.keys(),col=r.c("darkblue","red","black","yellow","green","purple"),xlab="snp type",ylab="number of snps")
gd.dev_off()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment