Skip to content

Instantly share code, notes, and snippets.

@cplaisier
Forked from anutua/oncoMerge.py
Last active August 31, 2020 21:55
Show Gist options
  • Save cplaisier/868752f6d17ca0eaf4f3614d9309ba10 to your computer and use it in GitHub Desktop.
Save cplaisier/868752f6d17ca0eaf4f3614d9309ba10 to your computer and use it in GitHub Desktop.
##########################################################
## OncoMerge: oncoMerge.py ##
## ______ ______ __ __ ##
## /\ __ \ /\ ___\ /\ \/\ \ ##
## \ \ __ \ \ \___ \ \ \ \_\ \ ##
## \ \_\ \_\ \/\_____\ \ \_____\ ##
## \/_/\/_/ \/_____/ \/_____/ ##
## @Developed by: Plaisier Lab ##
## (https://plaisierlab.engineering.asu.edu/) ##
## Arizona State University ##
## 242 ISTB1, 550 E Orange St ##
## Tempe, AZ 85281 ##
## @github: https://github.com/plaisier-lab/OncoMerge ##
## @Author: Chris Plaisier, Rob Schultz ##
## @License: GNU GPLv3 ##
## ##
## If this program is used in your analysis please ##
## mention who built it. Thanks. :-) ##
##########################################################
#####################
## Import packages ##
#####################
import re
import json
import argparse
import pandas as pd
import numpy as np
from statsmodels.stats.multitest import multipletests # pip install statsmodels==0.10.0
import itertools
import sys
import os
from scipy.stats import hypergeom
from tqdm import tqdm
#import mygene
#mg = mygene.MyGeneInfo() # pull in function to map genes
##################################
## Read in command line options ##
##################################
parser = argparse.ArgumentParser(description='OncoMerge merges patient Protein Affecting Mutations (PAMs) and Copy Number Alterations (CNAs) into a unified mutation matrix.')
parser.add_argument('-cf', '--config_path', help='Path to JSON encoded configuration file, overrides command line parameters', type = str)
parser.add_argument('-gp', '--gistic_path', help='Path to GISTIC output folder', type = str)
parser.add_argument('-dp', '--del_path', help='Path to GISTIC deletion file (default = del_genes.conf_99.txt)', type = str, default = 'del_genes.conf_99.txt')
parser.add_argument('-ap', '--amp_path', help='Path to the GISTIC amplification file (default = amp_genes.conf_99.txt)', type = str, default = 'amp_genes.conf_99.txt')
parser.add_argument('-gdp', '--gene_data_path', help='Path to the GISTIC gene data file (default = all_data_by_genes.txt)', type = str, default = 'all_data_by_genes.txt')
parser.add_argument('-cfp', '--conversion_file_path', help='Supply hand annotated file to convert gene symbols to Entrez IDs (default does not import hand annotated conversion file and instead uses conversion embedded in GISTIC output files).', type = str)
parser.add_argument('-ln', '--label_name', help='Label for Entrez ID column in GISTIC gene data file (default = \'Gene ID\')', type = str, default = 'Gene ID')
parser.add_argument('-tp', '--thresh_path', help='Path to the GISTIC all_thresholded file (default = all_thresholded.by_genes.txt)', type = str, default = 'all_thresholded.by_genes.txt')
parser.add_argument('-smp', '--som_mut_path', help='Path to the somatic mutations file (CSV matrix where columns are patients and genes are rows) [0 = not mutated, and 1 = mutated]', type = str)
parser.add_argument('-mscv', '--mutsig2cv_path', help='Path to a MutSig2CV output file', type = str)
parser.add_argument('-op', '--output_path', help='Path you would like to output OncoMerged files (default = current directory)', type = str, default = '.')
parser.add_argument('-mmf', '--min_mut_freq', help='Minimum frequency of mutation (range = 0-1; default = 0.05)', type = float, default = 0.05)
parser.add_argument('-pq', '--perm_qv', help='Permuted p-value FDR BH corrected cutoff (default = 0.1)', type = float, default = 0.1)
parser.add_argument('-sp', '--save_permutation', help='Run and save out permutation analysis to be used for comparability in another OncoMerge run (default off)', action='store_true')
parser.add_argument('-lp', '--load_permutation', help='Do not run permutation anlaysis and load permutation anlaysis from previous run (default off)', type = str, default = None)
parser.add_argument('-mlg', '--min_loci_genes', help='Minimum number of genes in loci to apply maximum final frequency filter (default = 10)', type = int, default = 10)
parser.add_argument('-mpf', '--min_pam_freq', help='Minimum PAM frequency (default = 0.01)', type = float, default = 0.01)
args = parser.parse_args()
#######################
## Define parameters ##
#######################
params = args.__dict__
if args.config_path:
with open(args.config_path, "r") as cfg:
tmp = json.loads(cfg.read())
for i in tmp:
params[i] = tmp[i]
if (not params['gistic_path']) or (not params['som_mut_path']) or (not params['mutsig2cv_path']) or (params['save_permutation'] and not params['load_permutation']==None):
parser.print_help()
sys.exit(1)
##################
## Load up data ##
##################
print('Loading data...')
# Create conversion series for gene symbol to Entrez ID
if not params['conversion_file_path']:
n1 = pd.read_csv(params['gistic_path']+'/'+params['gene_data_path'],index_col=0,sep='\t',usecols=[0,1])
n1.index = [i.split('|')[0] for i in n1.index]
n1 = n1.drop_duplicates()
n1 = n1[params['label_name']]
else:
n1 = pd.read_csv(params['conversion_file_path'],index_col=0)[params['label_name']].apply(int)
# load up significantly mutated genes
mutSig2CV = pd.read_csv(params['mutsig2cv_path'],index_col=1)
mutSig2CV = mutSig2CV.loc[mutSig2CV.index.map(lambda x: x in n1.index)]
mutSig2CV.index = mutSig2CV.index.map(lambda x: n1.loc[x])
mutSig2CV = mutSig2CV.loc[~mutSig2CV.index.duplicated(keep='first')]
sigPAMs = list(mutSig2CV.index[mutSig2CV['q']<=0.1]) # Changed to 0.1 based on Lawerence et al., Nature 2013 (PMID = 23770567)
# Get list of significantly CNA amplified genes
ampLoci = {}
ampLoci_qv = {}
amp1 = pd.read_csv(params['gistic_path']+'/'+params['amp_path'],index_col=0,sep='\t')
for col1 in amp1.columns:
if float(amp1[col1]['residual q value'])<=0.05 and not (col1[0]=='X' or col1=='Y'):
ampLoci[col1] = list(set([n1.loc[i] for i in [i.lstrip('[').rstrip(']').split('|')[0] for i in list(amp1[col1].dropna()[3:])] if i in n1.index and n1.loc[i]>0]))
ampLoci_qv[col1] = float(amp1[col1]['residual q value'])
# Get list of significantly CNA deleted genes
delLoci = {}
delLoci_qv = {}
del1 = pd.read_csv(params['gistic_path']+'/'+params['del_path'],index_col=0,sep='\t')
for col1 in del1.columns:
if float(del1[col1]['residual q value'])<=0.05 and not (col1[0]=='X' or col1=='Y'):
delLoci[col1] = list(set([n1.loc[i] for i in [i.lstrip('[').rstrip(']').split('|')[0] for i in list(del1[col1].dropna()[3:])] if i in n1.index and n1.loc[i]>0]))
delLoci_qv[col1] = float(del1[col1]['residual q value'])
# Set up background gene numbers for gold-standard enrichment analysis
backgrounds = {'Activating':[], 'LossOfFunction':[], 'Aggregate':[]}
actTmp = [gene for locus in ampLoci for gene in ampLoci[locus]]
backgrounds['Activating'] += actTmp
backgrounds['Aggregate'] += actTmp
delTmp = [gene for locus in delLoci for gene in delLoci[locus]]
backgrounds['LossOfFunction'] += delTmp
backgrounds['Aggregate'] += delTmp
backgrounds['Aggregate'] += sigPAMs
#Load go pathway
godf=pd.read_csv(params['gopath'],sep=',', index_col=0, dtype=object, header=None)
keggdf=pd.read_csv(params['keggpath'],sep=',', index_col=0, dtype=object, header=None)
keggdf = keggdf.iloc[1:]
print("go pathway")
print(godf)
print("go pathway done1")
godict_int=dict(zip(godf.index,[[int(j) for j in godf.loc[i][2].split(' ')] for i in godf.index]))
kegg_int=dict(zip(keggdf.index,[[int(j) for j in keggdf.loc[i][2].split(' ')] for i in keggdf.index]))
#godict_int = {key: value for key, value in godict_int.items() if key.isnumeric()}
#godict_string=(zip(godf.index,[godf.loc[i][2].split(' ') for i in godf.index]))
# Load up enrichment sets
"""enrichmentSets = {}
if 'enrichment_sets' in params:
for set1 in params['enrichment_sets']:
enSet1 = pd.read_csv(params['enrichment_sets'][set1], header=0)
enrichmentSets[set1] = {}
enrichmentSets[set1]['Aggregate'] = {'Activating': [], 'LossOfFunction': []}
for cancer in set(enSet1['Cancer']):
enrichmentSets[set1][cancer] = {}
for type1 in ['Activating', 'LossOfFunction']:
enrichmentSets[set1][cancer][type1] = list(enSet1.loc[(enSet1['Cancer']==cancer) & (enSet1['Type']==type1),'Entrez'])
enrichmentSets[set1]['Aggregate'][type1] += list(enSet1.loc[(enSet1['Cancer']==cancer) & (enSet1['Type']==type1),'Entrez'])
"""
# Load up somatically mutated genes
somMuts = pd.read_csv(params['som_mut_path'],index_col=0,header=0)
if not somMuts.index.dtype=='int64':
somMuts = somMuts.loc[somMuts.index.map(lambda x: x in n1.index)]
somMuts.index = somMuts.index.map(lambda x: n1.loc[x])
somMuts = somMuts.loc[~somMuts.index.duplicated(keep='first')]
# Read in gistic all_data_by_genes file
with open(params['gistic_path']+'/'+params['thresh_path'],'r') as inFile:
tmp = inFile.readline().strip().split('\t')
numCols1 = len(tmp)
d1 = pd.read_csv(params['gistic_path']+'/'+params['thresh_path'],index_col=0,sep='\t').drop(tmp[1], axis = 1)
d1.columns = [i[:12] for i in d1.columns]
d1 = d1.loc[d1.index.map(lambda x: x.split('|')[0] in n1.index)]
d1.index = d1.index.map(lambda x: n1.loc[x.split('|')[0]])
d1.index.name = 'Locus ID'
# Removing sex chromosomes (issues in CNA analysis) from d1
lociThresh = d1['Cytoband']
include = []
for i in lociThresh:
if not(i[0]=='X' or i[0]=='Y'):
include.append(True)
else:
include.append(False)
d1 = d1.loc[lociThresh[include].index].drop('Cytoband', axis = 1)
# Make sure somMuts and gistic have same samples
somMuts = somMuts[list(set(d1.columns).intersection(somMuts.columns))]
d1 = d1[list(set(d1.columns).intersection(somMuts.columns))]
# Get rid of duplicated rows
d1 = d1[~d1.index.duplicated(keep='first')]
## Fill out summary matrix
summaryMatrix = pd.DataFrame(index= list(set([gene for locus in ampLoci.values() for gene in locus] + [gene for locus in delLoci.values() for gene in locus] + list(somMuts.index))), columns = ['Symbol', 'PAM_freq', 'MutSig2CV_qvalue', 'CNA_freq', 'CNA_locus', 'CNA_type', 'GISTIC_residual_q_value', 'Act_freq', 'LoF_freq', 'OM_type_selected', 'OM_empirical_p_value', 'OM_empirical_q_value', 'Genes_in_locus', 'Final_mutation_type', 'Final_freq', 'Delta_over_PAM'])
# Add gene symbols
toMatch = [i for i in summaryMatrix.index if i in n1.values]
summaryMatrix.loc[toMatch,'Symbol'] = [n1.index[n1==i][0] for i in toMatch]
# Add MutSig2CV q-values
toMatch = [i for i in summaryMatrix.index if i in mutSig2CV.index]
summaryMatrix.loc[toMatch,'MutSig2CV_qvalue'] = mutSig2CV.loc[toMatch,'q']
# Print out some useful information
print('\tSize of CNA matrix: '+str(d1.shape))
print('\tSize of somatic mutation matrix: '+str(somMuts.shape))
print('Finished loading data.')
# Make the output directory if it doesn't exists already
if not os.path.exists(params['output_path']):
os.mkdir(params['output_path'])
########################
## Begin onocoMerging ##
########################
# Cutoff somatic mutations based on the minimum mutation frequency (mf)
freq1 = somMuts.sum(axis=1)/len(list(somMuts.columns))
somMutPoint = freq1[freq1>=params['min_mut_freq']].index
tmp1 = [i for i in summaryMatrix.index if i in freq1.index]
summaryMatrix.loc[tmp1, 'PAM_freq'] = freq1.loc[tmp1]
# Precompute positive and negative dichotomized matrices
print('Precomputing dichotomized matrices...')
posdicot = (lambda x: 1 if x>=2 else 0)
posD1 = d1.applymap(posdicot)
posFreq = posD1.mean(axis=1)
ampGenes = {j:i for i in ampLoci for j in ampLoci[i]}
negdicot = (lambda x: 1 if x<=(-2) else 0)
negD1 = d1.applymap(negdicot)
negFreq = negD1.mean(axis=1)
delGenes = {j:i for i in delLoci for j in delLoci[i]}
print('Finished precomputing dichotomized matrices.')
# Add back genes that are high frequency amplification or deletion (20%)
for gene1 in list([i for i in posD1.index if posFreq[i]>=0.2]):
if (not gene1 in ampGenes) and isinstance(lociThresh[gene1], str) and (lociThresh[gene1] in ampLoci_qv) and (gene1 in posD1.index):
ampGenes[gene1] = lociThresh[gene1]
ampLoci[lociThresh[gene1]].append(gene1)
#print('Pos', gene1, posFreq[gene1], lociThresh[gene1])
for gene1 in list([i for i in negD1.index if negFreq[i]>=0.2]):
if (not gene1 in delGenes) and isinstance(lociThresh[gene1], str) and (lociThresh[gene1] in delLoci_qv) and (gene1 in negD1.index):
delGenes[gene1] = lociThresh[gene1]
delLoci[lociThresh[gene1]].append(gene1)
#print('Neg', gene1, negFreq[gene1], lociThresh[gene1])
print('Generating CNA summaryMatrix data...')
somMuts1 = list(set(list(ampGenes.keys())+list(delGenes.keys())).intersection(list(d1.index)))
delGenesSet = set(delGenes.keys())
ampGenesSet = set(ampGenes.keys())
bothGenesSet = (ampGenesSet.intersection(delGenesSet)).intersection(somMuts1)
onlyDel = delGenesSet.difference(ampGenesSet).intersection(somMuts1)
onlyAmp = ampGenesSet.difference(delGenesSet).intersection(somMuts1)
with tqdm(total=len(bothGenesSet)+len(onlyDel)+len(onlyAmp)) as pbar:
for both1 in bothGenesSet:
# Conflict!
# Choose amplification if is higher frequency
if posFreq[both1]>negFreq[both1]:
summaryMatrix.loc[both1,['CNA_type','CNA_locus','GISTIC_residual_q_value','CNA_freq']] = ['Amp', ampGenes[both1], ampLoci_qv[ampGenes[both1]], posFreq[both1]]
# Otherwise choose deletion (deletion becomse default for equal frequency; unlikely to happen)
else:
summaryMatrix.loc[both1,['CNA_type','CNA_locus','GISTIC_residual_q_value','CNA_freq']] = ['Del', delGenes[both1], delLoci_qv[delGenes[both1]], negFreq[both1]]
pbar.update(1)
# Straight up deletion
for del1 in onlyDel:
summaryMatrix.loc[del1,['CNA_type','CNA_locus','GISTIC_residual_q_value','CNA_freq']] = ['Del', delGenes[del1], delLoci_qv[delGenes[del1]], negFreq[del1]]
pbar.update(1)
# Straight up amplification
for amp1 in onlyAmp:
summaryMatrix.loc[amp1,['CNA_type','CNA_locus','GISTIC_residual_q_value','CNA_freq']] = ['Amp', ampGenes[amp1], ampLoci_qv[ampGenes[amp1]], posFreq[amp1]]
pbar.update(1)
print('Done.')
#intersect gene lists
#pos_intersect = pd.merge(posD1, somMuts, how='inner', right_index=True, left_index=True indicator=True])
#neg_intersect = pd.merge(negD1, somMuts, how='inner', on=['user_id'])
#see negD1
newgodict= {}
for x in godict_int.keys():
tmp1=list(set(godict_int[x]).intersection(negD1.index))
if len(tmp1) > 2:
newgodict[x]=tmp1
aggnegD1=pd.DataFrame(columns=negD1.columns)
for x in newgodict.keys():
aggnegD1.loc[x] = negD1.loc[newgodict[x]].max(axis=0)
negD1 = pd.concat([negD1,aggnegD1])
#somMuts
newgodict_sm= {}
for x in godict_int.keys():
tmp1=list(set(godict_int[x]).intersection(somMuts.index))
if len(tmp1) > 2:
newgodict_sm[x]=tmp1
aggsomMuts = pd.DataFrame(columns=somMuts.columns)
for x in newgodict_sm.keys():
aggsomMuts.loc[x] = somMuts.loc[newgodict_sm[x]].max(axis=0)
somMuts = pd.concat([somMuts,aggsomMuts])
#posD1
newgodictpos= {}
for x in godict_int.keys():
tmp1=list(set(godict_int[x]).intersection(posD1.index))
if len(tmp1) > 2:
newgodictpos[x]=tmp1
aggposD1 = pd.DataFrame(columns=posD1.columns)
for x in newgodictpos.keys():
aggposD1.loc[x] = posD1.loc[newgodictpos[x]].max(axis=0)
posD1 = pd.concat([posD1,aggposD1])
# Merge loci for deletions and amplifications
lociCNAgenes = {}
lociCNA = pd.DataFrame(columns=d1.columns)
print('Bundling amplification loci...')
for loci1 in ampLoci:
# Get matrix of CNAs for genes in loci
dt = posD1.loc[ampLoci[loci1]]
# Get unique rows
dedup = dt.drop_duplicates(keep='first')
# Get genes which match and add to output dictionaries
for i in range(len(dedup.index)):
cnaName = loci1+'_'+str(i)+'_CNAamp'
lociCNA.loc[cnaName] = dedup.iloc[i]
lociCNAgenes[cnaName] = [j for j in dt.index if dedup.iloc[i].equals(dt.loc[j])]
print('Bundling deletion loci...')
for loci1 in delLoci:
# Get matrix of CNAs for genes in loci
dt = negD1.loc[delLoci[loci1]]
# Get unique rows
dedup = dt.drop_duplicates(keep='first')
# Get genes which match and add to output dictionaries
for i in range(len(dedup.index)):
cnaName = loci1+'_'+str(i)+'_CNAdel'
lociCNA.loc[cnaName] = dedup.iloc[i]
lociCNAgenes[cnaName] = [j for j in dt.index if dedup.iloc[i].equals(dt.loc[j])]
# Make combined matrix
# LoF = deletions + somatic point mutations
# Act = amplifications + somatic point mutations
print('Starting somatic mutations...')
pamLofAct = {}
freq = {}
for s1 in somMutPoint:
if s1>0:
if not str(s1) in pamLofAct:
pamLofAct[str(s1)] = {}
tmpSom = somMuts.loc[s1]
# If potential PAM, store PAM
if not (str(s1)+'_PAM' in pamLofAct[str(s1)] or sum(tmpSom)==0):
pamLofAct[str(s1)][str(s1)+'_PAM'] = tmpSom
if (s1 in negD1.index and s1 in posD1.index):
tmpNeg = negD1.loc[s1]
tmpLoF = tmpSom.add(tmpNeg)[tmpNeg.index]
tmpLoF[tmpLoF==2] = 1
tmpPos = posD1.loc[s1]
tmpAct = tmpSom.add(tmpPos)[tmpPos.index]
tmpAct[tmpAct==2] = 1
if not s1 in freq:
freq[str(s1)] = {'PAM':tmpSom.mean(),'CNAdel':tmpNeg.mean(),'CNAamp':tmpPos.mean(),'LoF':tmpLoF.mean(),'Act':tmpAct.mean()}
else:
if not s1 in freq:
freq[str(s1)] = {'PAM':tmpSom.mean(),'CNAdel':0,'CNAamp':0,'LoF':0,'Act':0}
###
print('Starting pathways...')
for s1 in set(godict_int.keys()).intersection(somMuts.index):
if not str(s1) in pamLofAct:
pamLofAct[str(s1)] = {}
tmpSom = somMuts.loc[s1]
# If potential PAM, store PAM
if not (str(s1)+'_PAM' in pamLofAct[str(s1)] or sum(tmpSom)==0):
pamLofAct[str(s1)][str(s1)+'_PAM'] = tmpSom
if (s1 in negD1.index and s1 in posD1.index):
tmpNeg = negD1.loc[s1]
tmpLoF = tmpSom.add(tmpNeg)[tmpNeg.index]
tmpLoF[tmpLoF==2] = 1
tmpPos = posD1.loc[s1]
tmpAct = tmpSom.add(tmpPos)[tmpPos.index]
tmpAct[tmpAct==2] = 1
if not s1 in freq:
freq[str(s1)] = {'PAM':tmpSom.mean(),'CNAdel':tmpNeg.mean(),'CNAamp':tmpPos.mean(),'LoF':tmpLoF.mean(),'Act':tmpAct.mean()}
else:
if not s1 in freq:
freq[str(s1)] = {'PAM':tmpSom.mean(),'CNAdel':0,'CNAamp':0,'LoF':0,'Act':0}
print('Starting amplifications...')
for loci1 in ampLoci:
for s1 in list(set(ampLoci[loci1]).intersection(somMuts.index)):
if s1>0:
# If potential Act
if s1 in somMuts.index and s1 in posD1.index:
tmpSom = somMuts.loc[s1]
tmpNeg = negD1.loc[s1]
tmpLoF = tmpSom.add(tmpNeg)[tmpNeg.index]
tmpLoF[tmpLoF==2] = 1
tmpPos = posD1.loc[s1]
tmpAct = tmpSom.add(tmpPos)[tmpPos.index]
tmpAct[tmpAct==2] = 1
if not s1 in freq:
freq[str(s1)] = {'PAM':tmpSom.mean(),'CNAdel':tmpNeg.mean(),'CNAamp':tmpPos.mean(),'LoF':tmpLoF.mean(),'Act':tmpAct.mean()}
# Store Act
if not str(s1) in pamLofAct:
pamLofAct[str(s1)] = {}
if not (str(s1)+'_Act' in pamLofAct[str(s1)] or tmpAct.equals(tmpSom) or tmpAct.equals(tmpPos)):
pamLofAct[str(s1)][str(s1)+'_Act'] = tmpAct
pamLofAct[str(s1)][str(s1)+'_CNAamp'] = tmpPos
for s1 in set(ampLoci[loci1]).difference(somMuts.index):
if s1>0:
tmpNeg = negD1.loc[s1]
tmpPos = posD1.loc[s1]
if not s1 in freq:
freq[str(s1)] = {'PAM':np.nan,'CNAdel':tmpNeg.mean(),'CNAamp':tmpPos.mean(),'LoF':np.nan,'Act':np.nan}
# Store Amp
if not str(s1) in pamLofAct:
pamLofAct[str(s1)] = {}
if not str(s1)+'_CNAamp' in pamLofAct[str(s1)]:
pamLofAct[str(s1)][str(s1)+'_CNAamp'] = tmpPos
print('Starting amplifications (pathway)...')
for s1 in list(set(godict_int.keys()).intersection(somMuts.index)):
# If potential Act
if s1 in somMuts.index and s1 in posD1.index:
tmpSom = somMuts.loc[s1]
tmpNeg = negD1.loc[s1]
tmpLoF = tmpSom.add(tmpNeg)[tmpNeg.index]
tmpLoF[tmpLoF==2] = 1
tmpPos = posD1.loc[s1]
tmpAct = tmpSom.add(tmpPos)[tmpPos.index]
tmpAct[tmpAct==2] = 1
if not s1 in freq:
freq[str(s1)] = {'PAM':tmpSom.mean(),'CNAdel':tmpNeg.mean(),'CNAamp':tmpPos.mean(),'LoF':tmpLoF.mean(),'Act':tmpAct.mean()}
# Store Act
if not str(s1) in pamLofAct:
pamLofAct[str(s1)] = {}
if not (str(s1)+'_Act' in pamLofAct[str(s1)] or tmpAct.equals(tmpSom) or tmpAct.equals(tmpPos)):
pamLofAct[str(s1)][str(s1)+'_Act'] = tmpAct
pamLofAct[str(s1)][str(s1)+'_CNAamp'] = tmpPos
print('Starting deletions...')
for loci1 in delLoci:
for s1 in set(delLoci[loci1]).intersection(somMuts.index):
if s1>0:
# If potential Lof
if s1 in somMuts.index and s1 in negD1.index:
tmpSom = somMuts.loc[s1]
tmpNeg = negD1.loc[s1]
tmpLoF = tmpSom.add(tmpNeg)[tmpNeg.index]
tmpLoF[tmpLoF==2] = 1
tmpPos = posD1.loc[s1]
tmpAct = tmpSom.add(tmpPos)[tmpPos.index]
tmpAct[tmpAct==2] = 1
if not s1 in freq:
freq[str(s1)] = {'PAM':tmpSom.mean(),'CNAdel':tmpNeg.mean(),'CNAamp':tmpPos.mean(),'LoF':tmpLoF.mean(),'Act':tmpAct.mean()}
# Store LoF
if not str(s1) in pamLofAct:
pamLofAct[str(s1)] = {}
if not (str(s1)+'_LoF' in pamLofAct[str(s1)] or tmpLoF.equals(tmpSom) or tmpLoF.equals(tmpNeg)):
pamLofAct[str(s1)][str(s1)+'_LoF'] = tmpLoF
pamLofAct[str(s1)][str(s1)+'_CNAdel'] = tmpNeg
for s1 in set(delLoci[loci1]).difference(somMuts.index):
if s1>0:
tmpNeg = negD1.loc[s1]
tmpPos = posD1.loc[s1]
if not s1 in freq:
freq[str(s1)] = {'PAM':np.nan,'CNAdel':tmpNeg.mean(),'CNAamp':tmpPos.mean(),'LoF':np.nan,'Act':np.nan}
# Store Del
if not str(s1) in pamLofAct:
pamLofAct[str(s1)] = {}
if not str(s1)+'_CNAdel' in pamLofAct[str(s1)]:
pamLofAct[str(s1)][str(s1)+'_CNAdel'] = tmpNeg
print('Starting deletions (pathways)...')
for s1 in list(set(godict_int.keys()).intersection(somMuts.index)):
# If potential Lof
if s1 in somMuts.index and s1 in negD1.index:
tmpSom = somMuts.loc[s1]
tmpNeg = negD1.loc[s1]
tmpLoF = tmpSom.add(tmpNeg)[tmpNeg.index]
tmpLoF[tmpLoF==2] = 1
tmpPos = posD1.loc[s1]
tmpAct = tmpSom.add(tmpPos)[tmpPos.index]
tmpAct[tmpAct==2] = 1
if not s1 in freq:
freq[str(s1)] = {'PAM':tmpSom.mean(),'CNAdel':tmpNeg.mean(),'CNAamp':tmpPos.mean(),'LoF':tmpLoF.mean(),'Act':tmpAct.mean()}
# Store LoF
if not str(s1) in pamLofAct:
pamLofAct[str(s1)] = {}
if not (str(s1)+'_LoF' in pamLofAct[str(s1)] or tmpLoF.equals(tmpSom) or tmpLoF.equals(tmpNeg)):
pamLofAct[str(s1)][str(s1)+'_LoF'] = tmpLoF
pamLofAct[str(s1)][str(s1)+'_CNAdel'] = tmpNeg
# Decide which mutations will be tested in permutation analysis
print('Screening for frequency...')
keepPAM = []
keepDel = []
keepAmp = []
keepers = {}
calcSig = []
for s1 in pamLofAct:
if s1 in freq:
freqPAM = freq[s1]['PAM']
freqPos = freq[s1]['CNAamp']
freqNeg = freq[s1]['CNAdel']
#if summaryMatrix.loc[s1,'CNA_type']=='Del':
# summaryMatrix.loc[s1, 'CNA_freq'] = freq[s1]['CNAdel']
#elif summaryMatrix.loc[s1,'CNA_type']=='Amp':
# summaryMatrix.loc[s1, 'CNA_freq'] = freq[s1]['CNAamp']
freqAct = freq[s1]['Act']
summaryMatrix.loc[s1, 'Act_freq'] = freq[s1]['Act']
freqLoF = freq[s1]['LoF']
summaryMatrix.loc[s1, 'LoF_freq'] = freq[s1]['LoF']
if freqLoF>=0.05 or freqAct>=0.05 or freqPAM>=0.05 or freqPos>=0.05 or freqNeg>=0.05:
name1 = 'Unkown'
if sum(n1.isin([s1]))==1:
name1 = n1.index[n1==int(s1)][0]
print('\t'+''.join([str(i) for i in [name1+' ('+str(s1),') - FreqPAM: ', round(freqPAM,3), ' | FreqNeg: ', round(freqNeg,3), ' | FreqLoF: ', round(freqLoF,3), ' | FreqPos: ', round(freqPos,3),' | FreqAct: ', round(freqAct,3)]]))
# Add PAM
if freqPAM>0 and freqPAM>=params['min_mut_freq'] and s1 in somMutPoint and s1 in sigPAMs:
keepers[str(s1)+'_PAM'] = pamLofAct[str(s1)][str(s1)+'_PAM']
keepPAM.append(str(s1)+'_PAM')
summaryMatrix.loc[s1,'OM_type_selected'] = 'PAM'
# Add Act
if str(s1)+'_Act' in pamLofAct[str(s1)] and freqAct>freqPAM and freqAct>=params['min_mut_freq']:
if freqPAM>=params['min_pam_freq']:
keepers[str(s1)+'_Act'] = pamLofAct[str(s1)][str(s1)+'_Act']
calcSig.append(str(s1)+'_Act')
summaryMatrix.loc[s1,'OM_type_selected'] = 'Act'
# Add CNAamp
if (str(s1)+'_CNAamp' in pamLofAct[str(s1)]) and (freqPos>=params['min_mut_freq']) and not ((summaryMatrix.loc[s1,'OM_type_selected']=='Act') or (summaryMatrix.loc[s1,'OM_type_selected']=='LoF')):
#print(str(s1)+'_CNAamp')
keepers[str(s1)+'_CNAamp'] = pamLofAct[str(s1)][str(s1)+'_CNAamp']
keepAmp.append(str(s1)+'_CNAamp')
summaryMatrix.loc[s1,'OM_type_selected'] = 'CNAamp'
# Add LoF
if str(s1)+'_LoF' in pamLofAct[str(s1)] and freqLoF>freqPAM and freqLoF>=params['min_mut_freq']:
if freqPAM>=params['min_pam_freq']:
keepers[str(s1)+'_LoF'] = pamLofAct[str(s1)][str(s1)+'_LoF']
calcSig.append(str(s1)+'_LoF')
summaryMatrix.loc[s1,'OM_type_selected'] = 'LoF'
# Add CNAdel
if ((str(s1)+'_CNAdel' in pamLofAct[str(s1)]) and (freqNeg>=params['min_mut_freq'])) and not ((summaryMatrix.loc[s1,'OM_type_selected']=='Act') or (summaryMatrix.loc[s1,'OM_type_selected']=='LoF')):
#print(str(s1)+'_CNAdel')
keepers[str(s1)+'_CNAdel'] = pamLofAct[str(s1)][str(s1)+'_CNAdel']
keepDel.append(str(s1)+'_CNAdel')
summaryMatrix.loc[s1,'OM_type_selected'] = 'CNAdel'
##################################
## Conduct permutation analysis ##
##################################
print('Permutation anlaysis...')
numPermutes = 1000
# Permute to get frequency
def singlePermute(somMutsMF, somCNAsMF):
tmp1 = pd.Series(np.random.permutation(somMutsMF), index=somMutsMF.index)
tmp2 = pd.Series(np.random.permutation(somCNAsMF), index=somCNAsMF.index)
subset1 = set(somMutsMF.index).intersection(somCNAsMF.index)
return list(tmp1.loc[subset1]+tmp2.loc[subset1])
permMF_neg = []
permMF_pos = []
if params['load_permutation']==None:
## Compute permutations if not loading from previous run
# Deletions
print('\tPermuting deletions...')
somMutsMF = somMuts.transpose().mean()
somCNAsMF = negD1.transpose().mean()
with tqdm(total=numPermutes) as pbar:
for i in range(numPermutes):
permMF_neg += singlePermute(somMutsMF, somCNAsMF)
pbar.update(1)
# Amplifications
print('\tPermuting amplifications...')
somMutsMF = somMuts.transpose().mean()
somCNAsMF = posD1.transpose().mean()
with tqdm(total=numPermutes) as pbar:
for i in range(numPermutes):
permMF_pos += singlePermute(somMutsMF, somCNAsMF)
pbar.update(1)
# Change precision so that is the same as what will be written out,
# - Fixes bug where precision change leads to different behavior from freshly run (then saved) and loaded permutation data
permMF_neg = [float(str(i)) for i in permMF_neg]
permMF_pos = [float(str(i)) for i in permMF_pos]
# If requested to save out permutations
if params['save_permutation']==True:
print('\tSaving permutations...')
np.save(params['output_path']+'/oncomerge_delPerm', permMF_neg)
np.save(params['output_path']+'/oncomerge_ampPerm', permMF_pos)
elif not params['load_permutation']==None:
## Load up permutations from previous run
print('\tLoading previous permutations...')
permMF_neg = np.load(params['load_permutation']+'/oncomerge_delPerm.npy')
permMF_pos = np.load(params['load_permutation']+'/oncomerge_ampPerm.npy')
# Write Permutation Analysis file Lof_Act_sig
lofActSig = pd.DataFrame(columns = ['Symbol', 'Type','Freq','Emp.p_value'], index = calcSig)
for sig1 in calcSig:
if sig1.find('LoF')>0:
if not re.search('^[A-Za-z].*$', sig1):
lofActSig['Symbol'].loc[sig1] = n1.index[n1==int(sig1.rstrip('_LoF'))][0]
else:
lofActSig['Symbol'].loc[sig1] = sig1.rstrip('_LoF')
lofActSig['Type'].loc[sig1] = 'LoF'
lofActSig['Freq'].loc[sig1] = freq[sig1.rstrip('_LoF')]['LoF']
elif sig1.find('Act')>0:
if not re.search('^[A-Za-z].*$',sig1):
lofActSig['Symbol'].loc[sig1] = n1.index[n1==int(sig1.rstrip('_Act'))][0]
else:
lofActSig['Symbol'].loc[sig1] = sig1.rstrip('_Act')
lofActSig['Type'].loc[sig1] = 'Act'
lofActSig['Freq'].loc[sig1] = freq[sig1.rstrip('_Act')]['Act']
# Precalculate the permuted p-values for each frequency
permdict1 = {}
permdict1['LoF'] = {}
permdict1['Act'] = {}
oMfreqs = [freq[sig1.split('_')[0]][sig1.split('_')[1]] for sig1 in calcSig]
for f in set(oMfreqs):
permdict1['LoF'][f] = float((permMF_neg >= f).sum())/len(permMF_neg)
permdict1['Act'][f] = float((permMF_pos >= f).sum())/len(permMF_pos)
# Add permuted p-values to summary matrix and permutation summary
for sig1 in calcSig:
if sig1.find('LoF')>0:
lofActSig.loc[sig1, 'Emp.p_value'] = permdict1['LoF'][freq[sig1.rstrip('_LoF')]['LoF']]
summaryMatrix.loc[int(sig1.rstrip('_LoF')),'OM_empirical_p_value'] = permdict1['LoF'][freq[sig1.rstrip('_LoF')]['LoF']]
elif sig1.find('Act')>0:
lofActSig.loc[sig1, 'Emp.p_value'] = permdict1['Act'][freq[sig1.rstrip('_Act')]['Act']]
summaryMatrix.loc[int(sig1.rstrip('_Act')),'OM_empirical_p_value'] = permdict1['Act'][freq[sig1.rstrip('_Act')]['Act']]
# Filter LoF and Act based on permuted p-values
if len(lofActSig)>0:
lofActSig['q_value'] = multipletests(lofActSig['Emp.p_value'], 0.05, method='fdr_bh')[1]
summaryMatrix.loc[[int(i.split('_')[0]) for i in lofActSig.index],'OM_empirical_q_value'] = list(lofActSig['q_value'])
lofActSig.sort_values('q_value').to_csv(params['output_path']+'/oncoMerge_ActLofPermPV.csv')
# Screen out LoF and Act that don't meet significance cutoffs
keepLofAct0 = list(lofActSig.index[lofActSig['q_value']<=params['perm_qv']])
else:
# No LoF or Act to filter
lofActSig.to_csv(params['output_path']+'/oncoMerge_ActLofPermPV.csv')
keepLofAct0 = []
# Function to map mutation to locus
def findLoci(mutation, ampLoci, delLoci):
gene, mutType = mutation.split('_')
if (mutType == 'Act') or (mutType == 'CNAamp'):
loci = ampLoci
if (mutType == 'LoF') or (mutType == 'CNAdel'):
loci = delLoci
return [locus1 for locus1 in loci.keys() if int(gene) in loci[locus1]]
# Tabulate the number of genes per locus
lofActLoci = {}
for mut in keepLofAct0:
mutLoci = findLoci(mut, ampLoci, delLoci)
for locus1 in mutLoci:
if locus1 not in lofActLoci.keys():
lofActLoci[locus1] = []
lofActLoci[locus1].append(mut)
# Tabulate the number of genes per locus
combinedLoci = lofActLoci
for mut in keepDel+keepAmp:
mutLoci = findLoci(mut, ampLoci, delLoci)
for locus1 in mutLoci:
if not locus1 in lofActLoci:
if locus1 not in combinedLoci.keys():
combinedLoci[locus1] = []
combinedLoci[locus1].append(mut)
## Decide whether to apply the maximum final frequency filter
keepLofAct1 = []
for locus in combinedLoci.keys():
# Don't filter further with maximum final frequency filter
if len(combinedLoci[locus]) < params['min_loci_genes']:
keepLofAct1 += combinedLoci[locus]
for mut in combinedLoci[locus]:
gene, mutType = mut.split('_')
summaryMatrix.loc[int(gene), 'Final_mutation_type'] = mutType
if mutType=='Act' or mutType=='LoF':
summaryMatrix.loc[int(gene), 'Final_freq'] = summaryMatrix.loc[int(gene), mutType+'_freq']
summaryMatrix.loc[int(gene), 'Delta_over_PAM'] = summaryMatrix.loc[int(gene), mutType+'_freq'] - summaryMatrix.loc[int(gene), 'PAM_freq']
else:
summaryMatrix.loc[int(gene), 'Final_freq'] = summaryMatrix.loc[int(gene), 'CNA_freq']
summaryMatrix.loc[int(gene), 'Genes_in_locus'] = len(combinedLoci[locus])
# Filter with maximum final frequency filter
else:
maxFF = max([summaryMatrix.loc[int(mut.split('_')[0]), mut.split('_')[1]+'_freq'] for mut in combinedLoci[locus]])
gl1 = len([mut for mut in combinedLoci[locus] if summaryMatrix.loc[int(mut.split('_')[0]), mut.split('_')[1]+'_freq']==maxFF])
for mut in combinedLoci[locus]:
gene, mutType = mut.split('_')
if summaryMatrix.loc[int(gene), mutType+'_freq']==maxFF:
keepLofAct1.append(mut)
summaryMatrix.loc[int(gene), 'Genes_in_locus'] = gl1
summaryMatrix.loc[int(gene), 'Final_mutation_type'] = mutType
if mutType=='Act' or mutType=='LoF':
summaryMatrix.loc[int(gene), 'Final_freq'] = summaryMatrix.loc[int(gene), mutType+'_freq']
summaryMatrix.loc[int(gene), 'Delta_over_PAM'] = summaryMatrix.loc[int(gene), mutType+'_freq'] - summaryMatrix.loc[int(gene), 'PAM_freq']
else:
summaryMatrix.loc[int(gene), 'Final_freq'] = summaryMatrix.loc[int(gene), 'CNA_freq']
keepLofAct = list(set(keepLofAct1))
# Screen out PAMs that are LoF/Act
newKeepPAM = []
for pam1 in keepPAM:
found = 0
tmp1 = pam1.split('_')[0]
for lofAct in keepLofAct:
if tmp1==lofAct.split('_')[0]:
found = 1
if found==0:
newKeepPAM.append(pam1)
summaryMatrix.loc[int(tmp1), 'Final_mutation_type'] = 'PAM'
summaryMatrix.loc[int(tmp1), 'Final_freq'] = summaryMatrix.loc[int(tmp1), 'PAM_freq']
summaryMatrix.loc[int(tmp1), 'Delta_over_PAM'] = float(0)
## Screen out loci that have a representative gene
# Mutations that are at or above minimum mutation frequency cutoff
highFreqLoci = lociCNA.loc[lociCNA.mean(axis=1)>=params['min_mut_freq']]
# Figure out what loci are explained by current Act or LoF genes
explainedLoc = []
for locus1 in highFreqLoci.index:
genesInLocus = [i for i in lociCNAgenes[locus1] if (str(i)+'_Act' in keepLofAct or str(i)+'_LoF' in keepLofAct)]
if len(genesInLocus)>0:
explainedLoc.append(locus1.split('_')[0])
# Screen out all other loci in that region
keepLoc = []
for locus1 in highFreqLoci.index:
if not locus1.split('_')[0] in explainedLoc:
keepLoc.append(locus1)
keepLoc_dict = {}
for locus1 in set(['_'.join([i.split('_')[0],i.split('_')[-1]]) for i in keepLoc]):
locus2 = locus1.split('_')[0]
if locus2 in ampLoci.keys():
keepLoc_dict[locus1] = posD1.loc[ampLoci[locus2]]
if locus2 in delLoci.keys():
keepLoc_dict[locus1] = negD1.loc[delLoci[locus2]]
def mode2(pat1col):
tmpser = pat1col.mode()
if len(tmpser)>1:
tmp10 = 1
else:
tmp10 = tmpser.iloc[0]
return tmp10
# Use most common value to determine the mutational profile of a loci
keepLoc_df = pd.DataFrame(columns = d1.columns)
tmpMode = pd.Series(index = d1.columns)
for locus1 in keepLoc_dict.keys():
for pat1 in keepLoc_dict[locus1].columns:
tmpMode.loc[pat1] = mode2(keepLoc_dict[locus1][pat1])
if tmpMode.mean()>=0.05:
keepLoc_df.loc[locus1] = tmpMode
keepLoc_df = keepLoc_df.applymap(int)
####################################
## Compile OncoMerge output files ##
####################################
finalMutFile = pd.concat([pd.DataFrame(keepers).transpose().loc[newKeepPAM].sort_index(), pd.DataFrame(keepers).transpose().loc[keepLofAct].sort_index(), keepLoc_df.sort_index()], sort=True)
# Rename all loci with only one gene
ind_list = finalMutFile.index.tolist()
for locus1 in keepLoc_df.index:
splitUp = locus1.split('_')
if splitUp[-1]=='CNAamp' and len(ampLoci[splitUp[0]])==1:
idx = ind_list.index(locus1)
ind_list[idx] = str(ampLoci[splitUp[0]][0]) + '_CNAamp'
summaryMatrix.loc[int(ampLoci[splitUp[0]][0]), 'Final_mutation_type'] = 'CNAamp'
summaryMatrix.loc[int(ampLoci[splitUp[0]][0]), 'Final_freq'] = summaryMatrix.loc[int(ampLoci[splitUp[0]][0]), 'CNA_freq']
summaryMatrix.loc[int(ampLoci[splitUp[0]][0]), 'Delta_over_PAM'] = summaryMatrix.loc[int(ampLoci[splitUp[0]][0]), 'CNA_freq']
if splitUp[-1]=='CNAdel' and len(delLoci[splitUp[0]])==1:
idx = ind_list.index(locus1)
ind_list[idx] = str(delLoci[splitUp[0]][0]) + '_CNAdel'
summaryMatrix.loc[int(delLoci[splitUp[0]][0]), 'Final_mutation_type'] = 'CNAdel'
summaryMatrix.loc[int(delLoci[splitUp[0]][0]), 'Final_freq'] = summaryMatrix.loc[int(delLoci[splitUp[0]][0]), 'CNA_freq']
summaryMatrix.loc[int(delLoci[splitUp[0]][0]), 'Delta_over_PAM'] = summaryMatrix.loc[int(delLoci[splitUp[0]][0]), 'CNA_freq']
finalMutFile.index = ind_list
# Write out final mutation matrix to csv file
finalMutFile.to_csv(params['output_path']+'/oncoMerge_mergedMuts.csv')
# Write out summary matrix to csv file
summaryMatrix.to_csv(params['output_path']+'/oncoMerge_summaryMatrix.csv')
## Write out loci information
# Prepare for writing out
writeLoci = ['Locus_name,Genes']
for locus1 in keepLoc_df.index:
splitUp = locus1.split('_')
if splitUp[-1]=='CNAamp':
writeLoci.append(locus1+','+' '.join([str(i) for i in ampLoci[splitUp[0]]]))
if splitUp[-1]=='CNAdel':
writeLoci.append(locus1+','+' '.join([str(i) for i in delLoci[splitUp[0]]]))
# Write out loci file
with open(params['output_path']+'/oncoMerge_CNA_loci.csv','w') as outFile:
outFile.write('\n'.join(writeLoci))
## Write out information for hypergeometric analysis
# Can be used to load in gene information from final mutation file:
# [[int(j.split('_')[0]),j.split('_')[1]] for j in finalMutFile.index[[not i for i in list(finalMutFile.index.str.contains('p|q'))]]]
# Write out background information for hypergeometric analysis
backgrounds = {i:[int(j) for j in backgrounds[i]] for i in backgrounds}
with open(params['output_path']+'/background.json', 'w') as outFile:
json.dump(backgrounds, outFile)
print('Done.')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment