cplaisier/oncoMerge_path.py

## oncoMerge_path.py
##########################################################
## OncoMerge:  oncoMerge.py                             ##
##  ______     ______     __  __                        ##
## /\  __ \   /\  ___\   /\ \/\ \                       ##
## \ \  __ \  \ \___  \  \ \ \_\ \                      ##
##  \ \_\ \_\  \/\_____\  \ \_____\                     ##
##   \/_/\/_/   \/_____/   \/_____/                     ##
## @Developed by: Plaisier Lab                          ##
##   (https://plaisierlab.engineering.asu.edu/)         ##
##   Arizona State University                           ##
##   242 ISTB1, 550 E Orange St                         ##
##   Tempe, AZ  85281                                   ##
## @github: https://github.com/plaisier-lab/OncoMerge   ##
## @Author:  Chris Plaisier, Rob Schultz                ##
## @License:  GNU GPLv3                                 ##
##                                                      ##
## If this program is used in your analysis please      ##
## mention who built it. Thanks. :-)                    ##
##########################################################


#####################
## Import packages ##
#####################

import re
import json
import argparse
import pandas as pd
import numpy as np
from statsmodels.stats.multitest import multipletests # pip install statsmodels==0.10.0
import itertools
import sys
import os
from scipy.stats import hypergeom
from tqdm import tqdm
#import mygene
#mg = mygene.MyGeneInfo() # pull in function to map genes

##################################
## Read in command line options ##
##################################

parser = argparse.ArgumentParser(description='OncoMerge merges patient Protein Affecting Mutations (PAMs) and Copy Number Alterations (CNAs) into a unified mutation matrix.')
parser.add_argument('-cf', '--config_path', help='Path to JSON encoded configuration file, overrides command line parameters', type = str)
parser.add_argument('-gp', '--gistic_path', help='Path to GISTIC output folder', type = str)
parser.add_argument('-dp', '--del_path', help='Path to GISTIC deletion file (default = del_genes.conf_99.txt)', type = str, default = 'del_genes.conf_99.txt')
parser.add_argument('-ap', '--amp_path', help='Path to the GISTIC amplification file (default = amp_genes.conf_99.txt)', type = str, default = 'amp_genes.conf_99.txt')
parser.add_argument('-gdp', '--gene_data_path', help='Path to the GISTIC gene data file (default = all_data_by_genes.txt)', type = str, default = 'all_data_by_genes.txt')
parser.add_argument('-cfp', '--conversion_file_path', help='Supply hand annotated file to convert gene symbols to Entrez IDs (default does not import hand annotated conversion file and instead uses conversion embedded in GISTIC output files).', type = str)
parser.add_argument('-ln', '--label_name', help='Label for Entrez ID column in GISTIC gene data file (default = \'Gene ID\')', type = str, default = 'Gene ID')
parser.add_argument('-tp', '--thresh_path', help='Path to the GISTIC all_thresholded file (default = all_thresholded.by_genes.txt)', type = str, default = 'all_thresholded.by_genes.txt')
parser.add_argument('-smp', '--som_mut_path', help='Path to the somatic mutations file (CSV matrix where columns are patients and genes are rows) [0 = not mutated, and 1 = mutated]', type = str)
parser.add_argument('-mscv', '--mutsig2cv_path', help='Path to a MutSig2CV output file', type = str)
parser.add_argument('-op', '--output_path', help='Path you would like to output OncoMerged files (default = current directory)', type = str, default = '.')
parser.add_argument('-mmf', '--min_mut_freq', help='Minimum frequency of mutation (range = 0-1; default = 0.05)', type = float, default = 0.05)
parser.add_argument('-pq', '--perm_qv', help='Permuted p-value FDR BH corrected cutoff (default = 0.1)', type = float, default = 0.1)
parser.add_argument('-sp', '--save_permutation', help='Run and save out permutation analysis to be used for comparability in another OncoMerge run (default off)', action='store_true')
parser.add_argument('-lp', '--load_permutation', help='Do not run permutation anlaysis and load permutation anlaysis from previous run (default off)', type = str, default = None)
parser.add_argument('-mlg', '--min_loci_genes', help='Minimum number of genes in loci to apply maximum final frequency filter (default = 10)', type = int, default = 10)
parser.add_argument('-mpf', '--min_pam_freq', help='Minimum PAM frequency (default = 0.01)', type = float, default = 0.01)
args = parser.parse_args()


#######################
## Define parameters ##
#######################

params = args.__dict__
if args.config_path:
    with open(args.config_path, "r") as cfg:
        tmp = json.loads(cfg.read())
        for i in tmp:
            params[i] = tmp[i]
if (not params['gistic_path']) or (not params['som_mut_path']) or (not params['mutsig2cv_path']) or (params['save_permutation'] and not params['load_permutation']==None):
    parser.print_help()
    sys.exit(1)


##################
## Load up data ##
##################

print('Loading data...')

# Create conversion series for gene symbol to Entrez ID
if not params['conversion_file_path']:
    n1 = pd.read_csv(params['gistic_path']+'/'+params['gene_data_path'],index_col=0,sep='\t',usecols=[0,1])
    n1.index = [i.split('|')[0] for i in n1.index]
    n1 = n1.drop_duplicates()
    n1 = n1[params['label_name']]
else:
    n1 = pd.read_csv(params['conversion_file_path'],index_col=0)[params['label_name']].apply(int)

# load up significantly mutated genes
mutSig2CV = pd.read_csv(params['mutsig2cv_path'],index_col=1)
mutSig2CV = mutSig2CV.loc[mutSig2CV.index.map(lambda x: x in n1.index)]
mutSig2CV.index = mutSig2CV.index.map(lambda x: n1.loc[x])
mutSig2CV = mutSig2CV.loc[~mutSig2CV.index.duplicated(keep='first')]
sigPAMs = list(mutSig2CV.index[mutSig2CV['q']<=0.1]) # Changed to 0.1 based on Lawerence et al., Nature 2013 (PMID = 23770567)

# Get list of significantly CNA amplified genes
ampLoci = {}
ampLoci_qv = {}
amp1 = pd.read_csv(params['gistic_path']+'/'+params['amp_path'],index_col=0,sep='\t')
for col1 in amp1.columns:
    if float(amp1[col1]['residual q value'])<=0.05 and not (col1[0]=='X' or col1=='Y'):
        ampLoci[col1] = list(set([n1.loc[i] for i in [i.lstrip('[').rstrip(']').split('|')[0] for i in list(amp1[col1].dropna()[3:])] if i in n1.index and n1.loc[i]>0]))
        ampLoci_qv[col1] = float(amp1[col1]['residual q value'])

# Get list of significantly CNA deleted genes
delLoci = {}
delLoci_qv = {}
del1 = pd.read_csv(params['gistic_path']+'/'+params['del_path'],index_col=0,sep='\t')
for col1 in del1.columns:
    if float(del1[col1]['residual q value'])<=0.05 and not (col1[0]=='X' or col1=='Y'):
        delLoci[col1] = list(set([n1.loc[i] for i in [i.lstrip('[').rstrip(']').split('|')[0] for i in list(del1[col1].dropna()[3:])] if i in n1.index and n1.loc[i]>0]))
        delLoci_qv[col1] = float(del1[col1]['residual q value'])

# Set up background gene numbers for gold-standard enrichment analysis
backgrounds = {'Activating':[], 'LossOfFunction':[], 'Aggregate':[]}
actTmp = [gene for locus in ampLoci for gene in ampLoci[locus]]
backgrounds['Activating'] += actTmp
backgrounds['Aggregate'] += actTmp
delTmp = [gene for locus in delLoci for gene in delLoci[locus]]
backgrounds['LossOfFunction'] += delTmp
backgrounds['Aggregate'] += delTmp
backgrounds['Aggregate'] += sigPAMs

#Load go pathway
godf=pd.read_csv(params['gopath'],sep=',', index_col=0, dtype=object, header=None)
keggdf=pd.read_csv(params['keggpath'],sep=',', index_col=0, dtype=object, header=None)
keggdf = keggdf.iloc[1:]
print("go pathway")
print(godf)
print("go pathway done1")

godict_int=dict(zip(godf.index,[[int(j) for j in godf.loc[i][2].split(' ')] for i in godf.index]))
kegg_int=dict(zip(keggdf.index,[[int(j) for j in keggdf.loc[i][2].split(' ')] for i in keggdf.index]))
#godict_int = {key: value for key, value in godict_int.items() if key.isnumeric()}
#godict_string=(zip(godf.index,[godf.loc[i][2].split(' ') for i in godf.index]))

# Load up enrichment sets
"""enrichmentSets = {}
if 'enrichment_sets' in params:
    for set1 in params['enrichment_sets']:
        enSet1 = pd.read_csv(params['enrichment_sets'][set1], header=0)
        enrichmentSets[set1] = {}
        enrichmentSets[set1]['Aggregate'] = {'Activating': [], 'LossOfFunction': []}
        for cancer in set(enSet1['Cancer']):
            enrichmentSets[set1][cancer] = {}
            for type1 in ['Activating', 'LossOfFunction']:
                enrichmentSets[set1][cancer][type1] = list(enSet1.loc[(enSet1['Cancer']==cancer) & (enSet1['Type']==type1),'Entrez'])
                enrichmentSets[set1]['Aggregate'][type1] += list(enSet1.loc[(enSet1['Cancer']==cancer) & (enSet1['Type']==type1),'Entrez'])
"""

# Load up somatically mutated genes
somMuts = pd.read_csv(params['som_mut_path'],index_col=0,header=0)
if not somMuts.index.dtype=='int64':
    somMuts = somMuts.loc[somMuts.index.map(lambda x: x in n1.index)]
    somMuts.index = somMuts.index.map(lambda x: n1.loc[x])
somMuts = somMuts.loc[~somMuts.index.duplicated(keep='first')]

# Read in gistic all_data_by_genes file
with open(params['gistic_path']+'/'+params['thresh_path'],'r') as inFile:
    tmp = inFile.readline().strip().split('\t')
    numCols1 = len(tmp)
d1 = pd.read_csv(params['gistic_path']+'/'+params['thresh_path'],index_col=0,sep='\t').drop(tmp[1], axis = 1)
d1.columns = [i[:12] for i in d1.columns]
d1 = d1.loc[d1.index.map(lambda x: x.split('|')[0] in n1.index)]
d1.index = d1.index.map(lambda x: n1.loc[x.split('|')[0]])
d1.index.name = 'Locus ID'

# Removing sex chromosomes (issues in CNA analysis) from d1
lociThresh = d1['Cytoband']
include = []
for i in lociThresh:
    if not(i[0]=='X' or i[0]=='Y'):
        include.append(True)
    else:
        include.append(False)

d1 = d1.loc[lociThresh[include].index].drop('Cytoband', axis = 1)

# Make sure somMuts and gistic have same samples
somMuts = somMuts[list(set(d1.columns).intersection(somMuts.columns))]
d1 = d1[list(set(d1.columns).intersection(somMuts.columns))]

# Get rid of duplicated rows
d1 = d1[~d1.index.duplicated(keep='first')]

## Fill out summary matrix
summaryMatrix = pd.DataFrame(index= list(set([gene for locus in ampLoci.values() for gene in locus] + [gene for locus in delLoci.values() for gene in locus] + list(somMuts.index))), columns = ['Symbol', 'PAM_freq', 'MutSig2CV_qvalue', 'CNA_freq', 'CNA_locus', 'CNA_type', 'GISTIC_residual_q_value', 'Act_freq', 'LoF_freq', 'OM_type_selected', 'OM_empirical_p_value', 'OM_empirical_q_value', 'Genes_in_locus',  'Final_mutation_type', 'Final_freq', 'Delta_over_PAM'])

# Add gene symbols
toMatch = [i for i in summaryMatrix.index if i in n1.values]
summaryMatrix.loc[toMatch,'Symbol'] = [n1.index[n1==i][0] for i in toMatch]

# Add MutSig2CV q-values
toMatch = [i for i in summaryMatrix.index if i in mutSig2CV.index]
summaryMatrix.loc[toMatch,'MutSig2CV_qvalue'] = mutSig2CV.loc[toMatch,'q']

# Print out some useful information
print('\tSize of CNA matrix: '+str(d1.shape))
print('\tSize of somatic mutation matrix: '+str(somMuts.shape))
print('Finished loading data.')

# Make the output directory if it doesn't exists already
if not os.path.exists(params['output_path']):
    os.mkdir(params['output_path'])


########################
## Begin onocoMerging ##
########################

# Cutoff somatic mutations based on the minimum mutation frequency (mf)
freq1 = somMuts.sum(axis=1)/len(list(somMuts.columns))
somMutPoint = freq1[freq1>=params['min_mut_freq']].index
tmp1 = [i for i in summaryMatrix.index if i in freq1.index]
summaryMatrix.loc[tmp1, 'PAM_freq'] = freq1.loc[tmp1]

# Precompute positive and negative dichotomized matrices
print('Precomputing dichotomized matrices...')
posdicot = (lambda x: 1 if x>=2 else 0)
posD1 = d1.applymap(posdicot)
posFreq = posD1.mean(axis=1)
ampGenes = {j:i for i in ampLoci for j in ampLoci[i]}
negdicot = (lambda x: 1 if x<=(-2) else 0)
negD1 = d1.applymap(negdicot)
negFreq = negD1.mean(axis=1)
delGenes = {j:i for i in delLoci for j in delLoci[i]}
print('Finished precomputing dichotomized matrices.')

# Add back genes that are high frequency amplification or deletion (20%)
for gene1 in list([i for i in posD1.index if posFreq[i]>=0.2]):
    if (not gene1 in ampGenes) and isinstance(lociThresh[gene1], str) and (lociThresh[gene1] in ampLoci_qv) and (gene1 in posD1.index):
        ampGenes[gene1] = lociThresh[gene1]
        ampLoci[lociThresh[gene1]].append(gene1)
        #print('Pos', gene1, posFreq[gene1], lociThresh[gene1])

for gene1 in list([i for i in negD1.index if negFreq[i]>=0.2]):
    if (not gene1 in delGenes) and isinstance(lociThresh[gene1], str) and (lociThresh[gene1] in delLoci_qv) and (gene1 in negD1.index):
        delGenes[gene1] = lociThresh[gene1]
        delLoci[lociThresh[gene1]].append(gene1)
        #print('Neg', gene1, negFreq[gene1], lociThresh[gene1])

print('Generating CNA summaryMatrix data...')
somMuts1 = list(set(list(ampGenes.keys())+list(delGenes.keys())).intersection(list(d1.index)))
delGenesSet = set(delGenes.keys())
ampGenesSet = set(ampGenes.keys())
bothGenesSet = (ampGenesSet.intersection(delGenesSet)).intersection(somMuts1)
onlyDel = delGenesSet.difference(ampGenesSet).intersection(somMuts1)
onlyAmp = ampGenesSet.difference(delGenesSet).intersection(somMuts1)
with tqdm(total=len(bothGenesSet)+len(onlyDel)+len(onlyAmp)) as pbar:
    for both1 in bothGenesSet:
        # Conflict!
        # Choose amplification if is higher frequency
        if posFreq[both1]>negFreq[both1]:
            summaryMatrix.loc[both1,['CNA_type','CNA_locus','GISTIC_residual_q_value','CNA_freq']] = ['Amp', ampGenes[both1], ampLoci_qv[ampGenes[both1]], posFreq[both1]]
        # Otherwise choose deletion (deletion becomse default for equal frequency; unlikely to happen)
        else:
            summaryMatrix.loc[both1,['CNA_type','CNA_locus','GISTIC_residual_q_value','CNA_freq']] = ['Del', delGenes[both1], delLoci_qv[delGenes[both1]], negFreq[both1]]
        pbar.update(1)
    # Straight up deletion
    for del1 in onlyDel:
        summaryMatrix.loc[del1,['CNA_type','CNA_locus','GISTIC_residual_q_value','CNA_freq']] = ['Del', delGenes[del1], delLoci_qv[delGenes[del1]], negFreq[del1]]
        pbar.update(1)
    # Straight up amplification
    for amp1 in onlyAmp:
        summaryMatrix.loc[amp1,['CNA_type','CNA_locus','GISTIC_residual_q_value','CNA_freq']] = ['Amp', ampGenes[amp1], ampLoci_qv[ampGenes[amp1]], posFreq[amp1]]
        pbar.update(1)
print('Done.')


#intersect gene lists
#pos_intersect = pd.merge(posD1, somMuts, how='inner', right_index=True, left_index=True indicator=True])
#neg_intersect = pd.merge(negD1, somMuts, how='inner', on=['user_id'])
#see negD1
newgodict= {}
for x in godict_int.keys():
    tmp1=list(set(godict_int[x]).intersection(negD1.index))
    if len(tmp1) > 2:
        newgodict[x]=tmp1

aggnegD1=pd.DataFrame(columns=negD1.columns)
for x in newgodict.keys():
    aggnegD1.loc[x] = negD1.loc[newgodict[x]].max(axis=0)

negD1 = pd.concat([negD1,aggnegD1])

#somMuts
newgodict_sm= {}
for x in godict_int.keys():
    tmp1=list(set(godict_int[x]).intersection(somMuts.index))
    if len(tmp1) > 2:
        newgodict_sm[x]=tmp1

aggsomMuts = pd.DataFrame(columns=somMuts.columns)
for x in newgodict_sm.keys():
    aggsomMuts.loc[x] = somMuts.loc[newgodict_sm[x]].max(axis=0)

somMuts = pd.concat([somMuts,aggsomMuts])

#posD1
newgodictpos= {}
for x in godict_int.keys():
    tmp1=list(set(godict_int[x]).intersection(posD1.index))
    if len(tmp1) > 2:
        newgodictpos[x]=tmp1

aggposD1 = pd.DataFrame(columns=posD1.columns)
for x in newgodictpos.keys():
    aggposD1.loc[x] = posD1.loc[newgodictpos[x]].max(axis=0)

posD1 = pd.concat([posD1,aggposD1])

# Merge loci for deletions and amplifications
lociCNAgenes = {}
lociCNA = pd.DataFrame(columns=d1.columns)
print('Bundling amplification loci...')
for loci1 in ampLoci:
    # Get matrix of CNAs for genes in loci
    dt = posD1.loc[ampLoci[loci1]]
    # Get unique rows
    dedup = dt.drop_duplicates(keep='first')
    # Get genes which match and add to output dictionaries
    for i in range(len(dedup.index)):
        cnaName = loci1+'_'+str(i)+'_CNAamp'
        lociCNA.loc[cnaName] = dedup.iloc[i]
        lociCNAgenes[cnaName] = [j for j in dt.index if dedup.iloc[i].equals(dt.loc[j])]

print('Bundling deletion loci...')
for loci1 in delLoci:
    # Get matrix of CNAs for genes in loci
    dt = negD1.loc[delLoci[loci1]]
    # Get unique rows
    dedup = dt.drop_duplicates(keep='first')
    # Get genes which match and add to output dictionaries
    for i in range(len(dedup.index)):
        cnaName = loci1+'_'+str(i)+'_CNAdel'
        lociCNA.loc[cnaName] = dedup.iloc[i]
        lociCNAgenes[cnaName] = [j for j in dt.index if dedup.iloc[i].equals(dt.loc[j])]

# Make combined matrix
# LoF = deletions + somatic point mutations
# Act = amplifications + somatic point mutations
print('Starting somatic mutations...')
pamLofAct = {}
freq = {}
for s1 in somMutPoint:
    if s1>0:
        if not str(s1) in pamLofAct:
            pamLofAct[str(s1)] = {}
        tmpSom = somMuts.loc[s1]
        # If potential PAM, store PAM
        if not (str(s1)+'_PAM' in pamLofAct[str(s1)] or sum(tmpSom)==0):
            pamLofAct[str(s1)][str(s1)+'_PAM'] = tmpSom
        if (s1 in negD1.index and s1 in posD1.index):
            tmpNeg = negD1.loc[s1]
            tmpLoF = tmpSom.add(tmpNeg)[tmpNeg.index]
            tmpLoF[tmpLoF==2] = 1
            tmpPos = posD1.loc[s1]
            tmpAct = tmpSom.add(tmpPos)[tmpPos.index]
            tmpAct[tmpAct==2] = 1
            if not s1 in freq:
                freq[str(s1)] = {'PAM':tmpSom.mean(),'CNAdel':tmpNeg.mean(),'CNAamp':tmpPos.mean(),'LoF':tmpLoF.mean(),'Act':tmpAct.mean()}
        else:
            if not s1 in freq:
                freq[str(s1)] = {'PAM':tmpSom.mean(),'CNAdel':0,'CNAamp':0,'LoF':0,'Act':0}
###
print('Starting pathways...')
for s1 in set(godict_int.keys()).intersection(somMuts.index):
    if not str(s1) in pamLofAct:
        pamLofAct[str(s1)] = {}
    tmpSom = somMuts.loc[s1]
    # If potential PAM, store PAM
    if not (str(s1)+'_PAM' in pamLofAct[str(s1)] or sum(tmpSom)==0):
        pamLofAct[str(s1)][str(s1)+'_PAM'] = tmpSom
    if (s1 in negD1.index and s1 in posD1.index):
        tmpNeg = negD1.loc[s1]
        tmpLoF = tmpSom.add(tmpNeg)[tmpNeg.index]
        tmpLoF[tmpLoF==2] = 1
        tmpPos = posD1.loc[s1]
        tmpAct = tmpSom.add(tmpPos)[tmpPos.index]
        tmpAct[tmpAct==2] = 1
        if not s1 in freq:
            freq[str(s1)] = {'PAM':tmpSom.mean(),'CNAdel':tmpNeg.mean(),'CNAamp':tmpPos.mean(),'LoF':tmpLoF.mean(),'Act':tmpAct.mean()}
    else:
        if not s1 in freq:
            freq[str(s1)] = {'PAM':tmpSom.mean(),'CNAdel':0,'CNAamp':0,'LoF':0,'Act':0}


print('Starting amplifications...')
for loci1 in ampLoci:
    for s1 in list(set(ampLoci[loci1]).intersection(somMuts.index)):
        if s1>0:
            # If potential Act
            if s1 in somMuts.index and s1 in posD1.index:
                tmpSom = somMuts.loc[s1]
                tmpNeg = negD1.loc[s1]
                tmpLoF = tmpSom.add(tmpNeg)[tmpNeg.index]
                tmpLoF[tmpLoF==2] = 1
                tmpPos = posD1.loc[s1]
                tmpAct = tmpSom.add(tmpPos)[tmpPos.index]
                tmpAct[tmpAct==2] = 1
                if not s1 in freq:
                    freq[str(s1)] = {'PAM':tmpSom.mean(),'CNAdel':tmpNeg.mean(),'CNAamp':tmpPos.mean(),'LoF':tmpLoF.mean(),'Act':tmpAct.mean()}
                # Store Act
                if not str(s1) in pamLofAct:
                    pamLofAct[str(s1)] = {}
                if not (str(s1)+'_Act' in pamLofAct[str(s1)] or tmpAct.equals(tmpSom) or tmpAct.equals(tmpPos)):
                    pamLofAct[str(s1)][str(s1)+'_Act'] = tmpAct
                    pamLofAct[str(s1)][str(s1)+'_CNAamp'] = tmpPos
    for s1 in set(ampLoci[loci1]).difference(somMuts.index):
        if s1>0:
            tmpNeg = negD1.loc[s1]
            tmpPos = posD1.loc[s1]
            if not s1 in freq:
                freq[str(s1)] = {'PAM':np.nan,'CNAdel':tmpNeg.mean(),'CNAamp':tmpPos.mean(),'LoF':np.nan,'Act':np.nan}
            # Store Amp
            if not str(s1) in pamLofAct:
                pamLofAct[str(s1)] = {}
            if not str(s1)+'_CNAamp' in pamLofAct[str(s1)]:
                pamLofAct[str(s1)][str(s1)+'_CNAamp'] = tmpPos


print('Starting amplifications (pathway)...')
for s1 in list(set(godict_int.keys()).intersection(somMuts.index)):
    # If potential Act
    if s1 in somMuts.index and s1 in posD1.index:
        tmpSom = somMuts.loc[s1]
        tmpNeg = negD1.loc[s1]
        tmpLoF = tmpSom.add(tmpNeg)[tmpNeg.index]
        tmpLoF[tmpLoF==2] = 1
        tmpPos = posD1.loc[s1]
        tmpAct = tmpSom.add(tmpPos)[tmpPos.index]
        tmpAct[tmpAct==2] = 1
        if not s1 in freq:
            freq[str(s1)] = {'PAM':tmpSom.mean(),'CNAdel':tmpNeg.mean(),'CNAamp':tmpPos.mean(),'LoF':tmpLoF.mean(),'Act':tmpAct.mean()}
        # Store Act
        if not str(s1) in pamLofAct:
            pamLofAct[str(s1)] = {}
        if not (str(s1)+'_Act' in pamLofAct[str(s1)] or tmpAct.equals(tmpSom) or tmpAct.equals(tmpPos)):
            pamLofAct[str(s1)][str(s1)+'_Act'] = tmpAct
            pamLofAct[str(s1)][str(s1)+'_CNAamp'] = tmpPos


print('Starting deletions...')
for loci1 in delLoci:
    for s1 in set(delLoci[loci1]).intersection(somMuts.index):
        if s1>0:
            # If potential Lof
            if s1 in somMuts.index and s1 in negD1.index:
                tmpSom = somMuts.loc[s1]
                tmpNeg = negD1.loc[s1]
                tmpLoF = tmpSom.add(tmpNeg)[tmpNeg.index]
                tmpLoF[tmpLoF==2] = 1
                tmpPos = posD1.loc[s1]
                tmpAct = tmpSom.add(tmpPos)[tmpPos.index]
                tmpAct[tmpAct==2] = 1
                if not s1 in freq:
                    freq[str(s1)] = {'PAM':tmpSom.mean(),'CNAdel':tmpNeg.mean(),'CNAamp':tmpPos.mean(),'LoF':tmpLoF.mean(),'Act':tmpAct.mean()}
                # Store LoF
                if not str(s1) in pamLofAct:
                    pamLofAct[str(s1)] = {}
                if not (str(s1)+'_LoF' in pamLofAct[str(s1)] or tmpLoF.equals(tmpSom) or tmpLoF.equals(tmpNeg)):
                    pamLofAct[str(s1)][str(s1)+'_LoF'] = tmpLoF
                    pamLofAct[str(s1)][str(s1)+'_CNAdel'] = tmpNeg
    for s1 in set(delLoci[loci1]).difference(somMuts.index):
        if s1>0:
            tmpNeg = negD1.loc[s1]
            tmpPos = posD1.loc[s1]
            if not s1 in freq:
                freq[str(s1)] = {'PAM':np.nan,'CNAdel':tmpNeg.mean(),'CNAamp':tmpPos.mean(),'LoF':np.nan,'Act':np.nan}
            # Store Del
            if not str(s1) in pamLofAct:
                pamLofAct[str(s1)] = {}
            if not str(s1)+'_CNAdel' in pamLofAct[str(s1)]:
                pamLofAct[str(s1)][str(s1)+'_CNAdel'] = tmpNeg


print('Starting deletions (pathways)...')
for s1 in list(set(godict_int.keys()).intersection(somMuts.index)):
    # If potential Lof
    if s1 in somMuts.index and s1 in negD1.index:
        tmpSom = somMuts.loc[s1]
        tmpNeg = negD1.loc[s1]
        tmpLoF = tmpSom.add(tmpNeg)[tmpNeg.index]
        tmpLoF[tmpLoF==2] = 1
        tmpPos = posD1.loc[s1]
        tmpAct = tmpSom.add(tmpPos)[tmpPos.index]
        tmpAct[tmpAct==2] = 1
        if not s1 in freq:
            freq[str(s1)] = {'PAM':tmpSom.mean(),'CNAdel':tmpNeg.mean(),'CNAamp':tmpPos.mean(),'LoF':tmpLoF.mean(),'Act':tmpAct.mean()}
        # Store LoF
        if not str(s1) in pamLofAct:
            pamLofAct[str(s1)] = {}
        if not (str(s1)+'_LoF' in pamLofAct[str(s1)] or tmpLoF.equals(tmpSom) or tmpLoF.equals(tmpNeg)):
            pamLofAct[str(s1)][str(s1)+'_LoF'] = tmpLoF
            pamLofAct[str(s1)][str(s1)+'_CNAdel'] = tmpNeg


# Decide which mutations will be tested in permutation analysis
print('Screening for frequency...')
keepPAM = []
keepDel = []
keepAmp = []
keepers = {}
calcSig = []
for s1 in pamLofAct:
    if s1 in freq:
        freqPAM = freq[s1]['PAM']
        freqPos = freq[s1]['CNAamp']
        freqNeg = freq[s1]['CNAdel']
        #if summaryMatrix.loc[s1,'CNA_type']=='Del':
        #    summaryMatrix.loc[s1, 'CNA_freq'] = freq[s1]['CNAdel']
        #elif summaryMatrix.loc[s1,'CNA_type']=='Amp':
        #    summaryMatrix.loc[s1, 'CNA_freq'] = freq[s1]['CNAamp']
        freqAct = freq[s1]['Act']
        summaryMatrix.loc[s1, 'Act_freq'] = freq[s1]['Act']
        freqLoF = freq[s1]['LoF']
        summaryMatrix.loc[s1, 'LoF_freq'] = freq[s1]['LoF']
        if freqLoF>=0.05 or freqAct>=0.05 or freqPAM>=0.05 or freqPos>=0.05 or freqNeg>=0.05:
            name1 = 'Unkown'
            if sum(n1.isin([s1]))==1:
                name1 = n1.index[n1==int(s1)][0]
            print('\t'+''.join([str(i) for i in [name1+' ('+str(s1),') - FreqPAM: ', round(freqPAM,3), ' | FreqNeg: ', round(freqNeg,3), ' | FreqLoF: ', round(freqLoF,3), ' | FreqPos: ', round(freqPos,3),' | FreqAct: ', round(freqAct,3)]]))
        # Add PAM
        if freqPAM>0 and freqPAM>=params['min_mut_freq'] and s1 in somMutPoint and s1 in sigPAMs:
            keepers[str(s1)+'_PAM'] = pamLofAct[str(s1)][str(s1)+'_PAM']
            keepPAM.append(str(s1)+'_PAM')
            summaryMatrix.loc[s1,'OM_type_selected'] = 'PAM'
        # Add Act
        if str(s1)+'_Act' in pamLofAct[str(s1)] and freqAct>freqPAM and freqAct>=params['min_mut_freq']:
            if freqPAM>=params['min_pam_freq']:
                keepers[str(s1)+'_Act'] = pamLofAct[str(s1)][str(s1)+'_Act']
                calcSig.append(str(s1)+'_Act')
                summaryMatrix.loc[s1,'OM_type_selected'] = 'Act'
        # Add CNAamp
        if (str(s1)+'_CNAamp' in pamLofAct[str(s1)]) and (freqPos>=params['min_mut_freq']) and not ((summaryMatrix.loc[s1,'OM_type_selected']=='Act') or (summaryMatrix.loc[s1,'OM_type_selected']=='LoF')):
            #print(str(s1)+'_CNAamp')
            keepers[str(s1)+'_CNAamp'] = pamLofAct[str(s1)][str(s1)+'_CNAamp']
            keepAmp.append(str(s1)+'_CNAamp')
            summaryMatrix.loc[s1,'OM_type_selected'] = 'CNAamp'
        # Add LoF
        if str(s1)+'_LoF' in pamLofAct[str(s1)] and freqLoF>freqPAM and freqLoF>=params['min_mut_freq']:
            if freqPAM>=params['min_pam_freq']:
                keepers[str(s1)+'_LoF'] = pamLofAct[str(s1)][str(s1)+'_LoF']
                calcSig.append(str(s1)+'_LoF')
                summaryMatrix.loc[s1,'OM_type_selected'] = 'LoF'
        # Add CNAdel
        if ((str(s1)+'_CNAdel' in pamLofAct[str(s1)]) and (freqNeg>=params['min_mut_freq'])) and not ((summaryMatrix.loc[s1,'OM_type_selected']=='Act') or (summaryMatrix.loc[s1,'OM_type_selected']=='LoF')):
            #print(str(s1)+'_CNAdel')
            keepers[str(s1)+'_CNAdel'] = pamLofAct[str(s1)][str(s1)+'_CNAdel']
            keepDel.append(str(s1)+'_CNAdel')
            summaryMatrix.loc[s1,'OM_type_selected'] = 'CNAdel'

##################################
## Conduct permutation analysis ##
##################################
print('Permutation anlaysis...')
numPermutes = 1000

# Permute to get frequency
def singlePermute(somMutsMF, somCNAsMF):
    tmp1 = pd.Series(np.random.permutation(somMutsMF), index=somMutsMF.index)
    tmp2 = pd.Series(np.random.permutation(somCNAsMF), index=somCNAsMF.index)
    subset1 = set(somMutsMF.index).intersection(somCNAsMF.index)
    return list(tmp1.loc[subset1]+tmp2.loc[subset1])

permMF_neg = []
permMF_pos = []
if params['load_permutation']==None:
    ## Compute permutations if not loading from previous run
    # Deletions
    print('\tPermuting deletions...')
    somMutsMF = somMuts.transpose().mean()
    somCNAsMF = negD1.transpose().mean()
    with tqdm(total=numPermutes) as pbar:
        for i in range(numPermutes):
            permMF_neg += singlePermute(somMutsMF, somCNAsMF)
            pbar.update(1)

    # Amplifications
    print('\tPermuting amplifications...')
    somMutsMF = somMuts.transpose().mean()
    somCNAsMF = posD1.transpose().mean()
    with tqdm(total=numPermutes) as pbar:
        for i in range(numPermutes):
            permMF_pos += singlePermute(somMutsMF, somCNAsMF)
            pbar.update(1)

    # Change precision so that is the same as what will be written out,
    # - Fixes bug where precision change leads to different behavior from freshly run (then saved) and loaded permutation data
    permMF_neg = [float(str(i)) for i in permMF_neg]
    permMF_pos = [float(str(i)) for i in permMF_pos]

    # If requested to save out permutations
    if params['save_permutation']==True:
        print('\tSaving permutations...')
        np.save(params['output_path']+'/oncomerge_delPerm', permMF_neg)
        np.save(params['output_path']+'/oncomerge_ampPerm', permMF_pos)
elif not params['load_permutation']==None:
    ## Load up permutations from previous run
    print('\tLoading previous permutations...')
    permMF_neg = np.load(params['load_permutation']+'/oncomerge_delPerm.npy')
    permMF_pos = np.load(params['load_permutation']+'/oncomerge_ampPerm.npy')

# Write Permutation Analysis file Lof_Act_sig
lofActSig = pd.DataFrame(columns = ['Symbol', 'Type','Freq','Emp.p_value'], index = calcSig)
for sig1 in calcSig:
    if sig1.find('LoF')>0:
        if not re.search('^[A-Za-z].*$', sig1):
            lofActSig['Symbol'].loc[sig1] = n1.index[n1==int(sig1.rstrip('_LoF'))][0]
        else:
            lofActSig['Symbol'].loc[sig1] = sig1.rstrip('_LoF')
        lofActSig['Type'].loc[sig1] = 'LoF'
        lofActSig['Freq'].loc[sig1] = freq[sig1.rstrip('_LoF')]['LoF']
    elif sig1.find('Act')>0:
        if not re.search('^[A-Za-z].*$',sig1):
            lofActSig['Symbol'].loc[sig1] = n1.index[n1==int(sig1.rstrip('_Act'))][0]
        else:
            lofActSig['Symbol'].loc[sig1] = sig1.rstrip('_Act')
        lofActSig['Type'].loc[sig1] = 'Act'
        lofActSig['Freq'].loc[sig1] = freq[sig1.rstrip('_Act')]['Act']

# Precalculate the permuted p-values for each frequency
permdict1 = {}
permdict1['LoF'] = {}
permdict1['Act'] = {}
oMfreqs = [freq[sig1.split('_')[0]][sig1.split('_')[1]] for sig1 in calcSig]
for f in set(oMfreqs):
    permdict1['LoF'][f] = float((permMF_neg >= f).sum())/len(permMF_neg)
    permdict1['Act'][f] = float((permMF_pos >= f).sum())/len(permMF_pos)

# Add permuted p-values to summary matrix and permutation summary
for sig1 in calcSig:
    if sig1.find('LoF')>0:
        lofActSig.loc[sig1, 'Emp.p_value'] = permdict1['LoF'][freq[sig1.rstrip('_LoF')]['LoF']]
        summaryMatrix.loc[int(sig1.rstrip('_LoF')),'OM_empirical_p_value'] = permdict1['LoF'][freq[sig1.rstrip('_LoF')]['LoF']]
    elif sig1.find('Act')>0:
        lofActSig.loc[sig1, 'Emp.p_value'] = permdict1['Act'][freq[sig1.rstrip('_Act')]['Act']]
        summaryMatrix.loc[int(sig1.rstrip('_Act')),'OM_empirical_p_value'] = permdict1['Act'][freq[sig1.rstrip('_Act')]['Act']]

# Filter LoF and Act based on permuted p-values
if len(lofActSig)>0:
    lofActSig['q_value'] = multipletests(lofActSig['Emp.p_value'], 0.05, method='fdr_bh')[1]
    summaryMatrix.loc[[int(i.split('_')[0]) for i in lofActSig.index],'OM_empirical_q_value'] = list(lofActSig['q_value'])
    lofActSig.sort_values('q_value').to_csv(params['output_path']+'/oncoMerge_ActLofPermPV.csv')
    # Screen out LoF and Act that don't meet significance cutoffs
    keepLofAct0 = list(lofActSig.index[lofActSig['q_value']<=params['perm_qv']])
else:
    # No LoF or Act to filter
    lofActSig.to_csv(params['output_path']+'/oncoMerge_ActLofPermPV.csv')
    keepLofAct0 = []

# Function to map mutation to locus
def findLoci(mutation, ampLoci, delLoci):
    gene, mutType = mutation.split('_')
    if (mutType == 'Act') or (mutType == 'CNAamp'):
        loci = ampLoci
    if (mutType == 'LoF') or (mutType == 'CNAdel'):
        loci = delLoci
    return [locus1 for locus1 in loci.keys() if int(gene) in loci[locus1]]

# Tabulate the number of genes per locus
lofActLoci = {}
for mut in keepLofAct0:
    mutLoci = findLoci(mut, ampLoci, delLoci)
    for locus1 in mutLoci:
        if locus1 not in lofActLoci.keys():
            lofActLoci[locus1] = []
        lofActLoci[locus1].append(mut)

# Tabulate the number of genes per locus
combinedLoci = lofActLoci
for mut in keepDel+keepAmp:
    mutLoci = findLoci(mut, ampLoci, delLoci)
    for locus1 in mutLoci:
        if not locus1 in lofActLoci:
            if locus1 not in combinedLoci.keys():
                combinedLoci[locus1] = []
            combinedLoci[locus1].append(mut)

## Decide whether to apply the maximum final frequency filter
keepLofAct1 = []
for locus in combinedLoci.keys():
    # Don't filter further with maximum final frequency filter
    if len(combinedLoci[locus]) < params['min_loci_genes']:
        keepLofAct1 += combinedLoci[locus]
        for mut in combinedLoci[locus]:
            gene, mutType = mut.split('_')
            summaryMatrix.loc[int(gene), 'Final_mutation_type'] = mutType
            if mutType=='Act' or mutType=='LoF':
                summaryMatrix.loc[int(gene), 'Final_freq'] = summaryMatrix.loc[int(gene), mutType+'_freq']
                summaryMatrix.loc[int(gene), 'Delta_over_PAM'] = summaryMatrix.loc[int(gene), mutType+'_freq'] - summaryMatrix.loc[int(gene), 'PAM_freq']
            else:
                summaryMatrix.loc[int(gene), 'Final_freq'] = summaryMatrix.loc[int(gene), 'CNA_freq']
            summaryMatrix.loc[int(gene), 'Genes_in_locus'] = len(combinedLoci[locus])
    # Filter with maximum final frequency filter
    else:
        maxFF = max([summaryMatrix.loc[int(mut.split('_')[0]), mut.split('_')[1]+'_freq'] for mut in combinedLoci[locus]])
        gl1 = len([mut for mut in combinedLoci[locus] if summaryMatrix.loc[int(mut.split('_')[0]), mut.split('_')[1]+'_freq']==maxFF])
        for mut in combinedLoci[locus]:
            gene, mutType = mut.split('_')
            if summaryMatrix.loc[int(gene), mutType+'_freq']==maxFF:
                keepLofAct1.append(mut)
                summaryMatrix.loc[int(gene), 'Genes_in_locus'] = gl1
                summaryMatrix.loc[int(gene), 'Final_mutation_type'] = mutType
                if mutType=='Act' or mutType=='LoF':
                    summaryMatrix.loc[int(gene), 'Final_freq'] = summaryMatrix.loc[int(gene), mutType+'_freq']
                    summaryMatrix.loc[int(gene), 'Delta_over_PAM'] = summaryMatrix.loc[int(gene), mutType+'_freq'] - summaryMatrix.loc[int(gene), 'PAM_freq']
                else:
                    summaryMatrix.loc[int(gene), 'Final_freq'] = summaryMatrix.loc[int(gene), 'CNA_freq']

keepLofAct = list(set(keepLofAct1))

# Screen out PAMs that are LoF/Act
newKeepPAM = []
for pam1 in keepPAM:
    found = 0
    tmp1 = pam1.split('_')[0]
    for lofAct in keepLofAct:
        if tmp1==lofAct.split('_')[0]:
            found = 1
    if found==0:
        newKeepPAM.append(pam1)
        summaryMatrix.loc[int(tmp1), 'Final_mutation_type'] = 'PAM'
        summaryMatrix.loc[int(tmp1), 'Final_freq'] = summaryMatrix.loc[int(tmp1), 'PAM_freq']
        summaryMatrix.loc[int(tmp1), 'Delta_over_PAM'] = float(0)

## Screen out loci that have a representative gene
# Mutations that are at or above minimum mutation frequency cutoff
highFreqLoci = lociCNA.loc[lociCNA.mean(axis=1)>=params['min_mut_freq']]

# Figure out what loci are explained by current Act or LoF genes
explainedLoc = []
for locus1 in highFreqLoci.index:
    genesInLocus = [i for i in lociCNAgenes[locus1] if (str(i)+'_Act' in keepLofAct or str(i)+'_LoF' in keepLofAct)]
    if len(genesInLocus)>0:
        explainedLoc.append(locus1.split('_')[0])

# Screen out all other loci in that region
keepLoc = []
for locus1 in highFreqLoci.index:
    if not locus1.split('_')[0] in explainedLoc:
        keepLoc.append(locus1)

keepLoc_dict = {}
for locus1 in set(['_'.join([i.split('_')[0],i.split('_')[-1]]) for i in keepLoc]):
    locus2 = locus1.split('_')[0]
    if locus2 in ampLoci.keys():
        keepLoc_dict[locus1] = posD1.loc[ampLoci[locus2]]
    if locus2 in delLoci.keys():
        keepLoc_dict[locus1] = negD1.loc[delLoci[locus2]]

def mode2(pat1col):
    tmpser = pat1col.mode()
    if len(tmpser)>1:
        tmp10 = 1
    else:
        tmp10 = tmpser.iloc[0]
    return tmp10

# Use most common value to determine the mutational profile of a loci
keepLoc_df = pd.DataFrame(columns = d1.columns)
tmpMode = pd.Series(index = d1.columns)
for locus1 in keepLoc_dict.keys():
    for pat1 in keepLoc_dict[locus1].columns:
         tmpMode.loc[pat1] = mode2(keepLoc_dict[locus1][pat1])
    if tmpMode.mean()>=0.05:
        keepLoc_df.loc[locus1] = tmpMode

keepLoc_df = keepLoc_df.applymap(int)

####################################
## Compile OncoMerge output files ##
####################################
finalMutFile = pd.concat([pd.DataFrame(keepers).transpose().loc[newKeepPAM].sort_index(), pd.DataFrame(keepers).transpose().loc[keepLofAct].sort_index(), keepLoc_df.sort_index()], sort=True)

# Rename all loci with only one gene
ind_list = finalMutFile.index.tolist()
for locus1 in keepLoc_df.index:
    splitUp = locus1.split('_')
    if splitUp[-1]=='CNAamp' and len(ampLoci[splitUp[0]])==1:
        idx = ind_list.index(locus1)
        ind_list[idx] = str(ampLoci[splitUp[0]][0]) + '_CNAamp'
        summaryMatrix.loc[int(ampLoci[splitUp[0]][0]), 'Final_mutation_type'] = 'CNAamp'
        summaryMatrix.loc[int(ampLoci[splitUp[0]][0]), 'Final_freq'] = summaryMatrix.loc[int(ampLoci[splitUp[0]][0]), 'CNA_freq']
        summaryMatrix.loc[int(ampLoci[splitUp[0]][0]), 'Delta_over_PAM'] = summaryMatrix.loc[int(ampLoci[splitUp[0]][0]), 'CNA_freq']

    if splitUp[-1]=='CNAdel' and len(delLoci[splitUp[0]])==1:
        idx = ind_list.index(locus1)
        ind_list[idx] = str(delLoci[splitUp[0]][0]) + '_CNAdel'
        summaryMatrix.loc[int(delLoci[splitUp[0]][0]), 'Final_mutation_type'] = 'CNAdel'
        summaryMatrix.loc[int(delLoci[splitUp[0]][0]), 'Final_freq'] = summaryMatrix.loc[int(delLoci[splitUp[0]][0]), 'CNA_freq']
        summaryMatrix.loc[int(delLoci[splitUp[0]][0]), 'Delta_over_PAM'] = summaryMatrix.loc[int(delLoci[splitUp[0]][0]), 'CNA_freq']

finalMutFile.index = ind_list

# Write out final mutation matrix to csv file
finalMutFile.to_csv(params['output_path']+'/oncoMerge_mergedMuts.csv')

# Write out summary matrix to csv file
summaryMatrix.to_csv(params['output_path']+'/oncoMerge_summaryMatrix.csv')

## Write out loci information
# Prepare for writing out
writeLoci = ['Locus_name,Genes']
for locus1 in keepLoc_df.index:
    splitUp = locus1.split('_')
    if splitUp[-1]=='CNAamp':
        writeLoci.append(locus1+','+' '.join([str(i) for i in ampLoci[splitUp[0]]]))
    if splitUp[-1]=='CNAdel':
        writeLoci.append(locus1+','+' '.join([str(i) for i in delLoci[splitUp[0]]]))

# Write out loci file
with open(params['output_path']+'/oncoMerge_CNA_loci.csv','w') as outFile:
    outFile.write('\n'.join(writeLoci))

## Write out information for hypergeometric analysis
# Can be used to load in gene information from final mutation file:
#    [[int(j.split('_')[0]),j.split('_')[1]] for j in finalMutFile.index[[not i for i in list(finalMutFile.index.str.contains('p|q'))]]]

# Write out background information for hypergeometric analysis
backgrounds = {i:[int(j) for j in backgrounds[i]] for i in backgrounds}
with open(params['output_path']+'/background.json', 'w') as outFile:
    json.dump(backgrounds, outFile)

print('Done.')