Skip to content

Instantly share code, notes, and snippets.

@fciamponi
Created May 31, 2021 13:10
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save fciamponi/ef8aae813575a4947c1f9cb87a1c7330 to your computer and use it in GitHub Desktop.
Save fciamponi/ef8aae813575a4947c1f9cb87a1c7330 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# coding: utf-8
# In[768]:
import pandas as pd
import numpy as np
import glob
import itertools
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib_venn import *
import scipy.stats as stats
# In[491]:
def tidy_split(df, column, sep='|', keep=False):
"""
Split the values of a column and expand so the new DataFrame has one split
value per row. Filters rows where the column is missing.
Params
------
df : pandas.DataFrame
dataframe with the column to split and expand
column : str
the column to split and expand
sep : str
the string used to split the column's values
keep : bool
whether to retain the presplit value as it's own row
Returns
-------
pandas.DataFrame
Returns a dataframe with the same columns as `df`.
"""
indexes = list()
new_values = list()
df = df.dropna(subset=[column])
for i, presplit in enumerate(df[column].astype(str)):
values = presplit.split(sep)
if keep and len(values) > 1:
indexes.append(i)
new_values.append(presplit)
for value in values:
indexes.append(i)
new_values.append(value)
new_df = df.iloc[indexes, :].copy()
new_df[column] = new_values
return new_df
# In[319]:
def show_values_on_bars(axs, h_v="v", space=0.4, fs=10, t=int, dc=2, r=0):
def _show_on_single_plot(ax):
if h_v == "v":
for p in ax.patches:
_x = p.get_x() + p.get_width() / 2
_y = p.get_y() + p.get_height()
if np.isnan(p.get_height()) == True:
value = 0
_y = 0
else:
if t == float:
value = np.round(t(p.get_height()),dc)
else:
value = t(p.get_height())
ax.text(_x, _y, value, ha="center", fontsize=fs, rotation=r)
elif h_v == "h":
for p in ax.patches:
_x = p.get_x() + p.get_width() + float(space)
_y = p.get_y() + p.get_height()
if t == float:
value = np.round(t(p.get_width()),dc)
else:
value = t(p.get_width())
ax.text(_x, _y, value, ha="left", rotation=r, fontsize=fs)
if isinstance(axs, np.ndarray):
for idx, ax in np.ndenumerate(axs):
_show_on_single_plot(ax)
else:
_show_on_single_plot(axs)
# ### Pre-processing
# In[778]:
samps = pd.read_csv('./sample.list.txt', names=['SAMPLES','LINE'], sep='\t')
samps['COND'] = samps['SAMPLES'].apply(lambda x: '-'.join(x.split('-')[1:]))
samps_groups = samps.groupby('COND')['SAMPLES'].agg(lambda x: ','.join(x)).reset_index()
samps_groups['COND'] = ['Hs578T_1uM_MS023_2d',
'Hs578T_1uM_MS023_5d',
'MDAMB468_1uM_MS023_2d',
'MDAMB468_1uM_MS023_5d',
'Hs578T_DMSO',
'MDAMB468_DMSO',
'Hs578T_shPRMT1_DOX',
'Hs578T_shPRMT1_NoDOX']
samps_pair = list(itertools.combinations(samps_groups['SAMPLES'],2))
conds_pair = list(itertools.combinations(samps_groups['COND'],2))
d1 = pd.DataFrame(samps_pair, columns=['SAMPLE_A','SAMPLE_B'])
d2 = pd.DataFrame(conds_pair, columns=['COND_A','COND_B'])
dd = pd.concat([d1,d2], axis=1)
dd['COMP'] = dd['COND_B']+'_vs_'+dd['COND_A']
dd['CELL_A'] = dd['COND_A'].apply(lambda x: x.split('_')[0])
dd['CELL_B'] = dd['COND_B'].apply(lambda x: x.split('_')[0])
dd_c = dd[dd['CELL_A'] == dd['CELL_B']]
# In[780]:
dd_pass = dd_c[dd_c['COND_A'].str.contains('DMSO')]
dd_fix = dd_c[dd_c['COND_B'].str.contains('DMSO')]
dd_fix.columns = ['SAMPLE_B','SAMPLE_A','COND_B','COND_A','COMP','CELL_B','CELL_A']
dd_fix['COMP'] = dd_fix['COND_B']+'_vs_'+dd_fix['COND_A']
dd_fix = dd_fix[dd_c.columns]
dd2 = pd.concat([dd_pass,dd_fix]).drop_duplicates().reset_index(drop=1)
dd2 = dd2[(dd2['COND_A'].str.contains('DMSO')) & ~(dd2['COND_B'].str.contains('DOX'))]
dd_pass = dd[dd['COND_A'].str.contains('NoDOX')]
dd_fix = dd[dd['COND_B'].str.contains('NoDOX')]
dd_fix.columns = ['SAMPLE_B','SAMPLE_A','COND_B','COND_A','COMP','CELL_B','CELL_A']
dd_fix['COMP'] = dd_fix['COND_B']+'_vs_'+dd_fix['COND_A']
dd_fix = dd_fix[dd.columns]
dd3 = pd.concat([dd_pass,dd_fix]).drop_duplicates().reset_index(drop=1)
dd3 = dd3[(dd3['COND_A'].str.contains('DOX')) & (dd3['COND_B'].str.contains('DOX'))]
dd_f = pd.concat([dd2,dd3])
dd_f.to_csv('./comb.pair.treated.vs.control.txt',sep=';',header=False,index=False)
# ### Generate Hs vs MDAM comparissons
# In[781]:
dd_c = dd[(dd['CELL_A'] != dd['CELL_B']) &
(dd['COND_A'].str.contains('2d|5d')) &
(dd['COND_B'].str.contains('2d|5d'))]
dd_pass = dd_c[(dd_c['COND_A'].str.contains('MDAMB468')) & (dd_c['COND_B'].str.contains('Hs578T'))]
dd_fix = dd_c[(dd_c['COND_B'].str.contains('MDAMB468')) & (dd_c['COND_A'].str.contains('Hs578T'))]
dd_fix.columns = ['SAMPLE_B','SAMPLE_A','COND_B','COND_A','COMP','CELL_B','CELL_A']
dd_fix['COMP'] = dd_fix['COND_B']+'_vs_'+dd_fix['COND_A']
dd_fix = dd_fix[dd_c.columns]
dd2 = pd.concat([dd_pass,dd_fix]).drop_duplicates().reset_index(drop=1)
# #dd2 = dd2[(dd2['COND_A'].str.contains('DMSO')) & ~(dd2['COND_B'].str.contains('DOX'))]
# dd_pass = dd[dd['COND_A'].str.contains('NoDOX')]
# dd_fix = dd[dd['COND_B'].str.contains('NoDOX')]
# dd_fix.columns = ['SAMPLE_B','SAMPLE_A','COND_B','COND_A','COMP','CELL_B','CELL_A']
# dd_fix['COMP'] = dd_fix['COND_B']+'_vs_'+dd_fix['COND_A']
# dd_fix = dd_fix[dd.columns]
# dd3 = pd.concat([dd_pass,dd_fix]).drop_duplicates().reset_index(drop=1)
# #dd3 = dd3[(dd3['COND_A'].str.contains('DOX')) & (dd3['COND_B'].str.contains('DOX'))]
dd_fc = pd.concat([dd2])
dd_fc.to_csv('./comb.pair.hs578t.vs.mdamb468.txt',sep=';',header=False,index=False)
# In[1122]:
dd_c = dd[(dd['CELL_A'] != dd['CELL_B']) &
(dd['COND_A'].str.contains('DMSO')) &
(dd['COND_B'].str.contains('DMSO'))]
dd_pass = dd_c[(dd_c['COND_A'].str.contains('MDAMB468')) & (dd_c['COND_B'].str.contains('Hs578T'))]
dd_fix = dd_c[(dd_c['COND_B'].str.contains('MDAMB468')) & (dd_c['COND_A'].str.contains('Hs578T'))]
dd_fix.columns = ['SAMPLE_B','SAMPLE_A','COND_B','COND_A','COMP','CELL_B','CELL_A']
dd_fix['COMP'] = dd_fix['COND_B']+'_vs_'+dd_fix['COND_A']
dd_fix = dd_fix[dd_c.columns]
dd2 = pd.concat([dd_pass,dd_fix]).drop_duplicates().reset_index(drop=1)
# #dd2 = dd2[(dd2['COND_A'].str.contains('DMSO')) & ~(dd2['COND_B'].str.contains('DOX'))]
# dd_pass = dd[dd['COND_A'].str.contains('NoDOX')]
# dd_fix = dd[dd['COND_B'].str.contains('NoDOX')]
# dd_fix.columns = ['SAMPLE_B','SAMPLE_A','COND_B','COND_A','COMP','CELL_B','CELL_A']
# dd_fix['COMP'] = dd_fix['COND_B']+'_vs_'+dd_fix['COND_A']
# dd_fix = dd_fix[dd.columns]
# dd3 = pd.concat([dd_pass,dd_fix]).drop_duplicates().reset_index(drop=1)
# #dd3 = dd3[(dd3['COND_A'].str.contains('DOX')) & (dd3['COND_B'].str.contains('DOX'))]
dd_fd = pd.concat([dd2])
dd_fd.to_csv('./comb.pair.hs578t.vs.mdamb468.DMSO.txt',sep=';',header=False,index=False)
# In[1123]:
dd_fd
# ### Vast-diff parsing
# In[323]:
annot = pd.read_csv('/home/fciamponi/Programs/vast-tools/VASTDB/Hsa/ANNOT_DATA/VASTDB_EVENTS_Hsa108_hg38_ALL.tab.gz',
sep='\t')
pcr = pd.read_csv('/home/fciamponi/Programs/vast-tools/VASTDB/Hsa/ANNOT_DATA/VASTDB_PCR_VALIDATIONS_Hsa108_hg19.tab.gz',
sep='\t')
prt = pd.read_csv('/home/fciamponi/Programs/vast-tools/VASTDB/Hsa/ANNOT_DATA/VASTDB_PROT_IMPACT_Hsa108_hg19_ALL.tab.gz',
sep='\t').rename(columns={'EventID':'EVENT'})
pfam = pd.read_csv('/home/fciamponi/Programs/vast-tools/VASTDB/Hsa/ANNOT_DATA/VASTDB_PROT_PFAM_Hsa108_hg19.tab.gz',
sep='\t').rename(columns={'EventID':'EVENT'})
# In[324]:
pdb = pd.read_csv('/home/fciamponi/Programs/vast-tools/VASTDB/Hsa/ANNOT_DATA/VASTDB_PROT_STRUCT_PDB_Hsa108_hg19.tab.gz',
sep='\t').rename(columns={'EVENT_ID':'EVENT'})
phy = pd.read_csv('/home/fciamponi/Programs/vast-tools/VASTDB/Hsa/ANNOT_DATA/VASTDB_PROT_STRUCT_PHYRE_Hsa108_hg19.tab.gz',
sep='\t').rename(columns={'EVENT_ID':'EVENT'})
# In[403]:
print(annot.shape)
print(pcr.shape)
print(prt.shape)
# In[1124]:
for i in range(len(dd_f)):
comp = str(dd_f.iloc[i]['COMP'])
ca = str(dd_f.iloc[i]['COND_A'])
cb = str(dd_f.iloc[i]['COND_B'])
df = pd.read_csv('./vast_diff_results/'+comp+'.vast.diff.txt',sep='\t')
df.columns = ['GENE','EVENT',ca,cb,'E[dPsi]','MV[dPsi]_at_0.95']
df['org'] = 'org'
df = df.merge(annot[['EVENT','COORD_o']],
how='outer').merge(prt,
how='outer').merge(pcr,
how='outer')
df = df[df['org'] == 'org']
df.drop('org', 1 ,inplace=True)
df['E[dPsi]'] = df['E[dPsi]']*-1
psi = pd.read_csv('./vast_diff_results/'+comp+'.txt',sep='\t')
df_ex = df[(df['EVENT'].isin(psi['EVENT'])) &
(abs(df['E[dPsi]']) >= df['MV[dPsi]_at_0.95']) &
(df['MV[dPsi]_at_0.95'] >= 0.05) &
(abs(df['E[dPsi]']) >= 0.2)
].sort_values('MV[dPsi]_at_0.95',ascending=False).reset_index(drop=1)
df_sig = df[(df['EVENT'].isin(psi['EVENT'])) &
(abs(df['E[dPsi]']) >= df['MV[dPsi]_at_0.95']) &
(df['MV[dPsi]_at_0.95'] >= 0.01) &
(abs(df['E[dPsi]']) >= 0.1)
].sort_values('MV[dPsi]_at_0.95',ascending=False).reset_index(drop=1)
writer = pd.ExcelWriter('./vast_diff_results/'+comp+'.ASEs.xlsx',
engine='xlsxwriter')
df_ex.to_excel(writer, sheet_name='Experimental_Validation', index=False)
df_sig.to_excel(writer, sheet_name='All_significant_ASEs', index=False)
writer.save()
df_sig[['EVENT']].to_csv('./vast_diff_results/'+comp+'.SIG.TGTs.txt',sep='\t',index=False,header=False)
df_ex[['EVENT']].to_csv('./vast_diff_results/'+comp+'.EXP.TGTs.txt',sep='\t',index=False,header=False)
# In[1125]:
for i in range(len(dd_fc)):
comp = str(dd_fc.iloc[i]['COMP'])
ca = str(dd_fc.iloc[i]['COND_A'])
cb = str(dd_fc.iloc[i]['COND_B'])
df = pd.read_csv('./vast_diff_results/'+comp+'.vast.diff.txt',sep='\t')
df.columns = ['GENE','EVENT',ca,cb,'E[dPsi]','MV[dPsi]_at_0.95']
df['org'] = 'org'
df = df.merge(annot[['EVENT','COORD_o']],
how='outer').merge(prt,
how='outer').merge(pcr,
how='outer')
df = df[df['org'] == 'org']
df.drop('org', 1 ,inplace=True)
df['E[dPsi]'] = df['E[dPsi]']*-1
psi = pd.read_csv('./vast_diff_results/'+comp+'.txt',sep='\t')
df_ex = df[(df['EVENT'].isin(psi['EVENT'])) &
(abs(df['E[dPsi]']) >= df['MV[dPsi]_at_0.95']) &
(df['MV[dPsi]_at_0.95'] >= 0.05) &
(abs(df['E[dPsi]']) >= 0.2)
].sort_values('MV[dPsi]_at_0.95',ascending=False).reset_index(drop=1)
df_sig = df[(df['EVENT'].isin(psi['EVENT'])) &
(abs(df['E[dPsi]']) >= df['MV[dPsi]_at_0.95']) &
(df['MV[dPsi]_at_0.95'] >= 0.01) &
(abs(df['E[dPsi]']) >= 0.1)
].sort_values('MV[dPsi]_at_0.95',ascending=False).reset_index(drop=1)
writer = pd.ExcelWriter('./vast_diff_results/'+comp+'.ASEs.xlsx',
engine='xlsxwriter')
df_ex.to_excel(writer, sheet_name='Experimental_Validation', index=False)
df_sig.to_excel(writer, sheet_name='All_significant_ASEs', index=False)
writer.save()
df_sig[['EVENT']].to_csv('./vast_diff_results/'+comp+'.SIG.TGTs.txt',sep='\t',index=False,header=False)
df_ex[['EVENT']].to_csv('./vast_diff_results/'+comp+'.EXP.TGTs.txt',sep='\t',index=False,header=False)
# In[ ]:
# In[1157]:
for i in range(len(dd_fd)):
comp = str(dd_fd.iloc[i]['COMP'])
ca = str(dd_fd.iloc[i]['COND_A'])
cb = str(dd_fd.iloc[i]['COND_B'])
df = pd.read_csv('./vast_diff_results/'+comp+'.vast.diff.txt',sep='\t')
df.columns = ['GENE','EVENT',ca,cb,'E[dPsi]','MV[dPsi]_at_0.95']
df['org'] = 'org'
df = df.merge(annot[['EVENT','COORD_o']],
how='outer').merge(prt,
how='outer').merge(pcr,
how='outer')
df = df[df['org'] == 'org']
df.drop('org', 1 ,inplace=True)
df['E[dPsi]'] = df['E[dPsi]']*-1
psi = pd.read_csv('./vast_diff_results/'+comp+'.txt',sep='\t')
df_sig = df[(df['EVENT'].isin(psi['EVENT'])) &
(abs(df['E[dPsi]']) >= df['MV[dPsi]_at_0.95']) &
(df['MV[dPsi]_at_0.95'] >= 0.01) &
(abs(df['E[dPsi]']) >= 0.1)
].sort_values('MV[dPsi]_at_0.95',ascending=False).reset_index(drop=1)
df_ex = df[(df['EVENT'].isin(psi['EVENT'])) &
(abs(df['E[dPsi]']) >= df['MV[dPsi]_at_0.95']) &
(df['MV[dPsi]_at_0.95'] >= 0.05) &
(abs(df['E[dPsi]']) >= 0.2)
].sort_values('MV[dPsi]_at_0.95',ascending=False).reset_index(drop=1)
writer = pd.ExcelWriter('./vast_diff_results/'+comp+'.ASEs.xlsx',
engine='xlsxwriter')
df_ex.to_excel(writer, sheet_name='Experimental_Validation', index=False)
df_sig.to_excel(writer, sheet_name='All_significant_ASEs', index=False)
writer.save()
df_sig[['EVENT']].to_csv('./vast_diff_results/'+comp+'.SIG.TGTs.txt',sep='\t',index=False,header=False)
df_ex[['EVENT']].to_csv('./vast_diff_results/'+comp+'.EXP.TGTs.txt',sep='\t',index=False,header=False)
# In[ ]:
# In[ ]:
# In[1129]:
my_evs = ['HsaINT0022568','HsaEX0011350','HsaINT0030497','HsaEX0051139',
'HsaEX0021497','HsaINT0013004','HsaINT0077022','HsaEX0055840']
dd = []
for i in range(len(dd_f[0:4])):
comp = str(dd_f.iloc[i]['COMP'])
ca = str(dd_f.iloc[i]['COND_A'])
cb = str(dd_f.iloc[i]['COND_B'])
df = pd.read_excel('./vast_diff_results/'+comp+'.ASEs.xlsx',
sheet_name='All_significant_ASEs', sep='\t')
sl = df[df['EVENT'].isin(my_evs)]
sl = sl[['GENE','EVENT','E[dPsi]']]
sl['ID'] = sl['GENE']+' - '+sl['EVENT']
sl['LINE'] = cb.split('_')[0]
sl['TIME'] = cb.split('_')[-1]
dd.append(sl)
# In[692]:
plt.figure(figsize=(6,4))
g = sns.catplot(x='TIME', y='E[dPsi]',
hue='LINE',col='ID',
kind='bar',
palette=['white','lightgrey'],
linewidth=1,
edgecolor='black',
dodge=True,
data=pd.concat(dd),
height=4, aspect=0.7,
col_wrap=4)
plt.savefig('./vast_diff_results/venn4.catplot.pdf', dpi=300, bbox_inches='tight')
plt.savefig('./vast_diff_results/venn4.catplot.svg', dpi=300, bbox_inches='tight')
plt.show()
# In[337]:
plt.figure(figsize=(6,4))
g = sns.barplot(x='ID',
y='E[dPsi]',
hue='TIME',
ci='sd',
palette=['white','lightgrey'],
dodge=True,
linewidth=1,
edgecolor='black',
data=pd.concat(dd).sort_values('E[dPsi]', ascending=True))
#show_values_on_bars(g, t=float)
g.set_ylabel('Mean dPSI', size=12)
g.set_yticks([-0.4,-0.2,0,0.2,0.4])
g.set_yticklabels(g.get_yticks().astype(float), size=12)
sns.despine(trim=True, offset=5)
g.set_xlabel('Alternative splicing events', size=12)
g.set_xticklabels(pd.concat(dd).sort_values('E[dPsi]', ascending=True)['ID'].unique(),
size=12, rotation=30, ha='right', rotation_mode="anchor")
plt.show()
# In[160]:
sl = df[df['EVENT'].isin(my_evs)][['GENE','EVENT','ONTO']]
# In[161]:
sl['org'] = 'org'
# In[169]:
slp = sl.merge(pfam, how='outer')#.merge(phy, how='outer')#.merge(phy, how='outer')
slp = slp[slp['org'] == 'org']
slp.drop('org', axis=1, inplace=True)
# In[170]:
slp
# ### Find overlapping events in HS line
# In[710]:
psi = pd.read_csv('./vast_diff_results/Hs578T_1uM_MS023_2d_vs_Hs578T_DMSO.txt',sep='\t')
d2 = pd.read_excel('./vast_diff_results/Hs578T_1uM_MS023_2d_vs_Hs578T_DMSO.ASEs.xlsx',
sheet_name='All_significant_ASEs',sep='\t')
d2_sig = d2[(d2['EVENT'].isin(psi['EVENT'])) &
(abs(d2['E[dPsi]']) >= d2['MV[dPsi]_at_0.95']) &
(d2['MV[dPsi]_at_0.95'] >= 0.01) &
(abs(d2['E[dPsi]']) >= 0.1)
].sort_values('MV[dPsi]_at_0.95',ascending=False).reset_index(drop=1)
psi = pd.read_csv('./vast_diff_results/Hs578T_1uM_MS023_5d_vs_Hs578T_DMSO.txt',sep='\t')
d5 = pd.read_excel('./vast_diff_results/Hs578T_1uM_MS023_5d_vs_Hs578T_DMSO.ASEs.xlsx',
sheet_name='All_significant_ASEs', sep='\t')
d5_sig = d5[(d5['EVENT'].isin(psi['EVENT'])) &
(abs(d5['E[dPsi]']) >= d5['MV[dPsi]_at_0.95']) &
(d5['MV[dPsi]_at_0.95'] >= 0.01) &
(abs(d5['E[dPsi]']) >= 0.1)
].sort_values('MV[dPsi]_at_0.95',ascending=False).reset_index(drop=1)
d2_hs578 = set(d2_sig['EVENT'])
d5_hs578 = set(d5_sig['EVENT'])
plt.figure(figsize=(3,4))
v = venn2_unweighted([d2_hs578,d5_hs578], set_labels=['2 days of\ntreatment','5 days of\ntreatment'])
c = venn2_circles(subsets=(1, 1, 1), linestyle='solid')
for text in v.set_labels:
text.set_fontsize(12)
for text in v.subset_labels:
text.set_fontsize(10)
for i in ['11','10','01']:
v.get_patch_by_id(i).set_alpha(0.0)
c[0].set_ls('dashed')
plt.title('HS578T - MS023 1uM\nAlternative splicing events',size=12,y=1.0)
plt.savefig('./hs587t_overlap.svg',dpi=300,bbox_inches='tight')
plt.savefig('./hs587t_overlap.pdf',dpi=300,bbox_inches='tight')
plt.show()
# In[711]:
hs_ov = d2_sig[['EVENT','E[dPsi]']].set_index('EVENT').add_suffix('_2d').reset_index().merge(
d5_sig[['EVENT','E[dPsi]']].set_index('EVENT').add_suffix('_5d').reset_index())
hs_ov['MEAN'] = hs_ov.apply(lambda x: np.mean([x['E[dPsi]_2d'],x['E[dPsi]_5d']]),1)
hs_ov['STD'] = hs_ov.apply(lambda x: np.std([x['E[dPsi]_2d'],x['E[dPsi]_5d']]),1)
hs_ov = hs_ov[abs(hs_ov['MEAN']) >= hs_ov['STD']]
hs_ov = hs_ov.merge(d2_sig[['EVENT','GENE','ONTO']])
hs_ov['ONTO'] = hs_ov['ONTO'].astype(str)
hs_ov['ONTO_SHORT'] = hs_ov.apply(lambda x:
"Protective" if x['MEAN'] < 0 and ('ORF disruption' in x['ONTO']) and ('inclusion' in x['ONTO']) else
"Deleterious" if x['MEAN'] < 0 and ('ORF disruption' in x['ONTO']) and ('exclusion' in x['ONTO']) else
"Protective" if x['MEAN'] > 0 and ('ORF disruption' in x['ONTO']) and ('exclusion' in x['ONTO']) else
"Deleterious" if x['MEAN'] > 0 and ('ORF disruption' in x['ONTO']) and ('inclusion' in x['ONTO']) else
"Alternative protein isoform" if ('isoform' in x['ONTO']) else
"Unknown" if ('description available' in x['ONTO']) else
x['ONTO'], axis=1).replace('nan','Unknown')
hs_ov['INC'] = hs_ov.apply(lambda x:
"Exclusion" if x['MEAN'] < 0 else
"Inclusion" if x['MEAN'] > 0 else 'Not ASE', axis=1).replace('nan','Unknown')
hs_ov['TYPE'] = hs_ov.apply(lambda x:
"Exon" if 'HsaEX' in x['EVENT'] else
"Intron" if 'HsaINT' in x['EVENT'] else
"Alt. 3'SS" if 'HsaALTA' in x['EVENT'] else
"Alt. 5'SS" if 'HsaALTD' in x['EVENT'] else
'Not ASE', axis=1).replace('nan','Unknown')
hs_ov['LABEL'] = hs_ov['TYPE']+';'+hs_ov['INC']
hs_res = pd.DataFrame(hs_ov['ONTO_SHORT'].value_counts()).reset_index()
hs_res.columns = ['Protein impact','Counts']
hs_res['%Impact'] = hs_res.apply(lambda x: np.round(x['Counts']/np.sum(hs_res['Counts']),3)*100, axis=1)
hs_res['Line'] = 'HS578T'
hs_res2 = pd.DataFrame(hs_ov['LABEL'].value_counts()).reset_index()
hs_res2.columns = ['Event label','Counts']
hs_res2['%Event'] = hs_res2.apply(lambda x: np.round(x['Counts']/np.sum(hs_res2['Counts']),3)*100, axis=1)
hs_res2['Line'] = 'HS578T'
# In[712]:
hs_ov['GENE'].tolist()
# In[741]:
hs_ov[hs_ov['GENE'].str.contains("PDK")]
# In[742]:
hs_ov[hs_ov['GENE'].isin(['COL1A1','IL6','ITGA6','PDK1','PPP2R1B','RAF1','BCL2L1','SGK1','MAGI1'])]
# In[714]:
psi = pd.read_csv('./vast_diff_results/MDAMB468_1uM_MS023_2d_vs_MDAMB468_DMSO.txt',sep='\t')
d2 = pd.read_excel('./vast_diff_results/MDAMB468_1uM_MS023_2d_vs_MDAMB468_DMSO.ASEs.xlsx',
sheet_name='All_significant_ASEs',sep='\t')
d2_sig = d2[(d2['EVENT'].isin(psi['EVENT'])) &
(abs(d2['E[dPsi]']) >= d2['MV[dPsi]_at_0.95']) &
(d2['MV[dPsi]_at_0.95'] >= 0.01) &
(abs(d2['E[dPsi]']) >= 0.1)
].sort_values('MV[dPsi]_at_0.95',ascending=False).reset_index(drop=1)
psi = pd.read_csv('./vast_diff_results/MDAMB468_1uM_MS023_5d_vs_MDAMB468_DMSO.txt',sep='\t')
d5 = pd.read_excel('./vast_diff_results/MDAMB468_1uM_MS023_5d_vs_MDAMB468_DMSO.ASEs.xlsx',
sheet_name='All_significant_ASEs', sep='\t')
d5_sig = d5[(d5['EVENT'].isin(psi['EVENT'])) &
(abs(d5['E[dPsi]']) >= d5['MV[dPsi]_at_0.95']) &
(d5['MV[dPsi]_at_0.95'] >= 0.01) &
(abs(d5['E[dPsi]']) >= 0.1)
].sort_values('MV[dPsi]_at_0.95',ascending=False).reset_index(drop=1)
d2_mdamb = set(d2_sig['EVENT'])
d5_mdamb = set(d5_sig['EVENT'])
plt.figure(figsize=(3,4))
v = venn2_unweighted([d2_mdamb,d5_mdamb], set_labels=['2 days of\ntreatment','5 days of\ntreatment'])
c = venn2_circles(subsets=(1, 1, 1), linestyle='solid')
for text in v.set_labels:
text.set_fontsize(12)
for text in v.subset_labels:
text.set_fontsize(10)
for i in ['11','10','01']:
v.get_patch_by_id(i).set_alpha(0.0)
c[0].set_ls('dashed')
plt.title('MDAMB468 - MS023 1uM\nAlternative splicing events',size=12,y=1.0)
plt.savefig('./mdamb468_overlap.svg',dpi=300,bbox_inches='tight')
plt.savefig('./mdamb468_overlap.pdf',dpi=300,bbox_inches='tight')
plt.show()
# In[715]:
md_ov = d2_sig[['EVENT','E[dPsi]']].set_index('EVENT').add_suffix('_2d').reset_index().merge(
d5_sig[['EVENT','E[dPsi]']].set_index('EVENT').add_suffix('_5d').reset_index())
md_ov['MEAN'] = md_ov.apply(lambda x: np.mean([x['E[dPsi]_2d'],x['E[dPsi]_5d']]),1)
md_ov['STD'] = md_ov.apply(lambda x: np.std([x['E[dPsi]_2d'],x['E[dPsi]_5d']]),1)
md_ov = md_ov[abs(md_ov['MEAN']) >= md_ov['STD']]
md_ov = md_ov.merge(d2_sig[['EVENT','GENE','ONTO']])
md_ov['ONTO'] = md_ov['ONTO'].astype(str)
md_ov['ONTO_SHORT'] = md_ov.apply(lambda x:
"Protective" if x['MEAN'] < 0 and ('ORF disruption' in x['ONTO']) and ('inclusion' in x['ONTO']) else
"Deleterious" if x['MEAN'] < 0 and ('ORF disruption' in x['ONTO']) and ('exclusion' in x['ONTO']) else
"Protective" if x['MEAN'] > 0 and ('ORF disruption' in x['ONTO']) and ('exclusion' in x['ONTO']) else
"Deleterious" if x['MEAN'] > 0 and ('ORF disruption' in x['ONTO']) and ('inclusion' in x['ONTO']) else
"Alternative protein isoform" if ('isoform' in x['ONTO']) else
"Unknown" if ('description available' in x['ONTO']) else
"ncRNA" if ('NonCoding' in x['ONTO']) else
x['ONTO'], axis=1).replace('nan','Unknown')
md_ov['INC'] = md_ov.apply(lambda x:
"Exclusion" if x['MEAN'] < 0 else
"Inclusion" if x['MEAN'] > 0 else 'Not ASE', axis=1).replace('nan','Unknown')
md_ov['TYPE'] = md_ov.apply(lambda x:
"Exon" if 'HsaEX' in x['EVENT'] else
"Intron" if 'HsaINT' in x['EVENT'] else
"Alt. 3'SS" if 'HsaALTA' in x['EVENT'] else
"Alt. 5'SS" if 'HsaALTD' in x['EVENT'] else
'Not ASE', axis=1).replace('nan','Unknown')
md_ov['LABEL'] = md_ov['TYPE']+';'+md_ov['INC']
md_res = pd.DataFrame(md_ov['ONTO_SHORT'].value_counts()).reset_index()
md_res.columns = ['Protein impact','Counts']
md_res['%Impact'] = md_res.apply(lambda x: np.round(x['Counts']/np.sum(md_res['Counts']),3)*100, axis=1)
md_res['Line'] = 'MDAMB468'
md_res2 = pd.DataFrame(md_ov['LABEL'].value_counts()).reset_index()
md_res2.columns = ['Event label','Counts']
md_res2['%Event'] = md_res2.apply(lambda x: np.round(x['Counts']/np.sum(md_res2['Counts']),3)*100, axis=1)
md_res2['Line'] = 'MDAMB468'
# In[716]:
md_ov['GENE'].tolist()
# In[717]:
md_ov[md_ov['GENE'].isin(['ATF2','EFNA1','KITLG','SYK'])]
# In[ ]:
# In[718]:
res = pd.concat([hs_res,md_res])
plt.figure(figsize=(6,4))
g = sns.barplot(x='Protein impact',
y='%Impact',
hue='Line',
palette=['white','lightgrey'],
dodge=True,
linewidth=1,
edgecolor='black',
data=res,
order=['Alternative protein isoform','Protective','Deleterious',
"5' UTR/ncRNA","3' UTR",'ncRNA','Unknown'])
show_values_on_bars(g, t=float, dc=1, fs=8, r=0)
g.set_ylabel('Ratio of ASEs (%)', size=12)
g.set_yticks([0,10,20,30,40,50])
g.set_yticklabels(g.get_yticks().astype(float), size=12)
sns.despine(trim=True, offset=5)
g.set_xlabel('Predicted protein impact', size=12)
g.set_xticklabels(['Alt. isoform','Protective','Deleterious',
"5' UTR/ncRNA","3' UTR",'ncRNA','Unknown'],
size=12, rotation=30, ha='right', rotation_mode="anchor")
plt.savefig('./prmt1i_protein_impact.svg', dpi=300, bbox_inches='tight')
plt.savefig('./prmt1i_protein_impact.pdf', dpi=300, bbox_inches='tight')
plt.show()
# In[719]:
res2 = pd.concat([hs_res2,md_res2])
res2['Region'] = res2['Event label'].apply(lambda x: x.split(";")[0])
res2['Level'] = res2['Event label'].apply(lambda x: x.split(";")[1])
# In[727]:
res2['Event label'].unique()
# In[739]:
plt.figure(figsize=(6,4))
g = sns.barplot(x='Event label',
y='%Event',
hue='Line',
palette=['white','lightgrey'],
dodge=True,
linewidth=1,
edgecolor='black',
data=res2,
order=['Intron;Exclusion', 'Intron;Inclusion',
'Exon;Exclusion', 'Exon;Inclusion',
"Alt. 5'SS;Exclusion", "Alt. 5'SS;Inclusion",
"Alt. 3'SS;Exclusion", "Alt. 3'SS;Inclusion"]
)
show_values_on_bars(g, t=float, dc=1, fs=8, r=0)
g.set_ylabel('Ratio of ASEs (%)', size=12)
g.set_yticks([0,10,20,30,40,50,60,70])
g.set_yticklabels(g.get_yticks().astype(float), size=12)
sns.despine(trim=True, offset=5)
g.set_xlabel('Type of ASE', size=12)
g.set_xticklabels(['Intron exclusion', 'Intron inclusion',
'Exon exclusion', 'Exon inclusion',
"Alt. 5'SS exclusion", "Alt. 5'SS inclusion",
"Alt. 3'SS exclusion", "Alt. 3'SS inclusion"],
size=12, rotation=30, ha='right', rotation_mode="anchor")
plt.legend(loc='upper right')
plt.savefig('./inclusion_profile.svg', dpi=300, bbox_inches='tight')
plt.savefig('./inclusion_profile.pdf', dpi=300, bbox_inches='tight')
plt.show()
# In[657]:
res = pd.concat([hs_res,md_res])
plt.figure(figsize=(6,4))
g = sns.barplot(x='Protein impact',
y='%Impact',
hue='Line',
palette=['white','lightgrey'],
dodge=True,
linewidth=1,
edgecolor='black',
data=res,
order=['Alternative protein isoform','Protective','Deleterious',
"5' UTR/ncRNA","3' UTR",'ncRNA','Unknown'])
show_values_on_bars(g, t=float, dc=1, fs=8, r=0)
g.set_ylabel('Ratio of ASEs (%)', size=12)
g.set_yticks([0,10,20,30,40,50])
g.set_yticklabels(g.get_yticks().astype(float), size=12)
sns.despine(trim=True, offset=5)
g.set_xlabel('Predicted protein impact', size=12)
g.set_xticklabels(['Alt. isoform','Protective','Deleterious',
"5' UTR/ncRNA","3' UTR",'ncRNA','Unknown'],
size=12, rotation=30, ha='right', rotation_mode="anchor")
plt.savefig('./prmt1i_protein_impact.svg', dpi=300, bbox_inches='tight')
plt.savefig('./prmt1i_protein_impact.pdf', dpi=300, bbox_inches='tight')
plt.show()
# In[ ]:
writer = pd.ExcelWriter('./overlap.day2.day5.ASEs.xlsx',
engine='xlsxwriter')
hs_ov.to_excel(writer, sheet_name='HS578T', index=False)
md_ov.to_excel(writer, sheet_name='MDAMB468', index=False)
writer.save()
# ### Hs578T vs MDAMB468
# In[1167]:
psi = pd.read_csv('./vast_diff_results/Hs578T_1uM_MS023_2d_vs_MDAMB468_1uM_MS023_2d.txt',sep='\t')
d2 = pd.read_excel('./vast_diff_results/Hs578T_1uM_MS023_2d_vs_MDAMB468_1uM_MS023_2d.ASEs.xlsx',
sheet_name='All_significant_ASEs',sep='\t')
d2_sig = d2[(d2['EVENT'].isin(psi['EVENT'])) &
(abs(d2['E[dPsi]']) >= d2['MV[dPsi]_at_0.95']) &
(d2['MV[dPsi]_at_0.95'] >= 0.01) &
(abs(d2['E[dPsi]']) >= 0.1)
].sort_values('MV[dPsi]_at_0.95',ascending=False).reset_index(drop=1)
psi = pd.read_csv('./vast_diff_results/Hs578T_1uM_MS023_5d_vs_MDAMB468_1uM_MS023_5d.txt',sep='\t')
d5 = pd.read_excel('./vast_diff_results/Hs578T_1uM_MS023_5d_vs_MDAMB468_1uM_MS023_5d.ASEs.xlsx',
sheet_name='All_significant_ASEs', sep='\t')
d5_sig = d5[(d5['EVENT'].isin(psi['EVENT'])) &
(abs(d5['E[dPsi]']) >= d5['MV[dPsi]_at_0.95']) &
(d5['MV[dPsi]_at_0.95'] >= 0.01) &
(abs(d5['E[dPsi]']) >= 0.1)
].sort_values('MV[dPsi]_at_0.95',ascending=False).reset_index(drop=1)
psi = pd.read_csv('./vast_diff_results/Hs578T_DMSO_vs_MDAMB468_DMSO.txt',sep='\t')
dm = pd.read_excel('./vast_diff_results/Hs578T_DMSO_vs_MDAMB468_DMSO.ASEs.xlsx',
sheet_name='All_significant_ASEs', sep='\t')
dm_sig = dm[(dm['EVENT'].isin(psi['EVENT'])) &
(abs(dm['E[dPsi]']) >= dm['MV[dPsi]_at_0.95']) &
(dm['MV[dPsi]_at_0.95'] >= 0.01) &
(abs(dm['E[dPsi]']) >= 0.1)
].sort_values('MV[dPsi]_at_0.95',ascending=False).reset_index(drop=1)
d2_hs578 = set(d2_sig['EVENT'])
d5_hs578 = set(d5_sig['EVENT'])
dm_hs578 = set(dm_sig['EVENT'])
plt.figure(figsize=(6,6))
v = venn3_unweighted([d2_hs578,d5_hs578, dm_hs578],
set_labels=['2 days of\ntreatment','5 days of\ntreatment','DMSO control'])
c = venn3_circles(subsets=(1, 1, 1, 1 , 1, 1, 1), linestyle='solid')
for text in v.set_labels:
text.set_fontsize(20)
for text in v.subset_labels:
text.set_fontsize(18)
for i in ['111','110','101','011','010','001','100']:
v.get_patch_by_id(i).set_alpha(0.0)
c[0].set_ls('dashed')
c[1].set_ls('dotted')
plt.title('HS578T vs. MDAMB568\nAlternative splicing events',size=20,y=1.1)
plt.savefig('./hs587t_vs_mdamb_overlap.svg',dpi=300,bbox_inches='tight')
plt.savefig('./hs587t_vs_mdamb_overlap.pdf',dpi=300,bbox_inches='tight')
plt.show()
# In[1248]:
hs_md_ov = d2_sig[['EVENT','E[dPsi]']].set_index('EVENT').add_suffix('_2d').reset_index().merge(
d5_sig[['EVENT','E[dPsi]']].set_index('EVENT').add_suffix('_5d').reset_index()).merge(
dm_sig[['EVENT','E[dPsi]']].set_index('EVENT').add_suffix('_dm').reset_index())
hs_md_ov['MEAN'] = hs_md_ov.apply(lambda x: np.mean([x['E[dPsi]_2d'],x['E[dPsi]_5d'],x['E[dPsi]_dm']]),1)
hs_md_ov['STD'] = hs_md_ov.apply(lambda x: np.std([x['E[dPsi]_2d'],x['E[dPsi]_5d'],x['E[dPsi]_dm']]),1)
hs_md_ov = hs_md_ov[abs(hs_md_ov['MEAN']) >= hs_md_ov['STD']]
hs_md_ov = hs_md_ov.merge(d2_sig[['EVENT','GENE','ONTO']])
hs_md_ov['ONTO'] = hs_md_ov['ONTO'].astype(str)
hs_md_ov['ONTO_SHORT'] = hs_md_ov.apply(lambda x:
"Protective" if x['MEAN'] < 0 and ('ORF disruption' in x['ONTO']) and ('inclusion' in x['ONTO']) else
"Deleterious" if x['MEAN'] < 0 and ('ORF disruption' in x['ONTO']) and ('exclusion' in x['ONTO']) else
"Protective" if x['MEAN'] > 0 and ('ORF disruption' in x['ONTO']) and ('exclusion' in x['ONTO']) else
"Deleterious" if x['MEAN'] > 0 and ('ORF disruption' in x['ONTO']) and ('inclusion' in x['ONTO']) else
"Alternative protein isoform" if ('isoform' in x['ONTO']) else
"Unknown" if ('description available' in x['ONTO']) else
"ncRNA" if ('NonCoding' in x['ONTO']) else
x['ONTO'], axis=1).replace('nan','Unknown')
# In[1362]:
sl4 = hs_md_ov[abs(hs_md_ov['MEAN']) >= 0.5]
sl4 = sl4[sl4['GENE'] != 'DST']
sl4['ID'] = sl4['GENE']+'='+sl4['EVENT']
# In[1275]:
hs_md_ov.to_excel('./overlap.HS578_vs_MDAMB468.ASEs.xlsx')
# In[1363]:
samps2 = tidy_split(samps_groups, 'SAMPLES', sep=',')
samps2['REP'] = ['_R1','_R2','_R3']*len(samps2['COND'].unique())
samps2['SAMP_ID'] = samps2['COND']+samps2['REP']
# In[1255]:
vast_psi = pd.read_csv('./INCLUSION_LEVELS_FULL-Hsa24-hg38.tab',sep='\t')
# In[1364]:
cols = pd.DataFrame(vast_psi.columns)
cols = cols[(cols[0].str.contains("MDAMB468|Hs578T|hs578t")) & ~(cols[0].str.contains("_R1-Q"))]
myEvs = sl4.sort_values("MEAN", ascending=False)['EVENT']
vast_psi_values = vast_psi[cols[0].tolist()+['EVENT']].set_index('EVENT').T
myPsi = vast_psi_values[myEvs].reset_index().rename(columns={'index':'SAMPLES'})
myPsi_ID = myPsi.merge(samps2[['SAMPLES','SAMP_ID']]).drop('SAMPLES',1).set_index('SAMP_ID').T
sl3 = myPsi_ID[[
'Hs578T_1uM_MS023_2d_R1',
'Hs578T_1uM_MS023_2d_R2',
'Hs578T_1uM_MS023_2d_R3',
'Hs578T_1uM_MS023_5d_R1',
'Hs578T_1uM_MS023_5d_R2',
'Hs578T_1uM_MS023_5d_R3',
'Hs578T_shPRMT1_DOX_R1',
'Hs578T_shPRMT1_DOX_R2',
'Hs578T_shPRMT1_DOX_R3',
'Hs578T_shPRMT1_NoDOX_R1',
'Hs578T_shPRMT1_NoDOX_R2',
'Hs578T_shPRMT1_NoDOX_R3',
'Hs578T_DMSO_R1',
'Hs578T_DMSO_R2',
'Hs578T_DMSO_R3',
'MDAMB468_1uM_MS023_2d_R1',
'MDAMB468_1uM_MS023_2d_R2',
'MDAMB468_1uM_MS023_2d_R3',
'MDAMB468_1uM_MS023_5d_R1',
'MDAMB468_1uM_MS023_5d_R2',
'MDAMB468_1uM_MS023_5d_R3',
'MDAMB468_DMSO_R1',
'MDAMB468_DMSO_R2',
'MDAMB468_DMSO_R3',
]]
sl5 = sl3.reset_index().merge(sl4[['EVENT','ID']].rename(columns={'EVENT':'index'})).set_index('ID').drop("index",1)
# In[1365]:
grid_kws = {"height_ratios": (0.9, .025), "hspace": 0.3}
f, (ax, cbar_ax) = plt.subplots(2, gridspec_kw=grid_kws, figsize = (12,12))
sns.heatmap(sl5,
cmap="YlGnBu",
linewidths=1,
ax=ax,
cbar_ax=cbar_ax,
annot=True,
fmt='.0f',
cbar_kws={"orientation": "horizontal"})
plt.savefig('top_targets_Hs578_vs_MDAMB.h.svg', dpi=300, bbox_inches='tight')
plt.savefig('top_targets_Hs578_vs_MDAMB.h.pdf', dpi=300, bbox_inches='tight')
plt.show()
# In[1366]:
grid_kws = {"height_ratios": (0.9, .025), "hspace": 0.3}
f, (ax, cbar_ax) = plt.subplots(2, gridspec_kw=grid_kws, figsize = (10,12))
sns.heatmap(sl5.T,
cmap="YlGnBu",
linewidths=1,
ax=ax,
cbar_ax=cbar_ax,
annot=True,
fmt='.0f',
cbar_kws={"orientation": "horizontal"})
plt.savefig('top_targets_Hs578_vs_MDAMB.v.svg', dpi=300, bbox_inches='tight')
plt.savefig('top_targets_Hs578_vs_MDAMB.v.pdf', dpi=300, bbox_inches='tight')
plt.show()
# In[1367]:
df_merged = hs_ov[['EVENT','GENE','MEAN']].rename(columns={'MEAN':'dPSI[Hs578T_MS023_1uM_vs_Hs578T_DMSO]'}
).merge(hs_md_ov[['EVENT','MEAN','ONTO']]
).rename(columns={'MEAN':'dPSI[Hs578T_vs_MDAMB468]'})
df_merged['ID'] = df_merged['GENE']+'='+df_merged['EVENT']
# In[1377]:
cols = pd.DataFrame(vast_psi.columns)
cols = cols[(cols[0].str.contains("MDAMB468|Hs578T|hs578t")) & ~(cols[0].str.contains("_R1-Q"))]
myEvs = df_merged['EVENT']#.sort_values("MEAN", ascending=False)['EVENT']
vast_psi_values = vast_psi[cols[0].tolist()+['EVENT']].set_index('EVENT').T
myPsi = vast_psi_values[myEvs].reset_index().rename(columns={'index':'SAMPLES'})
myPsi_ID = myPsi.merge(samps2[['SAMPLES','SAMP_ID']]).drop('SAMPLES',1).set_index('SAMP_ID').T
sl3 = myPsi_ID[[
'Hs578T_1uM_MS023_2d_R1',
'Hs578T_1uM_MS023_2d_R2',
'Hs578T_1uM_MS023_2d_R3',
'Hs578T_1uM_MS023_5d_R1',
'Hs578T_1uM_MS023_5d_R2',
'Hs578T_1uM_MS023_5d_R3',
#'Hs578T_shPRMT1_DOX_R1',
#'Hs578T_shPRMT1_DOX_R2',
#'Hs578T_shPRMT1_DOX_R3',
#'Hs578T_shPRMT1_NoDOX_R1',
#'Hs578T_shPRMT1_NoDOX_R2',
#'Hs578T_shPRMT1_NoDOX_R3',
'Hs578T_DMSO_R1',
'Hs578T_DMSO_R2',
'Hs578T_DMSO_R3',
'MDAMB468_1uM_MS023_2d_R1',
'MDAMB468_1uM_MS023_2d_R2',
'MDAMB468_1uM_MS023_2d_R3',
'MDAMB468_1uM_MS023_5d_R1',
'MDAMB468_1uM_MS023_5d_R2',
'MDAMB468_1uM_MS023_5d_R3',
'MDAMB468_DMSO_R1',
'MDAMB468_DMSO_R2',
'MDAMB468_DMSO_R3',
]]
sl5 = sl3.reset_index().merge(df_merged[['EVENT','ID']].rename(columns={'EVENT':'index'})).set_index('ID').drop("index",1)
# In[1378]:
grid_kws = {"height_ratios": (0.9, .025), "hspace": 0.3}
f, (ax, cbar_ax) = plt.subplots(2, gridspec_kw=grid_kws, figsize = (4,8))
sns.heatmap(sl5.T,
cmap="YlGnBu",
linewidths=1,
ax=ax,
cbar_ax=cbar_ax,
annot=True,
fmt='.0f',
cbar_kws={"orientation": "horizontal"})
plt.savefig('top_targets_Hs578_vs_DMSO_and_MDAMB.v.svg', dpi=300, bbox_inches='tight')
plt.savefig('top_targets_Hs578_vs_DMSO_and_MDAMB.v.pdf', dpi=300, bbox_inches='tight')
plt.show()
# In[ ]:
grid_kws = {"height_ratios": (0.9, .025), "hspace": 0.3}
f, (ax, cbar_ax) = plt.subplots(2, gridspec_kw=grid_kws, figsize = (6,4))
sns.heatmap(sl5,
cmap="YlGnBu",
linewidths=1,
ax=ax,
cbar_ax=cbar_ax,
annot=True,
fmt='.0f',
cbar_kws={"orientation": "horizontal"})
plt.savefig('top_targets_Hs578_vs_DMSO_and_MDAMB.h.svg', dpi=300, bbox_inches='tight')
plt.savefig('top_targets_Hs578_vs_DMSO_and_MDAMB.h.pdf', dpi=300, bbox_inches='tight')
plt.show()
# ### HS578 vs MDAMB468 - VAST TIDY
# In[1318]:
psi_table = pd.read_table("INCLUSION_LEVELS_FULL-Hsa24-hg38.TIDY.MS023.tab")#.fillna(0)
cols_to_drop = psi_table.filter(regex='-Q').columns.tolist()
psi_table.drop(columns=cols_to_drop, inplace=True)
psi_table = psi_table.set_index('EVENT').T.reset_index().rename(columns={'index':'SAMPLES'}
).merge(samps2[['SAMPLES','SAMP_ID']]
).set_index('SAMP_ID').drop("SAMPLES",1).T.reset_index().rename(columns={'index':'EVENT'})
hs_treated = psi_table.filter(regex='Hs578').filter(regex='MS023').columns.tolist()
hs_dmso = psi_table.filter(regex='Hs578').filter(regex='DMSO').columns.tolist()
md_treated = psi_table.filter(regex='MDAMB').filter(regex='MS023').columns.tolist()
md_dmso = psi_table.filter(regex='MDAMB').filter(regex='DMSO').columns.tolist()
psi_table = psi_table[['EVENT']+hs_treated+hs_dmso+md_treated+md_dmso]
# #.drop(psi_table.filter(regex='-Q').columns.tolist(),1)
# In[1320]:
psi_table['Hs578_vs_MDAMB468_pVal'] = psi_table.apply(lambda x: stats.mstats.mannwhitneyu(x[hs_treated+hs_dmso].values,
x[md_treated+md_dmso].values)[1],
axis=1)
psi_table['Hs578_vs_MDAMB468_dPSI'] = psi_table.apply(lambda x: np.round(
np.mean(x[hs_treated+hs_dmso])-np.mean(x[md_treated+md_dmso]),2),
axis=1)
psi_table['Hs578_MS023_vs_DMSO_pVal'] = psi_table.apply(lambda x: stats.mstats.mannwhitneyu(x[hs_treated].values,
x[hs_dmso].values)[1],
axis=1)
psi_table['Hs578_MS023_vs_DMSO_dPSI'] = psi_table.apply(lambda x: np.round(
np.mean(x[hs_treated])-np.mean(x[hs_dmso]),2),
axis=1)
psi_table['MDAMB468_MS023_vs_DMSO_pVal'] = psi_table.apply(lambda x: stats.mstats.mannwhitneyu(x[md_treated].values,
x[md_dmso].values)[1],
axis=1)
psi_table['MDAMB468_MS023_vs_DMSO_dPSI'] = psi_table.apply(lambda x: np.round(
np.mean(x[md_treated])-np.mean(x[md_dmso]),2),
axis=1)
# In[ ]:
# In[1322]:
hs578_vs_md468_hs = psi_table[(psi_table['Hs578_vs_MDAMB468_pVal'] <= 0.05) &
#(psi_table['Hs578_MS023_vs_DMSO_pVal'] >= 0.05) &
(abs(psi_table['Hs578_vs_MDAMB468_dPSI']) >= 40) #&
#(abs(psi_table['Hs578_MS023_vs_DMSO_dPSI']) <= 20)
].sort_values('Hs578_vs_MDAMB468_dPSI', ascending=False)
hs578_vs_md468_all = psi_table[(psi_table['Hs578_vs_MDAMB468_pVal'] <= 0.05) &
#(psi_table['Hs578_MS023_vs_DMSO_pVal'] >= 0.05) &
(abs(psi_table['Hs578_vs_MDAMB468_dPSI']) >= 20) #&
#(abs(psi_table['Hs578_MS023_vs_DMSO_dPSI']) <= 20)
].sort_values('Hs578_vs_MDAMB468_dPSI', ascending=False)
# In[1341]:
mytest = psi_table[((psi_table['Hs578_vs_MDAMB468_pVal'] <= 0.05) &
(abs(psi_table['Hs578_vs_MDAMB468_dPSI']) >= 10)) |
((psi_table['Hs578_MS023_vs_DMSO_pVal'] <= 0.05) &
(abs(psi_table['Hs578_MS023_vs_DMSO_dPSI']) >= 10)) |
((psi_table['MDAMB468_MS023_vs_DMSO_pVal'] <= 0.05) &
(abs(psi_table['MDAMB468_MS023_vs_DMSO_dPSI']) >= 10))
].sort_values('Hs578_vs_MDAMB468_dPSI', ascending=False).set_index('EVENT')
# In[1342]:
mytest.shape
# In[1343]:
mytest
# In[1338]:
grid_kws = {"height_ratios": (0.9, .025), "hspace": 0.3}
f, (ax, cbar_ax) = plt.subplots(2, gridspec_kw=grid_kws, figsize = (10,10))
sns.heatmap(mytest.drop(['Hs578_vs_MDAMB468_pVal','Hs578_vs_MDAMB468_dPSI',
'Hs578_MS023_vs_DMSO_pVal','Hs578_MS023_vs_DMSO_dPSI',
'MDAMB468_MS023_vs_DMSO_pVal','MDAMB468_MS023_vs_DMSO_dPSI',],1),
cmap="YlGnBu",
linewidths=1,
ax=ax,
cbar_ax=cbar_ax,
annot=True,
fmt='.0f',
cbar_kws={"orientation": "horizontal"})
plt.show()
# In[ ]:
# In[ ]:
#!/usr/bin/env python
# coding: utf-8
# In[768]:
import pandas as pd
import numpy as np
import glob
import itertools
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib_venn import *
import scipy.stats as stats
# In[491]:
def tidy_split(df, column, sep='|', keep=False):
"""
Split the values of a column and expand so the new DataFrame has one split
value per row. Filters rows where the column is missing.
Params
------
df : pandas.DataFrame
dataframe with the column to split and expand
column : str
the column to split and expand
sep : str
the string used to split the column's values
keep : bool
whether to retain the presplit value as it's own row
Returns
-------
pandas.DataFrame
Returns a dataframe with the same columns as `df`.
"""
indexes = list()
new_values = list()
df = df.dropna(subset=[column])
for i, presplit in enumerate(df[column].astype(str)):
values = presplit.split(sep)
if keep and len(values) > 1:
indexes.append(i)
new_values.append(presplit)
for value in values:
indexes.append(i)
new_values.append(value)
new_df = df.iloc[indexes, :].copy()
new_df[column] = new_values
return new_df
# In[319]:
def show_values_on_bars(axs, h_v="v", space=0.4, fs=10, t=int, dc=2, r=0):
def _show_on_single_plot(ax):
if h_v == "v":
for p in ax.patches:
_x = p.get_x() + p.get_width() / 2
_y = p.get_y() + p.get_height()
if np.isnan(p.get_height()) == True:
value = 0
_y = 0
else:
if t == float:
value = np.round(t(p.get_height()),dc)
else:
value = t(p.get_height())
ax.text(_x, _y, value, ha="center", fontsize=fs, rotation=r)
elif h_v == "h":
for p in ax.patches:
_x = p.get_x() + p.get_width() + float(space)
_y = p.get_y() + p.get_height()
if t == float:
value = np.round(t(p.get_width()),dc)
else:
value = t(p.get_width())
ax.text(_x, _y, value, ha="left", rotation=r, fontsize=fs)
if isinstance(axs, np.ndarray):
for idx, ax in np.ndenumerate(axs):
_show_on_single_plot(ax)
else:
_show_on_single_plot(axs)
# ### Pre-processing
# In[778]:
samps = pd.read_csv('./sample.list.txt', names=['SAMPLES','LINE'], sep='\t')
samps['COND'] = samps['SAMPLES'].apply(lambda x: '-'.join(x.split('-')[1:]))
samps_groups = samps.groupby('COND')['SAMPLES'].agg(lambda x: ','.join(x)).reset_index()
samps_groups['COND'] = ['Hs578T_1uM_MS023_2d',
'Hs578T_1uM_MS023_5d',
'MDAMB468_1uM_MS023_2d',
'MDAMB468_1uM_MS023_5d',
'Hs578T_DMSO',
'MDAMB468_DMSO',
'Hs578T_shPRMT1_DOX',
'Hs578T_shPRMT1_NoDOX']
samps_pair = list(itertools.combinations(samps_groups['SAMPLES'],2))
conds_pair = list(itertools.combinations(samps_groups['COND'],2))
d1 = pd.DataFrame(samps_pair, columns=['SAMPLE_A','SAMPLE_B'])
d2 = pd.DataFrame(conds_pair, columns=['COND_A','COND_B'])
dd = pd.concat([d1,d2], axis=1)
dd['COMP'] = dd['COND_B']+'_vs_'+dd['COND_A']
dd['CELL_A'] = dd['COND_A'].apply(lambda x: x.split('_')[0])
dd['CELL_B'] = dd['COND_B'].apply(lambda x: x.split('_')[0])
dd_c = dd[dd['CELL_A'] == dd['CELL_B']]
# In[780]:
dd_pass = dd_c[dd_c['COND_A'].str.contains('DMSO')]
dd_fix = dd_c[dd_c['COND_B'].str.contains('DMSO')]
dd_fix.columns = ['SAMPLE_B','SAMPLE_A','COND_B','COND_A','COMP','CELL_B','CELL_A']
dd_fix['COMP'] = dd_fix['COND_B']+'_vs_'+dd_fix['COND_A']
dd_fix = dd_fix[dd_c.columns]
dd2 = pd.concat([dd_pass,dd_fix]).drop_duplicates().reset_index(drop=1)
dd2 = dd2[(dd2['COND_A'].str.contains('DMSO')) & ~(dd2['COND_B'].str.contains('DOX'))]
dd_pass = dd[dd['COND_A'].str.contains('NoDOX')]
dd_fix = dd[dd['COND_B'].str.contains('NoDOX')]
dd_fix.columns = ['SAMPLE_B','SAMPLE_A','COND_B','COND_A','COMP','CELL_B','CELL_A']
dd_fix['COMP'] = dd_fix['COND_B']+'_vs_'+dd_fix['COND_A']
dd_fix = dd_fix[dd.columns]
dd3 = pd.concat([dd_pass,dd_fix]).drop_duplicates().reset_index(drop=1)
dd3 = dd3[(dd3['COND_A'].str.contains('DOX')) & (dd3['COND_B'].str.contains('DOX'))]
dd_f = pd.concat([dd2,dd3])
dd_f.to_csv('./comb.pair.treated.vs.control.txt',sep=';',header=False,index=False)
# ### Generate Hs vs MDAM comparissons
# In[781]:
dd_c = dd[(dd['CELL_A'] != dd['CELL_B']) &
(dd['COND_A'].str.contains('2d|5d')) &
(dd['COND_B'].str.contains('2d|5d'))]
dd_pass = dd_c[(dd_c['COND_A'].str.contains('MDAMB468')) & (dd_c['COND_B'].str.contains('Hs578T'))]
dd_fix = dd_c[(dd_c['COND_B'].str.contains('MDAMB468')) & (dd_c['COND_A'].str.contains('Hs578T'))]
dd_fix.columns = ['SAMPLE_B','SAMPLE_A','COND_B','COND_A','COMP','CELL_B','CELL_A']
dd_fix['COMP'] = dd_fix['COND_B']+'_vs_'+dd_fix['COND_A']
dd_fix = dd_fix[dd_c.columns]
dd2 = pd.concat([dd_pass,dd_fix]).drop_duplicates().reset_index(drop=1)
# #dd2 = dd2[(dd2['COND_A'].str.contains('DMSO')) & ~(dd2['COND_B'].str.contains('DOX'))]
# dd_pass = dd[dd['COND_A'].str.contains('NoDOX')]
# dd_fix = dd[dd['COND_B'].str.contains('NoDOX')]
# dd_fix.columns = ['SAMPLE_B','SAMPLE_A','COND_B','COND_A','COMP','CELL_B','CELL_A']
# dd_fix['COMP'] = dd_fix['COND_B']+'_vs_'+dd_fix['COND_A']
# dd_fix = dd_fix[dd.columns]
# dd3 = pd.concat([dd_pass,dd_fix]).drop_duplicates().reset_index(drop=1)
# #dd3 = dd3[(dd3['COND_A'].str.contains('DOX')) & (dd3['COND_B'].str.contains('DOX'))]
dd_fc = pd.concat([dd2])
dd_fc.to_csv('./comb.pair.hs578t.vs.mdamb468.txt',sep=';',header=False,index=False)
# In[1122]:
dd_c = dd[(dd['CELL_A'] != dd['CELL_B']) &
(dd['COND_A'].str.contains('DMSO')) &
(dd['COND_B'].str.contains('DMSO'))]
dd_pass = dd_c[(dd_c['COND_A'].str.contains('MDAMB468')) & (dd_c['COND_B'].str.contains('Hs578T'))]
dd_fix = dd_c[(dd_c['COND_B'].str.contains('MDAMB468')) & (dd_c['COND_A'].str.contains('Hs578T'))]
dd_fix.columns = ['SAMPLE_B','SAMPLE_A','COND_B','COND_A','COMP','CELL_B','CELL_A']
dd_fix['COMP'] = dd_fix['COND_B']+'_vs_'+dd_fix['COND_A']
dd_fix = dd_fix[dd_c.columns]
dd2 = pd.concat([dd_pass,dd_fix]).drop_duplicates().reset_index(drop=1)
# #dd2 = dd2[(dd2['COND_A'].str.contains('DMSO')) & ~(dd2['COND_B'].str.contains('DOX'))]
# dd_pass = dd[dd['COND_A'].str.contains('NoDOX')]
# dd_fix = dd[dd['COND_B'].str.contains('NoDOX')]
# dd_fix.columns = ['SAMPLE_B','SAMPLE_A','COND_B','COND_A','COMP','CELL_B','CELL_A']
# dd_fix['COMP'] = dd_fix['COND_B']+'_vs_'+dd_fix['COND_A']
# dd_fix = dd_fix[dd.columns]
# dd3 = pd.concat([dd_pass,dd_fix]).drop_duplicates().reset_index(drop=1)
# #dd3 = dd3[(dd3['COND_A'].str.contains('DOX')) & (dd3['COND_B'].str.contains('DOX'))]
dd_fd = pd.concat([dd2])
dd_fd.to_csv('./comb.pair.hs578t.vs.mdamb468.DMSO.txt',sep=';',header=False,index=False)
# In[1123]:
dd_fd
# ### Vast-diff parsing
# In[323]:
annot = pd.read_csv('/home/fciamponi/Programs/vast-tools/VASTDB/Hsa/ANNOT_DATA/VASTDB_EVENTS_Hsa108_hg38_ALL.tab.gz',
sep='\t')
pcr = pd.read_csv('/home/fciamponi/Programs/vast-tools/VASTDB/Hsa/ANNOT_DATA/VASTDB_PCR_VALIDATIONS_Hsa108_hg19.tab.gz',
sep='\t')
prt = pd.read_csv('/home/fciamponi/Programs/vast-tools/VASTDB/Hsa/ANNOT_DATA/VASTDB_PROT_IMPACT_Hsa108_hg19_ALL.tab.gz',
sep='\t').rename(columns={'EventID':'EVENT'})
pfam = pd.read_csv('/home/fciamponi/Programs/vast-tools/VASTDB/Hsa/ANNOT_DATA/VASTDB_PROT_PFAM_Hsa108_hg19.tab.gz',
sep='\t').rename(columns={'EventID':'EVENT'})
# In[324]:
pdb = pd.read_csv('/home/fciamponi/Programs/vast-tools/VASTDB/Hsa/ANNOT_DATA/VASTDB_PROT_STRUCT_PDB_Hsa108_hg19.tab.gz',
sep='\t').rename(columns={'EVENT_ID':'EVENT'})
phy = pd.read_csv('/home/fciamponi/Programs/vast-tools/VASTDB/Hsa/ANNOT_DATA/VASTDB_PROT_STRUCT_PHYRE_Hsa108_hg19.tab.gz',
sep='\t').rename(columns={'EVENT_ID':'EVENT'})
# In[403]:
print(annot.shape)
print(pcr.shape)
print(prt.shape)
# In[1124]:
for i in range(len(dd_f)):
comp = str(dd_f.iloc[i]['COMP'])
ca = str(dd_f.iloc[i]['COND_A'])
cb = str(dd_f.iloc[i]['COND_B'])
df = pd.read_csv('./vast_diff_results/'+comp+'.vast.diff.txt',sep='\t')
df.columns = ['GENE','EVENT',ca,cb,'E[dPsi]','MV[dPsi]_at_0.95']
df['org'] = 'org'
df = df.merge(annot[['EVENT','COORD_o']],
how='outer').merge(prt,
how='outer').merge(pcr,
how='outer')
df = df[df['org'] == 'org']
df.drop('org', 1 ,inplace=True)
df['E[dPsi]'] = df['E[dPsi]']*-1
psi = pd.read_csv('./vast_diff_results/'+comp+'.txt',sep='\t')
df_ex = df[(df['EVENT'].isin(psi['EVENT'])) &
(abs(df['E[dPsi]']) >= df['MV[dPsi]_at_0.95']) &
(df['MV[dPsi]_at_0.95'] >= 0.05) &
(abs(df['E[dPsi]']) >= 0.2)
].sort_values('MV[dPsi]_at_0.95',ascending=False).reset_index(drop=1)
df_sig = df[(df['EVENT'].isin(psi['EVENT'])) &
(abs(df['E[dPsi]']) >= df['MV[dPsi]_at_0.95']) &
(df['MV[dPsi]_at_0.95'] >= 0.01) &
(abs(df['E[dPsi]']) >= 0.1)
].sort_values('MV[dPsi]_at_0.95',ascending=False).reset_index(drop=1)
writer = pd.ExcelWriter('./vast_diff_results/'+comp+'.ASEs.xlsx',
engine='xlsxwriter')
df_ex.to_excel(writer, sheet_name='Experimental_Validation', index=False)
df_sig.to_excel(writer, sheet_name='All_significant_ASEs', index=False)
writer.save()
df_sig[['EVENT']].to_csv('./vast_diff_results/'+comp+'.SIG.TGTs.txt',sep='\t',index=False,header=False)
df_ex[['EVENT']].to_csv('./vast_diff_results/'+comp+'.EXP.TGTs.txt',sep='\t',index=False,header=False)
# In[1125]:
for i in range(len(dd_fc)):
comp = str(dd_fc.iloc[i]['COMP'])
ca = str(dd_fc.iloc[i]['COND_A'])
cb = str(dd_fc.iloc[i]['COND_B'])
df = pd.read_csv('./vast_diff_results/'+comp+'.vast.diff.txt',sep='\t')
df.columns = ['GENE','EVENT',ca,cb,'E[dPsi]','MV[dPsi]_at_0.95']
df['org'] = 'org'
df = df.merge(annot[['EVENT','COORD_o']],
how='outer').merge(prt,
how='outer').merge(pcr,
how='outer')
df = df[df['org'] == 'org']
df.drop('org', 1 ,inplace=True)
df['E[dPsi]'] = df['E[dPsi]']*-1
psi = pd.read_csv('./vast_diff_results/'+comp+'.txt',sep='\t')
df_ex = df[(df['EVENT'].isin(psi['EVENT'])) &
(abs(df['E[dPsi]']) >= df['MV[dPsi]_at_0.95']) &
(df['MV[dPsi]_at_0.95'] >= 0.05) &
(abs(df['E[dPsi]']) >= 0.2)
].sort_values('MV[dPsi]_at_0.95',ascending=False).reset_index(drop=1)
df_sig = df[(df['EVENT'].isin(psi['EVENT'])) &
(abs(df['E[dPsi]']) >= df['MV[dPsi]_at_0.95']) &
(df['MV[dPsi]_at_0.95'] >= 0.01) &
(abs(df['E[dPsi]']) >= 0.1)
].sort_values('MV[dPsi]_at_0.95',ascending=False).reset_index(drop=1)
writer = pd.ExcelWriter('./vast_diff_results/'+comp+'.ASEs.xlsx',
engine='xlsxwriter')
df_ex.to_excel(writer, sheet_name='Experimental_Validation', index=False)
df_sig.to_excel(writer, sheet_name='All_significant_ASEs', index=False)
writer.save()
df_sig[['EVENT']].to_csv('./vast_diff_results/'+comp+'.SIG.TGTs.txt',sep='\t',index=False,header=False)
df_ex[['EVENT']].to_csv('./vast_diff_results/'+comp+'.EXP.TGTs.txt',sep='\t',index=False,header=False)
# In[ ]:
# In[1157]:
for i in range(len(dd_fd)):
comp = str(dd_fd.iloc[i]['COMP'])
ca = str(dd_fd.iloc[i]['COND_A'])
cb = str(dd_fd.iloc[i]['COND_B'])
df = pd.read_csv('./vast_diff_results/'+comp+'.vast.diff.txt',sep='\t')
df.columns = ['GENE','EVENT',ca,cb,'E[dPsi]','MV[dPsi]_at_0.95']
df['org'] = 'org'
df = df.merge(annot[['EVENT','COORD_o']],
how='outer').merge(prt,
how='outer').merge(pcr,
how='outer')
df = df[df['org'] == 'org']
df.drop('org', 1 ,inplace=True)
df['E[dPsi]'] = df['E[dPsi]']*-1
psi = pd.read_csv('./vast_diff_results/'+comp+'.txt',sep='\t')
df_sig = df[(df['EVENT'].isin(psi['EVENT'])) &
(abs(df['E[dPsi]']) >= df['MV[dPsi]_at_0.95']) &
(df['MV[dPsi]_at_0.95'] >= 0.01) &
(abs(df['E[dPsi]']) >= 0.1)
].sort_values('MV[dPsi]_at_0.95',ascending=False).reset_index(drop=1)
df_ex = df[(df['EVENT'].isin(psi['EVENT'])) &
(abs(df['E[dPsi]']) >= df['MV[dPsi]_at_0.95']) &
(df['MV[dPsi]_at_0.95'] >= 0.05) &
(abs(df['E[dPsi]']) >= 0.2)
].sort_values('MV[dPsi]_at_0.95',ascending=False).reset_index(drop=1)
writer = pd.ExcelWriter('./vast_diff_results/'+comp+'.ASEs.xlsx',
engine='xlsxwriter')
df_ex.to_excel(writer, sheet_name='Experimental_Validation', index=False)
df_sig.to_excel(writer, sheet_name='All_significant_ASEs', index=False)
writer.save()
df_sig[['EVENT']].to_csv('./vast_diff_results/'+comp+'.SIG.TGTs.txt',sep='\t',index=False,header=False)
df_ex[['EVENT']].to_csv('./vast_diff_results/'+comp+'.EXP.TGTs.txt',sep='\t',index=False,header=False)
# In[ ]:
# In[ ]:
# In[1129]:
my_evs = ['HsaINT0022568','HsaEX0011350','HsaINT0030497','HsaEX0051139',
'HsaEX0021497','HsaINT0013004','HsaINT0077022','HsaEX0055840']
dd = []
for i in range(len(dd_f[0:4])):
comp = str(dd_f.iloc[i]['COMP'])
ca = str(dd_f.iloc[i]['COND_A'])
cb = str(dd_f.iloc[i]['COND_B'])
df = pd.read_excel('./vast_diff_results/'+comp+'.ASEs.xlsx',
sheet_name='All_significant_ASEs', sep='\t')
sl = df[df['EVENT'].isin(my_evs)]
sl = sl[['GENE','EVENT','E[dPsi]']]
sl['ID'] = sl['GENE']+' - '+sl['EVENT']
sl['LINE'] = cb.split('_')[0]
sl['TIME'] = cb.split('_')[-1]
dd.append(sl)
# In[692]:
plt.figure(figsize=(6,4))
g = sns.catplot(x='TIME', y='E[dPsi]',
hue='LINE',col='ID',
kind='bar',
palette=['white','lightgrey'],
linewidth=1,
edgecolor='black',
dodge=True,
data=pd.concat(dd),
height=4, aspect=0.7,
col_wrap=4)
plt.savefig('./vast_diff_results/venn4.catplot.pdf', dpi=300, bbox_inches='tight')
plt.savefig('./vast_diff_results/venn4.catplot.svg', dpi=300, bbox_inches='tight')
plt.show()
# In[337]:
plt.figure(figsize=(6,4))
g = sns.barplot(x='ID',
y='E[dPsi]',
hue='TIME',
ci='sd',
palette=['white','lightgrey'],
dodge=True,
linewidth=1,
edgecolor='black',
data=pd.concat(dd).sort_values('E[dPsi]', ascending=True))
#show_values_on_bars(g, t=float)
g.set_ylabel('Mean dPSI', size=12)
g.set_yticks([-0.4,-0.2,0,0.2,0.4])
g.set_yticklabels(g.get_yticks().astype(float), size=12)
sns.despine(trim=True, offset=5)
g.set_xlabel('Alternative splicing events', size=12)
g.set_xticklabels(pd.concat(dd).sort_values('E[dPsi]', ascending=True)['ID'].unique(),
size=12, rotation=30, ha='right', rotation_mode="anchor")
plt.show()
# In[160]:
sl = df[df['EVENT'].isin(my_evs)][['GENE','EVENT','ONTO']]
# In[161]:
sl['org'] = 'org'
# In[169]:
slp = sl.merge(pfam, how='outer')#.merge(phy, how='outer')#.merge(phy, how='outer')
slp = slp[slp['org'] == 'org']
slp.drop('org', axis=1, inplace=True)
# In[170]:
slp
# ### Find overlapping events in HS line
# In[710]:
psi = pd.read_csv('./vast_diff_results/Hs578T_1uM_MS023_2d_vs_Hs578T_DMSO.txt',sep='\t')
d2 = pd.read_excel('./vast_diff_results/Hs578T_1uM_MS023_2d_vs_Hs578T_DMSO.ASEs.xlsx',
sheet_name='All_significant_ASEs',sep='\t')
d2_sig = d2[(d2['EVENT'].isin(psi['EVENT'])) &
(abs(d2['E[dPsi]']) >= d2['MV[dPsi]_at_0.95']) &
(d2['MV[dPsi]_at_0.95'] >= 0.01) &
(abs(d2['E[dPsi]']) >= 0.1)
].sort_values('MV[dPsi]_at_0.95',ascending=False).reset_index(drop=1)
psi = pd.read_csv('./vast_diff_results/Hs578T_1uM_MS023_5d_vs_Hs578T_DMSO.txt',sep='\t')
d5 = pd.read_excel('./vast_diff_results/Hs578T_1uM_MS023_5d_vs_Hs578T_DMSO.ASEs.xlsx',
sheet_name='All_significant_ASEs', sep='\t')
d5_sig = d5[(d5['EVENT'].isin(psi['EVENT'])) &
(abs(d5['E[dPsi]']) >= d5['MV[dPsi]_at_0.95']) &
(d5['MV[dPsi]_at_0.95'] >= 0.01) &
(abs(d5['E[dPsi]']) >= 0.1)
].sort_values('MV[dPsi]_at_0.95',ascending=False).reset_index(drop=1)
d2_hs578 = set(d2_sig['EVENT'])
d5_hs578 = set(d5_sig['EVENT'])
plt.figure(figsize=(3,4))
v = venn2_unweighted([d2_hs578,d5_hs578], set_labels=['2 days of\ntreatment','5 days of\ntreatment'])
c = venn2_circles(subsets=(1, 1, 1), linestyle='solid')
for text in v.set_labels:
text.set_fontsize(12)
for text in v.subset_labels:
text.set_fontsize(10)
for i in ['11','10','01']:
v.get_patch_by_id(i).set_alpha(0.0)
c[0].set_ls('dashed')
plt.title('HS578T - MS023 1uM\nAlternative splicing events',size=12,y=1.0)
plt.savefig('./hs587t_overlap.svg',dpi=300,bbox_inches='tight')
plt.savefig('./hs587t_overlap.pdf',dpi=300,bbox_inches='tight')
plt.show()
# In[711]:
hs_ov = d2_sig[['EVENT','E[dPsi]']].set_index('EVENT').add_suffix('_2d').reset_index().merge(
d5_sig[['EVENT','E[dPsi]']].set_index('EVENT').add_suffix('_5d').reset_index())
hs_ov['MEAN'] = hs_ov.apply(lambda x: np.mean([x['E[dPsi]_2d'],x['E[dPsi]_5d']]),1)
hs_ov['STD'] = hs_ov.apply(lambda x: np.std([x['E[dPsi]_2d'],x['E[dPsi]_5d']]),1)
hs_ov = hs_ov[abs(hs_ov['MEAN']) >= hs_ov['STD']]
hs_ov = hs_ov.merge(d2_sig[['EVENT','GENE','ONTO']])
hs_ov['ONTO'] = hs_ov['ONTO'].astype(str)
hs_ov['ONTO_SHORT'] = hs_ov.apply(lambda x:
"Protective" if x['MEAN'] < 0 and ('ORF disruption' in x['ONTO']) and ('inclusion' in x['ONTO']) else
"Deleterious" if x['MEAN'] < 0 and ('ORF disruption' in x['ONTO']) and ('exclusion' in x['ONTO']) else
"Protective" if x['MEAN'] > 0 and ('ORF disruption' in x['ONTO']) and ('exclusion' in x['ONTO']) else
"Deleterious" if x['MEAN'] > 0 and ('ORF disruption' in x['ONTO']) and ('inclusion' in x['ONTO']) else
"Alternative protein isoform" if ('isoform' in x['ONTO']) else
"Unknown" if ('description available' in x['ONTO']) else
x['ONTO'], axis=1).replace('nan','Unknown')
hs_ov['INC'] = hs_ov.apply(lambda x:
"Exclusion" if x['MEAN'] < 0 else
"Inclusion" if x['MEAN'] > 0 else 'Not ASE', axis=1).replace('nan','Unknown')
hs_ov['TYPE'] = hs_ov.apply(lambda x:
"Exon" if 'HsaEX' in x['EVENT'] else
"Intron" if 'HsaINT' in x['EVENT'] else
"Alt. 3'SS" if 'HsaALTA' in x['EVENT'] else
"Alt. 5'SS" if 'HsaALTD' in x['EVENT'] else
'Not ASE', axis=1).replace('nan','Unknown')
hs_ov['LABEL'] = hs_ov['TYPE']+';'+hs_ov['INC']
hs_res = pd.DataFrame(hs_ov['ONTO_SHORT'].value_counts()).reset_index()
hs_res.columns = ['Protein impact','Counts']
hs_res['%Impact'] = hs_res.apply(lambda x: np.round(x['Counts']/np.sum(hs_res['Counts']),3)*100, axis=1)
hs_res['Line'] = 'HS578T'
hs_res2 = pd.DataFrame(hs_ov['LABEL'].value_counts()).reset_index()
hs_res2.columns = ['Event label','Counts']
hs_res2['%Event'] = hs_res2.apply(lambda x: np.round(x['Counts']/np.sum(hs_res2['Counts']),3)*100, axis=1)
hs_res2['Line'] = 'HS578T'
# In[712]:
hs_ov['GENE'].tolist()
# In[741]:
hs_ov[hs_ov['GENE'].str.contains("PDK")]
# In[742]:
hs_ov[hs_ov['GENE'].isin(['COL1A1','IL6','ITGA6','PDK1','PPP2R1B','RAF1','BCL2L1','SGK1','MAGI1'])]
# In[714]:
psi = pd.read_csv('./vast_diff_results/MDAMB468_1uM_MS023_2d_vs_MDAMB468_DMSO.txt',sep='\t')
d2 = pd.read_excel('./vast_diff_results/MDAMB468_1uM_MS023_2d_vs_MDAMB468_DMSO.ASEs.xlsx',
sheet_name='All_significant_ASEs',sep='\t')
d2_sig = d2[(d2['EVENT'].isin(psi['EVENT'])) &
(abs(d2['E[dPsi]']) >= d2['MV[dPsi]_at_0.95']) &
(d2['MV[dPsi]_at_0.95'] >= 0.01) &
(abs(d2['E[dPsi]']) >= 0.1)
].sort_values('MV[dPsi]_at_0.95',ascending=False).reset_index(drop=1)
psi = pd.read_csv('./vast_diff_results/MDAMB468_1uM_MS023_5d_vs_MDAMB468_DMSO.txt',sep='\t')
d5 = pd.read_excel('./vast_diff_results/MDAMB468_1uM_MS023_5d_vs_MDAMB468_DMSO.ASEs.xlsx',
sheet_name='All_significant_ASEs', sep='\t')
d5_sig = d5[(d5['EVENT'].isin(psi['EVENT'])) &
(abs(d5['E[dPsi]']) >= d5['MV[dPsi]_at_0.95']) &
(d5['MV[dPsi]_at_0.95'] >= 0.01) &
(abs(d5['E[dPsi]']) >= 0.1)
].sort_values('MV[dPsi]_at_0.95',ascending=False).reset_index(drop=1)
d2_mdamb = set(d2_sig['EVENT'])
d5_mdamb = set(d5_sig['EVENT'])
plt.figure(figsize=(3,4))
v = venn2_unweighted([d2_mdamb,d5_mdamb], set_labels=['2 days of\ntreatment','5 days of\ntreatment'])
c = venn2_circles(subsets=(1, 1, 1), linestyle='solid')
for text in v.set_labels:
text.set_fontsize(12)
for text in v.subset_labels:
text.set_fontsize(10)
for i in ['11','10','01']:
v.get_patch_by_id(i).set_alpha(0.0)
c[0].set_ls('dashed')
plt.title('MDAMB468 - MS023 1uM\nAlternative splicing events',size=12,y=1.0)
plt.savefig('./mdamb468_overlap.svg',dpi=300,bbox_inches='tight')
plt.savefig('./mdamb468_overlap.pdf',dpi=300,bbox_inches='tight')
plt.show()
# In[715]:
md_ov = d2_sig[['EVENT','E[dPsi]']].set_index('EVENT').add_suffix('_2d').reset_index().merge(
d5_sig[['EVENT','E[dPsi]']].set_index('EVENT').add_suffix('_5d').reset_index())
md_ov['MEAN'] = md_ov.apply(lambda x: np.mean([x['E[dPsi]_2d'],x['E[dPsi]_5d']]),1)
md_ov['STD'] = md_ov.apply(lambda x: np.std([x['E[dPsi]_2d'],x['E[dPsi]_5d']]),1)
md_ov = md_ov[abs(md_ov['MEAN']) >= md_ov['STD']]
md_ov = md_ov.merge(d2_sig[['EVENT','GENE','ONTO']])
md_ov['ONTO'] = md_ov['ONTO'].astype(str)
md_ov['ONTO_SHORT'] = md_ov.apply(lambda x:
"Protective" if x['MEAN'] < 0 and ('ORF disruption' in x['ONTO']) and ('inclusion' in x['ONTO']) else
"Deleterious" if x['MEAN'] < 0 and ('ORF disruption' in x['ONTO']) and ('exclusion' in x['ONTO']) else
"Protective" if x['MEAN'] > 0 and ('ORF disruption' in x['ONTO']) and ('exclusion' in x['ONTO']) else
"Deleterious" if x['MEAN'] > 0 and ('ORF disruption' in x['ONTO']) and ('inclusion' in x['ONTO']) else
"Alternative protein isoform" if ('isoform' in x['ONTO']) else
"Unknown" if ('description available' in x['ONTO']) else
"ncRNA" if ('NonCoding' in x['ONTO']) else
x['ONTO'], axis=1).replace('nan','Unknown')
md_ov['INC'] = md_ov.apply(lambda x:
"Exclusion" if x['MEAN'] < 0 else
"Inclusion" if x['MEAN'] > 0 else 'Not ASE', axis=1).replace('nan','Unknown')
md_ov['TYPE'] = md_ov.apply(lambda x:
"Exon" if 'HsaEX' in x['EVENT'] else
"Intron" if 'HsaINT' in x['EVENT'] else
"Alt. 3'SS" if 'HsaALTA' in x['EVENT'] else
"Alt. 5'SS" if 'HsaALTD' in x['EVENT'] else
'Not ASE', axis=1).replace('nan','Unknown')
md_ov['LABEL'] = md_ov['TYPE']+';'+md_ov['INC']
md_res = pd.DataFrame(md_ov['ONTO_SHORT'].value_counts()).reset_index()
md_res.columns = ['Protein impact','Counts']
md_res['%Impact'] = md_res.apply(lambda x: np.round(x['Counts']/np.sum(md_res['Counts']),3)*100, axis=1)
md_res['Line'] = 'MDAMB468'
md_res2 = pd.DataFrame(md_ov['LABEL'].value_counts()).reset_index()
md_res2.columns = ['Event label','Counts']
md_res2['%Event'] = md_res2.apply(lambda x: np.round(x['Counts']/np.sum(md_res2['Counts']),3)*100, axis=1)
md_res2['Line'] = 'MDAMB468'
# In[716]:
md_ov['GENE'].tolist()
# In[717]:
md_ov[md_ov['GENE'].isin(['ATF2','EFNA1','KITLG','SYK'])]
# In[ ]:
# In[718]:
res = pd.concat([hs_res,md_res])
plt.figure(figsize=(6,4))
g = sns.barplot(x='Protein impact',
y='%Impact',
hue='Line',
palette=['white','lightgrey'],
dodge=True,
linewidth=1,
edgecolor='black',
data=res,
order=['Alternative protein isoform','Protective','Deleterious',
"5' UTR/ncRNA","3' UTR",'ncRNA','Unknown'])
show_values_on_bars(g, t=float, dc=1, fs=8, r=0)
g.set_ylabel('Ratio of ASEs (%)', size=12)
g.set_yticks([0,10,20,30,40,50])
g.set_yticklabels(g.get_yticks().astype(float), size=12)
sns.despine(trim=True, offset=5)
g.set_xlabel('Predicted protein impact', size=12)
g.set_xticklabels(['Alt. isoform','Protective','Deleterious',
"5' UTR/ncRNA","3' UTR",'ncRNA','Unknown'],
size=12, rotation=30, ha='right', rotation_mode="anchor")
plt.savefig('./prmt1i_protein_impact.svg', dpi=300, bbox_inches='tight')
plt.savefig('./prmt1i_protein_impact.pdf', dpi=300, bbox_inches='tight')
plt.show()
# In[719]:
res2 = pd.concat([hs_res2,md_res2])
res2['Region'] = res2['Event label'].apply(lambda x: x.split(";")[0])
res2['Level'] = res2['Event label'].apply(lambda x: x.split(";")[1])
# In[727]:
res2['Event label'].unique()
# In[739]:
plt.figure(figsize=(6,4))
g = sns.barplot(x='Event label',
y='%Event',
hue='Line',
palette=['white','lightgrey'],
dodge=True,
linewidth=1,
edgecolor='black',
data=res2,
order=['Intron;Exclusion', 'Intron;Inclusion',
'Exon;Exclusion', 'Exon;Inclusion',
"Alt. 5'SS;Exclusion", "Alt. 5'SS;Inclusion",
"Alt. 3'SS;Exclusion", "Alt. 3'SS;Inclusion"]
)
show_values_on_bars(g, t=float, dc=1, fs=8, r=0)
g.set_ylabel('Ratio of ASEs (%)', size=12)
g.set_yticks([0,10,20,30,40,50,60,70])
g.set_yticklabels(g.get_yticks().astype(float), size=12)
sns.despine(trim=True, offset=5)
g.set_xlabel('Type of ASE', size=12)
g.set_xticklabels(['Intron exclusion', 'Intron inclusion',
'Exon exclusion', 'Exon inclusion',
"Alt. 5'SS exclusion", "Alt. 5'SS inclusion",
"Alt. 3'SS exclusion", "Alt. 3'SS inclusion"],
size=12, rotation=30, ha='right', rotation_mode="anchor")
plt.legend(loc='upper right')
plt.savefig('./inclusion_profile.svg', dpi=300, bbox_inches='tight')
plt.savefig('./inclusion_profile.pdf', dpi=300, bbox_inches='tight')
plt.show()
# In[657]:
res = pd.concat([hs_res,md_res])
plt.figure(figsize=(6,4))
g = sns.barplot(x='Protein impact',
y='%Impact',
hue='Line',
palette=['white','lightgrey'],
dodge=True,
linewidth=1,
edgecolor='black',
data=res,
order=['Alternative protein isoform','Protective','Deleterious',
"5' UTR/ncRNA","3' UTR",'ncRNA','Unknown'])
show_values_on_bars(g, t=float, dc=1, fs=8, r=0)
g.set_ylabel('Ratio of ASEs (%)', size=12)
g.set_yticks([0,10,20,30,40,50])
g.set_yticklabels(g.get_yticks().astype(float), size=12)
sns.despine(trim=True, offset=5)
g.set_xlabel('Predicted protein impact', size=12)
g.set_xticklabels(['Alt. isoform','Protective','Deleterious',
"5' UTR/ncRNA","3' UTR",'ncRNA','Unknown'],
size=12, rotation=30, ha='right', rotation_mode="anchor")
plt.savefig('./prmt1i_protein_impact.svg', dpi=300, bbox_inches='tight')
plt.savefig('./prmt1i_protein_impact.pdf', dpi=300, bbox_inches='tight')
plt.show()
# In[ ]:
writer = pd.ExcelWriter('./overlap.day2.day5.ASEs.xlsx',
engine='xlsxwriter')
hs_ov.to_excel(writer, sheet_name='HS578T', index=False)
md_ov.to_excel(writer, sheet_name='MDAMB468', index=False)
writer.save()
# ### Hs578T vs MDAMB468
# In[1167]:
psi = pd.read_csv('./vast_diff_results/Hs578T_1uM_MS023_2d_vs_MDAMB468_1uM_MS023_2d.txt',sep='\t')
d2 = pd.read_excel('./vast_diff_results/Hs578T_1uM_MS023_2d_vs_MDAMB468_1uM_MS023_2d.ASEs.xlsx',
sheet_name='All_significant_ASEs',sep='\t')
d2_sig = d2[(d2['EVENT'].isin(psi['EVENT'])) &
(abs(d2['E[dPsi]']) >= d2['MV[dPsi]_at_0.95']) &
(d2['MV[dPsi]_at_0.95'] >= 0.01) &
(abs(d2['E[dPsi]']) >= 0.1)
].sort_values('MV[dPsi]_at_0.95',ascending=False).reset_index(drop=1)
psi = pd.read_csv('./vast_diff_results/Hs578T_1uM_MS023_5d_vs_MDAMB468_1uM_MS023_5d.txt',sep='\t')
d5 = pd.read_excel('./vast_diff_results/Hs578T_1uM_MS023_5d_vs_MDAMB468_1uM_MS023_5d.ASEs.xlsx',
sheet_name='All_significant_ASEs', sep='\t')
d5_sig = d5[(d5['EVENT'].isin(psi['EVENT'])) &
(abs(d5['E[dPsi]']) >= d5['MV[dPsi]_at_0.95']) &
(d5['MV[dPsi]_at_0.95'] >= 0.01) &
(abs(d5['E[dPsi]']) >= 0.1)
].sort_values('MV[dPsi]_at_0.95',ascending=False).reset_index(drop=1)
psi = pd.read_csv('./vast_diff_results/Hs578T_DMSO_vs_MDAMB468_DMSO.txt',sep='\t')
dm = pd.read_excel('./vast_diff_results/Hs578T_DMSO_vs_MDAMB468_DMSO.ASEs.xlsx',
sheet_name='All_significant_ASEs', sep='\t')
dm_sig = dm[(dm['EVENT'].isin(psi['EVENT'])) &
(abs(dm['E[dPsi]']) >= dm['MV[dPsi]_at_0.95']) &
(dm['MV[dPsi]_at_0.95'] >= 0.01) &
(abs(dm['E[dPsi]']) >= 0.1)
].sort_values('MV[dPsi]_at_0.95',ascending=False).reset_index(drop=1)
d2_hs578 = set(d2_sig['EVENT'])
d5_hs578 = set(d5_sig['EVENT'])
dm_hs578 = set(dm_sig['EVENT'])
plt.figure(figsize=(6,6))
v = venn3_unweighted([d2_hs578,d5_hs578, dm_hs578],
set_labels=['2 days of\ntreatment','5 days of\ntreatment','DMSO control'])
c = venn3_circles(subsets=(1, 1, 1, 1 , 1, 1, 1), linestyle='solid')
for text in v.set_labels:
text.set_fontsize(20)
for text in v.subset_labels:
text.set_fontsize(18)
for i in ['111','110','101','011','010','001','100']:
v.get_patch_by_id(i).set_alpha(0.0)
c[0].set_ls('dashed')
c[1].set_ls('dotted')
plt.title('HS578T vs. MDAMB568\nAlternative splicing events',size=20,y=1.1)
plt.savefig('./hs587t_vs_mdamb_overlap.svg',dpi=300,bbox_inches='tight')
plt.savefig('./hs587t_vs_mdamb_overlap.pdf',dpi=300,bbox_inches='tight')
plt.show()
# In[1248]:
hs_md_ov = d2_sig[['EVENT','E[dPsi]']].set_index('EVENT').add_suffix('_2d').reset_index().merge(
d5_sig[['EVENT','E[dPsi]']].set_index('EVENT').add_suffix('_5d').reset_index()).merge(
dm_sig[['EVENT','E[dPsi]']].set_index('EVENT').add_suffix('_dm').reset_index())
hs_md_ov['MEAN'] = hs_md_ov.apply(lambda x: np.mean([x['E[dPsi]_2d'],x['E[dPsi]_5d'],x['E[dPsi]_dm']]),1)
hs_md_ov['STD'] = hs_md_ov.apply(lambda x: np.std([x['E[dPsi]_2d'],x['E[dPsi]_5d'],x['E[dPsi]_dm']]),1)
hs_md_ov = hs_md_ov[abs(hs_md_ov['MEAN']) >= hs_md_ov['STD']]
hs_md_ov = hs_md_ov.merge(d2_sig[['EVENT','GENE','ONTO']])
hs_md_ov['ONTO'] = hs_md_ov['ONTO'].astype(str)
hs_md_ov['ONTO_SHORT'] = hs_md_ov.apply(lambda x:
"Protective" if x['MEAN'] < 0 and ('ORF disruption' in x['ONTO']) and ('inclusion' in x['ONTO']) else
"Deleterious" if x['MEAN'] < 0 and ('ORF disruption' in x['ONTO']) and ('exclusion' in x['ONTO']) else
"Protective" if x['MEAN'] > 0 and ('ORF disruption' in x['ONTO']) and ('exclusion' in x['ONTO']) else
"Deleterious" if x['MEAN'] > 0 and ('ORF disruption' in x['ONTO']) and ('inclusion' in x['ONTO']) else
"Alternative protein isoform" if ('isoform' in x['ONTO']) else
"Unknown" if ('description available' in x['ONTO']) else
"ncRNA" if ('NonCoding' in x['ONTO']) else
x['ONTO'], axis=1).replace('nan','Unknown')
# In[1362]:
sl4 = hs_md_ov[abs(hs_md_ov['MEAN']) >= 0.5]
sl4 = sl4[sl4['GENE'] != 'DST']
sl4['ID'] = sl4['GENE']+'='+sl4['EVENT']
# In[1275]:
hs_md_ov.to_excel('./overlap.HS578_vs_MDAMB468.ASEs.xlsx')
# In[1363]:
samps2 = tidy_split(samps_groups, 'SAMPLES', sep=',')
samps2['REP'] = ['_R1','_R2','_R3']*len(samps2['COND'].unique())
samps2['SAMP_ID'] = samps2['COND']+samps2['REP']
# In[1255]:
vast_psi = pd.read_csv('./INCLUSION_LEVELS_FULL-Hsa24-hg38.tab',sep='\t')
# In[1364]:
cols = pd.DataFrame(vast_psi.columns)
cols = cols[(cols[0].str.contains("MDAMB468|Hs578T|hs578t")) & ~(cols[0].str.contains("_R1-Q"))]
myEvs = sl4.sort_values("MEAN", ascending=False)['EVENT']
vast_psi_values = vast_psi[cols[0].tolist()+['EVENT']].set_index('EVENT').T
myPsi = vast_psi_values[myEvs].reset_index().rename(columns={'index':'SAMPLES'})
myPsi_ID = myPsi.merge(samps2[['SAMPLES','SAMP_ID']]).drop('SAMPLES',1).set_index('SAMP_ID').T
sl3 = myPsi_ID[[
'Hs578T_1uM_MS023_2d_R1',
'Hs578T_1uM_MS023_2d_R2',
'Hs578T_1uM_MS023_2d_R3',
'Hs578T_1uM_MS023_5d_R1',
'Hs578T_1uM_MS023_5d_R2',
'Hs578T_1uM_MS023_5d_R3',
'Hs578T_shPRMT1_DOX_R1',
'Hs578T_shPRMT1_DOX_R2',
'Hs578T_shPRMT1_DOX_R3',
'Hs578T_shPRMT1_NoDOX_R1',
'Hs578T_shPRMT1_NoDOX_R2',
'Hs578T_shPRMT1_NoDOX_R3',
'Hs578T_DMSO_R1',
'Hs578T_DMSO_R2',
'Hs578T_DMSO_R3',
'MDAMB468_1uM_MS023_2d_R1',
'MDAMB468_1uM_MS023_2d_R2',
'MDAMB468_1uM_MS023_2d_R3',
'MDAMB468_1uM_MS023_5d_R1',
'MDAMB468_1uM_MS023_5d_R2',
'MDAMB468_1uM_MS023_5d_R3',
'MDAMB468_DMSO_R1',
'MDAMB468_DMSO_R2',
'MDAMB468_DMSO_R3',
]]
sl5 = sl3.reset_index().merge(sl4[['EVENT','ID']].rename(columns={'EVENT':'index'})).set_index('ID').drop("index",1)
# In[1365]:
grid_kws = {"height_ratios": (0.9, .025), "hspace": 0.3}
f, (ax, cbar_ax) = plt.subplots(2, gridspec_kw=grid_kws, figsize = (12,12))
sns.heatmap(sl5,
cmap="YlGnBu",
linewidths=1,
ax=ax,
cbar_ax=cbar_ax,
annot=True,
fmt='.0f',
cbar_kws={"orientation": "horizontal"})
plt.savefig('top_targets_Hs578_vs_MDAMB.h.svg', dpi=300, bbox_inches='tight')
plt.savefig('top_targets_Hs578_vs_MDAMB.h.pdf', dpi=300, bbox_inches='tight')
plt.show()
# In[1366]:
grid_kws = {"height_ratios": (0.9, .025), "hspace": 0.3}
f, (ax, cbar_ax) = plt.subplots(2, gridspec_kw=grid_kws, figsize = (10,12))
sns.heatmap(sl5.T,
cmap="YlGnBu",
linewidths=1,
ax=ax,
cbar_ax=cbar_ax,
annot=True,
fmt='.0f',
cbar_kws={"orientation": "horizontal"})
plt.savefig('top_targets_Hs578_vs_MDAMB.v.svg', dpi=300, bbox_inches='tight')
plt.savefig('top_targets_Hs578_vs_MDAMB.v.pdf', dpi=300, bbox_inches='tight')
plt.show()
# In[1367]:
df_merged = hs_ov[['EVENT','GENE','MEAN']].rename(columns={'MEAN':'dPSI[Hs578T_MS023_1uM_vs_Hs578T_DMSO]'}
).merge(hs_md_ov[['EVENT','MEAN','ONTO']]
).rename(columns={'MEAN':'dPSI[Hs578T_vs_MDAMB468]'})
df_merged['ID'] = df_merged['GENE']+'='+df_merged['EVENT']
# In[1377]:
cols = pd.DataFrame(vast_psi.columns)
cols = cols[(cols[0].str.contains("MDAMB468|Hs578T|hs578t")) & ~(cols[0].str.contains("_R1-Q"))]
myEvs = df_merged['EVENT']#.sort_values("MEAN", ascending=False)['EVENT']
vast_psi_values = vast_psi[cols[0].tolist()+['EVENT']].set_index('EVENT').T
myPsi = vast_psi_values[myEvs].reset_index().rename(columns={'index':'SAMPLES'})
myPsi_ID = myPsi.merge(samps2[['SAMPLES','SAMP_ID']]).drop('SAMPLES',1).set_index('SAMP_ID').T
sl3 = myPsi_ID[[
'Hs578T_1uM_MS023_2d_R1',
'Hs578T_1uM_MS023_2d_R2',
'Hs578T_1uM_MS023_2d_R3',
'Hs578T_1uM_MS023_5d_R1',
'Hs578T_1uM_MS023_5d_R2',
'Hs578T_1uM_MS023_5d_R3',
#'Hs578T_shPRMT1_DOX_R1',
#'Hs578T_shPRMT1_DOX_R2',
#'Hs578T_shPRMT1_DOX_R3',
#'Hs578T_shPRMT1_NoDOX_R1',
#'Hs578T_shPRMT1_NoDOX_R2',
#'Hs578T_shPRMT1_NoDOX_R3',
'Hs578T_DMSO_R1',
'Hs578T_DMSO_R2',
'Hs578T_DMSO_R3',
'MDAMB468_1uM_MS023_2d_R1',
'MDAMB468_1uM_MS023_2d_R2',
'MDAMB468_1uM_MS023_2d_R3',
'MDAMB468_1uM_MS023_5d_R1',
'MDAMB468_1uM_MS023_5d_R2',
'MDAMB468_1uM_MS023_5d_R3',
'MDAMB468_DMSO_R1',
'MDAMB468_DMSO_R2',
'MDAMB468_DMSO_R3',
]]
sl5 = sl3.reset_index().merge(df_merged[['EVENT','ID']].rename(columns={'EVENT':'index'})).set_index('ID').drop("index",1)
# In[1378]:
grid_kws = {"height_ratios": (0.9, .025), "hspace": 0.3}
f, (ax, cbar_ax) = plt.subplots(2, gridspec_kw=grid_kws, figsize = (4,8))
sns.heatmap(sl5.T,
cmap="YlGnBu",
linewidths=1,
ax=ax,
cbar_ax=cbar_ax,
annot=True,
fmt='.0f',
cbar_kws={"orientation": "horizontal"})
plt.savefig('top_targets_Hs578_vs_DMSO_and_MDAMB.v.svg', dpi=300, bbox_inches='tight')
plt.savefig('top_targets_Hs578_vs_DMSO_and_MDAMB.v.pdf', dpi=300, bbox_inches='tight')
plt.show()
# In[ ]:
grid_kws = {"height_ratios": (0.9, .025), "hspace": 0.3}
f, (ax, cbar_ax) = plt.subplots(2, gridspec_kw=grid_kws, figsize = (6,4))
sns.heatmap(sl5,
cmap="YlGnBu",
linewidths=1,
ax=ax,
cbar_ax=cbar_ax,
annot=True,
fmt='.0f',
cbar_kws={"orientation": "horizontal"})
plt.savefig('top_targets_Hs578_vs_DMSO_and_MDAMB.h.svg', dpi=300, bbox_inches='tight')
plt.savefig('top_targets_Hs578_vs_DMSO_and_MDAMB.h.pdf', dpi=300, bbox_inches='tight')
plt.show()
# ### HS578 vs MDAMB468 - VAST TIDY
# In[1318]:
psi_table = pd.read_table("INCLUSION_LEVELS_FULL-Hsa24-hg38.TIDY.MS023.tab")#.fillna(0)
cols_to_drop = psi_table.filter(regex='-Q').columns.tolist()
psi_table.drop(columns=cols_to_drop, inplace=True)
psi_table = psi_table.set_index('EVENT').T.reset_index().rename(columns={'index':'SAMPLES'}
).merge(samps2[['SAMPLES','SAMP_ID']]
).set_index('SAMP_ID').drop("SAMPLES",1).T.reset_index().rename(columns={'index':'EVENT'})
hs_treated = psi_table.filter(regex='Hs578').filter(regex='MS023').columns.tolist()
hs_dmso = psi_table.filter(regex='Hs578').filter(regex='DMSO').columns.tolist()
md_treated = psi_table.filter(regex='MDAMB').filter(regex='MS023').columns.tolist()
md_dmso = psi_table.filter(regex='MDAMB').filter(regex='DMSO').columns.tolist()
psi_table = psi_table[['EVENT']+hs_treated+hs_dmso+md_treated+md_dmso]
# #.drop(psi_table.filter(regex='-Q').columns.tolist(),1)
# In[1320]:
psi_table['Hs578_vs_MDAMB468_pVal'] = psi_table.apply(lambda x: stats.mstats.mannwhitneyu(x[hs_treated+hs_dmso].values,
x[md_treated+md_dmso].values)[1],
axis=1)
psi_table['Hs578_vs_MDAMB468_dPSI'] = psi_table.apply(lambda x: np.round(
np.mean(x[hs_treated+hs_dmso])-np.mean(x[md_treated+md_dmso]),2),
axis=1)
psi_table['Hs578_MS023_vs_DMSO_pVal'] = psi_table.apply(lambda x: stats.mstats.mannwhitneyu(x[hs_treated].values,
x[hs_dmso].values)[1],
axis=1)
psi_table['Hs578_MS023_vs_DMSO_dPSI'] = psi_table.apply(lambda x: np.round(
np.mean(x[hs_treated])-np.mean(x[hs_dmso]),2),
axis=1)
psi_table['MDAMB468_MS023_vs_DMSO_pVal'] = psi_table.apply(lambda x: stats.mstats.mannwhitneyu(x[md_treated].values,
x[md_dmso].values)[1],
axis=1)
psi_table['MDAMB468_MS023_vs_DMSO_dPSI'] = psi_table.apply(lambda x: np.round(
np.mean(x[md_treated])-np.mean(x[md_dmso]),2),
axis=1)
# In[ ]:
# In[1322]:
hs578_vs_md468_hs = psi_table[(psi_table['Hs578_vs_MDAMB468_pVal'] <= 0.05) &
#(psi_table['Hs578_MS023_vs_DMSO_pVal'] >= 0.05) &
(abs(psi_table['Hs578_vs_MDAMB468_dPSI']) >= 40) #&
#(abs(psi_table['Hs578_MS023_vs_DMSO_dPSI']) <= 20)
].sort_values('Hs578_vs_MDAMB468_dPSI', ascending=False)
hs578_vs_md468_all = psi_table[(psi_table['Hs578_vs_MDAMB468_pVal'] <= 0.05) &
#(psi_table['Hs578_MS023_vs_DMSO_pVal'] >= 0.05) &
(abs(psi_table['Hs578_vs_MDAMB468_dPSI']) >= 20) #&
#(abs(psi_table['Hs578_MS023_vs_DMSO_dPSI']) <= 20)
].sort_values('Hs578_vs_MDAMB468_dPSI', ascending=False)
# In[1341]:
mytest = psi_table[((psi_table['Hs578_vs_MDAMB468_pVal'] <= 0.05) &
(abs(psi_table['Hs578_vs_MDAMB468_dPSI']) >= 10)) |
((psi_table['Hs578_MS023_vs_DMSO_pVal'] <= 0.05) &
(abs(psi_table['Hs578_MS023_vs_DMSO_dPSI']) >= 10)) |
((psi_table['MDAMB468_MS023_vs_DMSO_pVal'] <= 0.05) &
(abs(psi_table['MDAMB468_MS023_vs_DMSO_dPSI']) >= 10))
].sort_values('Hs578_vs_MDAMB468_dPSI', ascending=False).set_index('EVENT')
# In[1342]:
mytest.shape
# In[1343]:
mytest
# In[1338]:
grid_kws = {"height_ratios": (0.9, .025), "hspace": 0.3}
f, (ax, cbar_ax) = plt.subplots(2, gridspec_kw=grid_kws, figsize = (10,10))
sns.heatmap(mytest.drop(['Hs578_vs_MDAMB468_pVal','Hs578_vs_MDAMB468_dPSI',
'Hs578_MS023_vs_DMSO_pVal','Hs578_MS023_vs_DMSO_dPSI',
'MDAMB468_MS023_vs_DMSO_pVal','MDAMB468_MS023_vs_DMSO_dPSI',],1),
cmap="YlGnBu",
linewidths=1,
ax=ax,
cbar_ax=cbar_ax,
annot=True,
fmt='.0f',
cbar_kws={"orientation": "horizontal"})
plt.show()
# In[ ]:
# In[ ]:
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment