Created
May 31, 2021 13:10
-
-
Save fciamponi/ef8aae813575a4947c1f9cb87a1c7330 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# coding: utf-8 | |
# In[768]: | |
import pandas as pd | |
import numpy as np | |
import glob | |
import itertools | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
from matplotlib_venn import * | |
import scipy.stats as stats | |
# In[491]: | |
def tidy_split(df, column, sep='|', keep=False): | |
""" | |
Split the values of a column and expand so the new DataFrame has one split | |
value per row. Filters rows where the column is missing. | |
Params | |
------ | |
df : pandas.DataFrame | |
dataframe with the column to split and expand | |
column : str | |
the column to split and expand | |
sep : str | |
the string used to split the column's values | |
keep : bool | |
whether to retain the presplit value as it's own row | |
Returns | |
------- | |
pandas.DataFrame | |
Returns a dataframe with the same columns as `df`. | |
""" | |
indexes = list() | |
new_values = list() | |
df = df.dropna(subset=[column]) | |
for i, presplit in enumerate(df[column].astype(str)): | |
values = presplit.split(sep) | |
if keep and len(values) > 1: | |
indexes.append(i) | |
new_values.append(presplit) | |
for value in values: | |
indexes.append(i) | |
new_values.append(value) | |
new_df = df.iloc[indexes, :].copy() | |
new_df[column] = new_values | |
return new_df | |
# In[319]: | |
def show_values_on_bars(axs, h_v="v", space=0.4, fs=10, t=int, dc=2, r=0): | |
def _show_on_single_plot(ax): | |
if h_v == "v": | |
for p in ax.patches: | |
_x = p.get_x() + p.get_width() / 2 | |
_y = p.get_y() + p.get_height() | |
if np.isnan(p.get_height()) == True: | |
value = 0 | |
_y = 0 | |
else: | |
if t == float: | |
value = np.round(t(p.get_height()),dc) | |
else: | |
value = t(p.get_height()) | |
ax.text(_x, _y, value, ha="center", fontsize=fs, rotation=r) | |
elif h_v == "h": | |
for p in ax.patches: | |
_x = p.get_x() + p.get_width() + float(space) | |
_y = p.get_y() + p.get_height() | |
if t == float: | |
value = np.round(t(p.get_width()),dc) | |
else: | |
value = t(p.get_width()) | |
ax.text(_x, _y, value, ha="left", rotation=r, fontsize=fs) | |
if isinstance(axs, np.ndarray): | |
for idx, ax in np.ndenumerate(axs): | |
_show_on_single_plot(ax) | |
else: | |
_show_on_single_plot(axs) | |
# ### Pre-processing | |
# In[778]: | |
samps = pd.read_csv('./sample.list.txt', names=['SAMPLES','LINE'], sep='\t') | |
samps['COND'] = samps['SAMPLES'].apply(lambda x: '-'.join(x.split('-')[1:])) | |
samps_groups = samps.groupby('COND')['SAMPLES'].agg(lambda x: ','.join(x)).reset_index() | |
samps_groups['COND'] = ['Hs578T_1uM_MS023_2d', | |
'Hs578T_1uM_MS023_5d', | |
'MDAMB468_1uM_MS023_2d', | |
'MDAMB468_1uM_MS023_5d', | |
'Hs578T_DMSO', | |
'MDAMB468_DMSO', | |
'Hs578T_shPRMT1_DOX', | |
'Hs578T_shPRMT1_NoDOX'] | |
samps_pair = list(itertools.combinations(samps_groups['SAMPLES'],2)) | |
conds_pair = list(itertools.combinations(samps_groups['COND'],2)) | |
d1 = pd.DataFrame(samps_pair, columns=['SAMPLE_A','SAMPLE_B']) | |
d2 = pd.DataFrame(conds_pair, columns=['COND_A','COND_B']) | |
dd = pd.concat([d1,d2], axis=1) | |
dd['COMP'] = dd['COND_B']+'_vs_'+dd['COND_A'] | |
dd['CELL_A'] = dd['COND_A'].apply(lambda x: x.split('_')[0]) | |
dd['CELL_B'] = dd['COND_B'].apply(lambda x: x.split('_')[0]) | |
dd_c = dd[dd['CELL_A'] == dd['CELL_B']] | |
# In[780]: | |
dd_pass = dd_c[dd_c['COND_A'].str.contains('DMSO')] | |
dd_fix = dd_c[dd_c['COND_B'].str.contains('DMSO')] | |
dd_fix.columns = ['SAMPLE_B','SAMPLE_A','COND_B','COND_A','COMP','CELL_B','CELL_A'] | |
dd_fix['COMP'] = dd_fix['COND_B']+'_vs_'+dd_fix['COND_A'] | |
dd_fix = dd_fix[dd_c.columns] | |
dd2 = pd.concat([dd_pass,dd_fix]).drop_duplicates().reset_index(drop=1) | |
dd2 = dd2[(dd2['COND_A'].str.contains('DMSO')) & ~(dd2['COND_B'].str.contains('DOX'))] | |
dd_pass = dd[dd['COND_A'].str.contains('NoDOX')] | |
dd_fix = dd[dd['COND_B'].str.contains('NoDOX')] | |
dd_fix.columns = ['SAMPLE_B','SAMPLE_A','COND_B','COND_A','COMP','CELL_B','CELL_A'] | |
dd_fix['COMP'] = dd_fix['COND_B']+'_vs_'+dd_fix['COND_A'] | |
dd_fix = dd_fix[dd.columns] | |
dd3 = pd.concat([dd_pass,dd_fix]).drop_duplicates().reset_index(drop=1) | |
dd3 = dd3[(dd3['COND_A'].str.contains('DOX')) & (dd3['COND_B'].str.contains('DOX'))] | |
dd_f = pd.concat([dd2,dd3]) | |
dd_f.to_csv('./comb.pair.treated.vs.control.txt',sep=';',header=False,index=False) | |
# ### Generate Hs vs MDAM comparissons | |
# In[781]: | |
dd_c = dd[(dd['CELL_A'] != dd['CELL_B']) & | |
(dd['COND_A'].str.contains('2d|5d')) & | |
(dd['COND_B'].str.contains('2d|5d'))] | |
dd_pass = dd_c[(dd_c['COND_A'].str.contains('MDAMB468')) & (dd_c['COND_B'].str.contains('Hs578T'))] | |
dd_fix = dd_c[(dd_c['COND_B'].str.contains('MDAMB468')) & (dd_c['COND_A'].str.contains('Hs578T'))] | |
dd_fix.columns = ['SAMPLE_B','SAMPLE_A','COND_B','COND_A','COMP','CELL_B','CELL_A'] | |
dd_fix['COMP'] = dd_fix['COND_B']+'_vs_'+dd_fix['COND_A'] | |
dd_fix = dd_fix[dd_c.columns] | |
dd2 = pd.concat([dd_pass,dd_fix]).drop_duplicates().reset_index(drop=1) | |
# #dd2 = dd2[(dd2['COND_A'].str.contains('DMSO')) & ~(dd2['COND_B'].str.contains('DOX'))] | |
# dd_pass = dd[dd['COND_A'].str.contains('NoDOX')] | |
# dd_fix = dd[dd['COND_B'].str.contains('NoDOX')] | |
# dd_fix.columns = ['SAMPLE_B','SAMPLE_A','COND_B','COND_A','COMP','CELL_B','CELL_A'] | |
# dd_fix['COMP'] = dd_fix['COND_B']+'_vs_'+dd_fix['COND_A'] | |
# dd_fix = dd_fix[dd.columns] | |
# dd3 = pd.concat([dd_pass,dd_fix]).drop_duplicates().reset_index(drop=1) | |
# #dd3 = dd3[(dd3['COND_A'].str.contains('DOX')) & (dd3['COND_B'].str.contains('DOX'))] | |
dd_fc = pd.concat([dd2]) | |
dd_fc.to_csv('./comb.pair.hs578t.vs.mdamb468.txt',sep=';',header=False,index=False) | |
# In[1122]: | |
dd_c = dd[(dd['CELL_A'] != dd['CELL_B']) & | |
(dd['COND_A'].str.contains('DMSO')) & | |
(dd['COND_B'].str.contains('DMSO'))] | |
dd_pass = dd_c[(dd_c['COND_A'].str.contains('MDAMB468')) & (dd_c['COND_B'].str.contains('Hs578T'))] | |
dd_fix = dd_c[(dd_c['COND_B'].str.contains('MDAMB468')) & (dd_c['COND_A'].str.contains('Hs578T'))] | |
dd_fix.columns = ['SAMPLE_B','SAMPLE_A','COND_B','COND_A','COMP','CELL_B','CELL_A'] | |
dd_fix['COMP'] = dd_fix['COND_B']+'_vs_'+dd_fix['COND_A'] | |
dd_fix = dd_fix[dd_c.columns] | |
dd2 = pd.concat([dd_pass,dd_fix]).drop_duplicates().reset_index(drop=1) | |
# #dd2 = dd2[(dd2['COND_A'].str.contains('DMSO')) & ~(dd2['COND_B'].str.contains('DOX'))] | |
# dd_pass = dd[dd['COND_A'].str.contains('NoDOX')] | |
# dd_fix = dd[dd['COND_B'].str.contains('NoDOX')] | |
# dd_fix.columns = ['SAMPLE_B','SAMPLE_A','COND_B','COND_A','COMP','CELL_B','CELL_A'] | |
# dd_fix['COMP'] = dd_fix['COND_B']+'_vs_'+dd_fix['COND_A'] | |
# dd_fix = dd_fix[dd.columns] | |
# dd3 = pd.concat([dd_pass,dd_fix]).drop_duplicates().reset_index(drop=1) | |
# #dd3 = dd3[(dd3['COND_A'].str.contains('DOX')) & (dd3['COND_B'].str.contains('DOX'))] | |
dd_fd = pd.concat([dd2]) | |
dd_fd.to_csv('./comb.pair.hs578t.vs.mdamb468.DMSO.txt',sep=';',header=False,index=False) | |
# In[1123]: | |
dd_fd | |
# ### Vast-diff parsing | |
# In[323]: | |
annot = pd.read_csv('/home/fciamponi/Programs/vast-tools/VASTDB/Hsa/ANNOT_DATA/VASTDB_EVENTS_Hsa108_hg38_ALL.tab.gz', | |
sep='\t') | |
pcr = pd.read_csv('/home/fciamponi/Programs/vast-tools/VASTDB/Hsa/ANNOT_DATA/VASTDB_PCR_VALIDATIONS_Hsa108_hg19.tab.gz', | |
sep='\t') | |
prt = pd.read_csv('/home/fciamponi/Programs/vast-tools/VASTDB/Hsa/ANNOT_DATA/VASTDB_PROT_IMPACT_Hsa108_hg19_ALL.tab.gz', | |
sep='\t').rename(columns={'EventID':'EVENT'}) | |
pfam = pd.read_csv('/home/fciamponi/Programs/vast-tools/VASTDB/Hsa/ANNOT_DATA/VASTDB_PROT_PFAM_Hsa108_hg19.tab.gz', | |
sep='\t').rename(columns={'EventID':'EVENT'}) | |
# In[324]: | |
pdb = pd.read_csv('/home/fciamponi/Programs/vast-tools/VASTDB/Hsa/ANNOT_DATA/VASTDB_PROT_STRUCT_PDB_Hsa108_hg19.tab.gz', | |
sep='\t').rename(columns={'EVENT_ID':'EVENT'}) | |
phy = pd.read_csv('/home/fciamponi/Programs/vast-tools/VASTDB/Hsa/ANNOT_DATA/VASTDB_PROT_STRUCT_PHYRE_Hsa108_hg19.tab.gz', | |
sep='\t').rename(columns={'EVENT_ID':'EVENT'}) | |
# In[403]: | |
print(annot.shape) | |
print(pcr.shape) | |
print(prt.shape) | |
# In[1124]: | |
for i in range(len(dd_f)): | |
comp = str(dd_f.iloc[i]['COMP']) | |
ca = str(dd_f.iloc[i]['COND_A']) | |
cb = str(dd_f.iloc[i]['COND_B']) | |
df = pd.read_csv('./vast_diff_results/'+comp+'.vast.diff.txt',sep='\t') | |
df.columns = ['GENE','EVENT',ca,cb,'E[dPsi]','MV[dPsi]_at_0.95'] | |
df['org'] = 'org' | |
df = df.merge(annot[['EVENT','COORD_o']], | |
how='outer').merge(prt, | |
how='outer').merge(pcr, | |
how='outer') | |
df = df[df['org'] == 'org'] | |
df.drop('org', 1 ,inplace=True) | |
df['E[dPsi]'] = df['E[dPsi]']*-1 | |
psi = pd.read_csv('./vast_diff_results/'+comp+'.txt',sep='\t') | |
df_ex = df[(df['EVENT'].isin(psi['EVENT'])) & | |
(abs(df['E[dPsi]']) >= df['MV[dPsi]_at_0.95']) & | |
(df['MV[dPsi]_at_0.95'] >= 0.05) & | |
(abs(df['E[dPsi]']) >= 0.2) | |
].sort_values('MV[dPsi]_at_0.95',ascending=False).reset_index(drop=1) | |
df_sig = df[(df['EVENT'].isin(psi['EVENT'])) & | |
(abs(df['E[dPsi]']) >= df['MV[dPsi]_at_0.95']) & | |
(df['MV[dPsi]_at_0.95'] >= 0.01) & | |
(abs(df['E[dPsi]']) >= 0.1) | |
].sort_values('MV[dPsi]_at_0.95',ascending=False).reset_index(drop=1) | |
writer = pd.ExcelWriter('./vast_diff_results/'+comp+'.ASEs.xlsx', | |
engine='xlsxwriter') | |
df_ex.to_excel(writer, sheet_name='Experimental_Validation', index=False) | |
df_sig.to_excel(writer, sheet_name='All_significant_ASEs', index=False) | |
writer.save() | |
df_sig[['EVENT']].to_csv('./vast_diff_results/'+comp+'.SIG.TGTs.txt',sep='\t',index=False,header=False) | |
df_ex[['EVENT']].to_csv('./vast_diff_results/'+comp+'.EXP.TGTs.txt',sep='\t',index=False,header=False) | |
# In[1125]: | |
for i in range(len(dd_fc)): | |
comp = str(dd_fc.iloc[i]['COMP']) | |
ca = str(dd_fc.iloc[i]['COND_A']) | |
cb = str(dd_fc.iloc[i]['COND_B']) | |
df = pd.read_csv('./vast_diff_results/'+comp+'.vast.diff.txt',sep='\t') | |
df.columns = ['GENE','EVENT',ca,cb,'E[dPsi]','MV[dPsi]_at_0.95'] | |
df['org'] = 'org' | |
df = df.merge(annot[['EVENT','COORD_o']], | |
how='outer').merge(prt, | |
how='outer').merge(pcr, | |
how='outer') | |
df = df[df['org'] == 'org'] | |
df.drop('org', 1 ,inplace=True) | |
df['E[dPsi]'] = df['E[dPsi]']*-1 | |
psi = pd.read_csv('./vast_diff_results/'+comp+'.txt',sep='\t') | |
df_ex = df[(df['EVENT'].isin(psi['EVENT'])) & | |
(abs(df['E[dPsi]']) >= df['MV[dPsi]_at_0.95']) & | |
(df['MV[dPsi]_at_0.95'] >= 0.05) & | |
(abs(df['E[dPsi]']) >= 0.2) | |
].sort_values('MV[dPsi]_at_0.95',ascending=False).reset_index(drop=1) | |
df_sig = df[(df['EVENT'].isin(psi['EVENT'])) & | |
(abs(df['E[dPsi]']) >= df['MV[dPsi]_at_0.95']) & | |
(df['MV[dPsi]_at_0.95'] >= 0.01) & | |
(abs(df['E[dPsi]']) >= 0.1) | |
].sort_values('MV[dPsi]_at_0.95',ascending=False).reset_index(drop=1) | |
writer = pd.ExcelWriter('./vast_diff_results/'+comp+'.ASEs.xlsx', | |
engine='xlsxwriter') | |
df_ex.to_excel(writer, sheet_name='Experimental_Validation', index=False) | |
df_sig.to_excel(writer, sheet_name='All_significant_ASEs', index=False) | |
writer.save() | |
df_sig[['EVENT']].to_csv('./vast_diff_results/'+comp+'.SIG.TGTs.txt',sep='\t',index=False,header=False) | |
df_ex[['EVENT']].to_csv('./vast_diff_results/'+comp+'.EXP.TGTs.txt',sep='\t',index=False,header=False) | |
# In[ ]: | |
# In[1157]: | |
for i in range(len(dd_fd)): | |
comp = str(dd_fd.iloc[i]['COMP']) | |
ca = str(dd_fd.iloc[i]['COND_A']) | |
cb = str(dd_fd.iloc[i]['COND_B']) | |
df = pd.read_csv('./vast_diff_results/'+comp+'.vast.diff.txt',sep='\t') | |
df.columns = ['GENE','EVENT',ca,cb,'E[dPsi]','MV[dPsi]_at_0.95'] | |
df['org'] = 'org' | |
df = df.merge(annot[['EVENT','COORD_o']], | |
how='outer').merge(prt, | |
how='outer').merge(pcr, | |
how='outer') | |
df = df[df['org'] == 'org'] | |
df.drop('org', 1 ,inplace=True) | |
df['E[dPsi]'] = df['E[dPsi]']*-1 | |
psi = pd.read_csv('./vast_diff_results/'+comp+'.txt',sep='\t') | |
df_sig = df[(df['EVENT'].isin(psi['EVENT'])) & | |
(abs(df['E[dPsi]']) >= df['MV[dPsi]_at_0.95']) & | |
(df['MV[dPsi]_at_0.95'] >= 0.01) & | |
(abs(df['E[dPsi]']) >= 0.1) | |
].sort_values('MV[dPsi]_at_0.95',ascending=False).reset_index(drop=1) | |
df_ex = df[(df['EVENT'].isin(psi['EVENT'])) & | |
(abs(df['E[dPsi]']) >= df['MV[dPsi]_at_0.95']) & | |
(df['MV[dPsi]_at_0.95'] >= 0.05) & | |
(abs(df['E[dPsi]']) >= 0.2) | |
].sort_values('MV[dPsi]_at_0.95',ascending=False).reset_index(drop=1) | |
writer = pd.ExcelWriter('./vast_diff_results/'+comp+'.ASEs.xlsx', | |
engine='xlsxwriter') | |
df_ex.to_excel(writer, sheet_name='Experimental_Validation', index=False) | |
df_sig.to_excel(writer, sheet_name='All_significant_ASEs', index=False) | |
writer.save() | |
df_sig[['EVENT']].to_csv('./vast_diff_results/'+comp+'.SIG.TGTs.txt',sep='\t',index=False,header=False) | |
df_ex[['EVENT']].to_csv('./vast_diff_results/'+comp+'.EXP.TGTs.txt',sep='\t',index=False,header=False) | |
# In[ ]: | |
# In[ ]: | |
# In[1129]: | |
my_evs = ['HsaINT0022568','HsaEX0011350','HsaINT0030497','HsaEX0051139', | |
'HsaEX0021497','HsaINT0013004','HsaINT0077022','HsaEX0055840'] | |
dd = [] | |
for i in range(len(dd_f[0:4])): | |
comp = str(dd_f.iloc[i]['COMP']) | |
ca = str(dd_f.iloc[i]['COND_A']) | |
cb = str(dd_f.iloc[i]['COND_B']) | |
df = pd.read_excel('./vast_diff_results/'+comp+'.ASEs.xlsx', | |
sheet_name='All_significant_ASEs', sep='\t') | |
sl = df[df['EVENT'].isin(my_evs)] | |
sl = sl[['GENE','EVENT','E[dPsi]']] | |
sl['ID'] = sl['GENE']+' - '+sl['EVENT'] | |
sl['LINE'] = cb.split('_')[0] | |
sl['TIME'] = cb.split('_')[-1] | |
dd.append(sl) | |
# In[692]: | |
plt.figure(figsize=(6,4)) | |
g = sns.catplot(x='TIME', y='E[dPsi]', | |
hue='LINE',col='ID', | |
kind='bar', | |
palette=['white','lightgrey'], | |
linewidth=1, | |
edgecolor='black', | |
dodge=True, | |
data=pd.concat(dd), | |
height=4, aspect=0.7, | |
col_wrap=4) | |
plt.savefig('./vast_diff_results/venn4.catplot.pdf', dpi=300, bbox_inches='tight') | |
plt.savefig('./vast_diff_results/venn4.catplot.svg', dpi=300, bbox_inches='tight') | |
plt.show() | |
# In[337]: | |
plt.figure(figsize=(6,4)) | |
g = sns.barplot(x='ID', | |
y='E[dPsi]', | |
hue='TIME', | |
ci='sd', | |
palette=['white','lightgrey'], | |
dodge=True, | |
linewidth=1, | |
edgecolor='black', | |
data=pd.concat(dd).sort_values('E[dPsi]', ascending=True)) | |
#show_values_on_bars(g, t=float) | |
g.set_ylabel('Mean dPSI', size=12) | |
g.set_yticks([-0.4,-0.2,0,0.2,0.4]) | |
g.set_yticklabels(g.get_yticks().astype(float), size=12) | |
sns.despine(trim=True, offset=5) | |
g.set_xlabel('Alternative splicing events', size=12) | |
g.set_xticklabels(pd.concat(dd).sort_values('E[dPsi]', ascending=True)['ID'].unique(), | |
size=12, rotation=30, ha='right', rotation_mode="anchor") | |
plt.show() | |
# In[160]: | |
sl = df[df['EVENT'].isin(my_evs)][['GENE','EVENT','ONTO']] | |
# In[161]: | |
sl['org'] = 'org' | |
# In[169]: | |
slp = sl.merge(pfam, how='outer')#.merge(phy, how='outer')#.merge(phy, how='outer') | |
slp = slp[slp['org'] == 'org'] | |
slp.drop('org', axis=1, inplace=True) | |
# In[170]: | |
slp | |
# ### Find overlapping events in HS line | |
# In[710]: | |
psi = pd.read_csv('./vast_diff_results/Hs578T_1uM_MS023_2d_vs_Hs578T_DMSO.txt',sep='\t') | |
d2 = pd.read_excel('./vast_diff_results/Hs578T_1uM_MS023_2d_vs_Hs578T_DMSO.ASEs.xlsx', | |
sheet_name='All_significant_ASEs',sep='\t') | |
d2_sig = d2[(d2['EVENT'].isin(psi['EVENT'])) & | |
(abs(d2['E[dPsi]']) >= d2['MV[dPsi]_at_0.95']) & | |
(d2['MV[dPsi]_at_0.95'] >= 0.01) & | |
(abs(d2['E[dPsi]']) >= 0.1) | |
].sort_values('MV[dPsi]_at_0.95',ascending=False).reset_index(drop=1) | |
psi = pd.read_csv('./vast_diff_results/Hs578T_1uM_MS023_5d_vs_Hs578T_DMSO.txt',sep='\t') | |
d5 = pd.read_excel('./vast_diff_results/Hs578T_1uM_MS023_5d_vs_Hs578T_DMSO.ASEs.xlsx', | |
sheet_name='All_significant_ASEs', sep='\t') | |
d5_sig = d5[(d5['EVENT'].isin(psi['EVENT'])) & | |
(abs(d5['E[dPsi]']) >= d5['MV[dPsi]_at_0.95']) & | |
(d5['MV[dPsi]_at_0.95'] >= 0.01) & | |
(abs(d5['E[dPsi]']) >= 0.1) | |
].sort_values('MV[dPsi]_at_0.95',ascending=False).reset_index(drop=1) | |
d2_hs578 = set(d2_sig['EVENT']) | |
d5_hs578 = set(d5_sig['EVENT']) | |
plt.figure(figsize=(3,4)) | |
v = venn2_unweighted([d2_hs578,d5_hs578], set_labels=['2 days of\ntreatment','5 days of\ntreatment']) | |
c = venn2_circles(subsets=(1, 1, 1), linestyle='solid') | |
for text in v.set_labels: | |
text.set_fontsize(12) | |
for text in v.subset_labels: | |
text.set_fontsize(10) | |
for i in ['11','10','01']: | |
v.get_patch_by_id(i).set_alpha(0.0) | |
c[0].set_ls('dashed') | |
plt.title('HS578T - MS023 1uM\nAlternative splicing events',size=12,y=1.0) | |
plt.savefig('./hs587t_overlap.svg',dpi=300,bbox_inches='tight') | |
plt.savefig('./hs587t_overlap.pdf',dpi=300,bbox_inches='tight') | |
plt.show() | |
# In[711]: | |
hs_ov = d2_sig[['EVENT','E[dPsi]']].set_index('EVENT').add_suffix('_2d').reset_index().merge( | |
d5_sig[['EVENT','E[dPsi]']].set_index('EVENT').add_suffix('_5d').reset_index()) | |
hs_ov['MEAN'] = hs_ov.apply(lambda x: np.mean([x['E[dPsi]_2d'],x['E[dPsi]_5d']]),1) | |
hs_ov['STD'] = hs_ov.apply(lambda x: np.std([x['E[dPsi]_2d'],x['E[dPsi]_5d']]),1) | |
hs_ov = hs_ov[abs(hs_ov['MEAN']) >= hs_ov['STD']] | |
hs_ov = hs_ov.merge(d2_sig[['EVENT','GENE','ONTO']]) | |
hs_ov['ONTO'] = hs_ov['ONTO'].astype(str) | |
hs_ov['ONTO_SHORT'] = hs_ov.apply(lambda x: | |
"Protective" if x['MEAN'] < 0 and ('ORF disruption' in x['ONTO']) and ('inclusion' in x['ONTO']) else | |
"Deleterious" if x['MEAN'] < 0 and ('ORF disruption' in x['ONTO']) and ('exclusion' in x['ONTO']) else | |
"Protective" if x['MEAN'] > 0 and ('ORF disruption' in x['ONTO']) and ('exclusion' in x['ONTO']) else | |
"Deleterious" if x['MEAN'] > 0 and ('ORF disruption' in x['ONTO']) and ('inclusion' in x['ONTO']) else | |
"Alternative protein isoform" if ('isoform' in x['ONTO']) else | |
"Unknown" if ('description available' in x['ONTO']) else | |
x['ONTO'], axis=1).replace('nan','Unknown') | |
hs_ov['INC'] = hs_ov.apply(lambda x: | |
"Exclusion" if x['MEAN'] < 0 else | |
"Inclusion" if x['MEAN'] > 0 else 'Not ASE', axis=1).replace('nan','Unknown') | |
hs_ov['TYPE'] = hs_ov.apply(lambda x: | |
"Exon" if 'HsaEX' in x['EVENT'] else | |
"Intron" if 'HsaINT' in x['EVENT'] else | |
"Alt. 3'SS" if 'HsaALTA' in x['EVENT'] else | |
"Alt. 5'SS" if 'HsaALTD' in x['EVENT'] else | |
'Not ASE', axis=1).replace('nan','Unknown') | |
hs_ov['LABEL'] = hs_ov['TYPE']+';'+hs_ov['INC'] | |
hs_res = pd.DataFrame(hs_ov['ONTO_SHORT'].value_counts()).reset_index() | |
hs_res.columns = ['Protein impact','Counts'] | |
hs_res['%Impact'] = hs_res.apply(lambda x: np.round(x['Counts']/np.sum(hs_res['Counts']),3)*100, axis=1) | |
hs_res['Line'] = 'HS578T' | |
hs_res2 = pd.DataFrame(hs_ov['LABEL'].value_counts()).reset_index() | |
hs_res2.columns = ['Event label','Counts'] | |
hs_res2['%Event'] = hs_res2.apply(lambda x: np.round(x['Counts']/np.sum(hs_res2['Counts']),3)*100, axis=1) | |
hs_res2['Line'] = 'HS578T' | |
# In[712]: | |
hs_ov['GENE'].tolist() | |
# In[741]: | |
hs_ov[hs_ov['GENE'].str.contains("PDK")] | |
# In[742]: | |
hs_ov[hs_ov['GENE'].isin(['COL1A1','IL6','ITGA6','PDK1','PPP2R1B','RAF1','BCL2L1','SGK1','MAGI1'])] | |
# In[714]: | |
psi = pd.read_csv('./vast_diff_results/MDAMB468_1uM_MS023_2d_vs_MDAMB468_DMSO.txt',sep='\t') | |
d2 = pd.read_excel('./vast_diff_results/MDAMB468_1uM_MS023_2d_vs_MDAMB468_DMSO.ASEs.xlsx', | |
sheet_name='All_significant_ASEs',sep='\t') | |
d2_sig = d2[(d2['EVENT'].isin(psi['EVENT'])) & | |
(abs(d2['E[dPsi]']) >= d2['MV[dPsi]_at_0.95']) & | |
(d2['MV[dPsi]_at_0.95'] >= 0.01) & | |
(abs(d2['E[dPsi]']) >= 0.1) | |
].sort_values('MV[dPsi]_at_0.95',ascending=False).reset_index(drop=1) | |
psi = pd.read_csv('./vast_diff_results/MDAMB468_1uM_MS023_5d_vs_MDAMB468_DMSO.txt',sep='\t') | |
d5 = pd.read_excel('./vast_diff_results/MDAMB468_1uM_MS023_5d_vs_MDAMB468_DMSO.ASEs.xlsx', | |
sheet_name='All_significant_ASEs', sep='\t') | |
d5_sig = d5[(d5['EVENT'].isin(psi['EVENT'])) & | |
(abs(d5['E[dPsi]']) >= d5['MV[dPsi]_at_0.95']) & | |
(d5['MV[dPsi]_at_0.95'] >= 0.01) & | |
(abs(d5['E[dPsi]']) >= 0.1) | |
].sort_values('MV[dPsi]_at_0.95',ascending=False).reset_index(drop=1) | |
d2_mdamb = set(d2_sig['EVENT']) | |
d5_mdamb = set(d5_sig['EVENT']) | |
plt.figure(figsize=(3,4)) | |
v = venn2_unweighted([d2_mdamb,d5_mdamb], set_labels=['2 days of\ntreatment','5 days of\ntreatment']) | |
c = venn2_circles(subsets=(1, 1, 1), linestyle='solid') | |
for text in v.set_labels: | |
text.set_fontsize(12) | |
for text in v.subset_labels: | |
text.set_fontsize(10) | |
for i in ['11','10','01']: | |
v.get_patch_by_id(i).set_alpha(0.0) | |
c[0].set_ls('dashed') | |
plt.title('MDAMB468 - MS023 1uM\nAlternative splicing events',size=12,y=1.0) | |
plt.savefig('./mdamb468_overlap.svg',dpi=300,bbox_inches='tight') | |
plt.savefig('./mdamb468_overlap.pdf',dpi=300,bbox_inches='tight') | |
plt.show() | |
# In[715]: | |
md_ov = d2_sig[['EVENT','E[dPsi]']].set_index('EVENT').add_suffix('_2d').reset_index().merge( | |
d5_sig[['EVENT','E[dPsi]']].set_index('EVENT').add_suffix('_5d').reset_index()) | |
md_ov['MEAN'] = md_ov.apply(lambda x: np.mean([x['E[dPsi]_2d'],x['E[dPsi]_5d']]),1) | |
md_ov['STD'] = md_ov.apply(lambda x: np.std([x['E[dPsi]_2d'],x['E[dPsi]_5d']]),1) | |
md_ov = md_ov[abs(md_ov['MEAN']) >= md_ov['STD']] | |
md_ov = md_ov.merge(d2_sig[['EVENT','GENE','ONTO']]) | |
md_ov['ONTO'] = md_ov['ONTO'].astype(str) | |
md_ov['ONTO_SHORT'] = md_ov.apply(lambda x: | |
"Protective" if x['MEAN'] < 0 and ('ORF disruption' in x['ONTO']) and ('inclusion' in x['ONTO']) else | |
"Deleterious" if x['MEAN'] < 0 and ('ORF disruption' in x['ONTO']) and ('exclusion' in x['ONTO']) else | |
"Protective" if x['MEAN'] > 0 and ('ORF disruption' in x['ONTO']) and ('exclusion' in x['ONTO']) else | |
"Deleterious" if x['MEAN'] > 0 and ('ORF disruption' in x['ONTO']) and ('inclusion' in x['ONTO']) else | |
"Alternative protein isoform" if ('isoform' in x['ONTO']) else | |
"Unknown" if ('description available' in x['ONTO']) else | |
"ncRNA" if ('NonCoding' in x['ONTO']) else | |
x['ONTO'], axis=1).replace('nan','Unknown') | |
md_ov['INC'] = md_ov.apply(lambda x: | |
"Exclusion" if x['MEAN'] < 0 else | |
"Inclusion" if x['MEAN'] > 0 else 'Not ASE', axis=1).replace('nan','Unknown') | |
md_ov['TYPE'] = md_ov.apply(lambda x: | |
"Exon" if 'HsaEX' in x['EVENT'] else | |
"Intron" if 'HsaINT' in x['EVENT'] else | |
"Alt. 3'SS" if 'HsaALTA' in x['EVENT'] else | |
"Alt. 5'SS" if 'HsaALTD' in x['EVENT'] else | |
'Not ASE', axis=1).replace('nan','Unknown') | |
md_ov['LABEL'] = md_ov['TYPE']+';'+md_ov['INC'] | |
md_res = pd.DataFrame(md_ov['ONTO_SHORT'].value_counts()).reset_index() | |
md_res.columns = ['Protein impact','Counts'] | |
md_res['%Impact'] = md_res.apply(lambda x: np.round(x['Counts']/np.sum(md_res['Counts']),3)*100, axis=1) | |
md_res['Line'] = 'MDAMB468' | |
md_res2 = pd.DataFrame(md_ov['LABEL'].value_counts()).reset_index() | |
md_res2.columns = ['Event label','Counts'] | |
md_res2['%Event'] = md_res2.apply(lambda x: np.round(x['Counts']/np.sum(md_res2['Counts']),3)*100, axis=1) | |
md_res2['Line'] = 'MDAMB468' | |
# In[716]: | |
md_ov['GENE'].tolist() | |
# In[717]: | |
md_ov[md_ov['GENE'].isin(['ATF2','EFNA1','KITLG','SYK'])] | |
# In[ ]: | |
# In[718]: | |
res = pd.concat([hs_res,md_res]) | |
plt.figure(figsize=(6,4)) | |
g = sns.barplot(x='Protein impact', | |
y='%Impact', | |
hue='Line', | |
palette=['white','lightgrey'], | |
dodge=True, | |
linewidth=1, | |
edgecolor='black', | |
data=res, | |
order=['Alternative protein isoform','Protective','Deleterious', | |
"5' UTR/ncRNA","3' UTR",'ncRNA','Unknown']) | |
show_values_on_bars(g, t=float, dc=1, fs=8, r=0) | |
g.set_ylabel('Ratio of ASEs (%)', size=12) | |
g.set_yticks([0,10,20,30,40,50]) | |
g.set_yticklabels(g.get_yticks().astype(float), size=12) | |
sns.despine(trim=True, offset=5) | |
g.set_xlabel('Predicted protein impact', size=12) | |
g.set_xticklabels(['Alt. isoform','Protective','Deleterious', | |
"5' UTR/ncRNA","3' UTR",'ncRNA','Unknown'], | |
size=12, rotation=30, ha='right', rotation_mode="anchor") | |
plt.savefig('./prmt1i_protein_impact.svg', dpi=300, bbox_inches='tight') | |
plt.savefig('./prmt1i_protein_impact.pdf', dpi=300, bbox_inches='tight') | |
plt.show() | |
# In[719]: | |
res2 = pd.concat([hs_res2,md_res2]) | |
res2['Region'] = res2['Event label'].apply(lambda x: x.split(";")[0]) | |
res2['Level'] = res2['Event label'].apply(lambda x: x.split(";")[1]) | |
# In[727]: | |
res2['Event label'].unique() | |
# In[739]: | |
plt.figure(figsize=(6,4)) | |
g = sns.barplot(x='Event label', | |
y='%Event', | |
hue='Line', | |
palette=['white','lightgrey'], | |
dodge=True, | |
linewidth=1, | |
edgecolor='black', | |
data=res2, | |
order=['Intron;Exclusion', 'Intron;Inclusion', | |
'Exon;Exclusion', 'Exon;Inclusion', | |
"Alt. 5'SS;Exclusion", "Alt. 5'SS;Inclusion", | |
"Alt. 3'SS;Exclusion", "Alt. 3'SS;Inclusion"] | |
) | |
show_values_on_bars(g, t=float, dc=1, fs=8, r=0) | |
g.set_ylabel('Ratio of ASEs (%)', size=12) | |
g.set_yticks([0,10,20,30,40,50,60,70]) | |
g.set_yticklabels(g.get_yticks().astype(float), size=12) | |
sns.despine(trim=True, offset=5) | |
g.set_xlabel('Type of ASE', size=12) | |
g.set_xticklabels(['Intron exclusion', 'Intron inclusion', | |
'Exon exclusion', 'Exon inclusion', | |
"Alt. 5'SS exclusion", "Alt. 5'SS inclusion", | |
"Alt. 3'SS exclusion", "Alt. 3'SS inclusion"], | |
size=12, rotation=30, ha='right', rotation_mode="anchor") | |
plt.legend(loc='upper right') | |
plt.savefig('./inclusion_profile.svg', dpi=300, bbox_inches='tight') | |
plt.savefig('./inclusion_profile.pdf', dpi=300, bbox_inches='tight') | |
plt.show() | |
# In[657]: | |
res = pd.concat([hs_res,md_res]) | |
plt.figure(figsize=(6,4)) | |
g = sns.barplot(x='Protein impact', | |
y='%Impact', | |
hue='Line', | |
palette=['white','lightgrey'], | |
dodge=True, | |
linewidth=1, | |
edgecolor='black', | |
data=res, | |
order=['Alternative protein isoform','Protective','Deleterious', | |
"5' UTR/ncRNA","3' UTR",'ncRNA','Unknown']) | |
show_values_on_bars(g, t=float, dc=1, fs=8, r=0) | |
g.set_ylabel('Ratio of ASEs (%)', size=12) | |
g.set_yticks([0,10,20,30,40,50]) | |
g.set_yticklabels(g.get_yticks().astype(float), size=12) | |
sns.despine(trim=True, offset=5) | |
g.set_xlabel('Predicted protein impact', size=12) | |
g.set_xticklabels(['Alt. isoform','Protective','Deleterious', | |
"5' UTR/ncRNA","3' UTR",'ncRNA','Unknown'], | |
size=12, rotation=30, ha='right', rotation_mode="anchor") | |
plt.savefig('./prmt1i_protein_impact.svg', dpi=300, bbox_inches='tight') | |
plt.savefig('./prmt1i_protein_impact.pdf', dpi=300, bbox_inches='tight') | |
plt.show() | |
# In[ ]: | |
writer = pd.ExcelWriter('./overlap.day2.day5.ASEs.xlsx', | |
engine='xlsxwriter') | |
hs_ov.to_excel(writer, sheet_name='HS578T', index=False) | |
md_ov.to_excel(writer, sheet_name='MDAMB468', index=False) | |
writer.save() | |
# ### Hs578T vs MDAMB468 | |
# In[1167]: | |
psi = pd.read_csv('./vast_diff_results/Hs578T_1uM_MS023_2d_vs_MDAMB468_1uM_MS023_2d.txt',sep='\t') | |
d2 = pd.read_excel('./vast_diff_results/Hs578T_1uM_MS023_2d_vs_MDAMB468_1uM_MS023_2d.ASEs.xlsx', | |
sheet_name='All_significant_ASEs',sep='\t') | |
d2_sig = d2[(d2['EVENT'].isin(psi['EVENT'])) & | |
(abs(d2['E[dPsi]']) >= d2['MV[dPsi]_at_0.95']) & | |
(d2['MV[dPsi]_at_0.95'] >= 0.01) & | |
(abs(d2['E[dPsi]']) >= 0.1) | |
].sort_values('MV[dPsi]_at_0.95',ascending=False).reset_index(drop=1) | |
psi = pd.read_csv('./vast_diff_results/Hs578T_1uM_MS023_5d_vs_MDAMB468_1uM_MS023_5d.txt',sep='\t') | |
d5 = pd.read_excel('./vast_diff_results/Hs578T_1uM_MS023_5d_vs_MDAMB468_1uM_MS023_5d.ASEs.xlsx', | |
sheet_name='All_significant_ASEs', sep='\t') | |
d5_sig = d5[(d5['EVENT'].isin(psi['EVENT'])) & | |
(abs(d5['E[dPsi]']) >= d5['MV[dPsi]_at_0.95']) & | |
(d5['MV[dPsi]_at_0.95'] >= 0.01) & | |
(abs(d5['E[dPsi]']) >= 0.1) | |
].sort_values('MV[dPsi]_at_0.95',ascending=False).reset_index(drop=1) | |
psi = pd.read_csv('./vast_diff_results/Hs578T_DMSO_vs_MDAMB468_DMSO.txt',sep='\t') | |
dm = pd.read_excel('./vast_diff_results/Hs578T_DMSO_vs_MDAMB468_DMSO.ASEs.xlsx', | |
sheet_name='All_significant_ASEs', sep='\t') | |
dm_sig = dm[(dm['EVENT'].isin(psi['EVENT'])) & | |
(abs(dm['E[dPsi]']) >= dm['MV[dPsi]_at_0.95']) & | |
(dm['MV[dPsi]_at_0.95'] >= 0.01) & | |
(abs(dm['E[dPsi]']) >= 0.1) | |
].sort_values('MV[dPsi]_at_0.95',ascending=False).reset_index(drop=1) | |
d2_hs578 = set(d2_sig['EVENT']) | |
d5_hs578 = set(d5_sig['EVENT']) | |
dm_hs578 = set(dm_sig['EVENT']) | |
plt.figure(figsize=(6,6)) | |
v = venn3_unweighted([d2_hs578,d5_hs578, dm_hs578], | |
set_labels=['2 days of\ntreatment','5 days of\ntreatment','DMSO control']) | |
c = venn3_circles(subsets=(1, 1, 1, 1 , 1, 1, 1), linestyle='solid') | |
for text in v.set_labels: | |
text.set_fontsize(20) | |
for text in v.subset_labels: | |
text.set_fontsize(18) | |
for i in ['111','110','101','011','010','001','100']: | |
v.get_patch_by_id(i).set_alpha(0.0) | |
c[0].set_ls('dashed') | |
c[1].set_ls('dotted') | |
plt.title('HS578T vs. MDAMB568\nAlternative splicing events',size=20,y=1.1) | |
plt.savefig('./hs587t_vs_mdamb_overlap.svg',dpi=300,bbox_inches='tight') | |
plt.savefig('./hs587t_vs_mdamb_overlap.pdf',dpi=300,bbox_inches='tight') | |
plt.show() | |
# In[1248]: | |
hs_md_ov = d2_sig[['EVENT','E[dPsi]']].set_index('EVENT').add_suffix('_2d').reset_index().merge( | |
d5_sig[['EVENT','E[dPsi]']].set_index('EVENT').add_suffix('_5d').reset_index()).merge( | |
dm_sig[['EVENT','E[dPsi]']].set_index('EVENT').add_suffix('_dm').reset_index()) | |
hs_md_ov['MEAN'] = hs_md_ov.apply(lambda x: np.mean([x['E[dPsi]_2d'],x['E[dPsi]_5d'],x['E[dPsi]_dm']]),1) | |
hs_md_ov['STD'] = hs_md_ov.apply(lambda x: np.std([x['E[dPsi]_2d'],x['E[dPsi]_5d'],x['E[dPsi]_dm']]),1) | |
hs_md_ov = hs_md_ov[abs(hs_md_ov['MEAN']) >= hs_md_ov['STD']] | |
hs_md_ov = hs_md_ov.merge(d2_sig[['EVENT','GENE','ONTO']]) | |
hs_md_ov['ONTO'] = hs_md_ov['ONTO'].astype(str) | |
hs_md_ov['ONTO_SHORT'] = hs_md_ov.apply(lambda x: | |
"Protective" if x['MEAN'] < 0 and ('ORF disruption' in x['ONTO']) and ('inclusion' in x['ONTO']) else | |
"Deleterious" if x['MEAN'] < 0 and ('ORF disruption' in x['ONTO']) and ('exclusion' in x['ONTO']) else | |
"Protective" if x['MEAN'] > 0 and ('ORF disruption' in x['ONTO']) and ('exclusion' in x['ONTO']) else | |
"Deleterious" if x['MEAN'] > 0 and ('ORF disruption' in x['ONTO']) and ('inclusion' in x['ONTO']) else | |
"Alternative protein isoform" if ('isoform' in x['ONTO']) else | |
"Unknown" if ('description available' in x['ONTO']) else | |
"ncRNA" if ('NonCoding' in x['ONTO']) else | |
x['ONTO'], axis=1).replace('nan','Unknown') | |
# In[1362]: | |
sl4 = hs_md_ov[abs(hs_md_ov['MEAN']) >= 0.5] | |
sl4 = sl4[sl4['GENE'] != 'DST'] | |
sl4['ID'] = sl4['GENE']+'='+sl4['EVENT'] | |
# In[1275]: | |
hs_md_ov.to_excel('./overlap.HS578_vs_MDAMB468.ASEs.xlsx') | |
# In[1363]: | |
samps2 = tidy_split(samps_groups, 'SAMPLES', sep=',') | |
samps2['REP'] = ['_R1','_R2','_R3']*len(samps2['COND'].unique()) | |
samps2['SAMP_ID'] = samps2['COND']+samps2['REP'] | |
# In[1255]: | |
vast_psi = pd.read_csv('./INCLUSION_LEVELS_FULL-Hsa24-hg38.tab',sep='\t') | |
# In[1364]: | |
cols = pd.DataFrame(vast_psi.columns) | |
cols = cols[(cols[0].str.contains("MDAMB468|Hs578T|hs578t")) & ~(cols[0].str.contains("_R1-Q"))] | |
myEvs = sl4.sort_values("MEAN", ascending=False)['EVENT'] | |
vast_psi_values = vast_psi[cols[0].tolist()+['EVENT']].set_index('EVENT').T | |
myPsi = vast_psi_values[myEvs].reset_index().rename(columns={'index':'SAMPLES'}) | |
myPsi_ID = myPsi.merge(samps2[['SAMPLES','SAMP_ID']]).drop('SAMPLES',1).set_index('SAMP_ID').T | |
sl3 = myPsi_ID[[ | |
'Hs578T_1uM_MS023_2d_R1', | |
'Hs578T_1uM_MS023_2d_R2', | |
'Hs578T_1uM_MS023_2d_R3', | |
'Hs578T_1uM_MS023_5d_R1', | |
'Hs578T_1uM_MS023_5d_R2', | |
'Hs578T_1uM_MS023_5d_R3', | |
'Hs578T_shPRMT1_DOX_R1', | |
'Hs578T_shPRMT1_DOX_R2', | |
'Hs578T_shPRMT1_DOX_R3', | |
'Hs578T_shPRMT1_NoDOX_R1', | |
'Hs578T_shPRMT1_NoDOX_R2', | |
'Hs578T_shPRMT1_NoDOX_R3', | |
'Hs578T_DMSO_R1', | |
'Hs578T_DMSO_R2', | |
'Hs578T_DMSO_R3', | |
'MDAMB468_1uM_MS023_2d_R1', | |
'MDAMB468_1uM_MS023_2d_R2', | |
'MDAMB468_1uM_MS023_2d_R3', | |
'MDAMB468_1uM_MS023_5d_R1', | |
'MDAMB468_1uM_MS023_5d_R2', | |
'MDAMB468_1uM_MS023_5d_R3', | |
'MDAMB468_DMSO_R1', | |
'MDAMB468_DMSO_R2', | |
'MDAMB468_DMSO_R3', | |
]] | |
sl5 = sl3.reset_index().merge(sl4[['EVENT','ID']].rename(columns={'EVENT':'index'})).set_index('ID').drop("index",1) | |
# In[1365]: | |
grid_kws = {"height_ratios": (0.9, .025), "hspace": 0.3} | |
f, (ax, cbar_ax) = plt.subplots(2, gridspec_kw=grid_kws, figsize = (12,12)) | |
sns.heatmap(sl5, | |
cmap="YlGnBu", | |
linewidths=1, | |
ax=ax, | |
cbar_ax=cbar_ax, | |
annot=True, | |
fmt='.0f', | |
cbar_kws={"orientation": "horizontal"}) | |
plt.savefig('top_targets_Hs578_vs_MDAMB.h.svg', dpi=300, bbox_inches='tight') | |
plt.savefig('top_targets_Hs578_vs_MDAMB.h.pdf', dpi=300, bbox_inches='tight') | |
plt.show() | |
# In[1366]: | |
grid_kws = {"height_ratios": (0.9, .025), "hspace": 0.3} | |
f, (ax, cbar_ax) = plt.subplots(2, gridspec_kw=grid_kws, figsize = (10,12)) | |
sns.heatmap(sl5.T, | |
cmap="YlGnBu", | |
linewidths=1, | |
ax=ax, | |
cbar_ax=cbar_ax, | |
annot=True, | |
fmt='.0f', | |
cbar_kws={"orientation": "horizontal"}) | |
plt.savefig('top_targets_Hs578_vs_MDAMB.v.svg', dpi=300, bbox_inches='tight') | |
plt.savefig('top_targets_Hs578_vs_MDAMB.v.pdf', dpi=300, bbox_inches='tight') | |
plt.show() | |
# In[1367]: | |
df_merged = hs_ov[['EVENT','GENE','MEAN']].rename(columns={'MEAN':'dPSI[Hs578T_MS023_1uM_vs_Hs578T_DMSO]'} | |
).merge(hs_md_ov[['EVENT','MEAN','ONTO']] | |
).rename(columns={'MEAN':'dPSI[Hs578T_vs_MDAMB468]'}) | |
df_merged['ID'] = df_merged['GENE']+'='+df_merged['EVENT'] | |
# In[1377]: | |
cols = pd.DataFrame(vast_psi.columns) | |
cols = cols[(cols[0].str.contains("MDAMB468|Hs578T|hs578t")) & ~(cols[0].str.contains("_R1-Q"))] | |
myEvs = df_merged['EVENT']#.sort_values("MEAN", ascending=False)['EVENT'] | |
vast_psi_values = vast_psi[cols[0].tolist()+['EVENT']].set_index('EVENT').T | |
myPsi = vast_psi_values[myEvs].reset_index().rename(columns={'index':'SAMPLES'}) | |
myPsi_ID = myPsi.merge(samps2[['SAMPLES','SAMP_ID']]).drop('SAMPLES',1).set_index('SAMP_ID').T | |
sl3 = myPsi_ID[[ | |
'Hs578T_1uM_MS023_2d_R1', | |
'Hs578T_1uM_MS023_2d_R2', | |
'Hs578T_1uM_MS023_2d_R3', | |
'Hs578T_1uM_MS023_5d_R1', | |
'Hs578T_1uM_MS023_5d_R2', | |
'Hs578T_1uM_MS023_5d_R3', | |
#'Hs578T_shPRMT1_DOX_R1', | |
#'Hs578T_shPRMT1_DOX_R2', | |
#'Hs578T_shPRMT1_DOX_R3', | |
#'Hs578T_shPRMT1_NoDOX_R1', | |
#'Hs578T_shPRMT1_NoDOX_R2', | |
#'Hs578T_shPRMT1_NoDOX_R3', | |
'Hs578T_DMSO_R1', | |
'Hs578T_DMSO_R2', | |
'Hs578T_DMSO_R3', | |
'MDAMB468_1uM_MS023_2d_R1', | |
'MDAMB468_1uM_MS023_2d_R2', | |
'MDAMB468_1uM_MS023_2d_R3', | |
'MDAMB468_1uM_MS023_5d_R1', | |
'MDAMB468_1uM_MS023_5d_R2', | |
'MDAMB468_1uM_MS023_5d_R3', | |
'MDAMB468_DMSO_R1', | |
'MDAMB468_DMSO_R2', | |
'MDAMB468_DMSO_R3', | |
]] | |
sl5 = sl3.reset_index().merge(df_merged[['EVENT','ID']].rename(columns={'EVENT':'index'})).set_index('ID').drop("index",1) | |
# In[1378]: | |
grid_kws = {"height_ratios": (0.9, .025), "hspace": 0.3} | |
f, (ax, cbar_ax) = plt.subplots(2, gridspec_kw=grid_kws, figsize = (4,8)) | |
sns.heatmap(sl5.T, | |
cmap="YlGnBu", | |
linewidths=1, | |
ax=ax, | |
cbar_ax=cbar_ax, | |
annot=True, | |
fmt='.0f', | |
cbar_kws={"orientation": "horizontal"}) | |
plt.savefig('top_targets_Hs578_vs_DMSO_and_MDAMB.v.svg', dpi=300, bbox_inches='tight') | |
plt.savefig('top_targets_Hs578_vs_DMSO_and_MDAMB.v.pdf', dpi=300, bbox_inches='tight') | |
plt.show() | |
# In[ ]: | |
grid_kws = {"height_ratios": (0.9, .025), "hspace": 0.3} | |
f, (ax, cbar_ax) = plt.subplots(2, gridspec_kw=grid_kws, figsize = (6,4)) | |
sns.heatmap(sl5, | |
cmap="YlGnBu", | |
linewidths=1, | |
ax=ax, | |
cbar_ax=cbar_ax, | |
annot=True, | |
fmt='.0f', | |
cbar_kws={"orientation": "horizontal"}) | |
plt.savefig('top_targets_Hs578_vs_DMSO_and_MDAMB.h.svg', dpi=300, bbox_inches='tight') | |
plt.savefig('top_targets_Hs578_vs_DMSO_and_MDAMB.h.pdf', dpi=300, bbox_inches='tight') | |
plt.show() | |
# ### HS578 vs MDAMB468 - VAST TIDY | |
# In[1318]: | |
psi_table = pd.read_table("INCLUSION_LEVELS_FULL-Hsa24-hg38.TIDY.MS023.tab")#.fillna(0) | |
cols_to_drop = psi_table.filter(regex='-Q').columns.tolist() | |
psi_table.drop(columns=cols_to_drop, inplace=True) | |
psi_table = psi_table.set_index('EVENT').T.reset_index().rename(columns={'index':'SAMPLES'} | |
).merge(samps2[['SAMPLES','SAMP_ID']] | |
).set_index('SAMP_ID').drop("SAMPLES",1).T.reset_index().rename(columns={'index':'EVENT'}) | |
hs_treated = psi_table.filter(regex='Hs578').filter(regex='MS023').columns.tolist() | |
hs_dmso = psi_table.filter(regex='Hs578').filter(regex='DMSO').columns.tolist() | |
md_treated = psi_table.filter(regex='MDAMB').filter(regex='MS023').columns.tolist() | |
md_dmso = psi_table.filter(regex='MDAMB').filter(regex='DMSO').columns.tolist() | |
psi_table = psi_table[['EVENT']+hs_treated+hs_dmso+md_treated+md_dmso] | |
# #.drop(psi_table.filter(regex='-Q').columns.tolist(),1) | |
# In[1320]: | |
psi_table['Hs578_vs_MDAMB468_pVal'] = psi_table.apply(lambda x: stats.mstats.mannwhitneyu(x[hs_treated+hs_dmso].values, | |
x[md_treated+md_dmso].values)[1], | |
axis=1) | |
psi_table['Hs578_vs_MDAMB468_dPSI'] = psi_table.apply(lambda x: np.round( | |
np.mean(x[hs_treated+hs_dmso])-np.mean(x[md_treated+md_dmso]),2), | |
axis=1) | |
psi_table['Hs578_MS023_vs_DMSO_pVal'] = psi_table.apply(lambda x: stats.mstats.mannwhitneyu(x[hs_treated].values, | |
x[hs_dmso].values)[1], | |
axis=1) | |
psi_table['Hs578_MS023_vs_DMSO_dPSI'] = psi_table.apply(lambda x: np.round( | |
np.mean(x[hs_treated])-np.mean(x[hs_dmso]),2), | |
axis=1) | |
psi_table['MDAMB468_MS023_vs_DMSO_pVal'] = psi_table.apply(lambda x: stats.mstats.mannwhitneyu(x[md_treated].values, | |
x[md_dmso].values)[1], | |
axis=1) | |
psi_table['MDAMB468_MS023_vs_DMSO_dPSI'] = psi_table.apply(lambda x: np.round( | |
np.mean(x[md_treated])-np.mean(x[md_dmso]),2), | |
axis=1) | |
# In[ ]: | |
# In[1322]: | |
hs578_vs_md468_hs = psi_table[(psi_table['Hs578_vs_MDAMB468_pVal'] <= 0.05) & | |
#(psi_table['Hs578_MS023_vs_DMSO_pVal'] >= 0.05) & | |
(abs(psi_table['Hs578_vs_MDAMB468_dPSI']) >= 40) #& | |
#(abs(psi_table['Hs578_MS023_vs_DMSO_dPSI']) <= 20) | |
].sort_values('Hs578_vs_MDAMB468_dPSI', ascending=False) | |
hs578_vs_md468_all = psi_table[(psi_table['Hs578_vs_MDAMB468_pVal'] <= 0.05) & | |
#(psi_table['Hs578_MS023_vs_DMSO_pVal'] >= 0.05) & | |
(abs(psi_table['Hs578_vs_MDAMB468_dPSI']) >= 20) #& | |
#(abs(psi_table['Hs578_MS023_vs_DMSO_dPSI']) <= 20) | |
].sort_values('Hs578_vs_MDAMB468_dPSI', ascending=False) | |
# In[1341]: | |
mytest = psi_table[((psi_table['Hs578_vs_MDAMB468_pVal'] <= 0.05) & | |
(abs(psi_table['Hs578_vs_MDAMB468_dPSI']) >= 10)) | | |
((psi_table['Hs578_MS023_vs_DMSO_pVal'] <= 0.05) & | |
(abs(psi_table['Hs578_MS023_vs_DMSO_dPSI']) >= 10)) | | |
((psi_table['MDAMB468_MS023_vs_DMSO_pVal'] <= 0.05) & | |
(abs(psi_table['MDAMB468_MS023_vs_DMSO_dPSI']) >= 10)) | |
].sort_values('Hs578_vs_MDAMB468_dPSI', ascending=False).set_index('EVENT') | |
# In[1342]: | |
mytest.shape | |
# In[1343]: | |
mytest | |
# In[1338]: | |
grid_kws = {"height_ratios": (0.9, .025), "hspace": 0.3} | |
f, (ax, cbar_ax) = plt.subplots(2, gridspec_kw=grid_kws, figsize = (10,10)) | |
sns.heatmap(mytest.drop(['Hs578_vs_MDAMB468_pVal','Hs578_vs_MDAMB468_dPSI', | |
'Hs578_MS023_vs_DMSO_pVal','Hs578_MS023_vs_DMSO_dPSI', | |
'MDAMB468_MS023_vs_DMSO_pVal','MDAMB468_MS023_vs_DMSO_dPSI',],1), | |
cmap="YlGnBu", | |
linewidths=1, | |
ax=ax, | |
cbar_ax=cbar_ax, | |
annot=True, | |
fmt='.0f', | |
cbar_kws={"orientation": "horizontal"}) | |
plt.show() | |
# In[ ]: | |
# In[ ]: | |
#!/usr/bin/env python | |
# coding: utf-8 | |
# In[768]: | |
import pandas as pd | |
import numpy as np | |
import glob | |
import itertools | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
from matplotlib_venn import * | |
import scipy.stats as stats | |
# In[491]: | |
def tidy_split(df, column, sep='|', keep=False): | |
""" | |
Split the values of a column and expand so the new DataFrame has one split | |
value per row. Filters rows where the column is missing. | |
Params | |
------ | |
df : pandas.DataFrame | |
dataframe with the column to split and expand | |
column : str | |
the column to split and expand | |
sep : str | |
the string used to split the column's values | |
keep : bool | |
whether to retain the presplit value as it's own row | |
Returns | |
------- | |
pandas.DataFrame | |
Returns a dataframe with the same columns as `df`. | |
""" | |
indexes = list() | |
new_values = list() | |
df = df.dropna(subset=[column]) | |
for i, presplit in enumerate(df[column].astype(str)): | |
values = presplit.split(sep) | |
if keep and len(values) > 1: | |
indexes.append(i) | |
new_values.append(presplit) | |
for value in values: | |
indexes.append(i) | |
new_values.append(value) | |
new_df = df.iloc[indexes, :].copy() | |
new_df[column] = new_values | |
return new_df | |
# In[319]: | |
def show_values_on_bars(axs, h_v="v", space=0.4, fs=10, t=int, dc=2, r=0): | |
def _show_on_single_plot(ax): | |
if h_v == "v": | |
for p in ax.patches: | |
_x = p.get_x() + p.get_width() / 2 | |
_y = p.get_y() + p.get_height() | |
if np.isnan(p.get_height()) == True: | |
value = 0 | |
_y = 0 | |
else: | |
if t == float: | |
value = np.round(t(p.get_height()),dc) | |
else: | |
value = t(p.get_height()) | |
ax.text(_x, _y, value, ha="center", fontsize=fs, rotation=r) | |
elif h_v == "h": | |
for p in ax.patches: | |
_x = p.get_x() + p.get_width() + float(space) | |
_y = p.get_y() + p.get_height() | |
if t == float: | |
value = np.round(t(p.get_width()),dc) | |
else: | |
value = t(p.get_width()) | |
ax.text(_x, _y, value, ha="left", rotation=r, fontsize=fs) | |
if isinstance(axs, np.ndarray): | |
for idx, ax in np.ndenumerate(axs): | |
_show_on_single_plot(ax) | |
else: | |
_show_on_single_plot(axs) | |
# ### Pre-processing | |
# In[778]: | |
samps = pd.read_csv('./sample.list.txt', names=['SAMPLES','LINE'], sep='\t') | |
samps['COND'] = samps['SAMPLES'].apply(lambda x: '-'.join(x.split('-')[1:])) | |
samps_groups = samps.groupby('COND')['SAMPLES'].agg(lambda x: ','.join(x)).reset_index() | |
samps_groups['COND'] = ['Hs578T_1uM_MS023_2d', | |
'Hs578T_1uM_MS023_5d', | |
'MDAMB468_1uM_MS023_2d', | |
'MDAMB468_1uM_MS023_5d', | |
'Hs578T_DMSO', | |
'MDAMB468_DMSO', | |
'Hs578T_shPRMT1_DOX', | |
'Hs578T_shPRMT1_NoDOX'] | |
samps_pair = list(itertools.combinations(samps_groups['SAMPLES'],2)) | |
conds_pair = list(itertools.combinations(samps_groups['COND'],2)) | |
d1 = pd.DataFrame(samps_pair, columns=['SAMPLE_A','SAMPLE_B']) | |
d2 = pd.DataFrame(conds_pair, columns=['COND_A','COND_B']) | |
dd = pd.concat([d1,d2], axis=1) | |
dd['COMP'] = dd['COND_B']+'_vs_'+dd['COND_A'] | |
dd['CELL_A'] = dd['COND_A'].apply(lambda x: x.split('_')[0]) | |
dd['CELL_B'] = dd['COND_B'].apply(lambda x: x.split('_')[0]) | |
dd_c = dd[dd['CELL_A'] == dd['CELL_B']] | |
# In[780]: | |
dd_pass = dd_c[dd_c['COND_A'].str.contains('DMSO')] | |
dd_fix = dd_c[dd_c['COND_B'].str.contains('DMSO')] | |
dd_fix.columns = ['SAMPLE_B','SAMPLE_A','COND_B','COND_A','COMP','CELL_B','CELL_A'] | |
dd_fix['COMP'] = dd_fix['COND_B']+'_vs_'+dd_fix['COND_A'] | |
dd_fix = dd_fix[dd_c.columns] | |
dd2 = pd.concat([dd_pass,dd_fix]).drop_duplicates().reset_index(drop=1) | |
dd2 = dd2[(dd2['COND_A'].str.contains('DMSO')) & ~(dd2['COND_B'].str.contains('DOX'))] | |
dd_pass = dd[dd['COND_A'].str.contains('NoDOX')] | |
dd_fix = dd[dd['COND_B'].str.contains('NoDOX')] | |
dd_fix.columns = ['SAMPLE_B','SAMPLE_A','COND_B','COND_A','COMP','CELL_B','CELL_A'] | |
dd_fix['COMP'] = dd_fix['COND_B']+'_vs_'+dd_fix['COND_A'] | |
dd_fix = dd_fix[dd.columns] | |
dd3 = pd.concat([dd_pass,dd_fix]).drop_duplicates().reset_index(drop=1) | |
dd3 = dd3[(dd3['COND_A'].str.contains('DOX')) & (dd3['COND_B'].str.contains('DOX'))] | |
dd_f = pd.concat([dd2,dd3]) | |
dd_f.to_csv('./comb.pair.treated.vs.control.txt',sep=';',header=False,index=False) | |
# ### Generate Hs vs MDAM comparissons | |
# In[781]: | |
dd_c = dd[(dd['CELL_A'] != dd['CELL_B']) & | |
(dd['COND_A'].str.contains('2d|5d')) & | |
(dd['COND_B'].str.contains('2d|5d'))] | |
dd_pass = dd_c[(dd_c['COND_A'].str.contains('MDAMB468')) & (dd_c['COND_B'].str.contains('Hs578T'))] | |
dd_fix = dd_c[(dd_c['COND_B'].str.contains('MDAMB468')) & (dd_c['COND_A'].str.contains('Hs578T'))] | |
dd_fix.columns = ['SAMPLE_B','SAMPLE_A','COND_B','COND_A','COMP','CELL_B','CELL_A'] | |
dd_fix['COMP'] = dd_fix['COND_B']+'_vs_'+dd_fix['COND_A'] | |
dd_fix = dd_fix[dd_c.columns] | |
dd2 = pd.concat([dd_pass,dd_fix]).drop_duplicates().reset_index(drop=1) | |
# #dd2 = dd2[(dd2['COND_A'].str.contains('DMSO')) & ~(dd2['COND_B'].str.contains('DOX'))] | |
# dd_pass = dd[dd['COND_A'].str.contains('NoDOX')] | |
# dd_fix = dd[dd['COND_B'].str.contains('NoDOX')] | |
# dd_fix.columns = ['SAMPLE_B','SAMPLE_A','COND_B','COND_A','COMP','CELL_B','CELL_A'] | |
# dd_fix['COMP'] = dd_fix['COND_B']+'_vs_'+dd_fix['COND_A'] | |
# dd_fix = dd_fix[dd.columns] | |
# dd3 = pd.concat([dd_pass,dd_fix]).drop_duplicates().reset_index(drop=1) | |
# #dd3 = dd3[(dd3['COND_A'].str.contains('DOX')) & (dd3['COND_B'].str.contains('DOX'))] | |
dd_fc = pd.concat([dd2]) | |
dd_fc.to_csv('./comb.pair.hs578t.vs.mdamb468.txt',sep=';',header=False,index=False) | |
# In[1122]: | |
dd_c = dd[(dd['CELL_A'] != dd['CELL_B']) & | |
(dd['COND_A'].str.contains('DMSO')) & | |
(dd['COND_B'].str.contains('DMSO'))] | |
dd_pass = dd_c[(dd_c['COND_A'].str.contains('MDAMB468')) & (dd_c['COND_B'].str.contains('Hs578T'))] | |
dd_fix = dd_c[(dd_c['COND_B'].str.contains('MDAMB468')) & (dd_c['COND_A'].str.contains('Hs578T'))] | |
dd_fix.columns = ['SAMPLE_B','SAMPLE_A','COND_B','COND_A','COMP','CELL_B','CELL_A'] | |
dd_fix['COMP'] = dd_fix['COND_B']+'_vs_'+dd_fix['COND_A'] | |
dd_fix = dd_fix[dd_c.columns] | |
dd2 = pd.concat([dd_pass,dd_fix]).drop_duplicates().reset_index(drop=1) | |
# #dd2 = dd2[(dd2['COND_A'].str.contains('DMSO')) & ~(dd2['COND_B'].str.contains('DOX'))] | |
# dd_pass = dd[dd['COND_A'].str.contains('NoDOX')] | |
# dd_fix = dd[dd['COND_B'].str.contains('NoDOX')] | |
# dd_fix.columns = ['SAMPLE_B','SAMPLE_A','COND_B','COND_A','COMP','CELL_B','CELL_A'] | |
# dd_fix['COMP'] = dd_fix['COND_B']+'_vs_'+dd_fix['COND_A'] | |
# dd_fix = dd_fix[dd.columns] | |
# dd3 = pd.concat([dd_pass,dd_fix]).drop_duplicates().reset_index(drop=1) | |
# #dd3 = dd3[(dd3['COND_A'].str.contains('DOX')) & (dd3['COND_B'].str.contains('DOX'))] | |
dd_fd = pd.concat([dd2]) | |
dd_fd.to_csv('./comb.pair.hs578t.vs.mdamb468.DMSO.txt',sep=';',header=False,index=False) | |
# In[1123]: | |
dd_fd | |
# ### Vast-diff parsing | |
# In[323]: | |
annot = pd.read_csv('/home/fciamponi/Programs/vast-tools/VASTDB/Hsa/ANNOT_DATA/VASTDB_EVENTS_Hsa108_hg38_ALL.tab.gz', | |
sep='\t') | |
pcr = pd.read_csv('/home/fciamponi/Programs/vast-tools/VASTDB/Hsa/ANNOT_DATA/VASTDB_PCR_VALIDATIONS_Hsa108_hg19.tab.gz', | |
sep='\t') | |
prt = pd.read_csv('/home/fciamponi/Programs/vast-tools/VASTDB/Hsa/ANNOT_DATA/VASTDB_PROT_IMPACT_Hsa108_hg19_ALL.tab.gz', | |
sep='\t').rename(columns={'EventID':'EVENT'}) | |
pfam = pd.read_csv('/home/fciamponi/Programs/vast-tools/VASTDB/Hsa/ANNOT_DATA/VASTDB_PROT_PFAM_Hsa108_hg19.tab.gz', | |
sep='\t').rename(columns={'EventID':'EVENT'}) | |
# In[324]: | |
pdb = pd.read_csv('/home/fciamponi/Programs/vast-tools/VASTDB/Hsa/ANNOT_DATA/VASTDB_PROT_STRUCT_PDB_Hsa108_hg19.tab.gz', | |
sep='\t').rename(columns={'EVENT_ID':'EVENT'}) | |
phy = pd.read_csv('/home/fciamponi/Programs/vast-tools/VASTDB/Hsa/ANNOT_DATA/VASTDB_PROT_STRUCT_PHYRE_Hsa108_hg19.tab.gz', | |
sep='\t').rename(columns={'EVENT_ID':'EVENT'}) | |
# In[403]: | |
print(annot.shape) | |
print(pcr.shape) | |
print(prt.shape) | |
# In[1124]: | |
for i in range(len(dd_f)): | |
comp = str(dd_f.iloc[i]['COMP']) | |
ca = str(dd_f.iloc[i]['COND_A']) | |
cb = str(dd_f.iloc[i]['COND_B']) | |
df = pd.read_csv('./vast_diff_results/'+comp+'.vast.diff.txt',sep='\t') | |
df.columns = ['GENE','EVENT',ca,cb,'E[dPsi]','MV[dPsi]_at_0.95'] | |
df['org'] = 'org' | |
df = df.merge(annot[['EVENT','COORD_o']], | |
how='outer').merge(prt, | |
how='outer').merge(pcr, | |
how='outer') | |
df = df[df['org'] == 'org'] | |
df.drop('org', 1 ,inplace=True) | |
df['E[dPsi]'] = df['E[dPsi]']*-1 | |
psi = pd.read_csv('./vast_diff_results/'+comp+'.txt',sep='\t') | |
df_ex = df[(df['EVENT'].isin(psi['EVENT'])) & | |
(abs(df['E[dPsi]']) >= df['MV[dPsi]_at_0.95']) & | |
(df['MV[dPsi]_at_0.95'] >= 0.05) & | |
(abs(df['E[dPsi]']) >= 0.2) | |
].sort_values('MV[dPsi]_at_0.95',ascending=False).reset_index(drop=1) | |
df_sig = df[(df['EVENT'].isin(psi['EVENT'])) & | |
(abs(df['E[dPsi]']) >= df['MV[dPsi]_at_0.95']) & | |
(df['MV[dPsi]_at_0.95'] >= 0.01) & | |
(abs(df['E[dPsi]']) >= 0.1) | |
].sort_values('MV[dPsi]_at_0.95',ascending=False).reset_index(drop=1) | |
writer = pd.ExcelWriter('./vast_diff_results/'+comp+'.ASEs.xlsx', | |
engine='xlsxwriter') | |
df_ex.to_excel(writer, sheet_name='Experimental_Validation', index=False) | |
df_sig.to_excel(writer, sheet_name='All_significant_ASEs', index=False) | |
writer.save() | |
df_sig[['EVENT']].to_csv('./vast_diff_results/'+comp+'.SIG.TGTs.txt',sep='\t',index=False,header=False) | |
df_ex[['EVENT']].to_csv('./vast_diff_results/'+comp+'.EXP.TGTs.txt',sep='\t',index=False,header=False) | |
# In[1125]: | |
for i in range(len(dd_fc)): | |
comp = str(dd_fc.iloc[i]['COMP']) | |
ca = str(dd_fc.iloc[i]['COND_A']) | |
cb = str(dd_fc.iloc[i]['COND_B']) | |
df = pd.read_csv('./vast_diff_results/'+comp+'.vast.diff.txt',sep='\t') | |
df.columns = ['GENE','EVENT',ca,cb,'E[dPsi]','MV[dPsi]_at_0.95'] | |
df['org'] = 'org' | |
df = df.merge(annot[['EVENT','COORD_o']], | |
how='outer').merge(prt, | |
how='outer').merge(pcr, | |
how='outer') | |
df = df[df['org'] == 'org'] | |
df.drop('org', 1 ,inplace=True) | |
df['E[dPsi]'] = df['E[dPsi]']*-1 | |
psi = pd.read_csv('./vast_diff_results/'+comp+'.txt',sep='\t') | |
df_ex = df[(df['EVENT'].isin(psi['EVENT'])) & | |
(abs(df['E[dPsi]']) >= df['MV[dPsi]_at_0.95']) & | |
(df['MV[dPsi]_at_0.95'] >= 0.05) & | |
(abs(df['E[dPsi]']) >= 0.2) | |
].sort_values('MV[dPsi]_at_0.95',ascending=False).reset_index(drop=1) | |
df_sig = df[(df['EVENT'].isin(psi['EVENT'])) & | |
(abs(df['E[dPsi]']) >= df['MV[dPsi]_at_0.95']) & | |
(df['MV[dPsi]_at_0.95'] >= 0.01) & | |
(abs(df['E[dPsi]']) >= 0.1) | |
].sort_values('MV[dPsi]_at_0.95',ascending=False).reset_index(drop=1) | |
writer = pd.ExcelWriter('./vast_diff_results/'+comp+'.ASEs.xlsx', | |
engine='xlsxwriter') | |
df_ex.to_excel(writer, sheet_name='Experimental_Validation', index=False) | |
df_sig.to_excel(writer, sheet_name='All_significant_ASEs', index=False) | |
writer.save() | |
df_sig[['EVENT']].to_csv('./vast_diff_results/'+comp+'.SIG.TGTs.txt',sep='\t',index=False,header=False) | |
df_ex[['EVENT']].to_csv('./vast_diff_results/'+comp+'.EXP.TGTs.txt',sep='\t',index=False,header=False) | |
# In[ ]: | |
# In[1157]: | |
for i in range(len(dd_fd)): | |
comp = str(dd_fd.iloc[i]['COMP']) | |
ca = str(dd_fd.iloc[i]['COND_A']) | |
cb = str(dd_fd.iloc[i]['COND_B']) | |
df = pd.read_csv('./vast_diff_results/'+comp+'.vast.diff.txt',sep='\t') | |
df.columns = ['GENE','EVENT',ca,cb,'E[dPsi]','MV[dPsi]_at_0.95'] | |
df['org'] = 'org' | |
df = df.merge(annot[['EVENT','COORD_o']], | |
how='outer').merge(prt, | |
how='outer').merge(pcr, | |
how='outer') | |
df = df[df['org'] == 'org'] | |
df.drop('org', 1 ,inplace=True) | |
df['E[dPsi]'] = df['E[dPsi]']*-1 | |
psi = pd.read_csv('./vast_diff_results/'+comp+'.txt',sep='\t') | |
df_sig = df[(df['EVENT'].isin(psi['EVENT'])) & | |
(abs(df['E[dPsi]']) >= df['MV[dPsi]_at_0.95']) & | |
(df['MV[dPsi]_at_0.95'] >= 0.01) & | |
(abs(df['E[dPsi]']) >= 0.1) | |
].sort_values('MV[dPsi]_at_0.95',ascending=False).reset_index(drop=1) | |
df_ex = df[(df['EVENT'].isin(psi['EVENT'])) & | |
(abs(df['E[dPsi]']) >= df['MV[dPsi]_at_0.95']) & | |
(df['MV[dPsi]_at_0.95'] >= 0.05) & | |
(abs(df['E[dPsi]']) >= 0.2) | |
].sort_values('MV[dPsi]_at_0.95',ascending=False).reset_index(drop=1) | |
writer = pd.ExcelWriter('./vast_diff_results/'+comp+'.ASEs.xlsx', | |
engine='xlsxwriter') | |
df_ex.to_excel(writer, sheet_name='Experimental_Validation', index=False) | |
df_sig.to_excel(writer, sheet_name='All_significant_ASEs', index=False) | |
writer.save() | |
df_sig[['EVENT']].to_csv('./vast_diff_results/'+comp+'.SIG.TGTs.txt',sep='\t',index=False,header=False) | |
df_ex[['EVENT']].to_csv('./vast_diff_results/'+comp+'.EXP.TGTs.txt',sep='\t',index=False,header=False) | |
# In[ ]: | |
# In[ ]: | |
# In[1129]: | |
my_evs = ['HsaINT0022568','HsaEX0011350','HsaINT0030497','HsaEX0051139', | |
'HsaEX0021497','HsaINT0013004','HsaINT0077022','HsaEX0055840'] | |
dd = [] | |
for i in range(len(dd_f[0:4])): | |
comp = str(dd_f.iloc[i]['COMP']) | |
ca = str(dd_f.iloc[i]['COND_A']) | |
cb = str(dd_f.iloc[i]['COND_B']) | |
df = pd.read_excel('./vast_diff_results/'+comp+'.ASEs.xlsx', | |
sheet_name='All_significant_ASEs', sep='\t') | |
sl = df[df['EVENT'].isin(my_evs)] | |
sl = sl[['GENE','EVENT','E[dPsi]']] | |
sl['ID'] = sl['GENE']+' - '+sl['EVENT'] | |
sl['LINE'] = cb.split('_')[0] | |
sl['TIME'] = cb.split('_')[-1] | |
dd.append(sl) | |
# In[692]: | |
plt.figure(figsize=(6,4)) | |
g = sns.catplot(x='TIME', y='E[dPsi]', | |
hue='LINE',col='ID', | |
kind='bar', | |
palette=['white','lightgrey'], | |
linewidth=1, | |
edgecolor='black', | |
dodge=True, | |
data=pd.concat(dd), | |
height=4, aspect=0.7, | |
col_wrap=4) | |
plt.savefig('./vast_diff_results/venn4.catplot.pdf', dpi=300, bbox_inches='tight') | |
plt.savefig('./vast_diff_results/venn4.catplot.svg', dpi=300, bbox_inches='tight') | |
plt.show() | |
# In[337]: | |
plt.figure(figsize=(6,4)) | |
g = sns.barplot(x='ID', | |
y='E[dPsi]', | |
hue='TIME', | |
ci='sd', | |
palette=['white','lightgrey'], | |
dodge=True, | |
linewidth=1, | |
edgecolor='black', | |
data=pd.concat(dd).sort_values('E[dPsi]', ascending=True)) | |
#show_values_on_bars(g, t=float) | |
g.set_ylabel('Mean dPSI', size=12) | |
g.set_yticks([-0.4,-0.2,0,0.2,0.4]) | |
g.set_yticklabels(g.get_yticks().astype(float), size=12) | |
sns.despine(trim=True, offset=5) | |
g.set_xlabel('Alternative splicing events', size=12) | |
g.set_xticklabels(pd.concat(dd).sort_values('E[dPsi]', ascending=True)['ID'].unique(), | |
size=12, rotation=30, ha='right', rotation_mode="anchor") | |
plt.show() | |
# In[160]: | |
sl = df[df['EVENT'].isin(my_evs)][['GENE','EVENT','ONTO']] | |
# In[161]: | |
sl['org'] = 'org' | |
# In[169]: | |
slp = sl.merge(pfam, how='outer')#.merge(phy, how='outer')#.merge(phy, how='outer') | |
slp = slp[slp['org'] == 'org'] | |
slp.drop('org', axis=1, inplace=True) | |
# In[170]: | |
slp | |
# ### Find overlapping events in HS line | |
# In[710]: | |
psi = pd.read_csv('./vast_diff_results/Hs578T_1uM_MS023_2d_vs_Hs578T_DMSO.txt',sep='\t') | |
d2 = pd.read_excel('./vast_diff_results/Hs578T_1uM_MS023_2d_vs_Hs578T_DMSO.ASEs.xlsx', | |
sheet_name='All_significant_ASEs',sep='\t') | |
d2_sig = d2[(d2['EVENT'].isin(psi['EVENT'])) & | |
(abs(d2['E[dPsi]']) >= d2['MV[dPsi]_at_0.95']) & | |
(d2['MV[dPsi]_at_0.95'] >= 0.01) & | |
(abs(d2['E[dPsi]']) >= 0.1) | |
].sort_values('MV[dPsi]_at_0.95',ascending=False).reset_index(drop=1) | |
psi = pd.read_csv('./vast_diff_results/Hs578T_1uM_MS023_5d_vs_Hs578T_DMSO.txt',sep='\t') | |
d5 = pd.read_excel('./vast_diff_results/Hs578T_1uM_MS023_5d_vs_Hs578T_DMSO.ASEs.xlsx', | |
sheet_name='All_significant_ASEs', sep='\t') | |
d5_sig = d5[(d5['EVENT'].isin(psi['EVENT'])) & | |
(abs(d5['E[dPsi]']) >= d5['MV[dPsi]_at_0.95']) & | |
(d5['MV[dPsi]_at_0.95'] >= 0.01) & | |
(abs(d5['E[dPsi]']) >= 0.1) | |
].sort_values('MV[dPsi]_at_0.95',ascending=False).reset_index(drop=1) | |
d2_hs578 = set(d2_sig['EVENT']) | |
d5_hs578 = set(d5_sig['EVENT']) | |
plt.figure(figsize=(3,4)) | |
v = venn2_unweighted([d2_hs578,d5_hs578], set_labels=['2 days of\ntreatment','5 days of\ntreatment']) | |
c = venn2_circles(subsets=(1, 1, 1), linestyle='solid') | |
for text in v.set_labels: | |
text.set_fontsize(12) | |
for text in v.subset_labels: | |
text.set_fontsize(10) | |
for i in ['11','10','01']: | |
v.get_patch_by_id(i).set_alpha(0.0) | |
c[0].set_ls('dashed') | |
plt.title('HS578T - MS023 1uM\nAlternative splicing events',size=12,y=1.0) | |
plt.savefig('./hs587t_overlap.svg',dpi=300,bbox_inches='tight') | |
plt.savefig('./hs587t_overlap.pdf',dpi=300,bbox_inches='tight') | |
plt.show() | |
# In[711]: | |
hs_ov = d2_sig[['EVENT','E[dPsi]']].set_index('EVENT').add_suffix('_2d').reset_index().merge( | |
d5_sig[['EVENT','E[dPsi]']].set_index('EVENT').add_suffix('_5d').reset_index()) | |
hs_ov['MEAN'] = hs_ov.apply(lambda x: np.mean([x['E[dPsi]_2d'],x['E[dPsi]_5d']]),1) | |
hs_ov['STD'] = hs_ov.apply(lambda x: np.std([x['E[dPsi]_2d'],x['E[dPsi]_5d']]),1) | |
hs_ov = hs_ov[abs(hs_ov['MEAN']) >= hs_ov['STD']] | |
hs_ov = hs_ov.merge(d2_sig[['EVENT','GENE','ONTO']]) | |
hs_ov['ONTO'] = hs_ov['ONTO'].astype(str) | |
hs_ov['ONTO_SHORT'] = hs_ov.apply(lambda x: | |
"Protective" if x['MEAN'] < 0 and ('ORF disruption' in x['ONTO']) and ('inclusion' in x['ONTO']) else | |
"Deleterious" if x['MEAN'] < 0 and ('ORF disruption' in x['ONTO']) and ('exclusion' in x['ONTO']) else | |
"Protective" if x['MEAN'] > 0 and ('ORF disruption' in x['ONTO']) and ('exclusion' in x['ONTO']) else | |
"Deleterious" if x['MEAN'] > 0 and ('ORF disruption' in x['ONTO']) and ('inclusion' in x['ONTO']) else | |
"Alternative protein isoform" if ('isoform' in x['ONTO']) else | |
"Unknown" if ('description available' in x['ONTO']) else | |
x['ONTO'], axis=1).replace('nan','Unknown') | |
hs_ov['INC'] = hs_ov.apply(lambda x: | |
"Exclusion" if x['MEAN'] < 0 else | |
"Inclusion" if x['MEAN'] > 0 else 'Not ASE', axis=1).replace('nan','Unknown') | |
hs_ov['TYPE'] = hs_ov.apply(lambda x: | |
"Exon" if 'HsaEX' in x['EVENT'] else | |
"Intron" if 'HsaINT' in x['EVENT'] else | |
"Alt. 3'SS" if 'HsaALTA' in x['EVENT'] else | |
"Alt. 5'SS" if 'HsaALTD' in x['EVENT'] else | |
'Not ASE', axis=1).replace('nan','Unknown') | |
hs_ov['LABEL'] = hs_ov['TYPE']+';'+hs_ov['INC'] | |
hs_res = pd.DataFrame(hs_ov['ONTO_SHORT'].value_counts()).reset_index() | |
hs_res.columns = ['Protein impact','Counts'] | |
hs_res['%Impact'] = hs_res.apply(lambda x: np.round(x['Counts']/np.sum(hs_res['Counts']),3)*100, axis=1) | |
hs_res['Line'] = 'HS578T' | |
hs_res2 = pd.DataFrame(hs_ov['LABEL'].value_counts()).reset_index() | |
hs_res2.columns = ['Event label','Counts'] | |
hs_res2['%Event'] = hs_res2.apply(lambda x: np.round(x['Counts']/np.sum(hs_res2['Counts']),3)*100, axis=1) | |
hs_res2['Line'] = 'HS578T' | |
# In[712]: | |
hs_ov['GENE'].tolist() | |
# In[741]: | |
hs_ov[hs_ov['GENE'].str.contains("PDK")] | |
# In[742]: | |
hs_ov[hs_ov['GENE'].isin(['COL1A1','IL6','ITGA6','PDK1','PPP2R1B','RAF1','BCL2L1','SGK1','MAGI1'])] | |
# In[714]: | |
psi = pd.read_csv('./vast_diff_results/MDAMB468_1uM_MS023_2d_vs_MDAMB468_DMSO.txt',sep='\t') | |
d2 = pd.read_excel('./vast_diff_results/MDAMB468_1uM_MS023_2d_vs_MDAMB468_DMSO.ASEs.xlsx', | |
sheet_name='All_significant_ASEs',sep='\t') | |
d2_sig = d2[(d2['EVENT'].isin(psi['EVENT'])) & | |
(abs(d2['E[dPsi]']) >= d2['MV[dPsi]_at_0.95']) & | |
(d2['MV[dPsi]_at_0.95'] >= 0.01) & | |
(abs(d2['E[dPsi]']) >= 0.1) | |
].sort_values('MV[dPsi]_at_0.95',ascending=False).reset_index(drop=1) | |
psi = pd.read_csv('./vast_diff_results/MDAMB468_1uM_MS023_5d_vs_MDAMB468_DMSO.txt',sep='\t') | |
d5 = pd.read_excel('./vast_diff_results/MDAMB468_1uM_MS023_5d_vs_MDAMB468_DMSO.ASEs.xlsx', | |
sheet_name='All_significant_ASEs', sep='\t') | |
d5_sig = d5[(d5['EVENT'].isin(psi['EVENT'])) & | |
(abs(d5['E[dPsi]']) >= d5['MV[dPsi]_at_0.95']) & | |
(d5['MV[dPsi]_at_0.95'] >= 0.01) & | |
(abs(d5['E[dPsi]']) >= 0.1) | |
].sort_values('MV[dPsi]_at_0.95',ascending=False).reset_index(drop=1) | |
d2_mdamb = set(d2_sig['EVENT']) | |
d5_mdamb = set(d5_sig['EVENT']) | |
plt.figure(figsize=(3,4)) | |
v = venn2_unweighted([d2_mdamb,d5_mdamb], set_labels=['2 days of\ntreatment','5 days of\ntreatment']) | |
c = venn2_circles(subsets=(1, 1, 1), linestyle='solid') | |
for text in v.set_labels: | |
text.set_fontsize(12) | |
for text in v.subset_labels: | |
text.set_fontsize(10) | |
for i in ['11','10','01']: | |
v.get_patch_by_id(i).set_alpha(0.0) | |
c[0].set_ls('dashed') | |
plt.title('MDAMB468 - MS023 1uM\nAlternative splicing events',size=12,y=1.0) | |
plt.savefig('./mdamb468_overlap.svg',dpi=300,bbox_inches='tight') | |
plt.savefig('./mdamb468_overlap.pdf',dpi=300,bbox_inches='tight') | |
plt.show() | |
# In[715]: | |
md_ov = d2_sig[['EVENT','E[dPsi]']].set_index('EVENT').add_suffix('_2d').reset_index().merge( | |
d5_sig[['EVENT','E[dPsi]']].set_index('EVENT').add_suffix('_5d').reset_index()) | |
md_ov['MEAN'] = md_ov.apply(lambda x: np.mean([x['E[dPsi]_2d'],x['E[dPsi]_5d']]),1) | |
md_ov['STD'] = md_ov.apply(lambda x: np.std([x['E[dPsi]_2d'],x['E[dPsi]_5d']]),1) | |
md_ov = md_ov[abs(md_ov['MEAN']) >= md_ov['STD']] | |
md_ov = md_ov.merge(d2_sig[['EVENT','GENE','ONTO']]) | |
md_ov['ONTO'] = md_ov['ONTO'].astype(str) | |
md_ov['ONTO_SHORT'] = md_ov.apply(lambda x: | |
"Protective" if x['MEAN'] < 0 and ('ORF disruption' in x['ONTO']) and ('inclusion' in x['ONTO']) else | |
"Deleterious" if x['MEAN'] < 0 and ('ORF disruption' in x['ONTO']) and ('exclusion' in x['ONTO']) else | |
"Protective" if x['MEAN'] > 0 and ('ORF disruption' in x['ONTO']) and ('exclusion' in x['ONTO']) else | |
"Deleterious" if x['MEAN'] > 0 and ('ORF disruption' in x['ONTO']) and ('inclusion' in x['ONTO']) else | |
"Alternative protein isoform" if ('isoform' in x['ONTO']) else | |
"Unknown" if ('description available' in x['ONTO']) else | |
"ncRNA" if ('NonCoding' in x['ONTO']) else | |
x['ONTO'], axis=1).replace('nan','Unknown') | |
md_ov['INC'] = md_ov.apply(lambda x: | |
"Exclusion" if x['MEAN'] < 0 else | |
"Inclusion" if x['MEAN'] > 0 else 'Not ASE', axis=1).replace('nan','Unknown') | |
md_ov['TYPE'] = md_ov.apply(lambda x: | |
"Exon" if 'HsaEX' in x['EVENT'] else | |
"Intron" if 'HsaINT' in x['EVENT'] else | |
"Alt. 3'SS" if 'HsaALTA' in x['EVENT'] else | |
"Alt. 5'SS" if 'HsaALTD' in x['EVENT'] else | |
'Not ASE', axis=1).replace('nan','Unknown') | |
md_ov['LABEL'] = md_ov['TYPE']+';'+md_ov['INC'] | |
md_res = pd.DataFrame(md_ov['ONTO_SHORT'].value_counts()).reset_index() | |
md_res.columns = ['Protein impact','Counts'] | |
md_res['%Impact'] = md_res.apply(lambda x: np.round(x['Counts']/np.sum(md_res['Counts']),3)*100, axis=1) | |
md_res['Line'] = 'MDAMB468' | |
md_res2 = pd.DataFrame(md_ov['LABEL'].value_counts()).reset_index() | |
md_res2.columns = ['Event label','Counts'] | |
md_res2['%Event'] = md_res2.apply(lambda x: np.round(x['Counts']/np.sum(md_res2['Counts']),3)*100, axis=1) | |
md_res2['Line'] = 'MDAMB468' | |
# In[716]: | |
md_ov['GENE'].tolist() | |
# In[717]: | |
md_ov[md_ov['GENE'].isin(['ATF2','EFNA1','KITLG','SYK'])] | |
# In[ ]: | |
# In[718]: | |
res = pd.concat([hs_res,md_res]) | |
plt.figure(figsize=(6,4)) | |
g = sns.barplot(x='Protein impact', | |
y='%Impact', | |
hue='Line', | |
palette=['white','lightgrey'], | |
dodge=True, | |
linewidth=1, | |
edgecolor='black', | |
data=res, | |
order=['Alternative protein isoform','Protective','Deleterious', | |
"5' UTR/ncRNA","3' UTR",'ncRNA','Unknown']) | |
show_values_on_bars(g, t=float, dc=1, fs=8, r=0) | |
g.set_ylabel('Ratio of ASEs (%)', size=12) | |
g.set_yticks([0,10,20,30,40,50]) | |
g.set_yticklabels(g.get_yticks().astype(float), size=12) | |
sns.despine(trim=True, offset=5) | |
g.set_xlabel('Predicted protein impact', size=12) | |
g.set_xticklabels(['Alt. isoform','Protective','Deleterious', | |
"5' UTR/ncRNA","3' UTR",'ncRNA','Unknown'], | |
size=12, rotation=30, ha='right', rotation_mode="anchor") | |
plt.savefig('./prmt1i_protein_impact.svg', dpi=300, bbox_inches='tight') | |
plt.savefig('./prmt1i_protein_impact.pdf', dpi=300, bbox_inches='tight') | |
plt.show() | |
# In[719]: | |
res2 = pd.concat([hs_res2,md_res2]) | |
res2['Region'] = res2['Event label'].apply(lambda x: x.split(";")[0]) | |
res2['Level'] = res2['Event label'].apply(lambda x: x.split(";")[1]) | |
# In[727]: | |
res2['Event label'].unique() | |
# In[739]: | |
plt.figure(figsize=(6,4)) | |
g = sns.barplot(x='Event label', | |
y='%Event', | |
hue='Line', | |
palette=['white','lightgrey'], | |
dodge=True, | |
linewidth=1, | |
edgecolor='black', | |
data=res2, | |
order=['Intron;Exclusion', 'Intron;Inclusion', | |
'Exon;Exclusion', 'Exon;Inclusion', | |
"Alt. 5'SS;Exclusion", "Alt. 5'SS;Inclusion", | |
"Alt. 3'SS;Exclusion", "Alt. 3'SS;Inclusion"] | |
) | |
show_values_on_bars(g, t=float, dc=1, fs=8, r=0) | |
g.set_ylabel('Ratio of ASEs (%)', size=12) | |
g.set_yticks([0,10,20,30,40,50,60,70]) | |
g.set_yticklabels(g.get_yticks().astype(float), size=12) | |
sns.despine(trim=True, offset=5) | |
g.set_xlabel('Type of ASE', size=12) | |
g.set_xticklabels(['Intron exclusion', 'Intron inclusion', | |
'Exon exclusion', 'Exon inclusion', | |
"Alt. 5'SS exclusion", "Alt. 5'SS inclusion", | |
"Alt. 3'SS exclusion", "Alt. 3'SS inclusion"], | |
size=12, rotation=30, ha='right', rotation_mode="anchor") | |
plt.legend(loc='upper right') | |
plt.savefig('./inclusion_profile.svg', dpi=300, bbox_inches='tight') | |
plt.savefig('./inclusion_profile.pdf', dpi=300, bbox_inches='tight') | |
plt.show() | |
# In[657]: | |
res = pd.concat([hs_res,md_res]) | |
plt.figure(figsize=(6,4)) | |
g = sns.barplot(x='Protein impact', | |
y='%Impact', | |
hue='Line', | |
palette=['white','lightgrey'], | |
dodge=True, | |
linewidth=1, | |
edgecolor='black', | |
data=res, | |
order=['Alternative protein isoform','Protective','Deleterious', | |
"5' UTR/ncRNA","3' UTR",'ncRNA','Unknown']) | |
show_values_on_bars(g, t=float, dc=1, fs=8, r=0) | |
g.set_ylabel('Ratio of ASEs (%)', size=12) | |
g.set_yticks([0,10,20,30,40,50]) | |
g.set_yticklabels(g.get_yticks().astype(float), size=12) | |
sns.despine(trim=True, offset=5) | |
g.set_xlabel('Predicted protein impact', size=12) | |
g.set_xticklabels(['Alt. isoform','Protective','Deleterious', | |
"5' UTR/ncRNA","3' UTR",'ncRNA','Unknown'], | |
size=12, rotation=30, ha='right', rotation_mode="anchor") | |
plt.savefig('./prmt1i_protein_impact.svg', dpi=300, bbox_inches='tight') | |
plt.savefig('./prmt1i_protein_impact.pdf', dpi=300, bbox_inches='tight') | |
plt.show() | |
# In[ ]: | |
writer = pd.ExcelWriter('./overlap.day2.day5.ASEs.xlsx', | |
engine='xlsxwriter') | |
hs_ov.to_excel(writer, sheet_name='HS578T', index=False) | |
md_ov.to_excel(writer, sheet_name='MDAMB468', index=False) | |
writer.save() | |
# ### Hs578T vs MDAMB468 | |
# In[1167]: | |
psi = pd.read_csv('./vast_diff_results/Hs578T_1uM_MS023_2d_vs_MDAMB468_1uM_MS023_2d.txt',sep='\t') | |
d2 = pd.read_excel('./vast_diff_results/Hs578T_1uM_MS023_2d_vs_MDAMB468_1uM_MS023_2d.ASEs.xlsx', | |
sheet_name='All_significant_ASEs',sep='\t') | |
d2_sig = d2[(d2['EVENT'].isin(psi['EVENT'])) & | |
(abs(d2['E[dPsi]']) >= d2['MV[dPsi]_at_0.95']) & | |
(d2['MV[dPsi]_at_0.95'] >= 0.01) & | |
(abs(d2['E[dPsi]']) >= 0.1) | |
].sort_values('MV[dPsi]_at_0.95',ascending=False).reset_index(drop=1) | |
psi = pd.read_csv('./vast_diff_results/Hs578T_1uM_MS023_5d_vs_MDAMB468_1uM_MS023_5d.txt',sep='\t') | |
d5 = pd.read_excel('./vast_diff_results/Hs578T_1uM_MS023_5d_vs_MDAMB468_1uM_MS023_5d.ASEs.xlsx', | |
sheet_name='All_significant_ASEs', sep='\t') | |
d5_sig = d5[(d5['EVENT'].isin(psi['EVENT'])) & | |
(abs(d5['E[dPsi]']) >= d5['MV[dPsi]_at_0.95']) & | |
(d5['MV[dPsi]_at_0.95'] >= 0.01) & | |
(abs(d5['E[dPsi]']) >= 0.1) | |
].sort_values('MV[dPsi]_at_0.95',ascending=False).reset_index(drop=1) | |
psi = pd.read_csv('./vast_diff_results/Hs578T_DMSO_vs_MDAMB468_DMSO.txt',sep='\t') | |
dm = pd.read_excel('./vast_diff_results/Hs578T_DMSO_vs_MDAMB468_DMSO.ASEs.xlsx', | |
sheet_name='All_significant_ASEs', sep='\t') | |
dm_sig = dm[(dm['EVENT'].isin(psi['EVENT'])) & | |
(abs(dm['E[dPsi]']) >= dm['MV[dPsi]_at_0.95']) & | |
(dm['MV[dPsi]_at_0.95'] >= 0.01) & | |
(abs(dm['E[dPsi]']) >= 0.1) | |
].sort_values('MV[dPsi]_at_0.95',ascending=False).reset_index(drop=1) | |
d2_hs578 = set(d2_sig['EVENT']) | |
d5_hs578 = set(d5_sig['EVENT']) | |
dm_hs578 = set(dm_sig['EVENT']) | |
plt.figure(figsize=(6,6)) | |
v = venn3_unweighted([d2_hs578,d5_hs578, dm_hs578], | |
set_labels=['2 days of\ntreatment','5 days of\ntreatment','DMSO control']) | |
c = venn3_circles(subsets=(1, 1, 1, 1 , 1, 1, 1), linestyle='solid') | |
for text in v.set_labels: | |
text.set_fontsize(20) | |
for text in v.subset_labels: | |
text.set_fontsize(18) | |
for i in ['111','110','101','011','010','001','100']: | |
v.get_patch_by_id(i).set_alpha(0.0) | |
c[0].set_ls('dashed') | |
c[1].set_ls('dotted') | |
plt.title('HS578T vs. MDAMB568\nAlternative splicing events',size=20,y=1.1) | |
plt.savefig('./hs587t_vs_mdamb_overlap.svg',dpi=300,bbox_inches='tight') | |
plt.savefig('./hs587t_vs_mdamb_overlap.pdf',dpi=300,bbox_inches='tight') | |
plt.show() | |
# In[1248]: | |
hs_md_ov = d2_sig[['EVENT','E[dPsi]']].set_index('EVENT').add_suffix('_2d').reset_index().merge( | |
d5_sig[['EVENT','E[dPsi]']].set_index('EVENT').add_suffix('_5d').reset_index()).merge( | |
dm_sig[['EVENT','E[dPsi]']].set_index('EVENT').add_suffix('_dm').reset_index()) | |
hs_md_ov['MEAN'] = hs_md_ov.apply(lambda x: np.mean([x['E[dPsi]_2d'],x['E[dPsi]_5d'],x['E[dPsi]_dm']]),1) | |
hs_md_ov['STD'] = hs_md_ov.apply(lambda x: np.std([x['E[dPsi]_2d'],x['E[dPsi]_5d'],x['E[dPsi]_dm']]),1) | |
hs_md_ov = hs_md_ov[abs(hs_md_ov['MEAN']) >= hs_md_ov['STD']] | |
hs_md_ov = hs_md_ov.merge(d2_sig[['EVENT','GENE','ONTO']]) | |
hs_md_ov['ONTO'] = hs_md_ov['ONTO'].astype(str) | |
hs_md_ov['ONTO_SHORT'] = hs_md_ov.apply(lambda x: | |
"Protective" if x['MEAN'] < 0 and ('ORF disruption' in x['ONTO']) and ('inclusion' in x['ONTO']) else | |
"Deleterious" if x['MEAN'] < 0 and ('ORF disruption' in x['ONTO']) and ('exclusion' in x['ONTO']) else | |
"Protective" if x['MEAN'] > 0 and ('ORF disruption' in x['ONTO']) and ('exclusion' in x['ONTO']) else | |
"Deleterious" if x['MEAN'] > 0 and ('ORF disruption' in x['ONTO']) and ('inclusion' in x['ONTO']) else | |
"Alternative protein isoform" if ('isoform' in x['ONTO']) else | |
"Unknown" if ('description available' in x['ONTO']) else | |
"ncRNA" if ('NonCoding' in x['ONTO']) else | |
x['ONTO'], axis=1).replace('nan','Unknown') | |
# In[1362]: | |
sl4 = hs_md_ov[abs(hs_md_ov['MEAN']) >= 0.5] | |
sl4 = sl4[sl4['GENE'] != 'DST'] | |
sl4['ID'] = sl4['GENE']+'='+sl4['EVENT'] | |
# In[1275]: | |
hs_md_ov.to_excel('./overlap.HS578_vs_MDAMB468.ASEs.xlsx') | |
# In[1363]: | |
samps2 = tidy_split(samps_groups, 'SAMPLES', sep=',') | |
samps2['REP'] = ['_R1','_R2','_R3']*len(samps2['COND'].unique()) | |
samps2['SAMP_ID'] = samps2['COND']+samps2['REP'] | |
# In[1255]: | |
vast_psi = pd.read_csv('./INCLUSION_LEVELS_FULL-Hsa24-hg38.tab',sep='\t') | |
# In[1364]: | |
cols = pd.DataFrame(vast_psi.columns) | |
cols = cols[(cols[0].str.contains("MDAMB468|Hs578T|hs578t")) & ~(cols[0].str.contains("_R1-Q"))] | |
myEvs = sl4.sort_values("MEAN", ascending=False)['EVENT'] | |
vast_psi_values = vast_psi[cols[0].tolist()+['EVENT']].set_index('EVENT').T | |
myPsi = vast_psi_values[myEvs].reset_index().rename(columns={'index':'SAMPLES'}) | |
myPsi_ID = myPsi.merge(samps2[['SAMPLES','SAMP_ID']]).drop('SAMPLES',1).set_index('SAMP_ID').T | |
sl3 = myPsi_ID[[ | |
'Hs578T_1uM_MS023_2d_R1', | |
'Hs578T_1uM_MS023_2d_R2', | |
'Hs578T_1uM_MS023_2d_R3', | |
'Hs578T_1uM_MS023_5d_R1', | |
'Hs578T_1uM_MS023_5d_R2', | |
'Hs578T_1uM_MS023_5d_R3', | |
'Hs578T_shPRMT1_DOX_R1', | |
'Hs578T_shPRMT1_DOX_R2', | |
'Hs578T_shPRMT1_DOX_R3', | |
'Hs578T_shPRMT1_NoDOX_R1', | |
'Hs578T_shPRMT1_NoDOX_R2', | |
'Hs578T_shPRMT1_NoDOX_R3', | |
'Hs578T_DMSO_R1', | |
'Hs578T_DMSO_R2', | |
'Hs578T_DMSO_R3', | |
'MDAMB468_1uM_MS023_2d_R1', | |
'MDAMB468_1uM_MS023_2d_R2', | |
'MDAMB468_1uM_MS023_2d_R3', | |
'MDAMB468_1uM_MS023_5d_R1', | |
'MDAMB468_1uM_MS023_5d_R2', | |
'MDAMB468_1uM_MS023_5d_R3', | |
'MDAMB468_DMSO_R1', | |
'MDAMB468_DMSO_R2', | |
'MDAMB468_DMSO_R3', | |
]] | |
sl5 = sl3.reset_index().merge(sl4[['EVENT','ID']].rename(columns={'EVENT':'index'})).set_index('ID').drop("index",1) | |
# In[1365]: | |
grid_kws = {"height_ratios": (0.9, .025), "hspace": 0.3} | |
f, (ax, cbar_ax) = plt.subplots(2, gridspec_kw=grid_kws, figsize = (12,12)) | |
sns.heatmap(sl5, | |
cmap="YlGnBu", | |
linewidths=1, | |
ax=ax, | |
cbar_ax=cbar_ax, | |
annot=True, | |
fmt='.0f', | |
cbar_kws={"orientation": "horizontal"}) | |
plt.savefig('top_targets_Hs578_vs_MDAMB.h.svg', dpi=300, bbox_inches='tight') | |
plt.savefig('top_targets_Hs578_vs_MDAMB.h.pdf', dpi=300, bbox_inches='tight') | |
plt.show() | |
# In[1366]: | |
grid_kws = {"height_ratios": (0.9, .025), "hspace": 0.3} | |
f, (ax, cbar_ax) = plt.subplots(2, gridspec_kw=grid_kws, figsize = (10,12)) | |
sns.heatmap(sl5.T, | |
cmap="YlGnBu", | |
linewidths=1, | |
ax=ax, | |
cbar_ax=cbar_ax, | |
annot=True, | |
fmt='.0f', | |
cbar_kws={"orientation": "horizontal"}) | |
plt.savefig('top_targets_Hs578_vs_MDAMB.v.svg', dpi=300, bbox_inches='tight') | |
plt.savefig('top_targets_Hs578_vs_MDAMB.v.pdf', dpi=300, bbox_inches='tight') | |
plt.show() | |
# In[1367]: | |
df_merged = hs_ov[['EVENT','GENE','MEAN']].rename(columns={'MEAN':'dPSI[Hs578T_MS023_1uM_vs_Hs578T_DMSO]'} | |
).merge(hs_md_ov[['EVENT','MEAN','ONTO']] | |
).rename(columns={'MEAN':'dPSI[Hs578T_vs_MDAMB468]'}) | |
df_merged['ID'] = df_merged['GENE']+'='+df_merged['EVENT'] | |
# In[1377]: | |
cols = pd.DataFrame(vast_psi.columns) | |
cols = cols[(cols[0].str.contains("MDAMB468|Hs578T|hs578t")) & ~(cols[0].str.contains("_R1-Q"))] | |
myEvs = df_merged['EVENT']#.sort_values("MEAN", ascending=False)['EVENT'] | |
vast_psi_values = vast_psi[cols[0].tolist()+['EVENT']].set_index('EVENT').T | |
myPsi = vast_psi_values[myEvs].reset_index().rename(columns={'index':'SAMPLES'}) | |
myPsi_ID = myPsi.merge(samps2[['SAMPLES','SAMP_ID']]).drop('SAMPLES',1).set_index('SAMP_ID').T | |
sl3 = myPsi_ID[[ | |
'Hs578T_1uM_MS023_2d_R1', | |
'Hs578T_1uM_MS023_2d_R2', | |
'Hs578T_1uM_MS023_2d_R3', | |
'Hs578T_1uM_MS023_5d_R1', | |
'Hs578T_1uM_MS023_5d_R2', | |
'Hs578T_1uM_MS023_5d_R3', | |
#'Hs578T_shPRMT1_DOX_R1', | |
#'Hs578T_shPRMT1_DOX_R2', | |
#'Hs578T_shPRMT1_DOX_R3', | |
#'Hs578T_shPRMT1_NoDOX_R1', | |
#'Hs578T_shPRMT1_NoDOX_R2', | |
#'Hs578T_shPRMT1_NoDOX_R3', | |
'Hs578T_DMSO_R1', | |
'Hs578T_DMSO_R2', | |
'Hs578T_DMSO_R3', | |
'MDAMB468_1uM_MS023_2d_R1', | |
'MDAMB468_1uM_MS023_2d_R2', | |
'MDAMB468_1uM_MS023_2d_R3', | |
'MDAMB468_1uM_MS023_5d_R1', | |
'MDAMB468_1uM_MS023_5d_R2', | |
'MDAMB468_1uM_MS023_5d_R3', | |
'MDAMB468_DMSO_R1', | |
'MDAMB468_DMSO_R2', | |
'MDAMB468_DMSO_R3', | |
]] | |
sl5 = sl3.reset_index().merge(df_merged[['EVENT','ID']].rename(columns={'EVENT':'index'})).set_index('ID').drop("index",1) | |
# In[1378]: | |
grid_kws = {"height_ratios": (0.9, .025), "hspace": 0.3} | |
f, (ax, cbar_ax) = plt.subplots(2, gridspec_kw=grid_kws, figsize = (4,8)) | |
sns.heatmap(sl5.T, | |
cmap="YlGnBu", | |
linewidths=1, | |
ax=ax, | |
cbar_ax=cbar_ax, | |
annot=True, | |
fmt='.0f', | |
cbar_kws={"orientation": "horizontal"}) | |
plt.savefig('top_targets_Hs578_vs_DMSO_and_MDAMB.v.svg', dpi=300, bbox_inches='tight') | |
plt.savefig('top_targets_Hs578_vs_DMSO_and_MDAMB.v.pdf', dpi=300, bbox_inches='tight') | |
plt.show() | |
# In[ ]: | |
grid_kws = {"height_ratios": (0.9, .025), "hspace": 0.3} | |
f, (ax, cbar_ax) = plt.subplots(2, gridspec_kw=grid_kws, figsize = (6,4)) | |
sns.heatmap(sl5, | |
cmap="YlGnBu", | |
linewidths=1, | |
ax=ax, | |
cbar_ax=cbar_ax, | |
annot=True, | |
fmt='.0f', | |
cbar_kws={"orientation": "horizontal"}) | |
plt.savefig('top_targets_Hs578_vs_DMSO_and_MDAMB.h.svg', dpi=300, bbox_inches='tight') | |
plt.savefig('top_targets_Hs578_vs_DMSO_and_MDAMB.h.pdf', dpi=300, bbox_inches='tight') | |
plt.show() | |
# ### HS578 vs MDAMB468 - VAST TIDY | |
# In[1318]: | |
psi_table = pd.read_table("INCLUSION_LEVELS_FULL-Hsa24-hg38.TIDY.MS023.tab")#.fillna(0) | |
cols_to_drop = psi_table.filter(regex='-Q').columns.tolist() | |
psi_table.drop(columns=cols_to_drop, inplace=True) | |
psi_table = psi_table.set_index('EVENT').T.reset_index().rename(columns={'index':'SAMPLES'} | |
).merge(samps2[['SAMPLES','SAMP_ID']] | |
).set_index('SAMP_ID').drop("SAMPLES",1).T.reset_index().rename(columns={'index':'EVENT'}) | |
hs_treated = psi_table.filter(regex='Hs578').filter(regex='MS023').columns.tolist() | |
hs_dmso = psi_table.filter(regex='Hs578').filter(regex='DMSO').columns.tolist() | |
md_treated = psi_table.filter(regex='MDAMB').filter(regex='MS023').columns.tolist() | |
md_dmso = psi_table.filter(regex='MDAMB').filter(regex='DMSO').columns.tolist() | |
psi_table = psi_table[['EVENT']+hs_treated+hs_dmso+md_treated+md_dmso] | |
# #.drop(psi_table.filter(regex='-Q').columns.tolist(),1) | |
# In[1320]: | |
psi_table['Hs578_vs_MDAMB468_pVal'] = psi_table.apply(lambda x: stats.mstats.mannwhitneyu(x[hs_treated+hs_dmso].values, | |
x[md_treated+md_dmso].values)[1], | |
axis=1) | |
psi_table['Hs578_vs_MDAMB468_dPSI'] = psi_table.apply(lambda x: np.round( | |
np.mean(x[hs_treated+hs_dmso])-np.mean(x[md_treated+md_dmso]),2), | |
axis=1) | |
psi_table['Hs578_MS023_vs_DMSO_pVal'] = psi_table.apply(lambda x: stats.mstats.mannwhitneyu(x[hs_treated].values, | |
x[hs_dmso].values)[1], | |
axis=1) | |
psi_table['Hs578_MS023_vs_DMSO_dPSI'] = psi_table.apply(lambda x: np.round( | |
np.mean(x[hs_treated])-np.mean(x[hs_dmso]),2), | |
axis=1) | |
psi_table['MDAMB468_MS023_vs_DMSO_pVal'] = psi_table.apply(lambda x: stats.mstats.mannwhitneyu(x[md_treated].values, | |
x[md_dmso].values)[1], | |
axis=1) | |
psi_table['MDAMB468_MS023_vs_DMSO_dPSI'] = psi_table.apply(lambda x: np.round( | |
np.mean(x[md_treated])-np.mean(x[md_dmso]),2), | |
axis=1) | |
# In[ ]: | |
# In[1322]: | |
hs578_vs_md468_hs = psi_table[(psi_table['Hs578_vs_MDAMB468_pVal'] <= 0.05) & | |
#(psi_table['Hs578_MS023_vs_DMSO_pVal'] >= 0.05) & | |
(abs(psi_table['Hs578_vs_MDAMB468_dPSI']) >= 40) #& | |
#(abs(psi_table['Hs578_MS023_vs_DMSO_dPSI']) <= 20) | |
].sort_values('Hs578_vs_MDAMB468_dPSI', ascending=False) | |
hs578_vs_md468_all = psi_table[(psi_table['Hs578_vs_MDAMB468_pVal'] <= 0.05) & | |
#(psi_table['Hs578_MS023_vs_DMSO_pVal'] >= 0.05) & | |
(abs(psi_table['Hs578_vs_MDAMB468_dPSI']) >= 20) #& | |
#(abs(psi_table['Hs578_MS023_vs_DMSO_dPSI']) <= 20) | |
].sort_values('Hs578_vs_MDAMB468_dPSI', ascending=False) | |
# In[1341]: | |
mytest = psi_table[((psi_table['Hs578_vs_MDAMB468_pVal'] <= 0.05) & | |
(abs(psi_table['Hs578_vs_MDAMB468_dPSI']) >= 10)) | | |
((psi_table['Hs578_MS023_vs_DMSO_pVal'] <= 0.05) & | |
(abs(psi_table['Hs578_MS023_vs_DMSO_dPSI']) >= 10)) | | |
((psi_table['MDAMB468_MS023_vs_DMSO_pVal'] <= 0.05) & | |
(abs(psi_table['MDAMB468_MS023_vs_DMSO_dPSI']) >= 10)) | |
].sort_values('Hs578_vs_MDAMB468_dPSI', ascending=False).set_index('EVENT') | |
# In[1342]: | |
mytest.shape | |
# In[1343]: | |
mytest | |
# In[1338]: | |
grid_kws = {"height_ratios": (0.9, .025), "hspace": 0.3} | |
f, (ax, cbar_ax) = plt.subplots(2, gridspec_kw=grid_kws, figsize = (10,10)) | |
sns.heatmap(mytest.drop(['Hs578_vs_MDAMB468_pVal','Hs578_vs_MDAMB468_dPSI', | |
'Hs578_MS023_vs_DMSO_pVal','Hs578_MS023_vs_DMSO_dPSI', | |
'MDAMB468_MS023_vs_DMSO_pVal','MDAMB468_MS023_vs_DMSO_dPSI',],1), | |
cmap="YlGnBu", | |
linewidths=1, | |
ax=ax, | |
cbar_ax=cbar_ax, | |
annot=True, | |
fmt='.0f', | |
cbar_kws={"orientation": "horizontal"}) | |
plt.show() | |
# In[ ]: | |
# In[ ]: | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment