Created
May 14, 2020 23:51
-
-
Save cplaisier/5dce1411fbc2846a5335698ff0847dfb to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
cd /Users/baileyquinn/Desktop/BT145/ | |
docker run -it -v "/Users/baileyquinn/Desktop/BT145:/files" cplaisier/scrna_seq_velocity | |
| |
cd /files | |
pip3 install clusim | |
pip3 install statsmodels | |
python3 | |
import numpy as np | |
import pandas as pd | |
import scanpy as sc | |
import mygene as mg | |
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score, adjusted_mutual_info_score | |
from scipy.stats import hypergeom | |
import matplotlib.pyplot as plt | |
plt.style.use('seaborn-whitegrid') | |
import clusim.sim as sim | |
from clusim.clustering import Clustering, print_clustering | |
import statsmodels | |
fileVAR = {'C':{'n_genes':2500, 'percent_mito':0.25}} | |
## Load BT145_C | |
BT145_adatas = {} | |
for i in fileVAR: | |
results_file = './write/BT145_' + i + '.h5ad' | |
adata = sc.read_10x_mtx( | |
'./BT145_' + i + '/filtered_feature_bc_matrix/', | |
var_names = 'gene_symbols', | |
cache=True) | |
adata.var_names_make_unique() | |
adata | |
M = adata.n_obs | |
# Keeping only ccAF genes studied here | |
ccAFfull = pd.read_csv("Table_S1.csv",header=1) | |
ccAFgenelist = ccAFfull['gene'].values.tolist() | |
ccAFset = set(ccAFgenelist) | |
fullgenelist = adata.var_names.values.tolist() | |
crossover = ccAFset.intersection(fullgenelist) | |
df0 = ccAFfull[ccAFfull['gene'].isin(crossover)] | |
df0.to_csv ('Table_S1_withinM.csv', header=True) | |
ccAFcounts = df0.groupby('cluster').count()['gene'] | |
## Preprocessing | |
# Filter | |
sc.pp.filter_cells(adata, min_genes=200) | |
sc.pp.filter_genes(adata, min_cells=3) | |
mito_genes = adata.var_names.str.startswith('MT-') | |
adata.obs['percent_mito'] = np.sum(adata[:, mito_genes].X, axis=1).A1 / np.sum(adata.X, axis=1).A1 | |
adata.obs['n_counts'] = adata.X.sum(axis=1).A1 | |
adata.obs['n_counts'] = adata.X.sum(axis=1).A1 | |
adata.obs['n_genes'] = [i.count_nonzero() for i in adata.X] | |
# Violin plots | |
#sc.pl.violin(adata, ['n_genes', 'n_counts', 'percent_mito'], jitter=0.4, multi_panel=True, save= '_' + i + '_scanpy.pdf') | |
# Scatter plots | |
#sc.pl.scatter(adata, x='n_counts', y='percent_mito', save= '_' + i + '_n_count_by_percMito.pdf') | |
#sc.pl.scatter(adata, x='n_counts', y='n_genes', save= '_' + i + '_n_count_by_n_genes.pdf') | |
# Filter | |
adata = adata[adata.obs.n_genes > 2500, :] | |
adata = adata[adata.obs.percent_mito < 0.25, :] | |
print(adata.shape) | |
adata.write(results_file) | |
BT145_adatas[i] = adata | |
# Load meta data for BT145_C | |
meta_C = pd.read_csv('BT145_C/BT145_C_meta.data.csv', header=0, index_col=0) | |
meta_C.index = [i+'-1' for i in meta_C.index] | |
adata = BT145_adatas['C'] | |
adata.obs['ccAF'] = meta_C.loc[adata.obs.index,'clusts_WT'] | |
# Normalize | |
sc.pp.normalize_total(adata, target_sum=1e6) | |
# Log normalize | |
sc.pp.log1p(adata) | |
# Save raw data | |
adata.raw = adata | |
# Identify highly variable genes | |
sc.pp.highly_variable_genes(adata, n_top_genes=4000) | |
sc.pl.highly_variable_genes(adata, save='_C_hvg.pdf') | |
adata = adata[:, adata.var.highly_variable] | |
# Regress out unwated noise and scale data | |
sc.pp.regress_out(adata, ['n_counts', 'percent_mito']) | |
sc.pp.scale(adata, max_value=10) | |
# PCA | |
sc.tl.pca(adata, svd_solver='arpack') | |
#sc.pl.pca(adata, color='CST3') | |
#sc.pl.pca_variance_ratio(adata, log=True) | |
adata.write(results_file) | |
adata | |
# Compute the neighborhood graph | |
sc.pp.neighbors(adata) #, n_neighbors=10, n_pcs=40) | |
# UMAP | |
sc.tl.umap(adata) | |
#sc.pl.umap(adata, color=['leiden'], save='_BT145_C_clusters_resolution' + str(res1) + '.pdf') | |
# Clustering and finding marker genes | |
resolutions = [0.35, 0.375, 0.4, 0.425, 0.45, 0.475, 0.5, 0.525, 0.55, 0.575, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 1.0] | |
adjRandScores = [] | |
amiScore = [] | |
nmiScore = [] | |
ecSim = [] | |
for res1 in resolutions: | |
sc.tl.leiden(adata, resolution = res1) | |
# Writing confusion matrix to CSV | |
adjRandScores.append(adjusted_rand_score(adata.obs['leiden'],adata.obs['ccAF'])) | |
amiScore.append(adjusted_mutual_info_score(adata.obs['leiden'],adata.obs['ccAF'])) | |
nmiScore.append(normalized_mutual_info_score(adata.obs['leiden'],adata.obs['ccAF'])) | |
ecSim.append(sim.element_sim(Clustering().from_membership_list(list(adata.obs['leiden'])),Clustering().from_membership_list(list(adata.obs['ccAF'])))) | |
# print('res = '+str(res1)+'; Adj Rand Score = '+str(adjusted_rand_score(adata.obs['leiden'],adata.obs['ccAF']))) | |
with open('markergenes/BT145_C/BT145_C_confusion_matrix_'+str(res1)+'.csv', 'w') as outFile: | |
pd.crosstab(adata.obs['leiden'],adata.obs['ccAF']).to_csv(outFile) | |
# Plot UMAP and cluster | |
sc.pl.umap(adata, color=['leiden'], save='_BT145_C_clusters_resolution_'+str(res1)+'.pdf') | |
# Identifying Marker Genes | |
sc.tl.dendrogram(adata, 'leiden') | |
sc.tl.rank_genes_groups(adata, 'leiden', method='wilcoxon', corr_method='benjamini-hochberg', n_genes=10000) | |
# sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False, save='_BT145_C_res_'+str(res1)+'.pdf') | |
sc.tl.filter_rank_genes_groups(adata, min_fold_change=2) | |
# visualize results | |
sc.pl.rank_genes_groups(adata, key='rank_genes_groups_filtered') | |
# visualize results using dotplot and heatmap | |
sc.tl.dendrogram(adata, 'leiden') | |
sc.pl.rank_genes_groups_dotplot(adata, key='rank_genes_groups_filtered', save='_BT145_C_res_'+str(res1)+'.pdf') | |
sc.pl.rank_genes_groups_heatmap(adata, key='rank_genes_groups_filtered', save='_BT145_C_res_'+str(res1)+'.pdf') | |
# Writing Marker Genes to CSV | |
with open('markergenes/BT145_C/BT145_C_markergenes_dataframe_'+str(res1)+'.csv', 'w') as outFile: | |
for x in range(max([int(i) for i in list(adata.obs['leiden'])])+1): | |
df = sc.get.rank_genes_groups_df(adata, str(x), pval_cutoff=0.05, log2fc_min=1).sort_values(by='logfoldchanges', ascending=False) | |
df['cluster'] = [x]*df.shape[0] | |
if x == 0: | |
df.to_csv(outFile, header = True) | |
else: | |
df.to_csv(outFile, header = False) | |
# Creating the ccAF dictionary, keeping an equal number of genes for each cluster | |
nfull = df0 | |
nfiltered1 = nfull.avg_logFC >= 0 | |
nfiltered2 = nfull.p_val_adj <= 0.05 | |
nfiltered = nfiltered1 #& nfiltered2 | |
df1 = nfull[nfiltered] | |
df1 = df1.sort_values("p_val_adj", axis=0, ascending=True) | |
ccAFbuilder = pd.DataFrame() | |
for c in ['Neural G0', 'Late G1', 'S', 'S/G2', 'G2/M', 'M/Early G1']: | |
df1subset = df1[df1.cluster == c] | |
df1subset = df1subset.head(n=28) | |
ccAFbuilder = ccAFbuilder.append(df1subset) | |
ccAFdict = dict(zip(ccAFbuilder.gene, ccAFbuilder.cluster)) | |
resolutions = [0.35, 0.375, 0.4, 0.425, 0.45, 0.475, 0.5, 0.525, 0.55, 0.575, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 1.0] | |
# Finding intersection of cell cycle and clusters | |
totclust = [] | |
sigclust = [] | |
for res0 in resolutions: | |
df2 = pd.read_csv("markergenes/BT145_C/BT145_C_markergenes_dataframe_"+str(res0)+".csv",header=0) | |
# Mapping marker genes to the cell cycle genes | |
df2['cell cycle'] = df2['names'].map(ccAFdict) | |
df2.to_csv('markergenes/BT145_C/BT145_C_markergenes_with_cellcycles_res_'+str(res0)+'.csv', index = False) | |
# Calculating the % each leiden cluster is mapped to each cell cycle cluster | |
df4 = pd.crosstab(df2['cluster'],df2['cell cycle']) | |
df4 = df4.drop(columns=['Low RNA'], errors='ignore') | |
coltotal = pd.DataFrame(data=None) | |
leideni = max(df2['cluster']) | |
#print(leideni) | |
for i in range(leideni+1): | |
coltotal[i] = df2[df2.cluster == i].count() | |
coltotal = coltotal.drop(['Unnamed: 0','scores','logfoldchanges','pvals','pvals_adj','cluster','cell cycle']).T | |
df5 = df4.div(coltotal['names'],axis='rows').mul(100).fillna(0) | |
df4 = df5.div(100).mul(coltotal['names'],axis='rows').fillna(0) | |
# Calculating p-value for each leiden cluster and cell cycle cluster combination | |
pvals = [] | |
for j in df4.columns: | |
n = ccAFcounts[j] | |
for i in range(leideni+1): | |
if i not in df2.groupby('cluster').count()['names']: | |
N = 0 | |
else: | |
N = df2.groupby('cluster').count()['names'][i] | |
x = df4.at[i,j] | |
prb = hypergeom.cdf(x, M, n, N) | |
hyperpval = 1-prb | |
#print(hyperpval) | |
pvals.append(hyperpval) | |
df5.to_csv('markergenes/BT145_C/BT145_C_percent_markergenes_with_cellcycles_res_'+str(res0)+'.csv', index = True) | |
#print(df5) | |
pvals_bh_corrected = statsmodels.stats.multitest.multipletests(pvals, alpha=0.05, method='fdr_bh')[1] | |
np.savetxt("markergenes/BT145_C/BT145_C_corrected_pvalues_res_"+str(res0)+".csv", pvals_bh_corrected, delimiter=",",fmt='%s') | |
# Adding p-values to the cluster overlap percentage data | |
i=0 | |
new_list=[] | |
while i<len(pvals_bh_corrected): | |
new_list.append(pvals_bh_corrected[i:(i+leideni+1)]) | |
i+=(leideni+1) | |
df_pvals = pd.DataFrame(data=new_list, index=[df4.columns]).T | |
for j in df4.columns: | |
for i in range(leideni+1): | |
z = df4.at[i,j] | |
if z == 0: | |
df_pvals.at[i,j] = np.nan | |
df_pvals.columns = ['p_' + str(col) for col in df4.columns] | |
df_pvals.to_csv('markergenes/BT145_C/BT145_C_pvalues_res_'+str(res0)+'.csv', index = True) | |
#print(df_pvals) | |
df_end = df5.join(df_pvals) | |
df_end.to_csv('markergenes/BT145_C/BT145_C_a_pvalues_percentoverlap_res_'+str(res0)+'.csv', index = True) | |
#print(df_end) | |
# Finding how many clusters map to ccAF clusters | |
totclust = np.append(totclust, len(df_pvals.index)) | |
totclust = totclust[totclust is not None] | |
tmp = [] | |
tmp = df_pvals[df_pvals <= 0.05].count(axis=1) | |
sigclust = np.append(sigclust, tmp[tmp >= 1].count()) | |
sigclust = sigclust[sigclust is not None] | |
# Computing the ratio of significant clusters to total clusters at each resolution | |
ratioclust = [] | |
ratioclust = np.append(ratioclust, np.divide(sigclust, totclust)) | |
ratioclust = ratioclust[ratioclust is not None] | |
# Graphing the element centric and cluster mapping graph | |
fig = plt.figure() | |
plt.figure(figsize=(8,6)) | |
ax = plt.axes() | |
#ax.plot(resolutions, adjRandScores, label='ARS') | |
#ax.plot(resolutions, nmiScore, color='red', label='AMI') | |
#ax.plot(resolutions, amiScore, color='green', label='NMI') | |
plt.subplot(211) | |
plt.plot(resolutions, ecSim, color='orange', label='Element-centric') | |
plt.xlabel('Leiden resolution') | |
plt.ylabel('Score') | |
legend = plt.legend(loc='upper right') | |
plt.subplot(212) # plot of clusters | |
plt.plot(resolutions, sigclust.T, color='red', label='Significant Clusters') | |
plt.plot(resolutions, totclust.T, color='green', label='Total Clusters') | |
plt.xlabel('Leiden resolution') | |
plt.ylabel('Number of Clusters') | |
plt.grid(b=None) | |
legend = plt.legend(loc='upper right', bbox_to_anchor=(0.5, 1.05)) | |
plt2 = plt.twinx() | |
plt2.plot(resolutions, ratioclust.T, marker='.', label='Ratio Significant:Total') | |
plt2.set_ylabel('Ratio') | |
plt2.grid(b=None) | |
legend = plt.legend(loc='upper left', bbox_to_anchor=(0.5, 1.05)) | |
plt.savefig('figures/BT145_C/BT145_C_adjRandScores.pdf') | |
# UMAP of identified genes | |
resf = 0.45 | |
sc.tl.leiden(adata, resolution = resf) | |
from matplotlib.colors import LinearSegmentedColormap | |
sc.pl.umap(adata, color=['TIMP3', 'GFAP', 'FAM198B', 'PLAT', 'LONRF2', 'PTPRE', 'CRYAB', 'TIMP4'], color_map= LinearSegmentedColormap.from_list('abc',['gainsboro','mistyrose','firebrick'],N=100), save='_C_GFAP_res_'+str(resf)+'.pdf') #, vmin=2.5,vmax=6 | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment