Skip to content

Instantly share code, notes, and snippets.

@cplaisier
Created May 14, 2020 23:51
Show Gist options
  • Save cplaisier/5dce1411fbc2846a5335698ff0847dfb to your computer and use it in GitHub Desktop.
Save cplaisier/5dce1411fbc2846a5335698ff0847dfb to your computer and use it in GitHub Desktop.
cd /Users/baileyquinn/Desktop/BT145/
docker run -it -v "/Users/baileyquinn/Desktop/BT145:/files" cplaisier/scrna_seq_velocity
cd /files
pip3 install clusim
pip3 install statsmodels
python3
import numpy as np
import pandas as pd
import scanpy as sc
import mygene as mg
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score, adjusted_mutual_info_score
from scipy.stats import hypergeom
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')
import clusim.sim as sim
from clusim.clustering import Clustering, print_clustering
import statsmodels
fileVAR = {'C':{'n_genes':2500, 'percent_mito':0.25}}
## Load BT145_C
BT145_adatas = {}
for i in fileVAR:
results_file = './write/BT145_' + i + '.h5ad'
adata = sc.read_10x_mtx(
'./BT145_' + i + '/filtered_feature_bc_matrix/',
var_names = 'gene_symbols',
cache=True)
adata.var_names_make_unique()
adata
M = adata.n_obs
# Keeping only ccAF genes studied here
ccAFfull = pd.read_csv("Table_S1.csv",header=1)
ccAFgenelist = ccAFfull['gene'].values.tolist()
ccAFset = set(ccAFgenelist)
fullgenelist = adata.var_names.values.tolist()
crossover = ccAFset.intersection(fullgenelist)
df0 = ccAFfull[ccAFfull['gene'].isin(crossover)]
df0.to_csv ('Table_S1_withinM.csv', header=True)
ccAFcounts = df0.groupby('cluster').count()['gene']
## Preprocessing
# Filter
sc.pp.filter_cells(adata, min_genes=200)
sc.pp.filter_genes(adata, min_cells=3)
mito_genes = adata.var_names.str.startswith('MT-')
adata.obs['percent_mito'] = np.sum(adata[:, mito_genes].X, axis=1).A1 / np.sum(adata.X, axis=1).A1
adata.obs['n_counts'] = adata.X.sum(axis=1).A1
adata.obs['n_counts'] = adata.X.sum(axis=1).A1
adata.obs['n_genes'] = [i.count_nonzero() for i in adata.X]
# Violin plots
#sc.pl.violin(adata, ['n_genes', 'n_counts', 'percent_mito'], jitter=0.4, multi_panel=True, save= '_' + i + '_scanpy.pdf')
# Scatter plots
#sc.pl.scatter(adata, x='n_counts', y='percent_mito', save= '_' + i + '_n_count_by_percMito.pdf')
#sc.pl.scatter(adata, x='n_counts', y='n_genes', save= '_' + i + '_n_count_by_n_genes.pdf')
# Filter
adata = adata[adata.obs.n_genes > 2500, :]
adata = adata[adata.obs.percent_mito < 0.25, :]
print(adata.shape)
adata.write(results_file)
BT145_adatas[i] = adata
# Load meta data for BT145_C
meta_C = pd.read_csv('BT145_C/BT145_C_meta.data.csv', header=0, index_col=0)
meta_C.index = [i+'-1' for i in meta_C.index]
adata = BT145_adatas['C']
adata.obs['ccAF'] = meta_C.loc[adata.obs.index,'clusts_WT']
# Normalize
sc.pp.normalize_total(adata, target_sum=1e6)
# Log normalize
sc.pp.log1p(adata)
# Save raw data
adata.raw = adata
# Identify highly variable genes
sc.pp.highly_variable_genes(adata, n_top_genes=4000)
sc.pl.highly_variable_genes(adata, save='_C_hvg.pdf')
adata = adata[:, adata.var.highly_variable]
# Regress out unwated noise and scale data
sc.pp.regress_out(adata, ['n_counts', 'percent_mito'])
sc.pp.scale(adata, max_value=10)
# PCA
sc.tl.pca(adata, svd_solver='arpack')
#sc.pl.pca(adata, color='CST3')
#sc.pl.pca_variance_ratio(adata, log=True)
adata.write(results_file)
adata
# Compute the neighborhood graph
sc.pp.neighbors(adata) #, n_neighbors=10, n_pcs=40)
# UMAP
sc.tl.umap(adata)
#sc.pl.umap(adata, color=['leiden'], save='_BT145_C_clusters_resolution' + str(res1) + '.pdf')
# Clustering and finding marker genes
resolutions = [0.35, 0.375, 0.4, 0.425, 0.45, 0.475, 0.5, 0.525, 0.55, 0.575, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 1.0]
adjRandScores = []
amiScore = []
nmiScore = []
ecSim = []
for res1 in resolutions:
sc.tl.leiden(adata, resolution = res1)
# Writing confusion matrix to CSV
adjRandScores.append(adjusted_rand_score(adata.obs['leiden'],adata.obs['ccAF']))
amiScore.append(adjusted_mutual_info_score(adata.obs['leiden'],adata.obs['ccAF']))
nmiScore.append(normalized_mutual_info_score(adata.obs['leiden'],adata.obs['ccAF']))
ecSim.append(sim.element_sim(Clustering().from_membership_list(list(adata.obs['leiden'])),Clustering().from_membership_list(list(adata.obs['ccAF']))))
# print('res = '+str(res1)+'; Adj Rand Score = '+str(adjusted_rand_score(adata.obs['leiden'],adata.obs['ccAF'])))
with open('markergenes/BT145_C/BT145_C_confusion_matrix_'+str(res1)+'.csv', 'w') as outFile:
pd.crosstab(adata.obs['leiden'],adata.obs['ccAF']).to_csv(outFile)
# Plot UMAP and cluster
sc.pl.umap(adata, color=['leiden'], save='_BT145_C_clusters_resolution_'+str(res1)+'.pdf')
# Identifying Marker Genes
sc.tl.dendrogram(adata, 'leiden')
sc.tl.rank_genes_groups(adata, 'leiden', method='wilcoxon', corr_method='benjamini-hochberg', n_genes=10000)
# sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False, save='_BT145_C_res_'+str(res1)+'.pdf')
sc.tl.filter_rank_genes_groups(adata, min_fold_change=2)
# visualize results
sc.pl.rank_genes_groups(adata, key='rank_genes_groups_filtered')
# visualize results using dotplot and heatmap
sc.tl.dendrogram(adata, 'leiden')
sc.pl.rank_genes_groups_dotplot(adata, key='rank_genes_groups_filtered', save='_BT145_C_res_'+str(res1)+'.pdf')
sc.pl.rank_genes_groups_heatmap(adata, key='rank_genes_groups_filtered', save='_BT145_C_res_'+str(res1)+'.pdf')
# Writing Marker Genes to CSV
with open('markergenes/BT145_C/BT145_C_markergenes_dataframe_'+str(res1)+'.csv', 'w') as outFile:
for x in range(max([int(i) for i in list(adata.obs['leiden'])])+1):
df = sc.get.rank_genes_groups_df(adata, str(x), pval_cutoff=0.05, log2fc_min=1).sort_values(by='logfoldchanges', ascending=False)
df['cluster'] = [x]*df.shape[0]
if x == 0:
df.to_csv(outFile, header = True)
else:
df.to_csv(outFile, header = False)
# Creating the ccAF dictionary, keeping an equal number of genes for each cluster
nfull = df0
nfiltered1 = nfull.avg_logFC >= 0
nfiltered2 = nfull.p_val_adj <= 0.05
nfiltered = nfiltered1 #& nfiltered2
df1 = nfull[nfiltered]
df1 = df1.sort_values("p_val_adj", axis=0, ascending=True)
ccAFbuilder = pd.DataFrame()
for c in ['Neural G0', 'Late G1', 'S', 'S/G2', 'G2/M', 'M/Early G1']:
df1subset = df1[df1.cluster == c]
df1subset = df1subset.head(n=28)
ccAFbuilder = ccAFbuilder.append(df1subset)
ccAFdict = dict(zip(ccAFbuilder.gene, ccAFbuilder.cluster))
resolutions = [0.35, 0.375, 0.4, 0.425, 0.45, 0.475, 0.5, 0.525, 0.55, 0.575, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 1.0]
# Finding intersection of cell cycle and clusters
totclust = []
sigclust = []
for res0 in resolutions:
df2 = pd.read_csv("markergenes/BT145_C/BT145_C_markergenes_dataframe_"+str(res0)+".csv",header=0)
# Mapping marker genes to the cell cycle genes
df2['cell cycle'] = df2['names'].map(ccAFdict)
df2.to_csv('markergenes/BT145_C/BT145_C_markergenes_with_cellcycles_res_'+str(res0)+'.csv', index = False)
# Calculating the % each leiden cluster is mapped to each cell cycle cluster
df4 = pd.crosstab(df2['cluster'],df2['cell cycle'])
df4 = df4.drop(columns=['Low RNA'], errors='ignore')
coltotal = pd.DataFrame(data=None)
leideni = max(df2['cluster'])
#print(leideni)
for i in range(leideni+1):
coltotal[i] = df2[df2.cluster == i].count()
coltotal = coltotal.drop(['Unnamed: 0','scores','logfoldchanges','pvals','pvals_adj','cluster','cell cycle']).T
df5 = df4.div(coltotal['names'],axis='rows').mul(100).fillna(0)
df4 = df5.div(100).mul(coltotal['names'],axis='rows').fillna(0)
# Calculating p-value for each leiden cluster and cell cycle cluster combination
pvals = []
for j in df4.columns:
n = ccAFcounts[j]
for i in range(leideni+1):
if i not in df2.groupby('cluster').count()['names']:
N = 0
else:
N = df2.groupby('cluster').count()['names'][i]
x = df4.at[i,j]
prb = hypergeom.cdf(x, M, n, N)
hyperpval = 1-prb
#print(hyperpval)
pvals.append(hyperpval)
df5.to_csv('markergenes/BT145_C/BT145_C_percent_markergenes_with_cellcycles_res_'+str(res0)+'.csv', index = True)
#print(df5)
pvals_bh_corrected = statsmodels.stats.multitest.multipletests(pvals, alpha=0.05, method='fdr_bh')[1]
np.savetxt("markergenes/BT145_C/BT145_C_corrected_pvalues_res_"+str(res0)+".csv", pvals_bh_corrected, delimiter=",",fmt='%s')
# Adding p-values to the cluster overlap percentage data
i=0
new_list=[]
while i<len(pvals_bh_corrected):
new_list.append(pvals_bh_corrected[i:(i+leideni+1)])
i+=(leideni+1)
df_pvals = pd.DataFrame(data=new_list, index=[df4.columns]).T
for j in df4.columns:
for i in range(leideni+1):
z = df4.at[i,j]
if z == 0:
df_pvals.at[i,j] = np.nan
df_pvals.columns = ['p_' + str(col) for col in df4.columns]
df_pvals.to_csv('markergenes/BT145_C/BT145_C_pvalues_res_'+str(res0)+'.csv', index = True)
#print(df_pvals)
df_end = df5.join(df_pvals)
df_end.to_csv('markergenes/BT145_C/BT145_C_a_pvalues_percentoverlap_res_'+str(res0)+'.csv', index = True)
#print(df_end)
# Finding how many clusters map to ccAF clusters
totclust = np.append(totclust, len(df_pvals.index))
totclust = totclust[totclust is not None]
tmp = []
tmp = df_pvals[df_pvals <= 0.05].count(axis=1)
sigclust = np.append(sigclust, tmp[tmp >= 1].count())
sigclust = sigclust[sigclust is not None]
# Computing the ratio of significant clusters to total clusters at each resolution
ratioclust = []
ratioclust = np.append(ratioclust, np.divide(sigclust, totclust))
ratioclust = ratioclust[ratioclust is not None]
# Graphing the element centric and cluster mapping graph
fig = plt.figure()
plt.figure(figsize=(8,6))
ax = plt.axes()
#ax.plot(resolutions, adjRandScores, label='ARS')
#ax.plot(resolutions, nmiScore, color='red', label='AMI')
#ax.plot(resolutions, amiScore, color='green', label='NMI')
plt.subplot(211)
plt.plot(resolutions, ecSim, color='orange', label='Element-centric')
plt.xlabel('Leiden resolution')
plt.ylabel('Score')
legend = plt.legend(loc='upper right')
plt.subplot(212) # plot of clusters
plt.plot(resolutions, sigclust.T, color='red', label='Significant Clusters')
plt.plot(resolutions, totclust.T, color='green', label='Total Clusters')
plt.xlabel('Leiden resolution')
plt.ylabel('Number of Clusters')
plt.grid(b=None)
legend = plt.legend(loc='upper right', bbox_to_anchor=(0.5, 1.05))
plt2 = plt.twinx()
plt2.plot(resolutions, ratioclust.T, marker='.', label='Ratio Significant:Total')
plt2.set_ylabel('Ratio')
plt2.grid(b=None)
legend = plt.legend(loc='upper left', bbox_to_anchor=(0.5, 1.05))
plt.savefig('figures/BT145_C/BT145_C_adjRandScores.pdf')
# UMAP of identified genes
resf = 0.45
sc.tl.leiden(adata, resolution = resf)
from matplotlib.colors import LinearSegmentedColormap
sc.pl.umap(adata, color=['TIMP3', 'GFAP', 'FAM198B', 'PLAT', 'LONRF2', 'PTPRE', 'CRYAB', 'TIMP4'], color_map= LinearSegmentedColormap.from_list('abc',['gainsboro','mistyrose','firebrick'],N=100), save='_C_GFAP_res_'+str(resf)+'.pdf') #, vmin=2.5,vmax=6
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment