cplaisier/BT145_C_script.py

## BT145_C_script.py
cd /Users/baileyquinn/Desktop/BT145/

docker run -it -v "/Users/baileyquinn/Desktop/BT145:/files" cplaisier/scrna_seq_velocity

cd /files

pip3 install clusim
pip3 install statsmodels

python3

import numpy as np
import pandas as pd
import scanpy as sc
import mygene as mg
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score, adjusted_mutual_info_score
from scipy.stats import hypergeom
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')
import clusim.sim as sim
from clusim.clustering import Clustering, print_clustering
import statsmodels

fileVAR = {'C':{'n_genes':2500, 'percent_mito':0.25}}

## Load BT145_C
BT145_adatas = {}
for i in fileVAR:
    results_file = './write/BT145_' + i + '.h5ad'
    adata = sc.read_10x_mtx(
            './BT145_' + i + '/filtered_feature_bc_matrix/',
            var_names = 'gene_symbols',
            cache=True)
    adata.var_names_make_unique()
    adata
    M = adata.n_obs

    # Keeping only ccAF genes studied here
    ccAFfull = pd.read_csv("Table_S1.csv",header=1)
    ccAFgenelist = ccAFfull['gene'].values.tolist()
    ccAFset = set(ccAFgenelist)
    fullgenelist = adata.var_names.values.tolist()
    crossover = ccAFset.intersection(fullgenelist)
    df0 = ccAFfull[ccAFfull['gene'].isin(crossover)]
    df0.to_csv ('Table_S1_withinM.csv', header=True)
    ccAFcounts = df0.groupby('cluster').count()['gene']

    ## Preprocessing
    # Filter
    sc.pp.filter_cells(adata, min_genes=200)
    sc.pp.filter_genes(adata, min_cells=3)

    mito_genes = adata.var_names.str.startswith('MT-')
    adata.obs['percent_mito'] = np.sum(adata[:, mito_genes].X, axis=1).A1 / np.sum(adata.X, axis=1).A1
    adata.obs['n_counts'] = adata.X.sum(axis=1).A1
    adata.obs['n_counts'] = adata.X.sum(axis=1).A1
    adata.obs['n_genes'] = [i.count_nonzero() for i in adata.X]

    # Violin plots
    #sc.pl.violin(adata, ['n_genes', 'n_counts', 'percent_mito'], jitter=0.4, multi_panel=True, save= '_' + i + '_scanpy.pdf')

    # Scatter plots
    #sc.pl.scatter(adata, x='n_counts', y='percent_mito', save= '_' + i + '_n_count_by_percMito.pdf')
    #sc.pl.scatter(adata, x='n_counts', y='n_genes', save= '_' + i + '_n_count_by_n_genes.pdf')

    # Filter
    adata = adata[adata.obs.n_genes > 2500, :]
    adata = adata[adata.obs.percent_mito < 0.25, :]
    print(adata.shape)
    adata.write(results_file)

    BT145_adatas[i] = adata


# Load meta data for BT145_C
meta_C = pd.read_csv('BT145_C/BT145_C_meta.data.csv', header=0, index_col=0)
meta_C.index = [i+'-1' for i in meta_C.index]

adata = BT145_adatas['C']
adata.obs['ccAF'] = meta_C.loc[adata.obs.index,'clusts_WT']

# Normalize
sc.pp.normalize_total(adata, target_sum=1e6)

# Log normalize
sc.pp.log1p(adata)

# Save raw data
adata.raw = adata

# Identify highly variable genes
sc.pp.highly_variable_genes(adata, n_top_genes=4000)
sc.pl.highly_variable_genes(adata, save='_C_hvg.pdf')
adata = adata[:, adata.var.highly_variable]

# Regress out unwated noise and scale data
sc.pp.regress_out(adata, ['n_counts', 'percent_mito'])
sc.pp.scale(adata, max_value=10)

# PCA
sc.tl.pca(adata, svd_solver='arpack')
#sc.pl.pca(adata, color='CST3')
#sc.pl.pca_variance_ratio(adata, log=True)
adata.write(results_file)
adata

# Compute the neighborhood graph
sc.pp.neighbors(adata) #, n_neighbors=10, n_pcs=40)

# UMAP
sc.tl.umap(adata)
#sc.pl.umap(adata, color=['leiden'], save='_BT145_C_clusters_resolution' + str(res1) + '.pdf')

# Clustering and finding marker genes
resolutions = [0.35, 0.375, 0.4, 0.425, 0.45, 0.475, 0.5, 0.525, 0.55, 0.575, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 1.0]
adjRandScores = []
amiScore = []
nmiScore = []
ecSim = []
for res1 in resolutions:
    sc.tl.leiden(adata, resolution = res1)
    # Writing confusion matrix to CSV
    adjRandScores.append(adjusted_rand_score(adata.obs['leiden'],adata.obs['ccAF']))
    amiScore.append(adjusted_mutual_info_score(adata.obs['leiden'],adata.obs['ccAF']))
    nmiScore.append(normalized_mutual_info_score(adata.obs['leiden'],adata.obs['ccAF']))
    ecSim.append(sim.element_sim(Clustering().from_membership_list(list(adata.obs['leiden'])),Clustering().from_membership_list(list(adata.obs['ccAF']))))
    # print('res = '+str(res1)+'; Adj Rand Score = '+str(adjusted_rand_score(adata.obs['leiden'],adata.obs['ccAF'])))
    with open('markergenes/BT145_C/BT145_C_confusion_matrix_'+str(res1)+'.csv', 'w') as outFile:
        pd.crosstab(adata.obs['leiden'],adata.obs['ccAF']).to_csv(outFile)
    # Plot UMAP and cluster
    sc.pl.umap(adata, color=['leiden'], save='_BT145_C_clusters_resolution_'+str(res1)+'.pdf')
    # Identifying Marker Genes
    sc.tl.dendrogram(adata, 'leiden')
    sc.tl.rank_genes_groups(adata, 'leiden', method='wilcoxon', corr_method='benjamini-hochberg', n_genes=10000)
    # sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False, save='_BT145_C_res_'+str(res1)+'.pdf')
    sc.tl.filter_rank_genes_groups(adata, min_fold_change=2)
    # visualize results
    sc.pl.rank_genes_groups(adata, key='rank_genes_groups_filtered')
    # visualize results using dotplot and heatmap
    sc.tl.dendrogram(adata, 'leiden')
    sc.pl.rank_genes_groups_dotplot(adata, key='rank_genes_groups_filtered', save='_BT145_C_res_'+str(res1)+'.pdf')
    sc.pl.rank_genes_groups_heatmap(adata, key='rank_genes_groups_filtered', save='_BT145_C_res_'+str(res1)+'.pdf')
    # Writing Marker Genes to CSV
    with open('markergenes/BT145_C/BT145_C_markergenes_dataframe_'+str(res1)+'.csv', 'w') as outFile:
        for x in range(max([int(i) for i in list(adata.obs['leiden'])])+1):
            df = sc.get.rank_genes_groups_df(adata, str(x), pval_cutoff=0.05, log2fc_min=1).sort_values(by='logfoldchanges', ascending=False)
            df['cluster'] = [x]*df.shape[0]
            if x == 0:
                df.to_csv(outFile, header = True)
            else:
                df.to_csv(outFile, header = False)


# Creating the ccAF dictionary, keeping an equal number of genes for each cluster
nfull = df0
nfiltered1 = nfull.avg_logFC >= 0
nfiltered2 = nfull.p_val_adj <= 0.05
nfiltered = nfiltered1 #& nfiltered2
df1 = nfull[nfiltered]
df1 = df1.sort_values("p_val_adj", axis=0, ascending=True)
ccAFbuilder = pd.DataFrame()
for c in ['Neural G0', 'Late G1', 'S', 'S/G2', 'G2/M', 'M/Early G1']:
    df1subset = df1[df1.cluster == c]
    df1subset = df1subset.head(n=28)
    ccAFbuilder = ccAFbuilder.append(df1subset)


ccAFdict = dict(zip(ccAFbuilder.gene, ccAFbuilder.cluster))


resolutions = [0.35, 0.375, 0.4, 0.425, 0.45, 0.475, 0.5, 0.525, 0.55, 0.575, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 1.0]

# Finding intersection of cell cycle and clusters
totclust = []
sigclust = []
for res0 in resolutions:
    df2 = pd.read_csv("markergenes/BT145_C/BT145_C_markergenes_dataframe_"+str(res0)+".csv",header=0)
    # Mapping marker genes to the cell cycle genes
    df2['cell cycle'] = df2['names'].map(ccAFdict)
    df2.to_csv('markergenes/BT145_C/BT145_C_markergenes_with_cellcycles_res_'+str(res0)+'.csv', index = False)
    # Calculating the % each leiden cluster is mapped to each cell cycle cluster
    df4 = pd.crosstab(df2['cluster'],df2['cell cycle'])
    df4 = df4.drop(columns=['Low RNA'], errors='ignore')
    coltotal = pd.DataFrame(data=None)
    leideni = max(df2['cluster'])
    #print(leideni)
    for i in range(leideni+1):
        coltotal[i] = df2[df2.cluster == i].count()
    coltotal = coltotal.drop(['Unnamed: 0','scores','logfoldchanges','pvals','pvals_adj','cluster','cell cycle']).T
    df5 = df4.div(coltotal['names'],axis='rows').mul(100).fillna(0)
    df4 = df5.div(100).mul(coltotal['names'],axis='rows').fillna(0)
    # Calculating p-value for each leiden cluster and cell cycle cluster combination
    pvals = []
    for j in df4.columns:
        n = ccAFcounts[j]
        for i in range(leideni+1):
            if i not in df2.groupby('cluster').count()['names']:
                N = 0
            else:
                N = df2.groupby('cluster').count()['names'][i]
            x = df4.at[i,j]
            prb = hypergeom.cdf(x, M, n, N)
            hyperpval = 1-prb
            #print(hyperpval)
            pvals.append(hyperpval)
    df5.to_csv('markergenes/BT145_C/BT145_C_percent_markergenes_with_cellcycles_res_'+str(res0)+'.csv', index = True)
    #print(df5)
    pvals_bh_corrected = statsmodels.stats.multitest.multipletests(pvals, alpha=0.05, method='fdr_bh')[1]
    np.savetxt("markergenes/BT145_C/BT145_C_corrected_pvalues_res_"+str(res0)+".csv", pvals_bh_corrected, delimiter=",",fmt='%s')
    # Adding p-values to the cluster overlap percentage data
    i=0
    new_list=[]
    while i<len(pvals_bh_corrected):
        new_list.append(pvals_bh_corrected[i:(i+leideni+1)])
        i+=(leideni+1)
    df_pvals = pd.DataFrame(data=new_list, index=[df4.columns]).T
    for j in df4.columns:
        for i in range(leideni+1):
            z = df4.at[i,j]
            if z == 0:
                df_pvals.at[i,j] = np.nan
    df_pvals.columns = ['p_' + str(col) for col in df4.columns]
    df_pvals.to_csv('markergenes/BT145_C/BT145_C_pvalues_res_'+str(res0)+'.csv', index = True)
    #print(df_pvals)
    df_end = df5.join(df_pvals)
    df_end.to_csv('markergenes/BT145_C/BT145_C_a_pvalues_percentoverlap_res_'+str(res0)+'.csv', index = True)
    #print(df_end)
    # Finding how many clusters map to ccAF clusters
    totclust = np.append(totclust, len(df_pvals.index))
    totclust = totclust[totclust is not None]
    tmp = []
    tmp = df_pvals[df_pvals <= 0.05].count(axis=1)
    sigclust = np.append(sigclust, tmp[tmp >= 1].count())
    sigclust = sigclust[sigclust is not None]


# Computing the ratio of significant clusters to total clusters at each resolution
ratioclust = []
ratioclust = np.append(ratioclust, np.divide(sigclust, totclust))
ratioclust = ratioclust[ratioclust is not None]

# Graphing the element centric and cluster mapping graph
fig = plt.figure()
plt.figure(figsize=(8,6))
ax = plt.axes()
#ax.plot(resolutions, adjRandScores, label='ARS')
#ax.plot(resolutions, nmiScore, color='red', label='AMI')
#ax.plot(resolutions, amiScore, color='green', label='NMI')
plt.subplot(211)
plt.plot(resolutions, ecSim, color='orange', label='Element-centric')
plt.xlabel('Leiden resolution')
plt.ylabel('Score')
legend = plt.legend(loc='upper right')
plt.subplot(212) # plot of clusters
plt.plot(resolutions, sigclust.T, color='red', label='Significant Clusters')
plt.plot(resolutions, totclust.T, color='green', label='Total Clusters')
plt.xlabel('Leiden resolution')
plt.ylabel('Number of Clusters')
plt.grid(b=None)
legend = plt.legend(loc='upper right', bbox_to_anchor=(0.5, 1.05))
plt2 = plt.twinx()
plt2.plot(resolutions, ratioclust.T, marker='.', label='Ratio Significant:Total')
plt2.set_ylabel('Ratio')
plt2.grid(b=None)
legend = plt.legend(loc='upper left', bbox_to_anchor=(0.5, 1.05))
plt.savefig('figures/BT145_C/BT145_C_adjRandScores.pdf')

# UMAP of identified genes
resf = 0.45
sc.tl.leiden(adata, resolution = resf)

from matplotlib.colors import LinearSegmentedColormap
sc.pl.umap(adata, color=['TIMP3', 'GFAP', 'FAM198B', 'PLAT', 'LONRF2', 'PTPRE', 'CRYAB', 'TIMP4'], color_map= LinearSegmentedColormap.from_list('abc',['gainsboro','mistyrose','firebrick'],N=100), save='_C_GFAP_res_'+str(resf)+'.pdf') #, vmin=2.5,vmax=6
	cd /Users/baileyquinn/Desktop/BT145/

	docker run -it -v "/Users/baileyquinn/Desktop/BT145:/files" cplaisier/scrna_seq_velocity

	cd /files

	pip3 install clusim
	pip3 install statsmodels

	python3

	import numpy as np
	import pandas as pd
	import scanpy as sc
	import mygene as mg
	from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score, adjusted_mutual_info_score
	from scipy.stats import hypergeom
	import matplotlib.pyplot as plt
	plt.style.use('seaborn-whitegrid')
	import clusim.sim as sim
	from clusim.clustering import Clustering, print_clustering
	import statsmodels

	fileVAR = {'C':{'n_genes':2500, 'percent_mito':0.25}}

	## Load BT145_C
	BT145_adatas = {}
	for i in fileVAR:
	results_file = './write/BT145_' + i + '.h5ad'
	adata = sc.read_10x_mtx(
	'./BT145_' + i + '/filtered_feature_bc_matrix/',
	var_names = 'gene_symbols',
	cache=True)
	adata.var_names_make_unique()
	adata
	M = adata.n_obs

	# Keeping only ccAF genes studied here
	ccAFfull = pd.read_csv("Table_S1.csv",header=1)
	ccAFgenelist = ccAFfull['gene'].values.tolist()
	ccAFset = set(ccAFgenelist)
	fullgenelist = adata.var_names.values.tolist()
	crossover = ccAFset.intersection(fullgenelist)
	df0 = ccAFfull[ccAFfull['gene'].isin(crossover)]
	df0.to_csv ('Table_S1_withinM.csv', header=True)
	ccAFcounts = df0.groupby('cluster').count()['gene']

	## Preprocessing
	# Filter
	sc.pp.filter_cells(adata, min_genes=200)
	sc.pp.filter_genes(adata, min_cells=3)

	mito_genes = adata.var_names.str.startswith('MT-')
	adata.obs['percent_mito'] = np.sum(adata[:, mito_genes].X, axis=1).A1 / np.sum(adata.X, axis=1).A1
	adata.obs['n_counts'] = adata.X.sum(axis=1).A1
	adata.obs['n_counts'] = adata.X.sum(axis=1).A1
	adata.obs['n_genes'] = [i.count_nonzero() for i in adata.X]

	# Violin plots
	#sc.pl.violin(adata, ['n_genes', 'n_counts', 'percent_mito'], jitter=0.4, multi_panel=True, save= '_' + i + '_scanpy.pdf')

	# Scatter plots
	#sc.pl.scatter(adata, x='n_counts', y='percent_mito', save= '_' + i + '_n_count_by_percMito.pdf')
	#sc.pl.scatter(adata, x='n_counts', y='n_genes', save= '_' + i + '_n_count_by_n_genes.pdf')

	# Filter
	adata = adata[adata.obs.n_genes > 2500, :]
	adata = adata[adata.obs.percent_mito < 0.25, :]
	print(adata.shape)
	adata.write(results_file)

	BT145_adatas[i] = adata


	# Load meta data for BT145_C
	meta_C = pd.read_csv('BT145_C/BT145_C_meta.data.csv', header=0, index_col=0)
	meta_C.index = [i+'-1' for i in meta_C.index]

	adata = BT145_adatas['C']
	adata.obs['ccAF'] = meta_C.loc[adata.obs.index,'clusts_WT']

	# Normalize
	sc.pp.normalize_total(adata, target_sum=1e6)

	# Log normalize
	sc.pp.log1p(adata)

	# Save raw data
	adata.raw = adata

	# Identify highly variable genes
	sc.pp.highly_variable_genes(adata, n_top_genes=4000)
	sc.pl.highly_variable_genes(adata, save='_C_hvg.pdf')
	adata = adata[:, adata.var.highly_variable]

	# Regress out unwated noise and scale data
	sc.pp.regress_out(adata, ['n_counts', 'percent_mito'])
	sc.pp.scale(adata, max_value=10)

	# PCA
	sc.tl.pca(adata, svd_solver='arpack')
	#sc.pl.pca(adata, color='CST3')
	#sc.pl.pca_variance_ratio(adata, log=True)
	adata.write(results_file)
	adata

	# Compute the neighborhood graph
	sc.pp.neighbors(adata) #, n_neighbors=10, n_pcs=40)

	# UMAP
	sc.tl.umap(adata)
	#sc.pl.umap(adata, color=['leiden'], save='_BT145_C_clusters_resolution' + str(res1) + '.pdf')

	# Clustering and finding marker genes
	resolutions = [0.35, 0.375, 0.4, 0.425, 0.45, 0.475, 0.5, 0.525, 0.55, 0.575, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 1.0]
	adjRandScores = []
	amiScore = []
	nmiScore = []
	ecSim = []
	for res1 in resolutions:
	sc.tl.leiden(adata, resolution = res1)
	# Writing confusion matrix to CSV
	adjRandScores.append(adjusted_rand_score(adata.obs['leiden'],adata.obs['ccAF']))
	amiScore.append(adjusted_mutual_info_score(adata.obs['leiden'],adata.obs['ccAF']))
	nmiScore.append(normalized_mutual_info_score(adata.obs['leiden'],adata.obs['ccAF']))
	ecSim.append(sim.element_sim(Clustering().from_membership_list(list(adata.obs['leiden'])),Clustering().from_membership_list(list(adata.obs['ccAF']))))
	# print('res = '+str(res1)+'; Adj Rand Score = '+str(adjusted_rand_score(adata.obs['leiden'],adata.obs['ccAF'])))
	with open('markergenes/BT145_C/BT145_C_confusion_matrix_'+str(res1)+'.csv', 'w') as outFile:
	pd.crosstab(adata.obs['leiden'],adata.obs['ccAF']).to_csv(outFile)
	# Plot UMAP and cluster
	sc.pl.umap(adata, color=['leiden'], save='_BT145_C_clusters_resolution_'+str(res1)+'.pdf')
	# Identifying Marker Genes
	sc.tl.dendrogram(adata, 'leiden')
	sc.tl.rank_genes_groups(adata, 'leiden', method='wilcoxon', corr_method='benjamini-hochberg', n_genes=10000)
	# sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False, save='_BT145_C_res_'+str(res1)+'.pdf')
	sc.tl.filter_rank_genes_groups(adata, min_fold_change=2)
	# visualize results
	sc.pl.rank_genes_groups(adata, key='rank_genes_groups_filtered')
	# visualize results using dotplot and heatmap
	sc.tl.dendrogram(adata, 'leiden')
	sc.pl.rank_genes_groups_dotplot(adata, key='rank_genes_groups_filtered', save='_BT145_C_res_'+str(res1)+'.pdf')
	sc.pl.rank_genes_groups_heatmap(adata, key='rank_genes_groups_filtered', save='_BT145_C_res_'+str(res1)+'.pdf')
	# Writing Marker Genes to CSV
	with open('markergenes/BT145_C/BT145_C_markergenes_dataframe_'+str(res1)+'.csv', 'w') as outFile:
	for x in range(max([int(i) for i in list(adata.obs['leiden'])])+1):
	df = sc.get.rank_genes_groups_df(adata, str(x), pval_cutoff=0.05, log2fc_min=1).sort_values(by='logfoldchanges', ascending=False)
	df['cluster'] = [x]*df.shape[0]
	if x == 0:
	df.to_csv(outFile, header = True)
	else:
	df.to_csv(outFile, header = False)


	# Creating the ccAF dictionary, keeping an equal number of genes for each cluster
	nfull = df0
	nfiltered1 = nfull.avg_logFC >= 0
	nfiltered2 = nfull.p_val_adj <= 0.05
	nfiltered = nfiltered1 #& nfiltered2
	df1 = nfull[nfiltered]
	df1 = df1.sort_values("p_val_adj", axis=0, ascending=True)
	ccAFbuilder = pd.DataFrame()
	for c in ['Neural G0', 'Late G1', 'S', 'S/G2', 'G2/M', 'M/Early G1']:
	df1subset = df1[df1.cluster == c]
	df1subset = df1subset.head(n=28)
	ccAFbuilder = ccAFbuilder.append(df1subset)


	ccAFdict = dict(zip(ccAFbuilder.gene, ccAFbuilder.cluster))


	resolutions = [0.35, 0.375, 0.4, 0.425, 0.45, 0.475, 0.5, 0.525, 0.55, 0.575, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 1.0]

	# Finding intersection of cell cycle and clusters
	totclust = []
	sigclust = []
	for res0 in resolutions:
	df2 = pd.read_csv("markergenes/BT145_C/BT145_C_markergenes_dataframe_"+str(res0)+".csv",header=0)
	# Mapping marker genes to the cell cycle genes
	df2['cell cycle'] = df2['names'].map(ccAFdict)
	df2.to_csv('markergenes/BT145_C/BT145_C_markergenes_with_cellcycles_res_'+str(res0)+'.csv', index = False)
	# Calculating the % each leiden cluster is mapped to each cell cycle cluster
	df4 = pd.crosstab(df2['cluster'],df2['cell cycle'])
	df4 = df4.drop(columns=['Low RNA'], errors='ignore')
	coltotal = pd.DataFrame(data=None)
	leideni = max(df2['cluster'])
	#print(leideni)
	for i in range(leideni+1):
	coltotal[i] = df2[df2.cluster == i].count()
	coltotal = coltotal.drop(['Unnamed: 0','scores','logfoldchanges','pvals','pvals_adj','cluster','cell cycle']).T
	df5 = df4.div(coltotal['names'],axis='rows').mul(100).fillna(0)
	df4 = df5.div(100).mul(coltotal['names'],axis='rows').fillna(0)
	# Calculating p-value for each leiden cluster and cell cycle cluster combination
	pvals = []
	for j in df4.columns:
	n = ccAFcounts[j]
	for i in range(leideni+1):
	if i not in df2.groupby('cluster').count()['names']:
	N = 0
	else:
	N = df2.groupby('cluster').count()['names'][i]
	x = df4.at[i,j]
	prb = hypergeom.cdf(x, M, n, N)
	hyperpval = 1-prb
	#print(hyperpval)
	pvals.append(hyperpval)
	df5.to_csv('markergenes/BT145_C/BT145_C_percent_markergenes_with_cellcycles_res_'+str(res0)+'.csv', index = True)
	#print(df5)
	pvals_bh_corrected = statsmodels.stats.multitest.multipletests(pvals, alpha=0.05, method='fdr_bh')[1]
	np.savetxt("markergenes/BT145_C/BT145_C_corrected_pvalues_res_"+str(res0)+".csv", pvals_bh_corrected, delimiter=",",fmt='%s')
	# Adding p-values to the cluster overlap percentage data
	i=0
	new_list=[]
	while i<len(pvals_bh_corrected):
	new_list.append(pvals_bh_corrected[i:(i+leideni+1)])
	i+=(leideni+1)
	df_pvals = pd.DataFrame(data=new_list, index=[df4.columns]).T
	for j in df4.columns:
	for i in range(leideni+1):
	z = df4.at[i,j]
	if z == 0:
	df_pvals.at[i,j] = np.nan
	df_pvals.columns = ['p_' + str(col) for col in df4.columns]
	df_pvals.to_csv('markergenes/BT145_C/BT145_C_pvalues_res_'+str(res0)+'.csv', index = True)
	#print(df_pvals)
	df_end = df5.join(df_pvals)
	df_end.to_csv('markergenes/BT145_C/BT145_C_a_pvalues_percentoverlap_res_'+str(res0)+'.csv', index = True)
	#print(df_end)
	# Finding how many clusters map to ccAF clusters
	totclust = np.append(totclust, len(df_pvals.index))
	totclust = totclust[totclust is not None]
	tmp = []
	tmp = df_pvals[df_pvals <= 0.05].count(axis=1)
	sigclust = np.append(sigclust, tmp[tmp >= 1].count())
	sigclust = sigclust[sigclust is not None]


	# Computing the ratio of significant clusters to total clusters at each resolution
	ratioclust = []
	ratioclust = np.append(ratioclust, np.divide(sigclust, totclust))
	ratioclust = ratioclust[ratioclust is not None]

	# Graphing the element centric and cluster mapping graph
	fig = plt.figure()
	plt.figure(figsize=(8,6))
	ax = plt.axes()
	#ax.plot(resolutions, adjRandScores, label='ARS')
	#ax.plot(resolutions, nmiScore, color='red', label='AMI')
	#ax.plot(resolutions, amiScore, color='green', label='NMI')
	plt.subplot(211)
	plt.plot(resolutions, ecSim, color='orange', label='Element-centric')
	plt.xlabel('Leiden resolution')
	plt.ylabel('Score')
	legend = plt.legend(loc='upper right')
	plt.subplot(212) # plot of clusters
	plt.plot(resolutions, sigclust.T, color='red', label='Significant Clusters')
	plt.plot(resolutions, totclust.T, color='green', label='Total Clusters')
	plt.xlabel('Leiden resolution')
	plt.ylabel('Number of Clusters')
	plt.grid(b=None)
	legend = plt.legend(loc='upper right', bbox_to_anchor=(0.5, 1.05))
	plt2 = plt.twinx()
	plt2.plot(resolutions, ratioclust.T, marker='.', label='Ratio Significant:Total')
	plt2.set_ylabel('Ratio')
	plt2.grid(b=None)
	legend = plt.legend(loc='upper left', bbox_to_anchor=(0.5, 1.05))
	plt.savefig('figures/BT145_C/BT145_C_adjRandScores.pdf')

	# UMAP of identified genes
	resf = 0.45
	sc.tl.leiden(adata, resolution = resf)

	from matplotlib.colors import LinearSegmentedColormap
	sc.pl.umap(adata, color=['TIMP3', 'GFAP', 'FAM198B', 'PLAT', 'LONRF2', 'PTPRE', 'CRYAB', 'TIMP4'], color_map= LinearSegmentedColormap.from_list('abc',['gainsboro','mistyrose','firebrick'],N=100), save='_C_GFAP_res_'+str(resf)+'.pdf') #, vmin=2.5,vmax=6