Created January 29, 2022 20:48
PheKnowLator Preeclampsia Evaluation
# import needed libraries
import matplotlib.pyplot as plt
import matplotlib'ggplot')
import matplotlib.patches as mpatches
import numpy as np
import pandas as pd
import pickle
import random
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
from tqdm import tqdm
def similarity_search(matrix, index_pat, top_n):
Function takes as input a tfidf matrix, an integer representing a patient id, and an integer representing the
number of similar patients to return. The function uses this information and calculates the cosine similarity
between the index patient and all other included patients. The results are sorted and returned as a list of
lists where each list contains a patient identifier and the cosine similarity score the top set of similar as
indicated by the input argument are returned.
:param tfidf_matrix: where each row represents a patient and each column represents a concept and counts are
weighted by TF-IDF
:param index_patient: an integer representing a patient id
:param top_n: an integer representing the number of similar patients to return
similar_patients: a list of lists where each list contains a patient identifier and the cosine similarity
score the top set of similar as indicated by the input argument are returned
# calculate similarity
cosine_similarities = cosine_similarity(matrix[index_pat:index_pat + 1], matrix).flatten()
rel_pat_indices = [i for i in cosine_similarities.argsort()[::-1] if i != index_pat]
similar_patients = [(patient, cosine_similarities[patient]) for patient in rel_pat_indices][0:top_n]
return similar_patients
def main():
# TASK 1 - Show that you can better understand ignorome genes using the pkt embeddings
# read in kg embeddings
kg_file_dir = 'resources/releases/v1.0.0/Dissertation/PheKnowLator Evaluation/knowledge/'
kg = pd.read_pickle(kg_file_dir + 'ALL_KG_res_df')
kg_labels = pd.read_csv(kg_file_dir + 'KG_Node_Metadata.csv', header=0)
kg = kg.merge(kg_labels[['id', 'label']], left_on='id', right_on='id', how='left').reset_index(drop=True)
# from rdflib.namespace import OWL, RDF, RDFS # type: ignore
# from rdflib import BNode, Graph, Literal, Namespace, URIRef # type: ignore
# graph = Graph().parse(kg_file_dir + 'label_files/go_with_imports.owl')
# go_typing = {
# x: str(list(graph.objects(x, URIRef('')))[0])
# for x in set(x for x in graph.subjects(RDF.type, OWL.Class)) if str(x).split('/')[-1].startswith('GO')}
# create subsets
kg_no_genes = kg[kg['grp'] != 'Genes'].reset_index(drop=True)
kg_no_genes['grp2'] = ['Entity'] * len(kg_no_genes)
kg_genes = kg[kg['grp'] == 'Genes'].reset_index(drop=True)
kg_genes['grp2'] = ['Genes'] * len(kg_genes)
# ignorome genes
file_dir = 'resources/releases/v1.0.0/Dissertation/PheKnowLator Evaluation/processed_results/PE_GeneExpression/'
ignorome_genes = pd.read_csv(file_dir + 'pe_ignorome.csv', sep=',', header=0)
ig_genes = ignorome_genes[ignorome_genes['IGNOROME'].notna()]['IGNOROME'].astype(int).astype(str).to_frame()
kw_genes = ignorome_genes[ignorome_genes['KNOW'].notna()]['KNOW'].astype(int).astype(str).to_frame()
kw_genes = ignorome_genes[ignorome_genes['BOTH'].notna()]['BOTH'].astype(int).astype(str).to_frame()
updt_groups = ['PE' if kg.iloc[x]['id'] in set(ig_genes['IGNOROME']) else kg.iloc[x]['grp'] for x in range(0, len(kg))]
# map embeddings to kg identifiers
ig_gene_embeddings = ig_genes.merge(kg, left_on='IGNOROME', right_on='id', how='left').reset_index(drop=True)
ig_gene_embeddings['grp2'] = ['Ignorome'] * len(ig_genes)
kw_gene_embeddings = kw_genes.merge(kg, left_on='KNOW', right_on='id', how='left').reset_index(drop=True)
kw_gene_embeddings['grp2'] = ['Knowome'] * len(kw_genes)
# update labels for KG
kg_no_genes = pd.concat([kg_no_genes, ig_gene_embeddings, kw_gene_embeddings])
kg_no_tsne = 'ignorome_all_tsne.npy'
kg_all_tsne = pd.concat([kg_genes, ig_gene_embeddings, kw_gene_embeddings])
kg_all_genes = 'ignorome_pe_all_genes_tsne.npy'
pe_genes = pd.concat([ig_gene_embeddings, kw_gene_embeddings])
pe_gene_tsne = 'ignorome_pe_tsne.npy'
# prep data for t-SNE
matrix = [list(x) for x in kg_no_genes['embeds']]
matrix = [list(x) for x in kg_all_genes['embeds']]
matrix = [list(x) for x in pe_genes['embeds']]
full_matrix = [list(x) for x in kg['embeds']]
# create t-sne
X_reduced = PCA(n_components=25, random_state=1).fit_transform(matrix)
# X_reduced = TruncatedSVD(n_components=50, random_state=1, algorithm='arpack').fit_transform(matrix)
X_embedded = TSNE(n_components=2, random_state=1, verbose=True, perplexity=50.0).fit_transform(X_reduced) + 'pca_pkt_embeddings_2d', X_embedded)
X_embedded = np.load(file_dir + pe_gene_tsne)
# set up colors and legend labels
names = {'Edge': 'Edge', 'Gene Ontology': 'Gene Ontology', 'Drugs': 'Drugs', 'Diseases': 'Diseases', 'Pathways':
'Pathways', 'Genes': 'Genes', 'Phenotypes': 'Phenotypes', 'PE': 'PE Ignorome'}
colors = {'Diseases': 'paleturquoise', 'Drugs': 'lavenderblush', 'Gene Ontology': 'palegreen', 'Genes': 'lightgrey',
'Pathways': 'mistyrose', 'Phenotypes': 'lavender', 'PE Ignorome': 'cornsilk'}
dis = mpatches.Patch(color='deepskyblue', label='Disease'); drg = mpatches.Patch(color='pink', label='Drug')
go = mpatches.Patch(color='lightgreen', label='GO'); ge = mpatches.Patch(color='dimgray', label='Gene')
pat = mpatches.Patch(color='crimson', label='Pathway'); phe = mpatches.Patch(color='purple', label='Phenotype')
pe = mpatches.Patch(color='goldenrod', label='Ignorome')
# create data frame that has the result of the MDS plus the cluster numbers and titles
df = pd.DataFrame(dict(x=X_embedded[:, 0], y=X_embedded[:, 1], group=list(updt_groups)))
groups = df.groupby('group')
fig, ax = plt.subplots(figsize=(13, 10))
for x, grp in groups:
if x == 'Gene Ontology':
ax.plot(grp.x, grp.y, marker='o', linestyle='', ms=4, label=names[x], color=colors[x], mec='forestgreen', alpha=0.6)
if x == 'Genes':
ax.plot(grp.x, grp.y, marker='o', linestyle='', ms=4, label=names[x], color=colors[x], mec='dimgray', alpha=0.6)
if x == 'Drugs':
ax.plot(grp.x, grp.y, marker='o', linestyle='', ms=4, label=names[x], color=colors[x], mec='hotpink', alpha=0.6)
if x == 'Phenotypes':
ax.plot(grp.x, grp.y, marker='o', linestyle='', ms=4, label=names[x], color=colors[x], mec='purple', alpha=0.6)
if x == 'Pathways':
ax.plot(grp.x, grp.y, marker='o', linestyle='', ms=4, label=names[x], color=colors[x], mec='crimson', alpha=0.6)
if x == 'Diseases':
ax.plot(grp.x, grp.y, marker='o', linestyle='', ms=4, label=names[x], color=colors[x], mec='deepskyblue', alpha=0.6)
if x == 'PE':
ax.plot(grp.x, grp.y, marker='*', linestyle='', ms=12, label=names['PE'], color=colors['PE Ignorome'], mec='darkgoldenrod', alpha=0.6)
plt.legend(handles=[dis, drg, go, ge, pat, phe, pe], fontsize=12, frameon=False, loc="lower center", ncol=7)
m0 = -55; m1 = 55; plt.ylim(m0, m1); plt.xlim(m0, m1)
plt.savefig(file_dir + 'PKT_PEIgnorome_tSNE.png', bbox_inches='tight'); plt.close()
# create dicts to help with using matrix indices
embedding_idx = kg['id'].to_dict(); embedding_id_grp = kg['grp'].to_dict()
pe_gene_idx = {k: v for k, v in embedding_idx.items() if v in set(ig_genes['IGNOROME'])}
kg_node_labels = {row['id']: row['label'] for idx, row in kg.iterrows() if row['grp'] != 'Edge'}
# convert embeddings to compressed sparse matrix
kg_matrix = csr_matrix([list(x) for x in kg['embeds']])
# find similar entities
similar_entities = {}
for gene_idx, gene_id in tqdm(pe_gene_idx.items()):
hit_dict = {}; gene_label = kg_node_labels[gene_id]
matches = [x for x in similarity_search(kg_matrix, gene_idx, 200) if x[0] not in pe_gene_idx.keys()][0:100]
if len(matches) < 100: break
for x, score in matches:
if embedding_id_grp[x] != 'Edge':
match_id = embedding_idx[x]; node_grp = embedding_id_grp[x]; match_label = kg_node_labels[match_id]
if node_grp in hit_dict.keys(): hit_dict[node_grp] += [[match_id, match_label, round(score, 3)]]
else: hit_dict[node_grp] = [[match_id, match_label, round(score, 3)]]
hit_dict['gene_label'] = gene_label
similar_entities[gene_id] = hit_dict
# k-means clustering -- unsupervised to find what the genes group with
annotation_counts = {'Drugs': {}, 'Diseases': {}, 'Genes': {}, 'Pathways': {}, 'Phenotypes': {}, 'Gene Ontology': {}}
master_list = dict()
for k, v in tqdm(similar_entities.items()):
gene_label = v['gene_label']; val = {x: y for x, y in v.items() if x != 'gene_label'}
master_list[k] = {'Drugs': [], 'Diseases': [], 'Genes': [], 'Pathways': [], 'Phenotypes': [], 'Gene Ontology': []}
for x in val.keys():
for i in val[x]:
if i[0] in annotation_counts[x].keys():
annotation_counts[x][i[0]]['counts'] += 1
annotation_counts[x][i[0]]['genes'].append('{} ({})'.format(k, gene_label))
annotation_counts[x][i[0]] = {
'scores': [i[-1]], 'counts': 1, 'genes': ['{} ({})'.format(k, gene_label)]}
# print top annotated entities
for k, v in annotation_counts.items():
for x in ['Drugs', 'Diseases', 'Genes', 'Gene Ontology', 'Pathways', 'Phenotypes']:
k = x; v = annotation_counts[k]; overall_scores = [i for j in [v[x]['scores'] for x in v.keys()] for i in j]
hs = sorted(list(set(overall_scores)), reverse=True)
y = [((x, kg_node_labels[x]), v[x]['scores']) for x in v.keys()
if len([g for g in v[x]['scores'] if g in hs]) > 0]
with open(file_dir + 'pkt_validation/pe_pkt_most_similar_{}.txt'.format(x), 'w') as f:
for z in y:
hits = ', '.join(v[z[0][0]]['genes'])
try: f.write(str(z[0][0]) + '*' + str(z[0][1]) + '*' + str(z[1][0]) + '*' + hits + '\n')
except AttributeError: pass
# printing most frequent by count
count_list = [v[x]['counts'] for x in v.keys()]
entity_counts = sorted(count_list, reverse=True); min_cnt = entity_counts[-1]; max_cnt = entity_counts[0]
cut_off = [max_cnt] if entity_counts.count(max_cnt) > 1 else entity_counts
he = [((x, kg_node_labels[x]), v[x]['counts']) for x in v.keys() if v[x]['counts'] in entity_counts]
with open(file_dir + 'pkt_validation/pe_pkt_most_frequent_{}.txt'.format(x), 'w') as f:
for u in he:
try: f.write(str(u[0][0]) + '*' + str(u[0][1]) + '*' + str(u[1]) + '\n')
except AttributeError: pass
# TASK 2 - Perfoming Modified Enrichment Analysis and Statistical Testing
write_loc = 'resources/releases/v1.0.0/Dissertation/PheKnowLator Evaluation/processed_results/pe_gene_testing/'
gold_standard_annotations = {
'Diseases': {'DOID_9870', 'DOID_1700', 'DOID_681', 'DOID_332', 'DOID_1094', 'DOID_5158', 'DOID_1926', 'DOID_9452'},
'Drugs': {'D005486', 'C445526', 'C010634', 'D004962', 'C059714', 'D006003', 'C098288', 'D018927', 'D008277',
'Genes': {'5352', '27094', '10576', '4709', '64983', '3047', '5931', '148867', '204962', '58480'},
'Gene Ontology': {'GO_0070125', 'GO_0006882', 'GO_0005747', 'GO_0031462', 'GO_0005833', 'GO_0005516', 'GO_0007062',
'GO_0042393', 'GO_0000398', 'GO_0051015'},
'Pathways': {'R-HSA-6799198', 'R-HSA-5419276', 'R-HSA-611105', 'R-HSA-1566948', 'R-HSA-3906995', 'R-HSA-194840',
'R-HSA-391251', 'R-HSA-2500257', 'R-HSA-913531', 'R-HSA-212165'},
'Phenotypes': {'HP_0008316', 'HP_0002725', 'HP_0008344', 'HP_0011904', 'HP_0001935'}
pickle.dump(gold_standard_annotations, open(write_loc + 'Expert_Reviewed_Annotations', 'wb'))
# read kg files
# kg_file_dir = 'resources/releases/v1.0.0/Dissertation/PheKnowLator Evaluation/knowledge/'
# kg = pd.read_pickle(kg_file_dir + 'ALL_KG_res_df')
# kg_labels = pd.read_csv(kg_file_dir + 'KG_Node_Metadata.csv', header=0)
# kg = kg.merge(kg_labels[['id', 'label']], left_on='id', right_on='id', how='left').reset_index(drop=True)
# set-up gene lists
# ig_genes = ignorome_genes[ignorome_genes['IGNOROME'].notna()]['IGNOROME'].astype(int).astype(str).to_frame()
# kw_genes = ignorome_genes[ignorome_genes['KNOW'].notna()]['KNOW'].astype(int).astype(str).to_frame()
# both_genes = ignorome_genes[ignorome_genes['BOTH'].notna()]['BOTH'].astype(int).astype(str).to_frame()
# pe_gene_idx = set(ig_genes['IGNOROME']) | set(kw_genes['KNOW']) | set(both_genes['BOTH'])
# create dicts to help with using matrix indices
# embedding_idx = kg['id'].to_dict(); embedding_id_grp = kg['grp'].to_dict()
# rev_embedding_idx = {v: k for k, v in embedding_idx.items()}
# kg_node_labels = {row['id']: row['label'] for idx, row in kg.iterrows() if row['grp'] != 'Edge'}
# lab_dicts = {'embedding_idx': embedding_idx, 'rev_embedding_idx': rev_embedding_idx, 'kg_node_labels' : kg_node_labels,
# 'embedding_id_grp': embedding_id_grp}
# pickle.dump(lab_dicts, open(write_loc + 'idx_dicts', 'wb'))
lab_dict = pickle.load(open(write_loc + 'idx_dicts', 'rb'))
embedding_idx = lab_dict['embedding_idx']; rev_embedding_idx = lab_dict['rev_embedding_idx']
kg_node_labels = lab_dict['kg_node_labels']; embedding_id_grp = lab_dict['embedding_id_grp']
# convert embeddings to compressed sparse matrix
# kg_matrix = csr_matrix([list(x) for x in kg['embeds']])
# pickle.dump(kg_matrix, open(write_loc + 'pkt_kg_matrix', 'wb'))
kg_matrix = pickle.load(open(write_loc + 'pkt_kg_matrix', 'rb'))
# pull 1,000 samples (with replacement) from all genes other than PE
# sample_set = set(kg[kg['grp'] == 'Genes']['id']) - pe_gene_idx # 22385
# gene_samples = [random.choices(list(sample_set), k=445) for _ in range(0, 1000)]
# gene_samp = set([i for u in gene_samples for i in u])
# updated = set([x for x in gene_samp if x not in gene_samp])
# pickle.dump(gene_samples, open(write_loc + 'sample_list', 'wb'))
gene_samples = pickle.load(open(write_loc + 'sample_list', 'rb'))
sample_start = 0; sample_end = 101; genes = gene_samples[sample_start:sample_end]
# sample_start = 100; sample_end = 201; genes = gene_samples[sample_start:sample_end]
# find similar entities
idx = list(range(0, 101))
for samp in tqdm(range(sample_start, sample_end)):
sampled_results = {}
for gene_id in tqdm(genes[idx.pop(0)]):
print(gene_id, len(genes))
gene_idx = rev_embedding_idx[gene_id]; hit_dict = {}; gene_label = kg_node_labels[gene_id]
matches = similarity_search(kg_matrix, gene_idx, 100); sampled_results[samp] = {}
for x, score in matches:
if embedding_id_grp[x] != 'Edge':
match_id = embedding_idx[x]; node_grp = embedding_id_grp[x]; match_label = kg_node_labels[match_id]
if node_grp in hit_dict.keys(): hit_dict[node_grp] += [[match_id, match_label, round(score, 3)]]
else: hit_dict[node_grp] = [[match_id, match_label, round(score, 3)]]
hit_dict['gene_label'] = gene_label
sampled_results[samp][gene_id] = hit_dict
# reads in the annotation runs and organizes them by domain --> hits --> entity-scores
from glob import glob
f = glob(write_loc + 'sample_sets/*sample*')[:-1]
sampled_results = {k: v for d in [pickle.load(open(x, 'rb')) for x in tqdm(f)] for k, v in d.items()}
annotation_counts = {'Drugs': {}, 'Diseases': {}, 'Genes': {}, 'Pathways': {}, 'Phenotypes': {}, 'Gene Ontology': {}}
for s in tqdm(list(sampled_results.keys())[0:1000]):
annotation_counts['Drugs'][s] = []; annotation_counts['Diseases'][s] = []
annotation_counts['Genes'][s] = []; annotation_counts['Gene Ontology'][s] = []
annotation_counts['Pathways'][s] = []; annotation_counts['Phenotypes'][s] = []
for k, v in sampled_results[s].items():
for x in v.keys():
if x != 'gene_label':
for i in v[x]:
annotation_counts[x][s] += ['{}:{}'.format(i[0], i[2])]
# for any repeating annotations within the same run, this method condenses them by the max similarity score
hits1 = {'Drugs': [], 'Diseases': [], 'Genes': [], 'Pathways': [], 'Phenotypes': [], 'Gene Ontology': []}
for k, v in tqdm(annotation_counts.items()):
hits1[k] = {}; ent_dict = {}
for run, out in v.items():
for i in out:
term, score = i.split(':')
if term in ent_dict.keys(): ent_dict[term] = max([ent_dict[term], float(score)])
else: ent_dict[term] = float(score)
hits1[k] = ['{}:{}'.format(k, v) for k, v in ent_dict.items()]
# gets final hits and scores tests
hits = {'Drugs': [], 'Diseases': [], 'Genes': [], 'Pathways': [], 'Phenotypes': [], 'Gene Ontology': []}
for k, v in tqdm(hits1.items()):
if k == 'Drugs': cut =118
if k == 'Diseases': cut = 15
if k == 'Genes': cut = 76
if k == 'Pathways': cut = 44
if k == 'Phenotypes': cut = 11
if k == 'Gene Ontology': cut = 62
gs = gold_standard_annotations[k]; entity_list, entity_scores = [], []
tup_list = [tuple(x.split(':')) for x in v]
sorted_tuples = sorted(tup_list, key=lambda x: x[1], reverse=True)
hits_to_search = [x[0] for x in sorted_tuples]; hit_scores = [float(x[1]) for x in sorted_tuples]
# hit = list(gs.intersection(set(hits_to_search[0:len(gs)])))
hit = list(gs.intersection(set(hits_to_search[0:cut])))
if len(hit) > 0: hits[k] += ['{}: {}'.format(x, kg_node_labels[x].lower()) for x in hit]
# entity_list += hits_to_search
# entity_scores += hit_scores
# print('DOMAIN: {}'.format(k))
# print('Entites: {} Total Entities; {} Unique'.format(len(entity_list), len(set(entity_list))))
# print('Similarity Score: {} Min; {} Max; {} Average'.format(round(min(entity_scores), 3),
# round(max(entity_scores), 3),
# round(sum(entity_scores)/len(entity_scores), 3)))
for k, v in hits.items():
print('{}: {}, p-value={}'.format(k.upper(), len(v), len(v)/float(1000)))
callahantiff commented Jan 29, 2022

TASK1 - Input and Output Data Sources

TASK12 - Input and Output Data Sources

TASK1 - Input and Output Data Sources

TASK12 - Input and Output Data Sources

