Created
January 25, 2024 18:11
-
-
Save seandavi/0e83580f3b98deafa625e26a3afca640 to your computer and use it in GitHub Desktop.
A quick demonstration of using sentence embeddings for semantic similarity search of metadata terms against "ontology" terms
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"id": "d0745ab6-e198-4639-a296-97d95626694d", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"%%capture\n", | |
"%pip install sentence_transformers polars" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"id": "39755693-3e86-407d-a7a5-29d396a0007c", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"[[-0.5002635 -0.505004 -0.47107625 ... 0.12245571 0.05090936\n", | |
" 0.47183704]\n", | |
" [-0.43048653 -0.16166009 -0.46010813 ... 0.10756788 0.16894172\n", | |
" 0.57526815]]\n" | |
] | |
} | |
], | |
"source": [ | |
"from sentence_transformers import SentenceTransformer\n", | |
"sentences = [\"This is an example sentence\", \"Each sentence is converted\"]\n", | |
"\n", | |
"model = SentenceTransformer('pritamdeka/S-PubMedBert-MS-MARCO')\n", | |
"embeddings = model.encode(sentences)\n", | |
"print(embeddings)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"id": "728b9711-986e-4fe0-960f-e065b570fe5f", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import polars as pl" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"id": "f7580ae2-5762-4d55-aa68-190bc52dc6b7", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div><style>\n", | |
".dataframe > thead > tr > th,\n", | |
".dataframe > tbody > tr > td {\n", | |
" text-align: right;\n", | |
"}\n", | |
"</style>\n", | |
"<small>shape: (29_227, 5)</small><table border=\"1\" class=\"dataframe\"><thead><tr><th>curation_id</th><th>original_bodysite</th><th>curated_bodysite</th><th>curated_bodysite_ontology_term_id</th><th>curated_bodysite_source</th></tr><tr><td>str</td><td>str</td><td>str</td><td>str</td><td>str</td></tr></thead><tbody><tr><td>"acyc_fmi_2014:…</td><td>"SALIVARY GLAND…</td><td>"Salivary Gland…</td><td>"NCIT:C12426"</td><td>"TUMOR_TISSUE_S…</td></tr><tr><td>"acyc_fmi_2014:…</td><td>"LUNG"</td><td>"Lung"</td><td>"NCIT:C12468"</td><td>"TUMOR_TISSUE_S…</td></tr><tr><td>"acyc_fmi_2014:…</td><td>"LUNG"</td><td>"Lung"</td><td>"NCIT:C12468"</td><td>"TUMOR_TISSUE_S…</td></tr><tr><td>"acyc_fmi_2014:…</td><td>"BUCCAL MUCOSA"</td><td>"Buccal Mucosa"</td><td>"NCIT:C12505"</td><td>"TUMOR_TISSUE_S…</td></tr><tr><td>"acyc_fmi_2014:…</td><td>"TUMOR EXTENTIO…</td><td>"Bone"</td><td>"NCIT:C12366"</td><td>"TUMOR_TISSUE_S…</td></tr><tr><td>"acyc_fmi_2014:…</td><td>"LUNG"</td><td>"Lung"</td><td>"NCIT:C12468"</td><td>"TUMOR_TISSUE_S…</td></tr><tr><td>"acyc_fmi_2014:…</td><td>"SALIVARY GLAND…</td><td>"Salivary Gland…</td><td>"NCIT:C12426"</td><td>"TUMOR_TISSUE_S…</td></tr><tr><td>"acyc_fmi_2014:…</td><td>"SALIVARY GLAND…</td><td>"Salivary Gland…</td><td>"NCIT:C12426"</td><td>"TUMOR_TISSUE_S…</td></tr><tr><td>"acyc_fmi_2014:…</td><td>"SALIVARY GLAND…</td><td>"Salivary Gland…</td><td>"NCIT:C12426"</td><td>"TUMOR_TISSUE_S…</td></tr><tr><td>"acyc_fmi_2014:…</td><td>"ORBITAL CAVITY…</td><td>"Orbit"</td><td>"NCIT:C12347"</td><td>"TUMOR_TISSUE_S…</td></tr><tr><td>"acyc_fmi_2014:…</td><td>"HARD PALATE"</td><td>"Hard Palate"</td><td>"NCIT:C12230"</td><td>"TUMOR_TISSUE_S…</td></tr><tr><td>"acyc_fmi_2014:…</td><td>"LARYNX"</td><td>"Larynx"</td><td>"NCIT:C12420"</td><td>"TUMOR_TISSUE_S…</td></tr><tr><td>…</td><td>…</td><td>…</td><td>…</td><td>…</td></tr><tr><td>"luad_tcga_pan_…</td><td>"LUNG"</td><td>"Lung"</td><td>"NCIT:C12468"</td><td>"TUMOR_TISSUE_S…</td></tr><tr><td>"luad_tcga_pan_…</td><td>"LUNG"</td><td>"Lung"</td><td>"NCIT:C12468"</td><td>"TUMOR_TISSUE_S…</td></tr><tr><td>"luad_tcga_pan_…</td><td>"LUNG"</td><td>"Lung"</td><td>"NCIT:C12468"</td><td>"TUMOR_TISSUE_S…</td></tr><tr><td>"luad_tcga_pan_…</td><td>"LUNG"</td><td>"Lung"</td><td>"NCIT:C12468"</td><td>"TUMOR_TISSUE_S…</td></tr><tr><td>"luad_tcga_pan_…</td><td>"LUNG"</td><td>"Lung"</td><td>"NCIT:C12468"</td><td>"TUMOR_TISSUE_S…</td></tr><tr><td>"luad_tcga_pan_…</td><td>"LUNG"</td><td>"Lung"</td><td>"NCIT:C12468"</td><td>"TUMOR_TISSUE_S…</td></tr><tr><td>"luad_tcga_pan_…</td><td>"LUNG"</td><td>"Lung"</td><td>"NCIT:C12468"</td><td>"TUMOR_TISSUE_S…</td></tr><tr><td>"luad_tcga_pan_…</td><td>"LUNG"</td><td>"Lung"</td><td>"NCIT:C12468"</td><td>"TUMOR_TISSUE_S…</td></tr><tr><td>"luad_tcga_pan_…</td><td>"LUNG"</td><td>"Lung"</td><td>"NCIT:C12468"</td><td>"TUMOR_TISSUE_S…</td></tr><tr><td>"luad_tcga_pan_…</td><td>"LUNG"</td><td>"Lung"</td><td>"NCIT:C12468"</td><td>"TUMOR_TISSUE_S…</td></tr><tr><td>"luad_tcga_pan_…</td><td>"LUNG"</td><td>"Lung"</td><td>"NCIT:C12468"</td><td>"TUMOR_TISSUE_S…</td></tr><tr><td>"luad_tcga_pan_…</td><td>"LUNG"</td><td>"Lung"</td><td>"NCIT:C12468"</td><td>"TUMOR_TISSUE_S…</td></tr></tbody></table></div>" | |
], | |
"text/plain": [ | |
"shape: (29_227, 5)\n", | |
"┌───────────────────┬───────────────────┬──────────────────┬───────────────────┬───────────────────┐\n", | |
"│ curation_id ┆ original_bodysite ┆ curated_bodysite ┆ curated_bodysite_ ┆ curated_bodysite_ │\n", | |
"│ --- ┆ --- ┆ --- ┆ ontology_term_i… ┆ source │\n", | |
"│ str ┆ str ┆ str ┆ --- ┆ --- │\n", | |
"│ ┆ ┆ ┆ str ┆ str │\n", | |
"╞═══════════════════╪═══════════════════╪══════════════════╪═══════════════════╪═══════════════════╡\n", | |
"│ acyc_fmi_2014:ACY ┆ SALIVARY GLAND ┆ Salivary Gland ┆ NCIT:C12426 ┆ TUMOR_TISSUE_SITE │\n", | |
"│ C-FMI-01:ACYC-F… ┆ ┆ ┆ ┆ │\n", | |
"│ acyc_fmi_2014:ACY ┆ LUNG ┆ Lung ┆ NCIT:C12468 ┆ TUMOR_TISSUE_SITE │\n", | |
"│ C-FMI-02:ACYC-F… ┆ ┆ ┆ ┆ │\n", | |
"│ acyc_fmi_2014:ACY ┆ LUNG ┆ Lung ┆ NCIT:C12468 ┆ TUMOR_TISSUE_SITE │\n", | |
"│ C-FMI-03:ACYC-F… ┆ ┆ ┆ ┆ │\n", | |
"│ acyc_fmi_2014:ACY ┆ BUCCAL MUCOSA ┆ Buccal Mucosa ┆ NCIT:C12505 ┆ TUMOR_TISSUE_SITE │\n", | |
"│ C-FMI-04:ACYC-F… ┆ ┆ ┆ ┆ │\n", | |
"│ … ┆ … ┆ … ┆ … ┆ … │\n", | |
"│ luad_tcga_pan_can ┆ LUNG ┆ Lung ┆ NCIT:C12468 ┆ TUMOR_TISSUE_SITE │\n", | |
"│ _atlas_2018:TCG… ┆ ┆ ┆ ┆ │\n", | |
"│ luad_tcga_pan_can ┆ LUNG ┆ Lung ┆ NCIT:C12468 ┆ TUMOR_TISSUE_SITE │\n", | |
"│ _atlas_2018:TCG… ┆ ┆ ┆ ┆ │\n", | |
"│ luad_tcga_pan_can ┆ LUNG ┆ Lung ┆ NCIT:C12468 ┆ TUMOR_TISSUE_SITE │\n", | |
"│ _atlas_2018:TCG… ┆ ┆ ┆ ┆ │\n", | |
"│ luad_tcga_pan_can ┆ LUNG ┆ Lung ┆ NCIT:C12468 ┆ TUMOR_TISSUE_SITE │\n", | |
"│ _atlas_2018:TCG… ┆ ┆ ┆ ┆ │\n", | |
"└───────────────────┴───────────────────┴──────────────────┴───────────────────┴───────────────────┘" | |
] | |
}, | |
"execution_count": 6, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# You may need to adjust the URL to be the correct \"raw\" URL since this\n", | |
"# uses a token. Your token may be different. \n", | |
"df = pl.read_csv('https://raw.githubusercontent.com/waldronlab/OmicsMLRepoData/master/cBioPortalData/data/curated_bodysite.csv?token=GHSAT0AAAAAACASHGM5ZU34WM77EKOIM4O4ZNSWB6A')\n", | |
"df.filter(pl.col('curated_bodysite') != \"NA\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"id": "72e661fb-d3ce-478b-9d46-d0d970df84a0", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"filt_df = df.filter(df['curated_bodysite']!=\"NA\").select(['original_bodysite', 'curated_bodysite']).unique()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"id": "880b1679-912e-4858-bc3b-0b2cf4d3ddce", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"first 'uncurated' results: ['PERITONEUM OVARY', 'ABDOMEN', 'RETROPERITONEUM/UPPER ABDOMINAL - GASTRIC', 'SKIN NODULE', 'REGIONAL NODES', 'EXTREMITIESREGIONAL CUTANEOUS OR SUBCUTANEOUS TISSUE', 'SMALL BOWEL RESECTION', 'ADRENAL GLAND', 'FRONTAL LOBE', 'SUPERFICIAL TRUNK - FLANK']\n", | |
"first 'curated' results: ['Aorta', 'Left', 'Maxilla', 'Mandible', 'Lobe of the Left Lung', 'Tectum Mesencephali', 'Formalin-Fixed Paraffin-Embedded', 'Pleural Fluid', 'Colon', 'Visual Pathway']\n" | |
] | |
} | |
], | |
"source": [ | |
"orig = []\n", | |
"for x in filt_df['original_bodysite'].to_list():\n", | |
" orig.extend(x.split('<;>'))\n", | |
"orig = list(set(orig))\n", | |
"cura = []\n", | |
"for x in filt_df['curated_bodysite'].to_list():\n", | |
" cura.extend(x.split('<;>'))\n", | |
"cura = list(set(cura))\n", | |
"print(\"first 'uncurated' results: \", orig[:10])\n", | |
"print(\"first 'curated' results: \", cura[:10])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"id": "0e1a83db-3cf6-49b8-9371-bd6184d068d7", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# embed the curated results (which would, more generally, be the set of ontology terms of interest)\n", | |
"cura_embed = model.encode(cura)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"id": "f3da1f91-6ff2-4655-a96e-003848cd3b8f", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"\n", | |
"======================\n", | |
"\n", | |
"Query: PERITONEUM OVARY\n", | |
"Top 5 most similar sentences in corpus:\n", | |
" Peritoneum (Score: 0.9691)\n", | |
" Pelvic Peritoneum (Score: 0.9425)\n", | |
" Ovary (Score: 0.9386)\n", | |
" Peritoneal (Score: 0.9248)\n", | |
" Abdomen (Score: 0.9004)\n", | |
"\n", | |
"======================\n", | |
"\n", | |
"Query: ABDOMEN\n", | |
"Top 5 most similar sentences in corpus:\n", | |
" Abdomen (Score: 1.0000)\n", | |
" Abdominal (Score: 0.9567)\n", | |
" Abdominal Wall (Score: 0.9372)\n", | |
" Right Upper Quadrant of Abdomen (Score: 0.9251)\n", | |
" Intra-abdominal (Score: 0.9249)\n", | |
"\n", | |
"======================\n", | |
"\n", | |
"Query: RETROPERITONEUM/UPPER ABDOMINAL - GASTRIC\n", | |
"Top 5 most similar sentences in corpus:\n", | |
" Retroperitoneum (Score: 0.9472)\n", | |
" Retroperitoneal (Score: 0.9248)\n", | |
" Right Upper Quadrant of Abdomen (Score: 0.9235)\n", | |
" Intra-abdominal (Score: 0.9216)\n", | |
" Left Upper Quadrant of Abdomen (Score: 0.9157)\n", | |
"\n", | |
"======================\n", | |
"\n", | |
"Query: SKIN NODULE\n", | |
"Top 5 most similar sentences in corpus:\n", | |
" Skin Nodule (Score: 1.0000)\n", | |
" Skin (Score: 0.9037)\n", | |
" Skin/Subcutaneous Tissue (Score: 0.8828)\n", | |
" Skin of the Trunk (Score: 0.8802)\n", | |
" Lung (Score: 0.8732)\n", | |
"\n", | |
"======================\n", | |
"\n", | |
"Query: REGIONAL NODES\n", | |
"Top 5 most similar sentences in corpus:\n", | |
" Regional Lymph Node (Score: 0.9729)\n", | |
" Regional (Score: 0.9399)\n", | |
" Lymph Node (Score: 0.9105)\n", | |
" Pancreatic Lymph Node (Score: 0.9079)\n", | |
" Retroperitoneal Lymph Node (Score: 0.9029)\n", | |
"\n", | |
"======================\n", | |
"\n", | |
"Query: EXTREMITIESREGIONAL CUTANEOUS OR SUBCUTANEOUS TISSUE\n", | |
"Top 5 most similar sentences in corpus:\n", | |
" Skin/Subcutaneous Tissue (Score: 0.9500)\n", | |
" Skin (Score: 0.8985)\n", | |
" Tissue (Score: 0.8963)\n", | |
" Skin of the Trunk (Score: 0.8927)\n", | |
" Scalp (Score: 0.8836)\n", | |
"\n", | |
"======================\n", | |
"\n", | |
"Query: SMALL BOWEL RESECTION\n", | |
"Top 5 most similar sentences in corpus:\n", | |
" Small Intestine Resection (Score: 0.9866)\n", | |
" Small Intestine (Score: 0.9352)\n", | |
" Intestine (Score: 0.8970)\n", | |
" Rectum (Score: 0.8837)\n", | |
" Right Colon (Score: 0.8812)\n", | |
"\n", | |
"======================\n", | |
"\n", | |
"Query: ADRENAL GLAND\n", | |
"Top 5 most similar sentences in corpus:\n", | |
" Adrenal Gland (Score: 1.0000)\n", | |
" Extra-Adrenal (Score: 0.9275)\n", | |
" Parotid Gland (Score: 0.9274)\n", | |
" Prostate Gland (Score: 0.9156)\n", | |
" Salivary Gland (Score: 0.9123)\n", | |
"\n", | |
"======================\n", | |
"\n", | |
"Query: FRONTAL LOBE\n", | |
"Top 5 most similar sentences in corpus:\n", | |
" Frontal Lobe (Score: 1.0000)\n", | |
" Frontal (Score: 0.9587)\n", | |
" Temporal Lobe (Score: 0.9126)\n", | |
" Occipital Lobe (Score: 0.9017)\n", | |
" Lobe of the Left Lung (Score: 0.9005)\n", | |
"\n", | |
"======================\n", | |
"\n", | |
"Query: SUPERFICIAL TRUNK - FLANK\n", | |
"Top 5 most similar sentences in corpus:\n", | |
" Skin of the Trunk (Score: 0.9170)\n", | |
" Trunk (Score: 0.9103)\n", | |
" Flank (Score: 0.9026)\n", | |
" Skin/Subcutaneous Tissue (Score: 0.8796)\n", | |
" Thoracic Spine (Score: 0.8695)\n" | |
] | |
} | |
], | |
"source": [ | |
"# For the first 10 \"uncurated\" or \"original\" terms\n", | |
"# 1. Embed term\n", | |
"# 2. Find top 5 most similar vectors from curated term embeddings\n", | |
"# 3. Report back the curated terms and scores\n", | |
"\n", | |
"from sentence_transformers import util\n", | |
"import torch\n", | |
"\n", | |
"top_k = 5\n", | |
"\n", | |
"queries = orig[0:10]\n", | |
"corpus = cura\n", | |
"corpus_embeddings = cura_embed\n", | |
"\n", | |
"for query in queries:\n", | |
" # embed each uncurated term\n", | |
" query_embedding = model.encode(query, convert_to_tensor=True)\n", | |
"\n", | |
" # We use cosine-similarity and torch.topk to find the highest 5 scores\n", | |
" # Seach in the corpus embeddings (the ontology terms that we embedded above)\n", | |
" cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]\n", | |
" top_results = torch.topk(cos_scores, k=top_k)\n", | |
"\n", | |
" print(\"\\n======================\\n\")\n", | |
" print(\"Query:\", query)\n", | |
" print(\"Top 5 most similar sentences in corpus:\")\n", | |
"\n", | |
" for score, idx in zip(top_results[0], top_results[1]):\n", | |
" print(\" \", corpus[idx], \"(Score: {:.4f})\".format(score))\n", | |
"\n", | |
" \"\"\"\n", | |
" # Alternatively, we can also use util.semantic_search to perform cosine similarty + topk\n", | |
" hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=5)\n", | |
" hits = hits[0] #Get the hits for the first query\n", | |
" for hit in hits:\n", | |
" print(corpus[hit['corpus_id']], \"(Score: {:.4f})\".format(hit['score']))\n", | |
" \"\"\"" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "cab0db63-0596-44d8-a60e-10436824cb92", | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3 (ipykernel)", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.11.1" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment