Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save seandavi/0e83580f3b98deafa625e26a3afca640 to your computer and use it in GitHub Desktop.
Save seandavi/0e83580f3b98deafa625e26a3afca640 to your computer and use it in GitHub Desktop.
A quick demonstration of using sentence embeddings for semantic similarity search of metadata terms against "ontology" terms
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "d0745ab6-e198-4639-a296-97d95626694d",
"metadata": {},
"outputs": [],
"source": [
"%%capture\n",
"%pip install sentence_transformers polars"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "39755693-3e86-407d-a7a5-29d396a0007c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[-0.5002635 -0.505004 -0.47107625 ... 0.12245571 0.05090936\n",
" 0.47183704]\n",
" [-0.43048653 -0.16166009 -0.46010813 ... 0.10756788 0.16894172\n",
" 0.57526815]]\n"
]
}
],
"source": [
"from sentence_transformers import SentenceTransformer\n",
"sentences = [\"This is an example sentence\", \"Each sentence is converted\"]\n",
"\n",
"model = SentenceTransformer('pritamdeka/S-PubMedBert-MS-MARCO')\n",
"embeddings = model.encode(sentences)\n",
"print(embeddings)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "728b9711-986e-4fe0-960f-e065b570fe5f",
"metadata": {},
"outputs": [],
"source": [
"import polars as pl"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "f7580ae2-5762-4d55-aa68-190bc52dc6b7",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div><style>\n",
".dataframe > thead > tr > th,\n",
".dataframe > tbody > tr > td {\n",
" text-align: right;\n",
"}\n",
"</style>\n",
"<small>shape: (29_227, 5)</small><table border=\"1\" class=\"dataframe\"><thead><tr><th>curation_id</th><th>original_bodysite</th><th>curated_bodysite</th><th>curated_bodysite_ontology_term_id</th><th>curated_bodysite_source</th></tr><tr><td>str</td><td>str</td><td>str</td><td>str</td><td>str</td></tr></thead><tbody><tr><td>&quot;acyc_fmi_2014:…</td><td>&quot;SALIVARY GLAND…</td><td>&quot;Salivary Gland…</td><td>&quot;NCIT:C12426&quot;</td><td>&quot;TUMOR_TISSUE_S…</td></tr><tr><td>&quot;acyc_fmi_2014:…</td><td>&quot;LUNG&quot;</td><td>&quot;Lung&quot;</td><td>&quot;NCIT:C12468&quot;</td><td>&quot;TUMOR_TISSUE_S…</td></tr><tr><td>&quot;acyc_fmi_2014:…</td><td>&quot;LUNG&quot;</td><td>&quot;Lung&quot;</td><td>&quot;NCIT:C12468&quot;</td><td>&quot;TUMOR_TISSUE_S…</td></tr><tr><td>&quot;acyc_fmi_2014:…</td><td>&quot;BUCCAL MUCOSA&quot;</td><td>&quot;Buccal Mucosa&quot;</td><td>&quot;NCIT:C12505&quot;</td><td>&quot;TUMOR_TISSUE_S…</td></tr><tr><td>&quot;acyc_fmi_2014:…</td><td>&quot;TUMOR EXTENTIO…</td><td>&quot;Bone&quot;</td><td>&quot;NCIT:C12366&quot;</td><td>&quot;TUMOR_TISSUE_S…</td></tr><tr><td>&quot;acyc_fmi_2014:…</td><td>&quot;LUNG&quot;</td><td>&quot;Lung&quot;</td><td>&quot;NCIT:C12468&quot;</td><td>&quot;TUMOR_TISSUE_S…</td></tr><tr><td>&quot;acyc_fmi_2014:…</td><td>&quot;SALIVARY GLAND…</td><td>&quot;Salivary Gland…</td><td>&quot;NCIT:C12426&quot;</td><td>&quot;TUMOR_TISSUE_S…</td></tr><tr><td>&quot;acyc_fmi_2014:…</td><td>&quot;SALIVARY GLAND…</td><td>&quot;Salivary Gland…</td><td>&quot;NCIT:C12426&quot;</td><td>&quot;TUMOR_TISSUE_S…</td></tr><tr><td>&quot;acyc_fmi_2014:…</td><td>&quot;SALIVARY GLAND…</td><td>&quot;Salivary Gland…</td><td>&quot;NCIT:C12426&quot;</td><td>&quot;TUMOR_TISSUE_S…</td></tr><tr><td>&quot;acyc_fmi_2014:…</td><td>&quot;ORBITAL CAVITY…</td><td>&quot;Orbit&quot;</td><td>&quot;NCIT:C12347&quot;</td><td>&quot;TUMOR_TISSUE_S…</td></tr><tr><td>&quot;acyc_fmi_2014:…</td><td>&quot;HARD PALATE&quot;</td><td>&quot;Hard Palate&quot;</td><td>&quot;NCIT:C12230&quot;</td><td>&quot;TUMOR_TISSUE_S…</td></tr><tr><td>&quot;acyc_fmi_2014:…</td><td>&quot;LARYNX&quot;</td><td>&quot;Larynx&quot;</td><td>&quot;NCIT:C12420&quot;</td><td>&quot;TUMOR_TISSUE_S…</td></tr><tr><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td></tr><tr><td>&quot;luad_tcga_pan_…</td><td>&quot;LUNG&quot;</td><td>&quot;Lung&quot;</td><td>&quot;NCIT:C12468&quot;</td><td>&quot;TUMOR_TISSUE_S…</td></tr><tr><td>&quot;luad_tcga_pan_…</td><td>&quot;LUNG&quot;</td><td>&quot;Lung&quot;</td><td>&quot;NCIT:C12468&quot;</td><td>&quot;TUMOR_TISSUE_S…</td></tr><tr><td>&quot;luad_tcga_pan_…</td><td>&quot;LUNG&quot;</td><td>&quot;Lung&quot;</td><td>&quot;NCIT:C12468&quot;</td><td>&quot;TUMOR_TISSUE_S…</td></tr><tr><td>&quot;luad_tcga_pan_…</td><td>&quot;LUNG&quot;</td><td>&quot;Lung&quot;</td><td>&quot;NCIT:C12468&quot;</td><td>&quot;TUMOR_TISSUE_S…</td></tr><tr><td>&quot;luad_tcga_pan_…</td><td>&quot;LUNG&quot;</td><td>&quot;Lung&quot;</td><td>&quot;NCIT:C12468&quot;</td><td>&quot;TUMOR_TISSUE_S…</td></tr><tr><td>&quot;luad_tcga_pan_…</td><td>&quot;LUNG&quot;</td><td>&quot;Lung&quot;</td><td>&quot;NCIT:C12468&quot;</td><td>&quot;TUMOR_TISSUE_S…</td></tr><tr><td>&quot;luad_tcga_pan_…</td><td>&quot;LUNG&quot;</td><td>&quot;Lung&quot;</td><td>&quot;NCIT:C12468&quot;</td><td>&quot;TUMOR_TISSUE_S…</td></tr><tr><td>&quot;luad_tcga_pan_…</td><td>&quot;LUNG&quot;</td><td>&quot;Lung&quot;</td><td>&quot;NCIT:C12468&quot;</td><td>&quot;TUMOR_TISSUE_S…</td></tr><tr><td>&quot;luad_tcga_pan_…</td><td>&quot;LUNG&quot;</td><td>&quot;Lung&quot;</td><td>&quot;NCIT:C12468&quot;</td><td>&quot;TUMOR_TISSUE_S…</td></tr><tr><td>&quot;luad_tcga_pan_…</td><td>&quot;LUNG&quot;</td><td>&quot;Lung&quot;</td><td>&quot;NCIT:C12468&quot;</td><td>&quot;TUMOR_TISSUE_S…</td></tr><tr><td>&quot;luad_tcga_pan_…</td><td>&quot;LUNG&quot;</td><td>&quot;Lung&quot;</td><td>&quot;NCIT:C12468&quot;</td><td>&quot;TUMOR_TISSUE_S…</td></tr><tr><td>&quot;luad_tcga_pan_…</td><td>&quot;LUNG&quot;</td><td>&quot;Lung&quot;</td><td>&quot;NCIT:C12468&quot;</td><td>&quot;TUMOR_TISSUE_S…</td></tr></tbody></table></div>"
],
"text/plain": [
"shape: (29_227, 5)\n",
"┌───────────────────┬───────────────────┬──────────────────┬───────────────────┬───────────────────┐\n",
"│ curation_id ┆ original_bodysite ┆ curated_bodysite ┆ curated_bodysite_ ┆ curated_bodysite_ │\n",
"│ --- ┆ --- ┆ --- ┆ ontology_term_i… ┆ source │\n",
"│ str ┆ str ┆ str ┆ --- ┆ --- │\n",
"│ ┆ ┆ ┆ str ┆ str │\n",
"╞═══════════════════╪═══════════════════╪══════════════════╪═══════════════════╪═══════════════════╡\n",
"│ acyc_fmi_2014:ACY ┆ SALIVARY GLAND ┆ Salivary Gland ┆ NCIT:C12426 ┆ TUMOR_TISSUE_SITE │\n",
"│ C-FMI-01:ACYC-F… ┆ ┆ ┆ ┆ │\n",
"│ acyc_fmi_2014:ACY ┆ LUNG ┆ Lung ┆ NCIT:C12468 ┆ TUMOR_TISSUE_SITE │\n",
"│ C-FMI-02:ACYC-F… ┆ ┆ ┆ ┆ │\n",
"│ acyc_fmi_2014:ACY ┆ LUNG ┆ Lung ┆ NCIT:C12468 ┆ TUMOR_TISSUE_SITE │\n",
"│ C-FMI-03:ACYC-F… ┆ ┆ ┆ ┆ │\n",
"│ acyc_fmi_2014:ACY ┆ BUCCAL MUCOSA ┆ Buccal Mucosa ┆ NCIT:C12505 ┆ TUMOR_TISSUE_SITE │\n",
"│ C-FMI-04:ACYC-F… ┆ ┆ ┆ ┆ │\n",
"│ … ┆ … ┆ … ┆ … ┆ … │\n",
"│ luad_tcga_pan_can ┆ LUNG ┆ Lung ┆ NCIT:C12468 ┆ TUMOR_TISSUE_SITE │\n",
"│ _atlas_2018:TCG… ┆ ┆ ┆ ┆ │\n",
"│ luad_tcga_pan_can ┆ LUNG ┆ Lung ┆ NCIT:C12468 ┆ TUMOR_TISSUE_SITE │\n",
"│ _atlas_2018:TCG… ┆ ┆ ┆ ┆ │\n",
"│ luad_tcga_pan_can ┆ LUNG ┆ Lung ┆ NCIT:C12468 ┆ TUMOR_TISSUE_SITE │\n",
"│ _atlas_2018:TCG… ┆ ┆ ┆ ┆ │\n",
"│ luad_tcga_pan_can ┆ LUNG ┆ Lung ┆ NCIT:C12468 ┆ TUMOR_TISSUE_SITE │\n",
"│ _atlas_2018:TCG… ┆ ┆ ┆ ┆ │\n",
"└───────────────────┴───────────────────┴──────────────────┴───────────────────┴───────────────────┘"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# You may need to adjust the URL to be the correct \"raw\" URL since this\n",
"# uses a token. Your token may be different. \n",
"df = pl.read_csv('https://raw.githubusercontent.com/waldronlab/OmicsMLRepoData/master/cBioPortalData/data/curated_bodysite.csv?token=GHSAT0AAAAAACASHGM5ZU34WM77EKOIM4O4ZNSWB6A')\n",
"df.filter(pl.col('curated_bodysite') != \"NA\")"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "72e661fb-d3ce-478b-9d46-d0d970df84a0",
"metadata": {},
"outputs": [],
"source": [
"filt_df = df.filter(df['curated_bodysite']!=\"NA\").select(['original_bodysite', 'curated_bodysite']).unique()"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "880b1679-912e-4858-bc3b-0b2cf4d3ddce",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"first 'uncurated' results: ['PERITONEUM OVARY', 'ABDOMEN', 'RETROPERITONEUM/UPPER ABDOMINAL - GASTRIC', 'SKIN NODULE', 'REGIONAL NODES', 'EXTREMITIESREGIONAL CUTANEOUS OR SUBCUTANEOUS TISSUE', 'SMALL BOWEL RESECTION', 'ADRENAL GLAND', 'FRONTAL LOBE', 'SUPERFICIAL TRUNK - FLANK']\n",
"first 'curated' results: ['Aorta', 'Left', 'Maxilla', 'Mandible', 'Lobe of the Left Lung', 'Tectum Mesencephali', 'Formalin-Fixed Paraffin-Embedded', 'Pleural Fluid', 'Colon', 'Visual Pathway']\n"
]
}
],
"source": [
"orig = []\n",
"for x in filt_df['original_bodysite'].to_list():\n",
" orig.extend(x.split('<;>'))\n",
"orig = list(set(orig))\n",
"cura = []\n",
"for x in filt_df['curated_bodysite'].to_list():\n",
" cura.extend(x.split('<;>'))\n",
"cura = list(set(cura))\n",
"print(\"first 'uncurated' results: \", orig[:10])\n",
"print(\"first 'curated' results: \", cura[:10])"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "0e1a83db-3cf6-49b8-9371-bd6184d068d7",
"metadata": {},
"outputs": [],
"source": [
"# embed the curated results (which would, more generally, be the set of ontology terms of interest)\n",
"cura_embed = model.encode(cura)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "f3da1f91-6ff2-4655-a96e-003848cd3b8f",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"======================\n",
"\n",
"Query: PERITONEUM OVARY\n",
"Top 5 most similar sentences in corpus:\n",
" Peritoneum (Score: 0.9691)\n",
" Pelvic Peritoneum (Score: 0.9425)\n",
" Ovary (Score: 0.9386)\n",
" Peritoneal (Score: 0.9248)\n",
" Abdomen (Score: 0.9004)\n",
"\n",
"======================\n",
"\n",
"Query: ABDOMEN\n",
"Top 5 most similar sentences in corpus:\n",
" Abdomen (Score: 1.0000)\n",
" Abdominal (Score: 0.9567)\n",
" Abdominal Wall (Score: 0.9372)\n",
" Right Upper Quadrant of Abdomen (Score: 0.9251)\n",
" Intra-abdominal (Score: 0.9249)\n",
"\n",
"======================\n",
"\n",
"Query: RETROPERITONEUM/UPPER ABDOMINAL - GASTRIC\n",
"Top 5 most similar sentences in corpus:\n",
" Retroperitoneum (Score: 0.9472)\n",
" Retroperitoneal (Score: 0.9248)\n",
" Right Upper Quadrant of Abdomen (Score: 0.9235)\n",
" Intra-abdominal (Score: 0.9216)\n",
" Left Upper Quadrant of Abdomen (Score: 0.9157)\n",
"\n",
"======================\n",
"\n",
"Query: SKIN NODULE\n",
"Top 5 most similar sentences in corpus:\n",
" Skin Nodule (Score: 1.0000)\n",
" Skin (Score: 0.9037)\n",
" Skin/Subcutaneous Tissue (Score: 0.8828)\n",
" Skin of the Trunk (Score: 0.8802)\n",
" Lung (Score: 0.8732)\n",
"\n",
"======================\n",
"\n",
"Query: REGIONAL NODES\n",
"Top 5 most similar sentences in corpus:\n",
" Regional Lymph Node (Score: 0.9729)\n",
" Regional (Score: 0.9399)\n",
" Lymph Node (Score: 0.9105)\n",
" Pancreatic Lymph Node (Score: 0.9079)\n",
" Retroperitoneal Lymph Node (Score: 0.9029)\n",
"\n",
"======================\n",
"\n",
"Query: EXTREMITIESREGIONAL CUTANEOUS OR SUBCUTANEOUS TISSUE\n",
"Top 5 most similar sentences in corpus:\n",
" Skin/Subcutaneous Tissue (Score: 0.9500)\n",
" Skin (Score: 0.8985)\n",
" Tissue (Score: 0.8963)\n",
" Skin of the Trunk (Score: 0.8927)\n",
" Scalp (Score: 0.8836)\n",
"\n",
"======================\n",
"\n",
"Query: SMALL BOWEL RESECTION\n",
"Top 5 most similar sentences in corpus:\n",
" Small Intestine Resection (Score: 0.9866)\n",
" Small Intestine (Score: 0.9352)\n",
" Intestine (Score: 0.8970)\n",
" Rectum (Score: 0.8837)\n",
" Right Colon (Score: 0.8812)\n",
"\n",
"======================\n",
"\n",
"Query: ADRENAL GLAND\n",
"Top 5 most similar sentences in corpus:\n",
" Adrenal Gland (Score: 1.0000)\n",
" Extra-Adrenal (Score: 0.9275)\n",
" Parotid Gland (Score: 0.9274)\n",
" Prostate Gland (Score: 0.9156)\n",
" Salivary Gland (Score: 0.9123)\n",
"\n",
"======================\n",
"\n",
"Query: FRONTAL LOBE\n",
"Top 5 most similar sentences in corpus:\n",
" Frontal Lobe (Score: 1.0000)\n",
" Frontal (Score: 0.9587)\n",
" Temporal Lobe (Score: 0.9126)\n",
" Occipital Lobe (Score: 0.9017)\n",
" Lobe of the Left Lung (Score: 0.9005)\n",
"\n",
"======================\n",
"\n",
"Query: SUPERFICIAL TRUNK - FLANK\n",
"Top 5 most similar sentences in corpus:\n",
" Skin of the Trunk (Score: 0.9170)\n",
" Trunk (Score: 0.9103)\n",
" Flank (Score: 0.9026)\n",
" Skin/Subcutaneous Tissue (Score: 0.8796)\n",
" Thoracic Spine (Score: 0.8695)\n"
]
}
],
"source": [
"# For the first 10 \"uncurated\" or \"original\" terms\n",
"# 1. Embed term\n",
"# 2. Find top 5 most similar vectors from curated term embeddings\n",
"# 3. Report back the curated terms and scores\n",
"\n",
"from sentence_transformers import util\n",
"import torch\n",
"\n",
"top_k = 5\n",
"\n",
"queries = orig[0:10]\n",
"corpus = cura\n",
"corpus_embeddings = cura_embed\n",
"\n",
"for query in queries:\n",
" # embed each uncurated term\n",
" query_embedding = model.encode(query, convert_to_tensor=True)\n",
"\n",
" # We use cosine-similarity and torch.topk to find the highest 5 scores\n",
" # Seach in the corpus embeddings (the ontology terms that we embedded above)\n",
" cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]\n",
" top_results = torch.topk(cos_scores, k=top_k)\n",
"\n",
" print(\"\\n======================\\n\")\n",
" print(\"Query:\", query)\n",
" print(\"Top 5 most similar sentences in corpus:\")\n",
"\n",
" for score, idx in zip(top_results[0], top_results[1]):\n",
" print(\" \", corpus[idx], \"(Score: {:.4f})\".format(score))\n",
"\n",
" \"\"\"\n",
" # Alternatively, we can also use util.semantic_search to perform cosine similarty + topk\n",
" hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=5)\n",
" hits = hits[0] #Get the hits for the first query\n",
" for hit in hits:\n",
" print(corpus[hit['corpus_id']], \"(Score: {:.4f})\".format(hit['score']))\n",
" \"\"\""
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cab0db63-0596-44d8-a60e-10436824cb92",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.1"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment