Skip to content

Instantly share code, notes, and snippets.

@rayvoelker
Created March 26, 2024 17:41
Show Gist options
  • Save rayvoelker/52b6f3f9cfc2f6261f2f7c11758e7d84 to your computer and use it in GitHub Desktop.
Save rayvoelker/52b6f3f9cfc2f6261f2f7c11758e7d84 to your computer and use it in GitHub Desktop.
Semantic Search Swiftly
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
{
"cells": [
{
"cell_type": "markdown",
"id": "c165a3d5-117f-4723-a32a-af5464c13ee2",
"metadata": {
"slideshow": {
"slide_type": "slide"
},
"tags": []
},
"source": [
"# Extract, Transform, Load (ETL) \n",
"\n",
"Our goals (cont.):\n",
"\n",
"1. ~**Extract relevant** bibliographic **MARC record data** from the Sierra REST API using **`sierra-ils-utils`**~\n",
"2. **Transform the relevant record data** using Hugging Face Transformers library and a pre-trained model\n",
"3. **Load embeddings into vector database** -- qdrant seems to be a popular choice, and has some nice features https://qdrant.tech/"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "5924cc20-ff3d-4788-8863-c4d27a62dac2",
"metadata": {
"slideshow": {
"slide_type": "slide"
},
"tags": []
},
"outputs": [],
"source": [
"import sqlite3\n",
"\n",
"local_database_filename = './semantic-search.db' # database file\n",
"\n",
"# modify our table if need be to store the embeddings\n",
"sql = \"\"\"\\\n",
"PRAGMA journal_mode=WAL;\n",
"CREATE TABLE IF NOT EXISTS embeddings (\n",
" id INTEGER PRIMARY KEY,\n",
" bib_id INTEGER UNIQUE,\n",
" embedding BLOB NOT NULL,\n",
" FOREIGN KEY (bib_id) REFERENCES bib_data(bib_id)\n",
");\"\"\"\n",
"\n",
"with sqlite3.connect(local_database_filename) as con:\n",
" con.executescript(sql)\n",
" con.commit()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "7a60dae6-14c9-4884-84a7-f1399dec0413",
"metadata": {
"slideshow": {
"slide_type": "slide"
},
"tags": []
},
"outputs": [],
"source": [
"def upsert_embedding(con, bib_id, embedding):\n",
" \"\"\"\n",
" Function to insert or update embeddings\n",
" \"\"\"\n",
" sql = \"\"\"\\\n",
" INSERT INTO embeddings(bib_id, embedding) VALUES(?, ?)\n",
" ON CONFLICT(bib_id) DO UPDATE SET embedding=excluded.embedding;\n",
" \"\"\"\n",
" con.execute(sql, (bib_id, embedding))\n",
" con.commit()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "7bc20b5f-8754-4448-be36-01ac8d3b8f9b",
"metadata": {
"slideshow": {
"slide_type": "slide"
},
"tags": []
},
"outputs": [],
"source": [
"from typing import Iterator\n",
"\n",
"def yield_search_data() -> Iterator[dict]:\n",
" \"\"\"\n",
" Function that returns a generator of dict -- the \"payload\" data for embeddings.\n",
" SQL below ensures that only the bib record with the highest `id` value for each \n",
" `bib_id` is returned -- deduplicating `bib_id` based on the `id` ordering\n",
" \"\"\" \n",
" sql = \"\"\"\\\n",
" SELECT\n",
" bd.id,\n",
" bd.bib_id,\n",
" bd.extracted_content\n",
" FROM\n",
" bib_data bd\n",
" JOIN (\n",
" SELECT\n",
" bib_id,\n",
" MAX(id) as max_id\n",
" FROM\n",
" bib_data\n",
" GROUP BY\n",
" bib_id\n",
" ) AS grouped_bd ON (\n",
" grouped_bd.bib_id = bd.bib_id \n",
" AND grouped_bd.max_id = bd.id\n",
" )\n",
" ORDER BY\n",
" bd.bib_id;\n",
" \"\"\"\n",
"\n",
" with sqlite3.connect(database_uri, uri=True) as con:\n",
" cursor = con.cursor()\n",
" cursor.execute(sql)\n",
"\n",
" columns = [description[0] for description in cursor.description]\n",
" while row := cursor.fetchone():\n",
" yield dict(zip(columns, row))"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "479a7d2b-142e-49e2-8b5f-8187b788d08d",
"metadata": {
"slideshow": {
"slide_type": "slide"
},
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0 {'id': 1, 'bib_id': 2500001, 'extracted_content': \"Title: 'ÆonFlux' | Subjects: Assassins Drama., Utopian socialism Drama., Disappeared persons Drama. | Genres: Fantasy films. lcgft, Action and adventure films. lcgft, Feature films. lcgft, Video recordings for the hearing impaired. lcgft, Science fiction films. lcgft | Publication Info: Place of publication: Hollywood, Calif. :, Publisher: Paramount Pictures,, Date of publication: c2006. | Physical Description: 1 videodisc (92 min.) : | Notes: Based on the characters created by Peter Chung. Originally released as a motion picture in 2005. 400 years in the future, after a virus kills off 99% of the world population, only one city on Earth remains. Ruled by the Goodchild dynasty, it is a perfect society of peace and prosperity--except that its citizens keep mysteriously disappearing. A secret agent/assassin/warrior has been given the mission to bring down the regime. But as she goes deeper into her mission, Aeon uncovers some shocking secrets that put the mission, not to mention her life, in danger. Special features: four eye-popping featurettes; writer and producer commentaries. | ISBN: 1415720584 | Classification Numbers: Local dewey decimal: DVD SciFi A251 2006 | Additional Details: Fixed length data elements: 060124p20062005cau092 g vleng dcgmIa , Physical description fixed field: vd cvaizq\"}\n"
]
}
],
"source": [
"# generate a single record for testing\n",
"for i, data in enumerate(yield_search_data()):\n",
" print(i, data)\n",
" break"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "45450c1b-efbd-477d-9fb6-637cbd1d963c",
"metadata": {
"slideshow": {
"slide_type": "slide"
},
"tags": []
},
"outputs": [],
"source": [
"%%capture\n",
"# NOTE: we _could_ use the `HuggingFace Transformers` library, \n",
"# but it's easier to use the `Sentence-Transformers` Library\n",
"# https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2\n",
"# from transformers import AutoTokenizer, AutoModel\n",
"!pip install -U sentence-transformers\n",
"\n",
"from sentence_transformers import SentenceTransformer\n",
"# import pickle\n",
"import time\n",
"\n",
"batch = [] # batch to transform into embeddings\n",
"batch_size = 10_000 # adjust if need be for GPU/CPU memory\n",
"file_counter = 0 # counter for the embeddings files\n",
"\n",
"# https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2#all-minilm-l6-v2\n",
"model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') "
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "daa7e62c-e37c-45ed-b043-12d8d562f17b",
"metadata": {
"slideshow": {
"slide_type": "slide"
},
"tags": []
},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "6a01a99e57d442bba0edaefc53831a5b",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Batches: 0%| | 0/313 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Processed 10000 items, Time per batch: 3.8352 seconds\n",
"Total processing time: 0.0661 minutes\n"
]
}
],
"source": [
"import pickle # for creating a binary object\n",
"start_time = time.time() # start a timer\n",
"\n",
"for i, data in enumerate(yield_search_data()):\n",
" batch.append((data['bib_id'], data.get('extracted_content'))) # Store bib_id with content\n",
"\n",
" if len(batch) == batch_size:\n",
" # Process the batch for embeddings\n",
" embedding_start = time.time()\n",
" contents = [content for _, content in batch] # Extract just the content for embedding\n",
" embeddings = model.encode(contents, show_progress_bar=True)\n",
" embedding_end = time.time()\n",
"\n",
" # Connect to the database for writing\n",
" with sqlite3.connect(local_database_filename) as con:\n",
" for j, (bib_id, _) in enumerate(batch):\n",
" embedding_blob = pickle.dumps(embeddings[j])\n",
" upsert_embedding(con, bib_id, embedding_blob)\n",
"\n",
" # Reset the batch after processing it for storage\n",
" batch = []\n",
"\n",
" print(f\"Processed {i+1} items, Time per batch: {embedding_end - embedding_start:.4f} seconds\")\n",
" \n",
"end_time = time.time()\n",
"print(f\"Total processing time: {(end_time - start_time)/60.0:.4f} minutes\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "93ae0af7-2081-4894-b1d1-d9de5202483a",
"metadata": {
"slideshow": {
"slide_type": "skip"
},
"tags": []
},
"outputs": [],
"source": [
"# by the way ... we need to run this one more time on completion of our loop \n",
"# ... don't forget!\n",
"\n",
"if batch and len(batch) > 0:\n",
" # Process the batch for embeddings\n",
" embedding_start = time.time()\n",
" contents = [content for _, content in batch] # Extract just the content for embedding\n",
" embeddings = model.encode(contents, show_progress_bar=True)\n",
" embedding_end = time.time()\n",
"\n",
" # Connect to the database for writing\n",
" with sqlite3.connect(local_database_filename) as con:\n",
" for j, (bib_id, _) in enumerate(batch):\n",
" embedding_blob = pickle.dumps(embeddings[j])\n",
" upsert_embedding(con, bib_id, embedding_blob)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment