Last active
October 14, 2022 07:11
-
-
Save jamescalam/48a41b966c064daded1f68cea91d3f86 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"id": "7cd0413e-ee5a-41b8-a5e1-1298cce62d48", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"100%|██████████| 210/210 [00:56<00:00, 3.72it/s]\n" | |
] | |
} | |
], | |
"source": [ | |
"# we encode and insert in batches of 64\n", | |
"batch_size = 64\n", | |
"\n", | |
"# loop through in batches of 64\n", | |
"for i in tqdm(range(0, len(new_data), batch_size)):\n", | |
" # find end position of batch (for when we hit end of data)\n", | |
" i_end = min(len(new_data)-1, i+batch_size)\n", | |
" # extract the metadata like text, start/end positions, etc\n", | |
" batch_meta = [{\n", | |
" \"text\": new_data[x][\"text\"],\n", | |
" \"start\": new_data[x][\"start\"],\n", | |
" \"end\": new_data[x][\"end\"],\n", | |
" \"url\": new_data[x][\"url\"],\n", | |
" \"title\": new_data[x][\"title\"]\n", | |
" } for x in range(i, i_end)]\n", | |
" # extract only text to be encoded by embedding model\n", | |
" batch_text = [\n", | |
" row['text'] for row in new_data[i:i_end]\n", | |
" ]\n", | |
" # create the embedding vectors\n", | |
" batch_embeds = model.encode(batch_text).tolist()\n", | |
" # extract IDs to be attached to each embedding and metadata\n", | |
" batch_ids = [\n", | |
" row['id'] for row in new_data[i:i_end]\n", | |
" ]\n", | |
" # 'upsert' (insert) IDs, embeddings, and metadata to index\n", | |
" to_upsert = list(zip(\n", | |
" batch_ids, batch_embeds, batch_meta\n", | |
" ))\n", | |
" index.upsert(to_upsert)\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"id": "a32f2d85-a310-4b97-bcf2-5993efecb784", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"{'dimension': 768,\n", | |
" 'index_fullness': 0.0,\n", | |
" 'namespaces': {'': {'vector_count': 13392}},\n", | |
" 'total_vector_count': 13392}" | |
] | |
}, | |
"execution_count": 14, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# check everything has been added\n", | |
"index.describe_index_stats()" | |
] | |
} | |
], | |
"metadata": { | |
"environment": { | |
"kernel": "python3", | |
"name": "common-cu110.m95", | |
"type": "gcloud", | |
"uri": "gcr.io/deeplearning-platform-release/base-cu110:m95" | |
}, | |
"kernelspec": { | |
"display_name": "Python 3.9.12 ('ml')", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.9.12" | |
}, | |
"vscode": { | |
"interpreter": { | |
"hash": "b8e7999f96e1b425e2d542f21b571f5a4be3e97158b0b46ea1b2500df63956ce" | |
} | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment