Skip to content

Instantly share code, notes, and snippets.

@jamescalam
Last active October 14, 2022 07:11
Show Gist options
  • Save jamescalam/48a41b966c064daded1f68cea91d3f86 to your computer and use it in GitHub Desktop.
Save jamescalam/48a41b966c064daded1f68cea91d3f86 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 12,
"id": "7cd0413e-ee5a-41b8-a5e1-1298cce62d48",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 210/210 [00:56<00:00, 3.72it/s]\n"
]
}
],
"source": [
"# we encode and insert in batches of 64\n",
"batch_size = 64\n",
"\n",
"# loop through in batches of 64\n",
"for i in tqdm(range(0, len(new_data), batch_size)):\n",
" # find end position of batch (for when we hit end of data)\n",
" i_end = min(len(new_data)-1, i+batch_size)\n",
" # extract the metadata like text, start/end positions, etc\n",
" batch_meta = [{\n",
" \"text\": new_data[x][\"text\"],\n",
" \"start\": new_data[x][\"start\"],\n",
" \"end\": new_data[x][\"end\"],\n",
" \"url\": new_data[x][\"url\"],\n",
" \"title\": new_data[x][\"title\"]\n",
" } for x in range(i, i_end)]\n",
" # extract only text to be encoded by embedding model\n",
" batch_text = [\n",
" row['text'] for row in new_data[i:i_end]\n",
" ]\n",
" # create the embedding vectors\n",
" batch_embeds = model.encode(batch_text).tolist()\n",
" # extract IDs to be attached to each embedding and metadata\n",
" batch_ids = [\n",
" row['id'] for row in new_data[i:i_end]\n",
" ]\n",
" # 'upsert' (insert) IDs, embeddings, and metadata to index\n",
" to_upsert = list(zip(\n",
" batch_ids, batch_embeds, batch_meta\n",
" ))\n",
" index.upsert(to_upsert)\n"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "a32f2d85-a310-4b97-bcf2-5993efecb784",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'dimension': 768,\n",
" 'index_fullness': 0.0,\n",
" 'namespaces': {'': {'vector_count': 13392}},\n",
" 'total_vector_count': 13392}"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# check everything has been added\n",
"index.describe_index_stats()"
]
}
],
"metadata": {
"environment": {
"kernel": "python3",
"name": "common-cu110.m95",
"type": "gcloud",
"uri": "gcr.io/deeplearning-platform-release/base-cu110:m95"
},
"kernelspec": {
"display_name": "Python 3.9.12 ('ml')",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.12"
},
"vscode": {
"interpreter": {
"hash": "b8e7999f96e1b425e2d542f21b571f5a4be3e97158b0b46ea1b2500df63956ce"
}
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment