jamescalam/whisper-yt-search-index-vecs.ipynb Secret

## whisper-yt-search-index-vecs.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "7cd0413e-ee5a-41b8-a5e1-1298cce62d48",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 210/210 [00:56<00:00,  3.72it/s]\n"
     ]
    }
   ],
   "source": [
    "# we encode and insert in batches of 64\n",
    "batch_size = 64\n",
    "\n",
    "# loop through in batches of 64\n",
    "for i in tqdm(range(0, len(new_data), batch_size)):\n",
    "    # find end position of batch (for when we hit end of data)\n",
    "    i_end = min(len(new_data)-1, i+batch_size)\n",
    "    # extract the metadata like text, start/end positions, etc\n",
    "    batch_meta = [{\n",
    "        \"text\": new_data[x][\"text\"],\n",
    "        \"start\": new_data[x][\"start\"],\n",
    "        \"end\": new_data[x][\"end\"],\n",
    "        \"url\": new_data[x][\"url\"],\n",
    "        \"title\": new_data[x][\"title\"]\n",
    "    } for x in range(i, i_end)]\n",
    "    # extract only text to be encoded by embedding model\n",
    "    batch_text = [\n",
    "        row['text'] for row in new_data[i:i_end]\n",
    "    ]\n",
    "    # create the embedding vectors\n",
    "    batch_embeds = model.encode(batch_text).tolist()\n",
    "    # extract IDs to be attached to each embedding and metadata\n",
    "    batch_ids = [\n",
    "        row['id'] for row in new_data[i:i_end]\n",
    "    ]\n",
    "    # 'upsert' (insert) IDs, embeddings, and metadata to index\n",
    "    to_upsert = list(zip(\n",
    "        batch_ids, batch_embeds, batch_meta\n",
    "    ))\n",
    "    index.upsert(to_upsert)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "a32f2d85-a310-4b97-bcf2-5993efecb784",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'dimension': 768,\n",
       " 'index_fullness': 0.0,\n",
       " 'namespaces': {'': {'vector_count': 13392}},\n",
       " 'total_vector_count': 13392}"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# check everything has been added\n",
    "index.describe_index_stats()"
   ]
  }
 ],
 "metadata": {
  "environment": {
   "kernel": "python3",
   "name": "common-cu110.m95",
   "type": "gcloud",
   "uri": "gcr.io/deeplearning-platform-release/base-cu110:m95"
  },
  "kernelspec": {
   "display_name": "Python 3.9.12 ('ml')",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.12"
  },
  "vscode": {
   "interpreter": {
    "hash": "b8e7999f96e1b425e2d542f21b571f5a4be3e97158b0b46ea1b2500df63956ce"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 12,
	"id": "7cd0413e-ee5a-41b8-a5e1-1298cce62d48",
	"metadata": {},
	"outputs": [
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"100%\|██████████\| 210/210 [00:56<00:00, 3.72it/s]\n"
	]
	}
	],
	"source": [
	"# we encode and insert in batches of 64\n",
	"batch_size = 64\n",
	"\n",
	"# loop through in batches of 64\n",
	"for i in tqdm(range(0, len(new_data), batch_size)):\n",
	" # find end position of batch (for when we hit end of data)\n",
	" i_end = min(len(new_data)-1, i+batch_size)\n",
	" # extract the metadata like text, start/end positions, etc\n",
	" batch_meta = [{\n",
	" \"text\": new_data[x][\"text\"],\n",
	" \"start\": new_data[x][\"start\"],\n",
	" \"end\": new_data[x][\"end\"],\n",
	" \"url\": new_data[x][\"url\"],\n",
	" \"title\": new_data[x][\"title\"]\n",
	" } for x in range(i, i_end)]\n",
	" # extract only text to be encoded by embedding model\n",
	" batch_text = [\n",
	" row['text'] for row in new_data[i:i_end]\n",
	" ]\n",
	" # create the embedding vectors\n",
	" batch_embeds = model.encode(batch_text).tolist()\n",
	" # extract IDs to be attached to each embedding and metadata\n",
	" batch_ids = [\n",
	" row['id'] for row in new_data[i:i_end]\n",
	" ]\n",
	" # 'upsert' (insert) IDs, embeddings, and metadata to index\n",
	" to_upsert = list(zip(\n",
	" batch_ids, batch_embeds, batch_meta\n",
	" ))\n",
	" index.upsert(to_upsert)\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 14,
	"id": "a32f2d85-a310-4b97-bcf2-5993efecb784",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"{'dimension': 768,\n",
	" 'index_fullness': 0.0,\n",
	" 'namespaces': {'': {'vector_count': 13392}},\n",
	" 'total_vector_count': 13392}"
	]
	},
	"execution_count": 14,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# check everything has been added\n",
	"index.describe_index_stats()"
	]
	}
	],
	"metadata": {
	"environment": {
	"kernel": "python3",
	"name": "common-cu110.m95",
	"type": "gcloud",
	"uri": "gcr.io/deeplearning-platform-release/base-cu110:m95"
	},
	"kernelspec": {
	"display_name": "Python 3.9.12 ('ml')",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.9.12"
	},
	"vscode": {
	"interpreter": {
	"hash": "b8e7999f96e1b425e2d542f21b571f5a4be3e97158b0b46ea1b2500df63956ce"
	}
	}
	},
	"nbformat": 4,
	"nbformat_minor": 5
	}