Skip to content

Instantly share code, notes, and snippets.

@mdouze
Created August 16, 2021 08:23
Show Gist options
  • Save mdouze/7d5271e49a3d4b8c9c8d1eac8f4b9748 to your computer and use it in GitHub Desktop.
Save mdouze/7d5271e49a3d4b8c9c8d1eac8f4b9748 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "bc894e95",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import faiss\n",
"\n",
"from faiss.contrib import datasets"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "daaeedd7",
"metadata": {},
"outputs": [],
"source": [
"# make a 1000-vector dataset in 32D\n",
"ds = datasets.SyntheticDataset(32, 0, 1000, 0)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "9989cc3f",
"metadata": {},
"outputs": [],
"source": [
"index = faiss.index_factory(ds.d, \"HNSW32\")"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "3e46f777",
"metadata": {},
"outputs": [],
"source": [
"index.add(ds.get_database())"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "0534df40",
"metadata": {},
"outputs": [],
"source": [
"hnsw = index.hnsw"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "61c64793",
"metadata": {},
"outputs": [],
"source": [
"# get nb levels for each vector, and select one \n",
"levels = faiss.vector_to_array(hnsw.levels)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "42f3c1b5",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"3"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"levels.max()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "9164df13",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(array([592]),)"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.where(levels == 3)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "4d69d75e",
"metadata": {},
"outputs": [],
"source": [
"def vector_to_array(v): \n",
" \"\"\" make a vector visible as a numpy array (without copying data)\"\"\"\n",
" return faiss.rev_swig_ptr(v.data(), v.size())\n",
"\n",
"def get_hnsw_links(hnsw, vno): \n",
" \"\"\" get link strcutre for vertex vno \"\"\"\n",
" \n",
" # make arrays visible from Python\n",
" levels = vector_to_array(hnsw.levels)\n",
" cum_nneighbor_per_level = vector_to_array(hnsw.cum_nneighbor_per_level)\n",
" offsets = vector_to_array(hnsw.offsets)\n",
" neighbors = vector_to_array(hnsw.neighbors)\n",
" \n",
" # all neighbors of vno\n",
" neigh_vno = neighbors[offsets[vno] : offsets[vno + 1]]\n",
" \n",
" # break down per level \n",
" nlevel = levels[vno]\n",
" return [\n",
" neigh_vno[cum_nneighbor_per_level[l] : cum_nneighbor_per_level[l + 1]]\n",
" for l in range(nlevel)\n",
" ] \n",
" "
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "e8384c49",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[array([534, 100, 344, 536, 186, 32, 940, 28, 914, 469, 379, 248, 33,\n",
" 787, 952, 667, 924, 730, 547, 537, 338, 55, 105, 899, 146, 751,\n",
" 189, 512, 236, 506, 57, 858, 578, 199, 279, 649, 294, 347, 407,\n",
" 471, 80, 814, 101, 568, 771, 41, 712, 349, 242, 79, 118, 12,\n",
" 985, 890, 722, 510, 835, 129, -1, -1, -1, -1, -1, -1],\n",
" dtype=int32),\n",
" array([473, 763, 344, 511, 52, 569, 877, 994, 998, 935, 133, 982, 702,\n",
" 632, 73, 136, 239, 847, 364, 770, 737, 385, 331, 944, 765, -1,\n",
" -1, -1, -1, -1, -1, -1], dtype=int32),\n",
" array([-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,\n",
" -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],\n",
" dtype=int32)]"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# get links for that vector\n",
"get_links(hnsw, 592)"
]
},
{
"cell_type": "markdown",
"id": "e6ef5fad",
"metadata": {},
"source": [
"There are three levels, the first (base level) has 64 entries. The levels above have 32. The link structure contains ids, that can be -1 when there are not enough links to fill the fixed-size array."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b529c4b8",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "bc894e95",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import faiss\n",
"\n",
"from faiss.contrib import datasets"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "daaeedd7",
"metadata": {},
"outputs": [],
"source": [
"# make a 1000-vector dataset in 32D\n",
"ds = datasets.SyntheticDataset(32, 0, 1000, 0)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "9989cc3f",
"metadata": {},
"outputs": [],
"source": [
"index = faiss.index_factory(ds.d, \"HNSW32\")"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "3e46f777",
"metadata": {},
"outputs": [],
"source": [
"index.add(ds.get_database())"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "0534df40",
"metadata": {},
"outputs": [],
"source": [
"hnsw = index.hnsw"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "61c64793",
"metadata": {},
"outputs": [],
"source": [
"# get nb levels for each vector, and select one \n",
"levels = faiss.vector_to_array(hnsw.levels)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "42f3c1b5",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"3"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"levels.max()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "9164df13",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(array([592]),)"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.where(levels == 3)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "4d69d75e",
"metadata": {},
"outputs": [],
"source": [
"def vector_to_array(v): \n",
" \"\"\" make a vector visible as a numpy array (without copying data)\"\"\"\n",
" return faiss.rev_swig_ptr(v.data(), v.size())\n",
"\n",
"def get_hnsw_links(hnsw, vno): \n",
" \"\"\" get link strcutre for vertex vno \"\"\"\n",
" \n",
" # make arrays visible from Python\n",
" levels = vector_to_array(hnsw.levels)\n",
" cum_nneighbor_per_level = vector_to_array(hnsw.cum_nneighbor_per_level)\n",
" offsets = vector_to_array(hnsw.offsets)\n",
" neighbors = vector_to_array(hnsw.neighbors)\n",
" \n",
" # all neighbors of vno\n",
" neigh_vno = neighbors[offsets[vno] : offsets[vno + 1]]\n",
" \n",
" # break down per level \n",
" nlevel = levels[vno]\n",
" return [\n",
" neigh_vno[cum_nneighbor_per_level[l] : cum_nneighbor_per_level[l + 1]]\n",
" for l in range(nlevel)\n",
" ] \n",
" "
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "e8384c49",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[array([534, 100, 344, 536, 186, 32, 940, 28, 914, 469, 379, 248, 33,\n",
" 787, 952, 667, 924, 730, 547, 537, 338, 55, 105, 899, 146, 751,\n",
" 189, 512, 236, 506, 57, 858, 578, 199, 279, 649, 294, 347, 407,\n",
" 471, 80, 814, 101, 568, 771, 41, 712, 349, 242, 79, 118, 12,\n",
" 985, 890, 722, 510, 835, 129, -1, -1, -1, -1, -1, -1],\n",
" dtype=int32),\n",
" array([473, 763, 344, 511, 52, 569, 877, 994, 998, 935, 133, 982, 702,\n",
" 632, 73, 136, 239, 847, 364, 770, 737, 385, 331, 944, 765, -1,\n",
" -1, -1, -1, -1, -1, -1], dtype=int32),\n",
" array([-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,\n",
" -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],\n",
" dtype=int32)]"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# get links for that vector\n",
"get_links(hnsw, 592)"
]
},
{
"cell_type": "markdown",
"id": "e6ef5fad",
"metadata": {},
"source": [
"There are three levels, the first (base level) has 64 entries. The levels above have 32. The link structure contains ids, that can be -1 when there are not enough links to fill the fixed-size array."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b529c4b8",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
@anoubhav
Copy link

It shows an invalid notebook. The notebook does not appear to be valid JSON.

@peiyuanzheng
Copy link

Because the json content is duplicated.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment