Skip to content

Instantly share code, notes, and snippets.

@wanji
Last active July 16, 2016 03:32
Show Gist options
  • Save wanji/c08693f06ef744feef50 to your computer and use it in GitHub Desktop.
Save wanji/c08693f06ef744feef50 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"metadata": {
"name": "",
"signature": "sha256:28c83ae59bc46da0f79efcd9c8f7e58066b9dce4f9e61612bf1e6b8eb84a7efe"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Examples for `HDIdx`"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# import necessary packages\n",
"\n",
"import hdidx\n",
"import numpy as np\n",
"\n",
"# print log messages\n",
"import logging\n",
"reload(logging)\n",
"logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s',\n",
" level=logging.INFO)\n",
"\n",
"# generating sample data\n",
"ndim = 256 # dimension of features\n",
"ndb = 10000 # number of dababase items\n",
"nqry = 120 # number of queries\n",
"\n",
"X_db = np.random.random((ndb, ndim)).astype(np.float64)\n",
"X_qry = np.random.random((nqry, ndim)).astype(np.float32)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 1
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Basic Usage"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# create Product Quantization Indexer\n",
"idx = hdidx.indexer.PQIndexer()\n",
"# set storage for the indexer, the default is store indexex in memory\n",
"idx.set_storage()\n",
"# build indexer\n",
"idx.build({'vals': X_db, 'nsubq': 8})\n",
"# add database items to the indexer\n",
"idx.add(X_db)\n",
"# searching in the database, and return top-100 items for each query\n",
"idx.search(X_qry, 100)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stderr",
"text": [
"2015-04-08 16:51:18,613 - INFO - Building codebooks in subspaces - BEGIN\n"
]
},
{
"output_type": "stream",
"stream": "stderr",
"text": [
"2015-04-08 16:51:18,614 - INFO - \tsubspace 0/8:\n"
]
},
{
"output_type": "stream",
"stream": "stderr",
"text": [
"2015-04-08 16:51:18,877 - INFO - \tsubspace 1/8:\n"
]
},
{
"output_type": "stream",
"stream": "stderr",
"text": [
"2015-04-08 16:51:19,266 - INFO - \tsubspace 2/8:\n"
]
},
{
"output_type": "stream",
"stream": "stderr",
"text": [
"2015-04-08 16:51:19,527 - INFO - \tsubspace 3/8:\n"
]
},
{
"output_type": "stream",
"stream": "stderr",
"text": [
"2015-04-08 16:51:19,821 - INFO - \tsubspace 4/8:\n"
]
},
{
"output_type": "stream",
"stream": "stderr",
"text": [
"2015-04-08 16:51:20,192 - INFO - \tsubspace 5/8:\n"
]
},
{
"output_type": "stream",
"stream": "stderr",
"text": [
"2015-04-08 16:51:20,654 - INFO - \tsubspace 6/8:\n"
]
},
{
"output_type": "stream",
"stream": "stderr",
"text": [
"2015-04-08 16:51:21,007 - INFO - \tsubspace 7/8:\n"
]
},
{
"output_type": "stream",
"stream": "stderr",
"text": [
"2015-04-08 16:51:21,371 - INFO - Building codebooks in subspaces - DONE\n"
]
},
{
"output_type": "stream",
"stream": "stderr",
"text": [
"2015-04-08 16:51:21,372 - INFO - 0/10000: 10000\n"
]
},
{
"output_type": "stream",
"stream": "stderr",
"text": [
"2015-04-08 16:51:25,919 - INFO - Start Querying ...\n"
]
},
{
"output_type": "stream",
"stream": "stderr",
"text": [
"2015-04-08 16:51:25,973 - INFO - \t100/120: 0.0005s per query\n"
]
},
{
"output_type": "stream",
"stream": "stderr",
"text": [
"2015-04-08 16:51:25,973 - INFO - \t\tknn: 0.0001;\tdistance: 0.0001;\tresult: 0.0000;\tdistab: 0.0003\n"
]
},
{
"output_type": "stream",
"stream": "stderr",
"text": [
"2015-04-08 16:51:25,983 - INFO - Querying Finished!\n"
]
},
{
"output_type": "stream",
"stream": "stderr",
"text": [
"2015-04-08 16:51:25,984 - INFO - Average querying time: 0.0005\n"
]
},
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 2,
"text": [
"(array([[4875, 1010, 2924, ..., 1778, 9669, 6937],\n",
" [ 935, 7988, 6960, ..., 6155, 2862, 8552],\n",
" [4070, 6554, 2425, ..., 5717, 8434, 7749],\n",
" ..., \n",
" [6695, 6395, 8149, ..., 2395, 3857, 7193],\n",
" [5368, 77, 1238, ..., 1319, 8494, 2513],\n",
" [3920, 9361, 9265, ..., 1398, 2621, 4062]], dtype=int32),\n",
" array([[ 22.39608955, 22.64003181, 23.18744469, ..., 24.55779076,\n",
" 24.56216049, 24.56312752],\n",
" [ 21.87729645, 22.44916153, 22.9182663 , ..., 24.82294655,\n",
" 24.83014107, 24.83216476],\n",
" [ 24.05742455, 24.07415771, 24.18813896, ..., 26.00284767,\n",
" 26.00819397, 26.01751709],\n",
" ..., \n",
" [ 23.31334496, 23.70270348, 24.25839233, ..., 26.0488205 ,\n",
" 26.04973793, 26.05724144],\n",
" [ 24.52768135, 24.54415512, 24.70788956, ..., 26.36787415,\n",
" 26.36966705, 26.37992859],\n",
" [ 22.33849525, 22.79646301, 22.85318565, ..., 24.45709991,\n",
" 24.45932007, 24.46750259]], dtype=float32))"
]
}
],
"prompt_number": 2
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Save indexer"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# create Product Quantization Indexer\n",
"idx = hdidx.indexer.PQIndexer()\n",
"# set storage for the indexer, this time we store the indexes into LMDB\n",
"idx.set_storage('lmdb', {'path': '/tmp/pq.idx'})\n",
"# build indexer\n",
"idx.build({'vals': X_db, 'nsubq': 8})\n",
"# save the index information for future use\n",
"idx.save('/tmp/pq.info')\n",
"# add database items to the indexer\n",
"idx.add(X_db)\n",
"# searching in the database, and return top-100 items for each query\n",
"idx.search(X_qry, 100)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stderr",
"text": [
"2015-04-08 16:51:26,219 - INFO - Building codebooks in subspaces - BEGIN\n"
]
},
{
"output_type": "stream",
"stream": "stderr",
"text": [
"2015-04-08 16:51:26,219 - INFO - \tsubspace 0/8:\n"
]
},
{
"output_type": "stream",
"stream": "stderr",
"text": [
"2015-04-08 16:51:26,635 - INFO - \tsubspace 1/8:\n"
]
},
{
"output_type": "stream",
"stream": "stderr",
"text": [
"2015-04-08 16:51:27,027 - INFO - \tsubspace 2/8:\n"
]
},
{
"output_type": "stream",
"stream": "stderr",
"text": [
"2015-04-08 16:51:27,354 - INFO - \tsubspace 3/8:\n"
]
},
{
"output_type": "stream",
"stream": "stderr",
"text": [
"2015-04-08 16:51:27,619 - INFO - \tsubspace 4/8:\n"
]
},
{
"output_type": "stream",
"stream": "stderr",
"text": [
"2015-04-08 16:51:27,953 - INFO - \tsubspace 5/8:\n"
]
},
{
"output_type": "stream",
"stream": "stderr",
"text": [
"2015-04-08 16:51:28,295 - INFO - \tsubspace 6/8:\n"
]
},
{
"output_type": "stream",
"stream": "stderr",
"text": [
"2015-04-08 16:51:28,721 - INFO - \tsubspace 7/8:\n"
]
},
{
"output_type": "stream",
"stream": "stderr",
"text": [
"2015-04-08 16:51:29,056 - INFO - Building codebooks in subspaces - DONE\n"
]
},
{
"output_type": "stream",
"stream": "stderr",
"text": [
"2015-04-08 16:51:29,057 - INFO - 0/10000: 10000\n"
]
},
{
"output_type": "stream",
"stream": "stderr",
"text": [
"2015-04-08 16:51:31,449 - INFO - Start Querying ...\n"
]
},
{
"output_type": "stream",
"stream": "stderr",
"text": [
"2015-04-08 16:51:31,568 - INFO - \t100/120: 0.0012s per query\n"
]
},
{
"output_type": "stream",
"stream": "stderr",
"text": [
"2015-04-08 16:51:31,569 - INFO - \t\tknn: 0.0004;\tdistance: 0.0005;\tresult: 0.0000;\tdistab: 0.0003\n"
]
},
{
"output_type": "stream",
"stream": "stderr",
"text": [
"2015-04-08 16:51:31,594 - INFO - Querying Finished!\n"
]
},
{
"output_type": "stream",
"stream": "stderr",
"text": [
"2015-04-08 16:51:31,594 - INFO - Average querying time: 0.0012\n"
]
},
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 3,
"text": [
"(array([[33870, 6606, 25826, ..., 2502, 12495, 26082],\n",
" [42343, 49349, 13409, ..., 44196, 2002, 18067],\n",
" [47133, 20148, 37617, ..., 8761, 33348, 36543],\n",
" ..., \n",
" [ 2807, 10529, 37991, ..., 8355, 47615, 30899],\n",
" [13999, 36660, 3608, ..., 44946, 15916, 20692],\n",
" [36386, 13856, 46034, ..., 41069, 45679, 4505]], dtype=int32),\n",
" array([[ 22.21693611, 22.44924736, 22.58147049, ..., 23.68716049,\n",
" 23.69098854, 23.70442963],\n",
" [ 22.1412468 , 22.20767593, 22.21963882, ..., 24.13736343,\n",
" 24.13786697, 24.13886642],\n",
" [ 23.51828194, 23.58364487, 23.73697472, ..., 25.25225258,\n",
" 25.25513077, 25.25598907],\n",
" ..., \n",
" [ 23.4031868 , 23.53798866, 23.63532448, ..., 25.11426353,\n",
" 25.13235092, 25.13462448],\n",
" [ 23.5990181 , 23.77312851, 23.82258415, ..., 25.51331329,\n",
" 25.51656532, 25.51693535],\n",
" [ 21.98418999, 22.25404167, 22.51370621, ..., 23.68246841,\n",
" 23.68491936, 23.6861496 ]], dtype=float32))"
]
}
],
"prompt_number": 3
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Load existing indexer"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# create Product Quantization Indexer\n",
"idx = hdidx.indexer.PQIndexer()\n",
"# set storage for the indexer, this time we load the indexes from LMDB\n",
"idx.set_storage('lmdb', {'path': '/tmp/pq.idx'})\n",
"# save the index information for future use\n",
"idx.load('/tmp/pq.info')\n",
"# searching in the database, and return top-100 items for each query\n",
"idx.search(X_qry, 100)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stderr",
"text": [
"2015-04-08 16:51:31,882 - INFO - Start Querying ...\n"
]
},
{
"output_type": "stream",
"stream": "stderr",
"text": [
"2015-04-08 16:51:32,000 - INFO - \t100/120: 0.0012s per query\n"
]
},
{
"output_type": "stream",
"stream": "stderr",
"text": [
"2015-04-08 16:51:32,001 - INFO - \t\tknn: 0.0004;\tdistance: 0.0005;\tresult: 0.0000;\tdistab: 0.0003\n"
]
},
{
"output_type": "stream",
"stream": "stderr",
"text": [
"2015-04-08 16:51:32,024 - INFO - Querying Finished!\n"
]
},
{
"output_type": "stream",
"stream": "stderr",
"text": [
"2015-04-08 16:51:32,025 - INFO - Average querying time: 0.0012\n"
]
},
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 4,
"text": [
"(array([[33870, 6606, 25826, ..., 2502, 12495, 26082],\n",
" [42343, 49349, 13409, ..., 44196, 2002, 18067],\n",
" [47133, 20148, 37617, ..., 8761, 33348, 36543],\n",
" ..., \n",
" [ 2807, 10529, 37991, ..., 8355, 47615, 30899],\n",
" [13999, 36660, 3608, ..., 44946, 15916, 20692],\n",
" [36386, 13856, 46034, ..., 41069, 45679, 4505]], dtype=int32),\n",
" array([[ 22.21693611, 22.44924736, 22.58147049, ..., 23.68716049,\n",
" 23.69098854, 23.70442963],\n",
" [ 22.1412468 , 22.20767593, 22.21963882, ..., 24.13736343,\n",
" 24.13786697, 24.13886642],\n",
" [ 23.51828194, 23.58364487, 23.73697472, ..., 25.25225258,\n",
" 25.25513077, 25.25598907],\n",
" ..., \n",
" [ 23.4031868 , 23.53798866, 23.63532448, ..., 25.11426353,\n",
" 25.13235092, 25.13462448],\n",
" [ 23.5990181 , 23.77312851, 23.82258415, ..., 25.51331329,\n",
" 25.51656532, 25.51693535],\n",
" [ 21.98418999, 22.25404167, 22.51370621, ..., 23.68246841,\n",
" 23.68491936, 23.6861496 ]], dtype=float32))"
]
}
],
"prompt_number": 4
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment