-
-
Save jamescalam/a09a16c17b677f2cf9c019114711f3bf to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import shutil\n", | |
"import urllib.request as request\n", | |
"from contextlib import closing\n", | |
"\n", | |
"# first we download the Sift1M dataset\n", | |
"with closing(request.urlopen('ftp://ftp.irisa.fr/local/texmex/corpus/sift.tar.gz')) as r:\n", | |
" with open('sift.tar.gz', 'wb') as f:\n", | |
" shutil.copyfileobj(r, f)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import tarfile\n", | |
"\n", | |
"# the download leaves us with a tar.gz file, we unzip it\n", | |
"tar = tarfile.open('sift.tar.gz', \"r:gz\")\n", | |
"tar.extractall()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import numpy as np\n", | |
"\n", | |
"# now define a function to read the fvecs file format of Sift1M dataset\n", | |
"def read_fvecs(fp):\n", | |
" a = np.fromfile(fp, dtype='int32')\n", | |
" d = a[0]\n", | |
" return a.reshape(-1, d + 1)[:, 1:].copy().view('float32')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# data we will search through\n", | |
"xb = read_fvecs('sift_base.fvecs') # 1M samples\n", | |
"# also get some query vectors to search with\n", | |
"xq = read_fvecs('./sift/sift_query.fvecs')\n", | |
"# take just one query (there are many in sift_learn.fvecs)\n", | |
"xq = xq[0].reshape(1, xq.shape[1])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"(1, 128)" | |
] | |
}, | |
"execution_count": 6, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"xq.shape" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"(1000000, 128)" | |
] | |
}, | |
"execution_count": 7, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"xb.shape" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([[ 1., 3., 11., 110., 62., 22., 4., 0., 43., 21., 22.,\n", | |
" 18., 6., 28., 64., 9., 11., 1., 0., 0., 1., 40.,\n", | |
" 101., 21., 20., 2., 4., 2., 2., 9., 18., 35., 1.,\n", | |
" 1., 7., 25., 108., 116., 63., 2., 0., 0., 11., 74.,\n", | |
" 40., 101., 116., 3., 33., 1., 1., 11., 14., 18., 116.,\n", | |
" 116., 68., 12., 5., 4., 2., 2., 9., 102., 17., 3.,\n", | |
" 10., 18., 8., 15., 67., 63., 15., 0., 14., 116., 80.,\n", | |
" 0., 2., 22., 96., 37., 28., 88., 43., 1., 4., 18.,\n", | |
" 116., 51., 5., 11., 32., 14., 8., 23., 44., 17., 12.,\n", | |
" 9., 0., 0., 19., 37., 85., 18., 16., 104., 22., 6.,\n", | |
" 2., 26., 12., 58., 67., 82., 25., 12., 2., 2., 25.,\n", | |
" 18., 8., 2., 19., 42., 48., 11.]], dtype=float32)" | |
] | |
}, | |
"execution_count": 8, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"xq" | |
] | |
} | |
], | |
"metadata": { | |
"interpreter": { | |
"hash": "b0fa6594d8f4cbf19f97940f81e996739fb7646882a419484c72d19e05852a7e" | |
}, | |
"kernelspec": { | |
"display_name": "Python 3.9.5 64-bit", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.8.8" | |
}, | |
"orig_nbformat": 4 | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment