snakers4/calculate_knn.py

## calculate_knn.py
import gc
import tqdm
import faiss
import bcolz
import os,sys
import numpy as np
from tqdm import tqdm

# open the stored bcolz array
# note that these vectors have to be 280 dimensional
# to be compatible with faiss indexing
# https://github.com/facebookresearch/faiss/wiki/Troubleshooting#gpu-precomputed-table-error

bc_path = 'your_vectors.bc'
bc_vectors = bcolz.open(rootdir=bc_path)
vectors = bc_vectors[:,:]

# create a bcolz array for a knn graph
knn_bc_path = 'knn.bc'
knn_bc = bcolz.carray(rootdir=knn_bc_path, mode='w')
knn_bc.flush()

# create a bcolz array for distances
knn_dist_bc_path = 'distances.bc'
knn_dist_bc = bcolz.carray(rootdir=knn_dist_bc_path, mode='w')
knn_dist_bc.flush()

res = faiss.StandardGpuResources()
index = faiss.index_factory(vectors.shape[1], "IVF4096,PQ56")
co = faiss.GpuClonerOptions()

# https://github.com/facebookresearch/faiss/tree/master/benchs
# here we are using a 64-byte PQ, so we must set the lookup tables to
# 16 bit float (this is due to the limited temporary memory).
co.useFloat16 = True
index = faiss.index_cpu_to_gpu(res, 0, index, co)

print("Train the index")
index.train(vectors)
print ('Add vectors to the index')
index.add(vectors)
del vectors
gc.collect()

nprobe = 1 << 8
index.setNumProbes(nprobe)

batch_size = int(16384/2)
l = list(range(0,len(bc_vectors)))
batches = [l[i:i + batch_size] for i in range(0, len(bc_vectors), batch_size)]
# check that the operation is valid
assert set([item for sublist in batches for item in sublist]) == set(list(range(0,len(bc_vectors))))

processed_batches = []

with tqdm(total=len(batches)) as pbar:
    for batch in batches:
        processed_batches.append(batch)
        b_array = np.asarray(batch)
        D, I = index.search(bc_vectors[b_array], 100)

        knn_bc.append(I)
        knn_bc.flush()

        knn_dist_bc.append(D)
        knn_dist_bc.flush()

        pbar.update(1)

# check that all vectors were processed
assert set([item for sublist in processed_batches for item in sublist]) == set(list(range(0,len(bc_vectors))))
assert len(knn_bc) == len(bc_vectors)
assert len(knn_dist_bc) == len(bc_vectors)
	import gc
	import tqdm
	import faiss
	import bcolz
	import os,sys
	import numpy as np
	from tqdm import tqdm

	# open the stored bcolz array
	# note that these vectors have to be 280 dimensional
	# to be compatible with faiss indexing
	# https://github.com/facebookresearch/faiss/wiki/Troubleshooting#gpu-precomputed-table-error

	bc_path = 'your_vectors.bc'
	bc_vectors = bcolz.open(rootdir=bc_path)
	vectors = bc_vectors[:,:]

	# create a bcolz array for a knn graph
	knn_bc_path = 'knn.bc'
	knn_bc = bcolz.carray(rootdir=knn_bc_path, mode='w')
	knn_bc.flush()

	# create a bcolz array for distances
	knn_dist_bc_path = 'distances.bc'
	knn_dist_bc = bcolz.carray(rootdir=knn_dist_bc_path, mode='w')
	knn_dist_bc.flush()

	res = faiss.StandardGpuResources()
	index = faiss.index_factory(vectors.shape[1], "IVF4096,PQ56")
	co = faiss.GpuClonerOptions()

	# https://github.com/facebookresearch/faiss/tree/master/benchs
	# here we are using a 64-byte PQ, so we must set the lookup tables to
	# 16 bit float (this is due to the limited temporary memory).
	co.useFloat16 = True
	index = faiss.index_cpu_to_gpu(res, 0, index, co)

	print("Train the index")
	index.train(vectors)
	print ('Add vectors to the index')
	index.add(vectors)
	del vectors
	gc.collect()

	nprobe = 1 << 8
	index.setNumProbes(nprobe)

	batch_size = int(16384/2)
	l = list(range(0,len(bc_vectors)))
	batches = [l[i:i + batch_size] for i in range(0, len(bc_vectors), batch_size)]
	# check that the operation is valid
	assert set([item for sublist in batches for item in sublist]) == set(list(range(0,len(bc_vectors))))

	processed_batches = []

	with tqdm(total=len(batches)) as pbar:
	for batch in batches:
	processed_batches.append(batch)
	b_array = np.asarray(batch)
	D, I = index.search(bc_vectors[b_array], 100)

	knn_bc.append(I)
	knn_bc.flush()

	knn_dist_bc.append(D)
	knn_dist_bc.flush()

	pbar.update(1)

	# check that all vectors were processed
	assert set([item for sublist in processed_batches for item in sublist]) == set(list(range(0,len(bc_vectors))))
	assert len(knn_bc) == len(bc_vectors)
	assert len(knn_dist_bc) == len(bc_vectors)