Skip to content

Instantly share code, notes, and snippets.

@huyhoang17
Created November 13, 2018 15:59
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save huyhoang17/6ad54db6d861c10b36a2da047f746d33 to your computer and use it in GitHub Desktop.
Save huyhoang17/6ad54db6d861c10b36a2da047f746d33 to your computer and use it in GitHub Desktop.
PQ code from Product Quantization paper
import numpy as np
from scipy.cluster.vq import vq, kmeans2
from scipy.spatial.distance import cdist
def train(vec, M, Ks=256):
Ds = int(vec.shape[1] / M)
codeword = np.empty((M, Ks, Ds), np.float32)
for m in range(M):
vec_sub = vec[:, m * Ds: (m + 1) * Ds]
codeword[m], _ = kmeans2(vec_sub, Ks)
return codeword
def encode(codeword, vec):
M, Ks, Ds = codeword.shape
pqcode = np.empty((vec.shape[0], M), np.uint8)
for m in range(M):
vec_sub = vec[:, m * Ds: (m + 1) * Ds]
pqcode[:, m], _ = vq(vec_sub, codeword[m])
return pqcode
def search(codeword, pqcode, query):
M, Ks, Ds = codeword.shape
dist_table = np.empty((M, Ks), np.float32)
for m in range(M):
query_sub = query[m * Ds: (m + 1) * Ds]
dist_table[m, :] = cdist([query_sub], codeword[m], 'sqeuclidean')[0]
dist = np.sum(dist_table[range(M), pqcode], axis=1)
return dist
if __name__ == '__main__':
N, Nt, D = 10000, 2000, 128
# 10,000 128-dim vectors to be indexed
vec = np.random.random((N, D)).astype(np.float32)
vec_train = np.random.random((Nt, D)).astype(
np.float32) # 2,000 128-dim vectors for training
query = np.random.random((D,)).astype(np.float32) # a 128-dim query vector
M = 8
codeword = train(vec_train, M)
pqcode = encode(codeword, vec)
dist = search(codeword, pqcode, query)
print(dist)
mind_ids = dist.argsort()[:10]
for id_ in mind_ids:
print("Id: {} -> Dist: {}".format(id_, dist[id_]))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment