benwtrent/knnPerf.py

## knnPerf.py
#!/usr/bin/env/python

import os
import subprocess
import benchUtil
import constants

LUCENE_CHECKOUT = 'lucene_candidate'

# test parameters. This script will run KnnGraphTester on every combination of these parameters
VALUES = {
    'ndoc': (100000,),# 200000),
    'maxConn': (48, ),
    'beamWidthIndex': (200,),
    'fanout': (0, 10, 50, 90, 190, 490, 590, 990),#, 250),
    'topK': (10,),
}

def advance(ix, values):
    for i in reversed(range(len(ix))):
        param = list(values.keys())[i]
        if ix[i] == len(values[param]) - 1:
            ix[i] = 0
        else:
            ix[i] += 1
            return True
    return False

def run_knn_benchmark(checkout, values, training_file, testing_file, dims, metric):
    indexes = [0] * len(values.keys())
    indexes[-1] = -1
    args = []
    print(f"\n\n\nNow running{training_file}\n\n\n")
    dim = dims #768
    doc_vectors = training_file # '%s/util/wiki768ja.random.train' % constants.BASE_DIR #constants.GLOVE_VECTOR_DOCS_FILE
    query_vectors = testing_file # '%s/util/wiki768ja.test' % constants.BASE_DIR #'%s/util/tasks/vector-task-100d.vec' % constants.BASE_DIR
    cp = benchUtil.classPathToString(benchUtil.getClassPath(checkout))
    JAVA_EXE = '/Users/benjamintrent/Library/Java/JavaVirtualMachines/jdk-20.0.1.jdk/Contents/Home/bin/java'
    cmd = [JAVA_EXE,
           '-cp', cp,
           '--add-modules', 'jdk.incubator.vector',
           '-Dorg.apache.lucene.store.MMapDirectory.enableMemorySegments=false',
           'KnnGraphTester']
    print("recall\tlatency\tnDoc\tfanout\tmaxConn\tbeamWidth\tvisited\tindex ms")
    while advance(indexes, values):
        pv = {}
        args = []
        for (i, p) in enumerate(list(values.keys())):
            if p in values:
                if values[p]:
                    value = values[p][indexes[i]]
                    pv[p] = value
                else:
                    args += ['-' + p]
        args += [a for (k, v) in pv.items() for a in ('-' + k, str(v)) if a]

        this_cmd = cmd + args + [
            '-dim', str(dim),
            '-docs', doc_vectors,
            #'-stats'
            #'-reindex',
            '-metric', metric,
            '-search', query_vectors,
            #'-forceMerge',
            #'-niter', str(3000),
            '-quiet',
            ]
        subprocess.run(this_cmd)

test_names = ["normal_1_1", "normal_1_2", "pareto", "uniform", "bimodal_5", "bimodal_9", "gamma_1_1", "gamma_2_2"]
tests = []
for name in test_names:
    tests.append((f"{constants.BASE_DIR}/util/{name}.ordered.train", f"{constants.BASE_DIR}/util/{name}.test", 384, "angular"))
    tests.append((f"{constants.BASE_DIR}/util/{name}.random.train", f"{constants.BASE_DIR}/util/{name}.test", 384, "angular"))
    tests.append((f"{constants.BASE_DIR}/util/{name}.reversed.train", f"{constants.BASE_DIR}/util/{name}.test", 384, "angular"))
    tests.append((f"{constants.BASE_DIR}/util/{name}.ordered-transform.train", f"{constants.BASE_DIR}/util/{name}-transform.test", 385, "euclidean"))
    tests.append((f"{constants.BASE_DIR}/util/{name}.random-transform.train", f"{constants.BASE_DIR}/util/{name}-transform.test", 385, "euclidean"))
    tests.append((f"{constants.BASE_DIR}/util/{name}.reversed-transform.train", f"{constants.BASE_DIR}/util/{name}-transform.test", 385, "euclidean"))

for (training_file, testing_file, dims, metric) in tests:
    run_knn_benchmark(LUCENE_CHECKOUT, VALUES, training_file, testing_file, dims, metric)

## scale_data.py
import numpy as np


# Load the data
table = np.load('data/embeddings.npy')

# check that all vectors are unit vectors
assert np.allclose(np.linalg.norm(table, axis=1), 1)

uniform = (np.random.uniform(0, 1, table.shape) * 10) * table

# linearly scale all vector magnitudes via a normal distribution
normal_1 = (np.random.normal(loc=1.0, scale=0.1, size=table.shape) * 10) * table
normal_2 = (np.random.normal(loc=1.0, scale=0.2, size=table.shape) * 10) * table

# scale all vector magnitudes via a gamma distribution
gamma_1 = (np.random.gamma(1, 1, table.shape) * 10) * table
gamma_2 = (np.random.gamma(2, 2, table.shape) * 10) * table

# scale all vector magnitudes via pareto distribution
pareto_5 = (np.random.pareto(5, table.shape) * 10) * table

bimodal_9 = (((np.random.normal(loc=1, scale=0.2, size=table.shape) * 0.9) + (np.random.normal(loc=3, scale=0.2, size=table.shape)*0.1))*10) * table
bimodal_5 = (((np.random.normal(loc=1, scale=0.2, size=table.shape) * 0.5) + (np.random.normal(loc=3, scale=0.2, size=table.shape)*0.5))*10) * table

def save_to_file(filename, dataset):
    np.save(filename, dataset.astype(np.float32), allow_pickle=False)

for ds in [(uniform, "uniform"), (normal_1, "normal_1_1"), (normal_2, "normal_1_2"), (gamma_1, "gamma_1_1"), (gamma_2, "gamma_2_2"), (pareto_5, "pareto"), (bimodal_9, "bimodal_9"), (bimodal_5, "bimodal_5")]:
    save_to_file("data/{}.npy".format(ds[1]), ds[0])

## transform_synthetic.py
import numpy as np


DATA_SETS =[
    {"name": "uniform", "files": [
        "uniform.npy",
    ]},
    {"name": "normal_1_1", "files": [
       "normal_1_1.npy",
    ]},
    {"name": "normal_1_2", "files": [
        "normal_1_2.npy",
    ]},
    {"name": "gamma_1_1", "files": [
        "gamma_1_1.npy",
    ]},
    {"name": "gamma_2_2", "files": [
        "gamma_2_2.npy",
    ]},
    {"name": "pareto", "files": [
        "pareto.npy",
    ]},
    {"name": "bimodal_9", "files": [
        "bimodal_9.npy",
    ]},
    {"name": "bimodal_5", "files": [
        "bimodal_5.npy",
    ]},
]


def transform_queries(Q):
    n, _ = Q.shape
    return np.concatenate([Q, np.zeros((n, 1))], axis=-1, dtype=np.float32)


def transform_docs(D, norms):
    n, d = D.shape
    max_norm = magnitudes.max()
    flipped_norms = np.copy(norms).reshape(n, 1)
    transformed_data = np.concatenate([D, np.sqrt(max_norm**2 - flipped_norms**2)], axis=-1, dtype=np.float32)
    return transformed_data


def validate_array_match_upto_dim(arr1, arr2, dim_eq_upto):
    assert np.allclose(arr1[:dim_eq_upto], arr2[:dim_eq_upto]), "data sets are different"


def validate_dataset_match_upto_dim(arr1, arr2, dim_eq_upto):
    n1, d1 = arr1.shape
    n2, d2 = arr2.shape
    assert n1 == n2, f"Shape does not map [{arr1.shape}] vs [{arr2.shape}]"
    for i in range(n1):
        validate_array_match_upto_dim(arr1[i], arr2[i], dim_eq_upto)

for data_set in DATA_SETS:
    name = data_set["name"]
    np_total = np.load(data_set["files"][0])
    assert np_total.shape == (522931, 384)
    assert np_total.dtype == np.float32
    assert np.isnan(np_total).sum() == 0

    #Have to convert to a list here to get
    #the numpy ndarray's shape correct later
    #There's probably a better way...
    flat_ds = list()
    for vec in np_total:
        flat_ds.append(vec)
    np_flat_ds = np.array(flat_ds)
    assert np_total.shape == (522931, 384)
    assert np_total.dtype == np.float32
    assert np.isnan(np_flat_ds).sum() == 0
    row_count = np_flat_ds.shape[0]
    query_count = 10_000
    training_rows = row_count - query_count
    print(f"{name} num rows: {training_rows}")

    transformed_queries = transform_queries(np_flat_ds[training_rows:-1])
    validate_dataset_match_upto_dim(transformed_queries, np_flat_ds[training_rows:-1], 384)
    with open(f"{name}-transform.test", "w") as out_f:
        transformed_queries.tofile(out_f)
    with open(f"{name}.test", "w") as out_f:
        np_flat_ds[training_rows:-1].tofile(out_f)

    magnitudes = np.linalg.norm(np_flat_ds[0:training_rows], axis=1)
    print("mean median var max min")
    print(f"{np.mean(magnitudes)} {np.median(magnitudes)} {np.var(magnitudes)} {np.max(magnitudes)} {np.min(magnitudes)}")
    indices = np.argsort(magnitudes)
    transformed_np_flat_ds = transform_docs(np_flat_ds[0:training_rows], magnitudes)
    validate_dataset_match_upto_dim(transformed_np_flat_ds, np_flat_ds[0:training_rows], 384)
    transformed_np_flat_ds_sorted = transformed_np_flat_ds[indices]
    np_flat_ds_sorted = np_flat_ds[indices]
    with open(f"{name}.random-transform.train", "w") as out_f:
        transformed_np_flat_ds.tofile(out_f)
    with open(f"{name}.ordered-transform.train", "w") as out_f:
        transformed_np_flat_ds_sorted.tofile(out_f)
    with open(f"{name}.reversed-transform.train", "w") as out_f:
        np.flip(transformed_np_flat_ds_sorted, axis=0).tofile(out_f)

    with open(f"{name}.random.train", "w") as out_f:
        np_flat_ds[0:training_rows].tofile(out_f)
    with open(f"{name}.reversed.train", "w") as out_f:
        np.flip(np_flat_ds_sorted, axis=0).tofile(out_f)
    with open(f"{name}.ordered.train", "w") as out_f:
        np_flat_ds_sorted.tofile(out_f)
	#!/usr/bin/env/python

	import os
	import subprocess
	import benchUtil
	import constants

	LUCENE_CHECKOUT = 'lucene_candidate'

	# test parameters. This script will run KnnGraphTester on every combination of these parameters
	VALUES = {
	'ndoc': (100000,),# 200000),
	'maxConn': (48, ),
	'beamWidthIndex': (200,),
	'fanout': (0, 10, 50, 90, 190, 490, 590, 990),#, 250),
	'topK': (10,),
	}

	def advance(ix, values):
	for i in reversed(range(len(ix))):
	param = list(values.keys())[i]
	if ix[i] == len(values[param]) - 1:
	ix[i] = 0
	else:
	ix[i] += 1
	return True
	return False

	def run_knn_benchmark(checkout, values, training_file, testing_file, dims, metric):
	indexes = [0] * len(values.keys())
	indexes[-1] = -1
	args = []
	print(f"\n\n\nNow running{training_file}\n\n\n")
	dim = dims #768
	doc_vectors = training_file # '%s/util/wiki768ja.random.train' % constants.BASE_DIR #constants.GLOVE_VECTOR_DOCS_FILE
	query_vectors = testing_file # '%s/util/wiki768ja.test' % constants.BASE_DIR #'%s/util/tasks/vector-task-100d.vec' % constants.BASE_DIR
	cp = benchUtil.classPathToString(benchUtil.getClassPath(checkout))
	JAVA_EXE = '/Users/benjamintrent/Library/Java/JavaVirtualMachines/jdk-20.0.1.jdk/Contents/Home/bin/java'
	cmd = [JAVA_EXE,
	'-cp', cp,
	'--add-modules', 'jdk.incubator.vector',
	'-Dorg.apache.lucene.store.MMapDirectory.enableMemorySegments=false',
	'KnnGraphTester']
	print("recall\tlatency\tnDoc\tfanout\tmaxConn\tbeamWidth\tvisited\tindex ms")
	while advance(indexes, values):
	pv = {}
	args = []
	for (i, p) in enumerate(list(values.keys())):
	if p in values:
	if values[p]:
	value = values[p][indexes[i]]
	pv[p] = value
	else:
	args += ['-' + p]
	args += [a for (k, v) in pv.items() for a in ('-' + k, str(v)) if a]

	this_cmd = cmd + args + [
	'-dim', str(dim),
	'-docs', doc_vectors,
	#'-stats'
	#'-reindex',
	'-metric', metric,
	'-search', query_vectors,
	#'-forceMerge',
	#'-niter', str(3000),
	'-quiet',
	]
	subprocess.run(this_cmd)

	test_names = ["normal_1_1", "normal_1_2", "pareto", "uniform", "bimodal_5", "bimodal_9", "gamma_1_1", "gamma_2_2"]
	tests = []
	for name in test_names:
	tests.append((f"{constants.BASE_DIR}/util/{name}.ordered.train", f"{constants.BASE_DIR}/util/{name}.test", 384, "angular"))
	tests.append((f"{constants.BASE_DIR}/util/{name}.random.train", f"{constants.BASE_DIR}/util/{name}.test", 384, "angular"))
	tests.append((f"{constants.BASE_DIR}/util/{name}.reversed.train", f"{constants.BASE_DIR}/util/{name}.test", 384, "angular"))
	tests.append((f"{constants.BASE_DIR}/util/{name}.ordered-transform.train", f"{constants.BASE_DIR}/util/{name}-transform.test", 385, "euclidean"))
	tests.append((f"{constants.BASE_DIR}/util/{name}.random-transform.train", f"{constants.BASE_DIR}/util/{name}-transform.test", 385, "euclidean"))
	tests.append((f"{constants.BASE_DIR}/util/{name}.reversed-transform.train", f"{constants.BASE_DIR}/util/{name}-transform.test", 385, "euclidean"))

	for (training_file, testing_file, dims, metric) in tests:
	run_knn_benchmark(LUCENE_CHECKOUT, VALUES, training_file, testing_file, dims, metric)
	import numpy as np


	# Load the data
	table = np.load('data/embeddings.npy')

	# check that all vectors are unit vectors
	assert np.allclose(np.linalg.norm(table, axis=1), 1)

	uniform = (np.random.uniform(0, 1, table.shape) * 10) * table

	# linearly scale all vector magnitudes via a normal distribution
	normal_1 = (np.random.normal(loc=1.0, scale=0.1, size=table.shape) * 10) * table
	normal_2 = (np.random.normal(loc=1.0, scale=0.2, size=table.shape) * 10) * table

	# scale all vector magnitudes via a gamma distribution
	gamma_1 = (np.random.gamma(1, 1, table.shape) * 10) * table
	gamma_2 = (np.random.gamma(2, 2, table.shape) * 10) * table

	# scale all vector magnitudes via pareto distribution
	pareto_5 = (np.random.pareto(5, table.shape) * 10) * table

	bimodal_9 = (((np.random.normal(loc=1, scale=0.2, size=table.shape) * 0.9) + (np.random.normal(loc=3, scale=0.2, size=table.shape)0.1))10) * table
	bimodal_5 = (((np.random.normal(loc=1, scale=0.2, size=table.shape) * 0.5) + (np.random.normal(loc=3, scale=0.2, size=table.shape)0.5))10) * table

	def save_to_file(filename, dataset):
	np.save(filename, dataset.astype(np.float32), allow_pickle=False)

	for ds in [(uniform, "uniform"), (normal_1, "normal_1_1"), (normal_2, "normal_1_2"), (gamma_1, "gamma_1_1"), (gamma_2, "gamma_2_2"), (pareto_5, "pareto"), (bimodal_9, "bimodal_9"), (bimodal_5, "bimodal_5")]:
	save_to_file("data/{}.npy".format(ds[1]), ds[0])
	import numpy as np


	DATA_SETS =[
	{"name": "uniform", "files": [
	"uniform.npy",
	]},
	{"name": "normal_1_1", "files": [
	"normal_1_1.npy",
	]},
	{"name": "normal_1_2", "files": [
	"normal_1_2.npy",
	]},
	{"name": "gamma_1_1", "files": [
	"gamma_1_1.npy",
	]},
	{"name": "gamma_2_2", "files": [
	"gamma_2_2.npy",
	]},
	{"name": "pareto", "files": [
	"pareto.npy",
	]},
	{"name": "bimodal_9", "files": [
	"bimodal_9.npy",
	]},
	{"name": "bimodal_5", "files": [
	"bimodal_5.npy",
	]},
	]


	def transform_queries(Q):
	n, _ = Q.shape
	return np.concatenate([Q, np.zeros((n, 1))], axis=-1, dtype=np.float32)


	def transform_docs(D, norms):
	n, d = D.shape
	max_norm = magnitudes.max()
	flipped_norms = np.copy(norms).reshape(n, 1)
	transformed_data = np.concatenate([D, np.sqrt(max_norm2 - flipped_norms2)], axis=-1, dtype=np.float32)
	return transformed_data


	def validate_array_match_upto_dim(arr1, arr2, dim_eq_upto):
	assert np.allclose(arr1[:dim_eq_upto], arr2[:dim_eq_upto]), "data sets are different"


	def validate_dataset_match_upto_dim(arr1, arr2, dim_eq_upto):
	n1, d1 = arr1.shape
	n2, d2 = arr2.shape
	assert n1 == n2, f"Shape does not map [{arr1.shape}] vs [{arr2.shape}]"
	for i in range(n1):
	validate_array_match_upto_dim(arr1[i], arr2[i], dim_eq_upto)

	for data_set in DATA_SETS:
	name = data_set["name"]
	np_total = np.load(data_set["files"][0])
	assert np_total.shape == (522931, 384)
	assert np_total.dtype == np.float32
	assert np.isnan(np_total).sum() == 0

	#Have to convert to a list here to get
	#the numpy ndarray's shape correct later
	#There's probably a better way...
	flat_ds = list()
	for vec in np_total:
	flat_ds.append(vec)
	np_flat_ds = np.array(flat_ds)
	assert np_total.shape == (522931, 384)
	assert np_total.dtype == np.float32
	assert np.isnan(np_flat_ds).sum() == 0
	row_count = np_flat_ds.shape[0]
	query_count = 10_000
	training_rows = row_count - query_count
	print(f"{name} num rows: {training_rows}")

	transformed_queries = transform_queries(np_flat_ds[training_rows:-1])
	validate_dataset_match_upto_dim(transformed_queries, np_flat_ds[training_rows:-1], 384)
	with open(f"{name}-transform.test", "w") as out_f:
	transformed_queries.tofile(out_f)
	with open(f"{name}.test", "w") as out_f:
	np_flat_ds[training_rows:-1].tofile(out_f)

	magnitudes = np.linalg.norm(np_flat_ds[0:training_rows], axis=1)
	print("mean median var max min")
	print(f"{np.mean(magnitudes)} {np.median(magnitudes)} {np.var(magnitudes)} {np.max(magnitudes)} {np.min(magnitudes)}")
	indices = np.argsort(magnitudes)
	transformed_np_flat_ds = transform_docs(np_flat_ds[0:training_rows], magnitudes)
	validate_dataset_match_upto_dim(transformed_np_flat_ds, np_flat_ds[0:training_rows], 384)
	transformed_np_flat_ds_sorted = transformed_np_flat_ds[indices]
	np_flat_ds_sorted = np_flat_ds[indices]
	with open(f"{name}.random-transform.train", "w") as out_f:
	transformed_np_flat_ds.tofile(out_f)
	with open(f"{name}.ordered-transform.train", "w") as out_f:
	transformed_np_flat_ds_sorted.tofile(out_f)
	with open(f"{name}.reversed-transform.train", "w") as out_f:
	np.flip(transformed_np_flat_ds_sorted, axis=0).tofile(out_f)

	with open(f"{name}.random.train", "w") as out_f:
	np_flat_ds[0:training_rows].tofile(out_f)
	with open(f"{name}.reversed.train", "w") as out_f:
	np.flip(np_flat_ds_sorted, axis=0).tofile(out_f)
	with open(f"{name}.ordered.train", "w") as out_f:
	np_flat_ds_sorted.tofile(out_f)