alexanderguzhva/gist:cb2b9a08ec312e585b5ba11e3691ce36

## gistfile1.txt
#include <cstddef>
#include <cstdint>
#include <iostream>
#include <memory>
#include <optional>
#include <random>
#include <vector>

#include <raft/core/device_resources.hpp>
#include <raft/distance/distance_types.hpp>
#include <raft/neighbors/ivf_pq.cuh>
#include <raft/neighbors/specializations.cuh>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_buffer.hpp>
#include <rmm/device_vector.hpp>
#include <rmm/mr/device/device_memory_resource.hpp>
#include <rmm/mr/device/managed_memory_resource.hpp>
#include <rmm/mr/device/pool_memory_resource.hpp>

int main(int argc, char** argv) {
  const size_t nColumns = 256;
  const size_t nTrainRows = 65536;
  const size_t nQueryRows = 16;
  const size_t nLists = 256;
  const size_t nSubq = 8;
  const size_t nProbe = 16;
  const bool isTrainShared = true;
  const size_t topk = 32;

  std::default_random_engine rng(123);
  std::uniform_real_distribution<float> u(0, 1);

  std::vector<float> hDataTrain(nColumns * nTrainRows, 0);
  for (size_t i = 0; i < hDataTrain.size(); i++) {
    hDataTrain[i] = u(rng);
  }

  std::vector<float> hDataQuery(nColumns * nQueryRows, 0);
  for (size_t i = 0; i < hDataQuery.size(); i++) {
    hDataQuery[i] = u(rng);
  }

  // don't care about cudaFree
  float* dDataTrain = nullptr;
  cudaMalloc((void**)&dDataTrain, sizeof(float) * hDataTrain.size());
  cudaMemcpy(
      dDataTrain,
      hDataTrain.data(),
      sizeof(float) * hDataTrain.size(),
      cudaMemcpyHostToDevice);

  float* dDataQuery = nullptr;
  cudaMalloc((void**)&dDataQuery, sizeof(float) * hDataQuery.size());
  cudaMemcpy(
      dDataQuery,
      hDataQuery.data(),
      sizeof(float) * hDataQuery.size(),
      cudaMemcpyHostToDevice);

  // raft code
  using data_t = float;
  using idx_t = int64_t;

  using raft_index_t = raft::neighbors::ivf_pq::index<idx_t>;

  //
  std::vector<idx_t> hIndices(nQueryRows * topk, -1);
  std::vector<data_t> hDistances(nQueryRows * topk, -1);

  // don't care about cudaFree
  idx_t* dIndices = nullptr;
  cudaMalloc((void**)&dIndices, sizeof(idx_t) * hIndices.size());

  float* dDistances = nullptr;
  cudaMalloc((void**)&dDistances, sizeof(data_t) * hDistances.size());

  //
  raft::device_resources handle;

  //
  auto data_view = raft::make_device_matrix_view<const data_t, idx_t>(
      dDataTrain, nTrainRows, nColumns);

  // let's keep other parameters as is
  raft::neighbors::ivf_pq::index_params index_params;
  index_params.n_lists = nLists;
  index_params.pq_dim = nSubq;
  index_params.add_data_on_build = true;

  // subspace is faster, cluster is more precise
  index_params.codebook_kind = (isTrainShared)
      ? raft::neighbors::ivf_pq::codebook_gen::PER_SUBSPACE
      : raft::neighbors::ivf_pq::codebook_gen::PER_CLUSTER;

  // build
  auto raft_index = raft::neighbors::ivf_pq::build<data_t, idx_t>(
      handle, index_params, data_view);

  // wait for build to finish
  handle.sync_stream();

  // unneeded, but just to be super safe
  cudaDeviceSynchronize();

  // search
  auto query_view = raft::make_device_matrix_view<const data_t, idx_t>(
      dDataQuery, nQueryRows, nColumns);
  auto inds_view =
      raft::make_device_matrix_view<idx_t, idx_t>(dIndices, nQueryRows, topk);
  auto dists_view = raft::make_device_matrix_view<data_t, idx_t>(
      dDistances, nQueryRows, topk);

  //
  raft::neighbors::ivf_pq::search_params search_params;
  search_params.n_probes = nProbe;

  // search
  raft::neighbors::ivf_pq::search<data_t, idx_t>(
      handle, search_params, raft_index, query_view, inds_view, dists_view);
  handle.sync_stream();

  // unneeded, but just to be super safe
  cudaDeviceSynchronize();

  // copy back
  cudaMemcpy(
      hIndices.data(),
      dIndices,
      sizeof(idx_t) * hIndices.size(),
      cudaMemcpyDeviceToHost);
  cudaMemcpy(
      hDistances.data(),
      dDistances,
      sizeof(data_t) * hDistances.size(),
      cudaMemcpyDeviceToHost);

  // print results
  std::cout << "topk for the first query" << std::endl;
  for (size_t i = 0; i < topk; i++) {
    std::cout << hIndices[i] << "\t" << hDistances[i] << std::endl;
  }
  std::cout << std::endl;

  std::cout << "topk for the last query" << std::endl;
  for (size_t i = 0; i < topk; i++) {
    std::cout << hIndices[i + topk * (nQueryRows - 1)] << "\t"
              << hDistances[i + topk * (nQueryRows - 1)] << std::endl;
  }
  std::cout << std::endl;

  //
  return 0;
}
	#include <cstddef>
	#include <cstdint>
	#include <iostream>
	#include <memory>
	#include <optional>
	#include <random>
	#include <vector>

	#include <raft/core/device_resources.hpp>
	#include <raft/distance/distance_types.hpp>
	#include <raft/neighbors/ivf_pq.cuh>
	#include <raft/neighbors/specializations.cuh>

	#include <rmm/cuda_stream_view.hpp>
	#include <rmm/device_buffer.hpp>
	#include <rmm/device_vector.hpp>
	#include <rmm/mr/device/device_memory_resource.hpp>
	#include <rmm/mr/device/managed_memory_resource.hpp>
	#include <rmm/mr/device/pool_memory_resource.hpp>

	int main(int argc, char** argv) {
	const size_t nColumns = 256;
	const size_t nTrainRows = 65536;
	const size_t nQueryRows = 16;
	const size_t nLists = 256;
	const size_t nSubq = 8;
	const size_t nProbe = 16;
	const bool isTrainShared = true;
	const size_t topk = 32;

	std::default_random_engine rng(123);
	std::uniform_real_distribution<float> u(0, 1);

	std::vector<float> hDataTrain(nColumns * nTrainRows, 0);
	for (size_t i = 0; i < hDataTrain.size(); i++) {
	hDataTrain[i] = u(rng);
	}

	std::vector<float> hDataQuery(nColumns * nQueryRows, 0);
	for (size_t i = 0; i < hDataQuery.size(); i++) {
	hDataQuery[i] = u(rng);
	}

	// don't care about cudaFree
	float* dDataTrain = nullptr;
	cudaMalloc((void*)&dDataTrain, sizeof(float) hDataTrain.size());
	cudaMemcpy(
	dDataTrain,
	hDataTrain.data(),
	sizeof(float) * hDataTrain.size(),
	cudaMemcpyHostToDevice);

	float* dDataQuery = nullptr;
	cudaMalloc((void*)&dDataQuery, sizeof(float) hDataQuery.size());
	cudaMemcpy(
	dDataQuery,
	hDataQuery.data(),
	sizeof(float) * hDataQuery.size(),
	cudaMemcpyHostToDevice);

	// raft code
	using data_t = float;
	using idx_t = int64_t;

	using raft_index_t = raft::neighbors::ivf_pq::index<idx_t>;

	//
	std::vector<idx_t> hIndices(nQueryRows * topk, -1);
	std::vector<data_t> hDistances(nQueryRows * topk, -1);

	// don't care about cudaFree
	idx_t* dIndices = nullptr;
	cudaMalloc((void*)&dIndices, sizeof(idx_t) hIndices.size());

	float* dDistances = nullptr;
	cudaMalloc((void*)&dDistances, sizeof(data_t) hDistances.size());

	//
	raft::device_resources handle;

	//
	auto data_view = raft::make_device_matrix_view<const data_t, idx_t>(
	dDataTrain, nTrainRows, nColumns);

	// let's keep other parameters as is
	raft::neighbors::ivf_pq::index_params index_params;
	index_params.n_lists = nLists;
	index_params.pq_dim = nSubq;
	index_params.add_data_on_build = true;

	// subspace is faster, cluster is more precise
	index_params.codebook_kind = (isTrainShared)
	? raft::neighbors::ivf_pq::codebook_gen::PER_SUBSPACE
	: raft::neighbors::ivf_pq::codebook_gen::PER_CLUSTER;

	// build
	auto raft_index = raft::neighbors::ivf_pq::build<data_t, idx_t>(
	handle, index_params, data_view);

	// wait for build to finish
	handle.sync_stream();

	// unneeded, but just to be super safe
	cudaDeviceSynchronize();

	// search
	auto query_view = raft::make_device_matrix_view<const data_t, idx_t>(
	dDataQuery, nQueryRows, nColumns);
	auto inds_view =
	raft::make_device_matrix_view<idx_t, idx_t>(dIndices, nQueryRows, topk);
	auto dists_view = raft::make_device_matrix_view<data_t, idx_t>(
	dDistances, nQueryRows, topk);

	//
	raft::neighbors::ivf_pq::search_params search_params;
	search_params.n_probes = nProbe;

	// search
	raft::neighbors::ivf_pq::search<data_t, idx_t>(
	handle, search_params, raft_index, query_view, inds_view, dists_view);
	handle.sync_stream();

	// unneeded, but just to be super safe
	cudaDeviceSynchronize();

	// copy back
	cudaMemcpy(
	hIndices.data(),
	dIndices,
	sizeof(idx_t) * hIndices.size(),
	cudaMemcpyDeviceToHost);
	cudaMemcpy(
	hDistances.data(),
	dDistances,
	sizeof(data_t) * hDistances.size(),
	cudaMemcpyDeviceToHost);

	// print results
	std::cout << "topk for the first query" << std::endl;
	for (size_t i = 0; i < topk; i++) {
	std::cout << hIndices[i] << "\t" << hDistances[i] << std::endl;
	}
	std::cout << std::endl;

	std::cout << "topk for the last query" << std::endl;
	for (size_t i = 0; i < topk; i++) {
	std::cout << hIndices[i + topk * (nQueryRows - 1)] << "\t"
	<< hDistances[i + topk * (nQueryRows - 1)] << std::endl;
	}
	std::cout << std::endl;

	//
	return 0;
	}