AlbertoEAF/profile_single_row_predict.cpp

## profile_single_row_predict.cpp
/**
 *
 * Quick & dirty Single Row Predict benchmark.
 *
 *
 * Add this .cpp to a new "profiling/" folder and the following line to the end of CMakeLists.txt:
 *
 *   OPTION(BUILD_PROFILING_TESTS "Set to ON to compile profiling executables for development and benchmarks." OFF)
 *   if(BUILD_PROFILING_TESTS)
 *      # For profiling builds with valgrind/callgrind use -DUSE_DEBUG=ON
 *     add_executable(lightgbm_profile_single_row_predict profiling/profile_single_row_predict.cpp ${SOURCES})
 *   endif(BUILD_PROFILING_TESTS)
 *
 *
 * Requirements:
 *
 *   - Add a "LightGBM_model.txt" file at the repo root.
 *   - Adapt ``values`` below to your model to have at least 2 different input rows.
 *
 * Compilation:
 *
 *   cmake .. -DBUILD_PROFILING_TESTS=ON && make -j4
 *
 * Usage:
 *
 *   time ./lightgbm_profile_single_row_predict <# threads> <# points> [f]  # f uses the Fast single row prediction
 *
 *
 * Alberto Ferreira, 2021
 */

#include <iostream>
#include <stdio.h>
#include <math.h>
#include <vector>
#include <thread>
#include <ctime>
#include <cstring>
#include "LightGBM/c_api.h"

using namespace std;

#define FMT_HEADER_ONLY
#include "LightGBM/../../external_libs/fmt/include/fmt/format.h"

inline void predict(BoosterHandle handle,
            const void* data,
            int32_t ncol,
            int num_iterations,
            int64_t* out_len,
            double* out_result) {
  if (0 != LGBM_BoosterPredictForMatSingleRow(
    handle,
    data,
    C_API_DTYPE_FLOAT64,
    ncol,
    1, // is_row_major
    C_API_PREDICT_NORMAL,
    0, // start_iteration
    num_iterations,
    "",
    out_len,
    out_result)) {
      throw std::exception();
  }
}

void predict_n(
  BoosterHandle boosterHandle,
  double *data,
  const size_t nrows,
  int ncol,
  int num_iterations,
  int64_t *out_len,
  double* out_scores,
  const size_t start,
  const size_t end) {
    for (size_t i = start; i < end; ++i) {
      size_t nrow = i%nrows;
      predict(boosterHandle, data + nrow*ncol, ncol, num_iterations, out_len, out_scores + i);
    }
  }


inline void predict_fast(FastConfigHandle handle,
            const void* data,
            int64_t* out_len,
            double* out_result) {
  if (0 != LGBM_BoosterPredictForMatSingleRowFast(handle, data, out_len, out_result)) {
      throw std::exception();
  }
}

void predict_fast_n(
  FastConfigHandle handle,
  double *data,
  const size_t nrows,
  const size_t ncol,
  int64_t *out_len,
  double* out_scores,
  const size_t start,
  const size_t end) {
    for (size_t i = start; i < end; ++i) {
      size_t nrow = i%nrows;
      predict_fast(handle, data + nrow*ncol, out_len, out_scores + i);
    }
}

int main(int argc, char **argv) {

  // Input parsing & experiment setup:
  if (argc < 2) {
    // argv[1] = #threads
    // argv[2] == "f" ? => Use Fast variant.
    cout << "Please pass #threads!\n";
    exit(1);
  }
  const int nthreads = std::atoi(argv[1]);
  const size_t N_PREDICTIONS = size_t(std::atol(argv[2]));
  bool fast_mode = strcmp(argv[3], "f") == 0;
  cout << "fast_mode=" << fast_mode << "\n";
  cout << "start\n";

  BoosterHandle boosterHandle;
  int num_iterations;
  LGBM_BoosterCreateFromModelfile("./LightGBM_model.txt", &num_iterations, &boosterHandle);
  cout << "Model iterations " << num_iterations<< "\n";

  /*
  Dataset:
      feature_names=amount num1_float num2_double num3_int
      fraud := 400<amount<700 & cat1_string="C"~=2 & num1_float < 70
  Use input "rows" that provide different output scores to ensure thread-safety:
  */
  double values[] = {
      0.25, 1.4, 0.12, -0.5,
      500, 2, 9999, 200,
  };
  const size_t NROWS=2;
  const int NUM_FEATURES = 4;
  double ref_scores[NUM_FEATURES * NROWS];

  int64_t dummy_out_len;
  std::vector<double> scores(N_PREDICTIONS);

  FastConfigHandle fastConfigHandle;
  LGBM_BoosterPredictForMatSingleRowFastInit(boosterHandle, C_API_PREDICT_NORMAL, 0, num_iterations, C_API_DTYPE_FLOAT64, NUM_FEATURES, "", &fastConfigHandle);

  // Generate 2 distinct reference scores - 1 per input row:
  predict(boosterHandle, values, NUM_FEATURES, num_iterations, &dummy_out_len, &ref_scores[0]);
  predict(boosterHandle, values+NUM_FEATURES, NUM_FEATURES, num_iterations, &dummy_out_len, &ref_scores[1]);
  fmt::print("Ref scores: {:.6g}, {:.6g}\n", ref_scores[0], ref_scores[1]);


  // Schedule work ////////////////////////////////////////////////////////////////////////////////////////////

  const size_t full_span = scores.size();
  const size_t base_thread_span = full_span / nthreads;
  fmt::print("Work span={}, {} threads, items/thread ~= {}\n", full_span, nthreads, base_thread_span);

  auto t0 = std::clock();
  std::vector<std::thread> threads;
  for (int nthread = 0; nthread < nthreads; ++nthread) {
    const size_t start = nthread * base_thread_span;
    const size_t end = nthread < nthreads-1 ? start + base_thread_span : full_span;

    fmt::print("Thread {} [{}:{}] ({} items)\n", nthread, start, end, end-start);

    if (fast_mode) {
      threads.push_back(std::thread(&predict_fast_n, fastConfigHandle, values, NROWS, NUM_FEATURES, &dummy_out_len, scores.data(), start, end));
    } else {
      threads.push_back(std::thread(&predict_n, boosterHandle, values, NROWS, NUM_FEATURES, num_iterations, &dummy_out_len, scores.data(), start, end));
    }
  }

  for (auto &th: threads)
    th.join();

  // Check output scores against reference scores /////////////////////////////////////////////////////////////

  for (size_t i = 0; i < N_PREDICTIONS; ++i) {
    const size_t row = i%2;
    const double error = scores[i]-ref_scores[row];
    if (abs(error) > 1e-30) {
      fmt::print("{} Score {} ref_score {}\n", i, scores[i], ref_scores[row]);
      fmt::print("{} Score error: {}\n", i, error);
    }
  }

  cout << "len=" << dummy_out_len << endl;
  cout << "end\n";
  auto t_exec = double(clock() - t0) / CLOCKS_PER_SEC;
  cout << "Executed in " << t_exec << "s\n";
}
	/**
	*
	* Quick & dirty Single Row Predict benchmark.
	*
	*
	* Add this .cpp to a new "profiling/" folder and the following line to the end of CMakeLists.txt:
	*
	* OPTION(BUILD_PROFILING_TESTS "Set to ON to compile profiling executables for development and benchmarks." OFF)
	* if(BUILD_PROFILING_TESTS)
	* # For profiling builds with valgrind/callgrind use -DUSE_DEBUG=ON
	* add_executable(lightgbm_profile_single_row_predict profiling/profile_single_row_predict.cpp ${SOURCES})
	* endif(BUILD_PROFILING_TESTS)
	*
	*
	* Requirements:
	*
	* - Add a "LightGBM_model.txt" file at the repo root.
	* - Adapt ``values`` below to your model to have at least 2 different input rows.
	*
	* Compilation:
	*
	* cmake .. -DBUILD_PROFILING_TESTS=ON && make -j4
	*
	* Usage:
	*
	* time ./lightgbm_profile_single_row_predict <# threads> <# points> [f] # f uses the Fast single row prediction
	*
	*
	* Alberto Ferreira, 2021
	*/

	#include <iostream>
	#include <stdio.h>
	#include <math.h>
	#include <vector>
	#include <thread>
	#include <ctime>
	#include <cstring>
	#include "LightGBM/c_api.h"

	using namespace std;

	#define FMT_HEADER_ONLY
	#include "LightGBM/../../external_libs/fmt/include/fmt/format.h"

	inline void predict(BoosterHandle handle,
	const void* data,
	int32_t ncol,
	int num_iterations,
	int64_t* out_len,
	double* out_result) {
	if (0 != LGBM_BoosterPredictForMatSingleRow(
	handle,
	data,
	C_API_DTYPE_FLOAT64,
	ncol,
	1, // is_row_major
	C_API_PREDICT_NORMAL,
	0, // start_iteration
	num_iterations,
	"",
	out_len,
	out_result)) {
	throw std::exception();
	}
	}

	void predict_n(
	BoosterHandle boosterHandle,
	double *data,
	const size_t nrows,
	int ncol,
	int num_iterations,
	int64_t *out_len,
	double* out_scores,
	const size_t start,
	const size_t end) {
	for (size_t i = start; i < end; ++i) {
	size_t nrow = i%nrows;
	predict(boosterHandle, data + nrow*ncol, ncol, num_iterations, out_len, out_scores + i);
	}
	}


	inline void predict_fast(FastConfigHandle handle,
	const void* data,
	int64_t* out_len,
	double* out_result) {
	if (0 != LGBM_BoosterPredictForMatSingleRowFast(handle, data, out_len, out_result)) {
	throw std::exception();
	}
	}

	void predict_fast_n(
	FastConfigHandle handle,
	double *data,
	const size_t nrows,
	const size_t ncol,
	int64_t *out_len,
	double* out_scores,
	const size_t start,
	const size_t end) {
	for (size_t i = start; i < end; ++i) {
	size_t nrow = i%nrows;
	predict_fast(handle, data + nrow*ncol, out_len, out_scores + i);
	}
	}

	int main(int argc, char **argv) {

	// Input parsing & experiment setup:
	if (argc < 2) {
	// argv[1] = #threads
	// argv[2] == "f" ? => Use Fast variant.
	cout << "Please pass #threads!\n";
	exit(1);
	}
	const int nthreads = std::atoi(argv[1]);
	const size_t N_PREDICTIONS = size_t(std::atol(argv[2]));
	bool fast_mode = strcmp(argv[3], "f") == 0;
	cout << "fast_mode=" << fast_mode << "\n";
	cout << "start\n";

	BoosterHandle boosterHandle;
	int num_iterations;
	LGBM_BoosterCreateFromModelfile("./LightGBM_model.txt", &num_iterations, &boosterHandle);
	cout << "Model iterations " << num_iterations<< "\n";

	/*
	Dataset:
	feature_names=amount num1_float num2_double num3_int
	fraud := 400<amount<700 & cat1_string="C"~=2 & num1_float < 70
	Use input "rows" that provide different output scores to ensure thread-safety:
	*/
	double values[] = {
	0.25, 1.4, 0.12, -0.5,
	500, 2, 9999, 200,
	};
	const size_t NROWS=2;
	const int NUM_FEATURES = 4;
	double ref_scores[NUM_FEATURES * NROWS];

	int64_t dummy_out_len;
	std::vector<double> scores(N_PREDICTIONS);

	FastConfigHandle fastConfigHandle;
	LGBM_BoosterPredictForMatSingleRowFastInit(boosterHandle, C_API_PREDICT_NORMAL, 0, num_iterations, C_API_DTYPE_FLOAT64, NUM_FEATURES, "", &fastConfigHandle);

	// Generate 2 distinct reference scores - 1 per input row:
	predict(boosterHandle, values, NUM_FEATURES, num_iterations, &dummy_out_len, &ref_scores[0]);
	predict(boosterHandle, values+NUM_FEATURES, NUM_FEATURES, num_iterations, &dummy_out_len, &ref_scores[1]);
	fmt::print("Ref scores: {:.6g}, {:.6g}\n", ref_scores[0], ref_scores[1]);


	// Schedule work ////////////////////////////////////////////////////////////////////////////////////////////

	const size_t full_span = scores.size();
	const size_t base_thread_span = full_span / nthreads;
	fmt::print("Work span={}, {} threads, items/thread ~= {}\n", full_span, nthreads, base_thread_span);

	auto t0 = std::clock();
	std::vector<std::thread> threads;
	for (int nthread = 0; nthread < nthreads; ++nthread) {
	const size_t start = nthread * base_thread_span;
	const size_t end = nthread < nthreads-1 ? start + base_thread_span : full_span;

	fmt::print("Thread {} [{}:{}] ({} items)\n", nthread, start, end, end-start);

	if (fast_mode) {
	threads.push_back(std::thread(&predict_fast_n, fastConfigHandle, values, NROWS, NUM_FEATURES, &dummy_out_len, scores.data(), start, end));
	} else {
	threads.push_back(std::thread(&predict_n, boosterHandle, values, NROWS, NUM_FEATURES, num_iterations, &dummy_out_len, scores.data(), start, end));
	}
	}

	for (auto &th: threads)
	th.join();

	// Check output scores against reference scores /////////////////////////////////////////////////////////////

	for (size_t i = 0; i < N_PREDICTIONS; ++i) {
	const size_t row = i%2;
	const double error = scores[i]-ref_scores[row];
	if (abs(error) > 1e-30) {
	fmt::print("{} Score {} ref_score {}\n", i, scores[i], ref_scores[row]);
	fmt::print("{} Score error: {}\n", i, error);
	}
	}

	cout << "len=" << dummy_out_len << endl;
	cout << "end\n";
	auto t_exec = double(clock() - t0) / CLOCKS_PER_SEC;
	cout << "Executed in " << t_exec << "s\n";
	}