Skip to content

Instantly share code, notes, and snippets.

@cbecker
Last active February 7, 2024 03:04
Show Gist options
  • Star 13 You must be signed in to star a gist
  • Fork 5 You must be signed in to fork a gist
  • Save cbecker/fb628bec3c179fc49617ba369cbb1aab to your computer and use it in GitHub Desktop.
Save cbecker/fb628bec3c179fc49617ba369cbb1aab to your computer and use it in GitHub Desktop.
lightGBM C++ example
#include <LightGBM/config.h>
#include <LightGBM/dataset_loader.h>
#include <LightGBM/boosting.h>
#include <LightGBM/objective_function.h>
#include <LightGBM/metric.h>
#include <LightGBM/utils/common.h>
#include <iostream>
#include <random>
#include <algorithm>
int main(int argc, char **argv)
{
/* create example dataset */
std::random_device rd;
std::mt19937 gen(rd());
// one random generator for every class
std::vector<std::normal_distribution<>> dists = {
std::normal_distribution<>(0, 1),
std::normal_distribution<>(10, 1)};
/* create raw data */
const int numSamples = 5000;
const int numFeats = 2;
const int numClasses = static_cast<int>(dists.size());
std::cout << "Num classes: " << numClasses << std::endl;
// labels
std::vector<float> labels(numSamples);
for (int i=0; i < numSamples; i++)
labels[i] = i % numClasses;
std::vector< std::vector<double> > features(numSamples);
for (int i=0; i < numSamples; i++)
{
features[i].resize(numFeats);
for (int j=0; j < numFeats; j++)
{
const auto lbl = static_cast<int>(labels[i]);
features[i][j] = dists[lbl](gen);
}
}
// prepare sample data
std::vector< std::vector<double> > sampleData(numFeats);
for (int i=0; i < numSamples; i++)
{
for (int j=0; j < numFeats; j++)
sampleData[j].push_back(features[i][j]);
}
/** Load dataset **/
LightGBM::IOConfig io_config;
io_config.num_class = numClasses;
io_config.max_bin = 255;
io_config.verbosity = 10;
std::unique_ptr<LightGBM::Dataset> dset;
LightGBM::DatasetLoader loader(io_config, nullptr, numClasses, nullptr);
dset.reset( loader.CostructFromSampleData(sampleData, numSamples, numSamples) );
for (int i = 0; i < numSamples; ++i)
{
const int thread_id = 0;
dset->PushOneRow(thread_id, i, features[i]);
}
dset->FinishLoad();
// check bins
for(int j=0; j < numFeats; j++)
{
const auto nbins = dset->FeatureAt(j)->bin_mapper()->num_bin();
std::cout << "Feat " << numFeats << std::endl;
std::cout << " " << dset->FeatureAt(j)->bin_mapper()->BinToValue(0) << " ";
std::cout << " " << dset->FeatureAt(j)->bin_mapper()->BinToValue(nbins-2) << " ";
std::cout << std::endl;
}
if (!dset->SetFloatField("label", labels.data(), numSamples)) {
std::cout << "Error setting label" << std::endl;
return -1;
}
/** Prepare boosting **/
LightGBM::BoostingConfig boostConfig;
boostConfig.num_iterations = 100;
boostConfig.bagging_freq = 1;
boostConfig.bagging_fraction = 0.5;
boostConfig.num_class = numClasses;
// tree params
boostConfig.tree_config.min_data_in_leaf = 10;
boostConfig.tree_config.num_leaves = 16;
//boostConfig.tree_config.min_sum_hessian_in_leaf = 0;
LightGBM::ObjectiveConfig objConfig;
objConfig.num_class = numClasses;
// objConfig.label_gain.clear();
// objConfig.label_gain.resize(numClasses, 1.0);
auto *objFunc = LightGBM::ObjectiveFunction::CreateObjectiveFunction("multiclass", objConfig);
objFunc->Init(dset->metadata(), dset->num_data());
LightGBM::MetricConfig metricConfig;
metricConfig.num_class = numClasses;
std::vector< std::unique_ptr<LightGBM::Metric> > trainMetrics;
auto metric = std::unique_ptr<LightGBM::Metric>(
LightGBM::Metric::CreateMetric("multi_logloss", metricConfig));
metric->Init(dset->metadata(), dset->num_data());
trainMetrics.push_back(std::move(metric));
auto *booster = LightGBM::Boosting::CreateBoosting(LightGBM::BoostingType::kGBDT, nullptr);
booster->Init(&boostConfig, nullptr, objFunc,
LightGBM::Common::ConstPtrInVectorWrapper<LightGBM::Metric>(trainMetrics));
booster->ResetTrainingData(&boostConfig, dset.get(), objFunc,
LightGBM::Common::ConstPtrInVectorWrapper<LightGBM::Metric>(trainMetrics));
// booster->AddValidDataset(dset.get(), LightGBM::Common::ConstPtrInVectorWrapper<LightGBM::Metric>(trainMetrics));
for (int i=0; i < boostConfig.num_iterations; i++)
{
std::cout << "Iteration " << (i+1) << std::endl;
auto scores = booster->GetEvalAt(0);
for(auto &v: scores)
std::cout << "Score: " << v << std::endl;
if (booster->TrainOneIter(nullptr, nullptr, false))
{
std::cout << "Breaking.." << std::endl;
break;
}
}
booster->SetNumIterationForPred(0); // predict with all trees
/** Predict training data **/
std::vector<int> predictedClass(numSamples);
for (int i=0; i < numSamples; i++)
{
auto predVec = booster->PredictRaw(features[i].data());
const auto predMax = std::max_element(predVec.begin(), predVec.end());
predictedClass[i] = std::distance(predVec.begin(), predMax);
}
// compute error
double err = 0;
for (int i=0; i < numSamples; i++)
{
if (predictedClass[i] != labels[i])
{
err++;
}
}
err /= labels.size();
std::cout << "Training error: " << err << std::endl;
return EXIT_SUCCESS;
}
@einvince
Copy link

CostructFromSampleData has been changed
CostructFromSampleData(double** sample_values,
int** sample_indices, int num_col, const int* num_per_col,
size_t total_sample_size, data_size_t num_data)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment