Last active
February 7, 2024 03:04
-
-
Save cbecker/fb628bec3c179fc49617ba369cbb1aab to your computer and use it in GitHub Desktop.
lightGBM C++ example
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <LightGBM/config.h> | |
#include <LightGBM/dataset_loader.h> | |
#include <LightGBM/boosting.h> | |
#include <LightGBM/objective_function.h> | |
#include <LightGBM/metric.h> | |
#include <LightGBM/utils/common.h> | |
#include <iostream> | |
#include <random> | |
#include <algorithm> | |
int main(int argc, char **argv) | |
{ | |
/* create example dataset */ | |
std::random_device rd; | |
std::mt19937 gen(rd()); | |
// one random generator for every class | |
std::vector<std::normal_distribution<>> dists = { | |
std::normal_distribution<>(0, 1), | |
std::normal_distribution<>(10, 1)}; | |
/* create raw data */ | |
const int numSamples = 5000; | |
const int numFeats = 2; | |
const int numClasses = static_cast<int>(dists.size()); | |
std::cout << "Num classes: " << numClasses << std::endl; | |
// labels | |
std::vector<float> labels(numSamples); | |
for (int i=0; i < numSamples; i++) | |
labels[i] = i % numClasses; | |
std::vector< std::vector<double> > features(numSamples); | |
for (int i=0; i < numSamples; i++) | |
{ | |
features[i].resize(numFeats); | |
for (int j=0; j < numFeats; j++) | |
{ | |
const auto lbl = static_cast<int>(labels[i]); | |
features[i][j] = dists[lbl](gen); | |
} | |
} | |
// prepare sample data | |
std::vector< std::vector<double> > sampleData(numFeats); | |
for (int i=0; i < numSamples; i++) | |
{ | |
for (int j=0; j < numFeats; j++) | |
sampleData[j].push_back(features[i][j]); | |
} | |
/** Load dataset **/ | |
LightGBM::IOConfig io_config; | |
io_config.num_class = numClasses; | |
io_config.max_bin = 255; | |
io_config.verbosity = 10; | |
std::unique_ptr<LightGBM::Dataset> dset; | |
LightGBM::DatasetLoader loader(io_config, nullptr, numClasses, nullptr); | |
dset.reset( loader.CostructFromSampleData(sampleData, numSamples, numSamples) ); | |
for (int i = 0; i < numSamples; ++i) | |
{ | |
const int thread_id = 0; | |
dset->PushOneRow(thread_id, i, features[i]); | |
} | |
dset->FinishLoad(); | |
// check bins | |
for(int j=0; j < numFeats; j++) | |
{ | |
const auto nbins = dset->FeatureAt(j)->bin_mapper()->num_bin(); | |
std::cout << "Feat " << numFeats << std::endl; | |
std::cout << " " << dset->FeatureAt(j)->bin_mapper()->BinToValue(0) << " "; | |
std::cout << " " << dset->FeatureAt(j)->bin_mapper()->BinToValue(nbins-2) << " "; | |
std::cout << std::endl; | |
} | |
if (!dset->SetFloatField("label", labels.data(), numSamples)) { | |
std::cout << "Error setting label" << std::endl; | |
return -1; | |
} | |
/** Prepare boosting **/ | |
LightGBM::BoostingConfig boostConfig; | |
boostConfig.num_iterations = 100; | |
boostConfig.bagging_freq = 1; | |
boostConfig.bagging_fraction = 0.5; | |
boostConfig.num_class = numClasses; | |
// tree params | |
boostConfig.tree_config.min_data_in_leaf = 10; | |
boostConfig.tree_config.num_leaves = 16; | |
//boostConfig.tree_config.min_sum_hessian_in_leaf = 0; | |
LightGBM::ObjectiveConfig objConfig; | |
objConfig.num_class = numClasses; | |
// objConfig.label_gain.clear(); | |
// objConfig.label_gain.resize(numClasses, 1.0); | |
auto *objFunc = LightGBM::ObjectiveFunction::CreateObjectiveFunction("multiclass", objConfig); | |
objFunc->Init(dset->metadata(), dset->num_data()); | |
LightGBM::MetricConfig metricConfig; | |
metricConfig.num_class = numClasses; | |
std::vector< std::unique_ptr<LightGBM::Metric> > trainMetrics; | |
auto metric = std::unique_ptr<LightGBM::Metric>( | |
LightGBM::Metric::CreateMetric("multi_logloss", metricConfig)); | |
metric->Init(dset->metadata(), dset->num_data()); | |
trainMetrics.push_back(std::move(metric)); | |
auto *booster = LightGBM::Boosting::CreateBoosting(LightGBM::BoostingType::kGBDT, nullptr); | |
booster->Init(&boostConfig, nullptr, objFunc, | |
LightGBM::Common::ConstPtrInVectorWrapper<LightGBM::Metric>(trainMetrics)); | |
booster->ResetTrainingData(&boostConfig, dset.get(), objFunc, | |
LightGBM::Common::ConstPtrInVectorWrapper<LightGBM::Metric>(trainMetrics)); | |
// booster->AddValidDataset(dset.get(), LightGBM::Common::ConstPtrInVectorWrapper<LightGBM::Metric>(trainMetrics)); | |
for (int i=0; i < boostConfig.num_iterations; i++) | |
{ | |
std::cout << "Iteration " << (i+1) << std::endl; | |
auto scores = booster->GetEvalAt(0); | |
for(auto &v: scores) | |
std::cout << "Score: " << v << std::endl; | |
if (booster->TrainOneIter(nullptr, nullptr, false)) | |
{ | |
std::cout << "Breaking.." << std::endl; | |
break; | |
} | |
} | |
booster->SetNumIterationForPred(0); // predict with all trees | |
/** Predict training data **/ | |
std::vector<int> predictedClass(numSamples); | |
for (int i=0; i < numSamples; i++) | |
{ | |
auto predVec = booster->PredictRaw(features[i].data()); | |
const auto predMax = std::max_element(predVec.begin(), predVec.end()); | |
predictedClass[i] = std::distance(predVec.begin(), predMax); | |
} | |
// compute error | |
double err = 0; | |
for (int i=0; i < numSamples; i++) | |
{ | |
if (predictedClass[i] != labels[i]) | |
{ | |
err++; | |
} | |
} | |
err /= labels.size(); | |
std::cout << "Training error: " << err << std::endl; | |
return EXIT_SUCCESS; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
CostructFromSampleData has been changed
CostructFromSampleData(double** sample_values,
int** sample_indices, int num_col, const int* num_per_col,
size_t total_sample_size, data_size_t num_data)