Instantly share code, notes, and snippets.

Embed
What would you like to do?
Benchmark newly added policies of Normalization, NeighborSearch and Interpolation in mlpack's CF module

This gist benchmarks newly added functionalities in mlpack's CF module:

  1. Normalization: NoNormalization, OverallMeanNormalization, UserMeanNormalization, ItemMeanNormalization, CombinedNormalization.
  2. NeighborSearch: EuclideanSearch, CosineSearch, PearsonSearch.
  3. Interpolation: AverageInterpolation, SimilarityInterpolation, RegressionInterpolation.

We use Grouplens-100k as benchmark dataset. Grouplens-100k dataset is divided into trainSet and testSet by the ratio of 80%/20%. Model is trained on trainSet and we compute RMSE(Root Mean Square Error) on testSet.

Results are presented in three .csv files:

  1. normalization.csv compares the resulting RMSE across different DecompositionPolicies and different Normalization. EuclideanSearch is used as neighbor search policy and AverageInterpolation is used as interpolation policy.
  2. neighborsearch.csv compares the resulting RMSE across different DecompositionPolicies and different NeighborSearchPolicies. OverallMeanNormalization is used as data normalization method and AverageInterpolation is used as interpolation policy.
  3. interpolation.csv compares the resulting RMSE across different DecompositionPolicies and different InterpolationPolicies. OverallMeanNormalization is used as data normalization method and EuclideanSearch is used as neighbor search policy.

Hyperparameters are set as follows:

const size_t numUsersForSimilarity = 5;
const size_t rank = 5;
const size_t maxIterations = 200;
const double minResidue = 1e-5;
const bool mit = true;

The CombinedNormalization in the results refers to the following specialization of template CombinedNormalization<...>:

using CombinedMeanNormalization =
    CombinedNormalization<
        OverallMeanNormalization,
        UserMeanNormalization,
        ItemMeanNormalization>;

benchmark.cpp is the program used to generate the results in the form of .csv files.

#include <armadillo>
#include <mlpack/methods/cf/cf.hpp>
#include <mlpack/methods/cf/decomposition_policies/batch_svd_method.hpp>
#include <mlpack/methods/cf/decomposition_policies/randomized_svd_method.hpp>
#include <mlpack/methods/cf/decomposition_policies/regularized_svd_method.hpp>
#include <mlpack/methods/cf/decomposition_policies/svd_complete_method.hpp>
#include <mlpack/methods/cf/decomposition_policies/svd_incomplete_method.hpp>
#include <mlpack/methods/cf/normalization/no_normalization.hpp>
#include <mlpack/methods/cf/normalization/overall_mean_normalization.hpp>
#include <mlpack/methods/cf/normalization/user_mean_normalization.hpp>
#include <mlpack/methods/cf/normalization/item_mean_normalization.hpp>
#include <mlpack/methods/cf/normalization/z_score_normalization.hpp>
#include <mlpack/methods/cf/normalization/combined_normalization.hpp>
#include <mlpack/methods/cf/neighbor_search_policies/lmetric_search.hpp>
#include <mlpack/methods/cf/neighbor_search_policies/cosine_search.hpp>
#include <mlpack/methods/cf/neighbor_search_policies/pearson_search.hpp>
#include <mlpack/methods/cf/interpolation_policies/average_interpolation.hpp>
#include <mlpack/methods/cf/interpolation_policies/similarity_interpolation.hpp>
#include <mlpack/methods/cf/interpolation_policies/regression_interpolation.hpp>
#include <fstream>
using namespace mlpack;
using namespace mlpack::cf;
const size_t numUsersForSimilarity = 5;
const size_t rank = 5;
const size_t maxIterations = 200;
const double minResidue = 1e-5;
const bool mit = true;
arma::mat trainSet;
arma::Mat<size_t> testSet;
std::string decompositionPolicies =
"NMF,SVDComplete,SVDIncomplete,BatchSVD,RegSVD,RandSVD";
using CombinedMeanNormalization =
CombinedNormalization<OverallMeanNormalization,
UserMeanNormalization,
ItemMeanNormalization>;
template<typename DecompositionPolicy = NMFPolicy,
typename NormalizationPolicy = NoNormalization,
typename NeighborSearchPolicy = EuclideanSearch,
typename InterpolationPolicy = AverageInterpolation>
double Test()
{
DecompositionPolicy decomposition;
CFType<NormalizationPolicy> model(trainSet, decomposition,
numUsersForSimilarity, rank, maxIterations, minResidue, mit);
arma::vec predictions;
const arma::Mat<size_t> combinations =
testSet.submat(0, 0, 1, testSet.n_cols - 1);
// Make predictions.
model.Predict(combinations, predictions);
// Compute RMSE.
double rmse = std::sqrt(arma::accu(arma::pow(
predictions - testSet.row(2).t(), 2)) / testSet.n_cols);
return rmse;
}
// Run CFType with specified configurations and all decomposition policies.
// Return results in csv format.
template<typename NormalizationPolicy = NoNormalization,
typename NeighborSearchPolicy = EuclideanSearch,
typename InterpolationPolicy = AverageInterpolation>
std::string RunAllDecompositionPolicies()
{
std::string result;
result += std::to_string(
Test<NMFPolicy, NormalizationPolicy, NeighborSearchPolicy, InterpolationPolicy>()
) + ",";
result += std::to_string(
Test<SVDCompletePolicy, NormalizationPolicy, NeighborSearchPolicy, InterpolationPolicy>()
) + ",";
result += std::to_string(
Test<SVDIncompletePolicy, NormalizationPolicy, NeighborSearchPolicy, InterpolationPolicy>()
) + ",";
result += std::to_string(
Test<BatchSVDPolicy, NormalizationPolicy, NeighborSearchPolicy, InterpolationPolicy>()
) + ",";
result += std::to_string(
Test<RegSVDPolicy, NormalizationPolicy, NeighborSearchPolicy, InterpolationPolicy>()
) + ",";
result += std::to_string(
Test<RandomizedSVDPolicy, NormalizationPolicy, NeighborSearchPolicy, InterpolationPolicy>()
);
return result;
}
std::string CompareAcrossNormalization()
{
std::string csvResult = "Normalization,"+decompositionPolicies+"\n";
csvResult += "NoNormalization," + RunAllDecompositionPolicies<NoNormalization>() + "\n";
csvResult += "OverallMean," + RunAllDecompositionPolicies<OverallMeanNormalization>() + "\n";
csvResult += "UserMean," + RunAllDecompositionPolicies<UserMeanNormalization>() + "\n";
csvResult += "ItemMean," + RunAllDecompositionPolicies<ItemMeanNormalization>() + "\n";
csvResult += "Combined," + RunAllDecompositionPolicies<CombinedMeanNormalization>() + "\n";
return csvResult;
}
std::string CompareAcrossNeighborSearchPolicies()
{
std::string csvResult = "NeighborSearch,"+decompositionPolicies+"\n";
csvResult += "Euclidean," +
RunAllDecompositionPolicies<CombinedMeanNormalization, EuclideanSearch>() + "\n";
csvResult += "Cosine," +
RunAllDecompositionPolicies<CombinedMeanNormalization, CosineSearch>() + "\n";
csvResult += "Pearson," +
RunAllDecompositionPolicies<CombinedMeanNormalization, PearsonSearch>() + "\n";
return csvResult;
}
std::string CompareAcrossInterpolationPolicies()
{
std::string csvResult = "Interpolation,"+decompositionPolicies+"\n";
csvResult += "Average," +
RunAllDecompositionPolicies<CombinedMeanNormalization,
EuclideanSearch,
AverageInterpolation>() + "\n";
csvResult += "SimilarityBased," +
RunAllDecompositionPolicies<CombinedMeanNormalization,
EuclideanSearch,
SimilarityInterpolation>() + "\n";
csvResult += "RegressionBased," +
RunAllDecompositionPolicies<CombinedMeanNormalization,
EuclideanSearch,
RegressionInterpolation>() + "\n";
return csvResult;
}
void WriteFile(std::string filename, std::string content)
{
std::ofstream ofile;
ofile.open(filename);
ofile << content;
ofile.close();
}
int main()
{
// Load data.
data::Load("ml-100k/train.csv", trainSet);
data::Load("ml-100k/test.csv", testSet);
std::string result;
result = CompareAcrossNormalization();
WriteFile("normalization.csv", result);
result = CompareAcrossNeighborSearchPolicies();
WriteFile("neighborsearch.csv", result);
result = CompareAcrossInterpolationPolicies();
WriteFile("interpolation.csv", result);
return 0;
}
Interpolation NMF SVDComplete SVDIncomplete BatchSVD RegSVD RandSVD
Average 0.953057 1.635753 1.604061 0.945257 0.948002 0.940722
SimilarityBased 0.950559 1.661185 1.569685 0.949938 0.950395 0.940722
RegressionBased 0.955269 1.626876 1.576476 0.952231 0.942847 0.940721
NeighborSearch NMF SVDComplete SVDIncomplete BatchSVD RegSVD RandSVD
Euclidean 0.951145 1.659544 1.582659 0.948048 0.946519 0.940722
Cosine 0.951144 1.650480 1.600040 0.954414 0.949094 0.940722
Pearson 0.950491 1.657441 1.565452 0.950297 0.946168 0.940722
Normalization NMF SVDComplete SVDIncomplete BatchSVD RegSVD RandSVD
NoNormalization 2.687243 2.594964 2.421016 0.940314 0.951421 3.168779
OverallMean 1.077630 1.767463 1.678172 0.939575 0.932781 1.068154
UserMean 1.006931 1.669093 1.624108 0.942312 0.939175 1.003825
ItemMean 1.011046 1.693937 1.633665 0.965028 0.967486 0.991557
Combined 0.951061 1.635910 1.590888 0.956371 0.952686 0.940721
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment