Skip to content

Instantly share code, notes, and snippets.

@tklein23
Created June 14, 2014 14:07
Show Gist options
  • Save tklein23/9b79eb44e298e6719fd4 to your computer and use it in GitHub Desktop.
Save tklein23/9b79eb44e298e6719fd4 to your computer and use it in GitHub Desktop.
benchmark of feature-hashing implementations
/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 3 of the License, or
* (at your option) any later version.
*
* Written (W) 2014 Thoralf Klein
*/
#include <shogun/base/init.h>
#include <shogun/lib/common.h>
#include <shogun/lib/Hash.h>
#include <shogun/lib/SGSparseVector.h>
#include <shogun/mathematics/Random.h>
#include <shogun/lib/Time.h>
#include <string.h>
#include <cassert>
using namespace shogun;
SGSparseVector<float64_t>
hash_sparse_vector_abinash(SGSparseVector<float64_t> vec,
int32_t dim,
uint32_t seed)
{
SGVector<float64_t> h_vec(dim);
h_vec.zero();
for (index_t i = 0; i < vec.num_feat_entries; i++)
{
uint32_t hash = CHash::MurmurHash3((uint8_t *) &vec.features[i].feat_index, sizeof(index_t),
seed);
h_vec[hash % dim] += vec.features[i].entry;
}
int32_t num_nnz_features = 0;
for (index_t i = 0; i < dim; i++)
{
if (h_vec[i] != 0)
{
num_nnz_features++;
}
}
SGSparseVector<float64_t> sv(num_nnz_features);
int32_t sparse_index = 0;
for (index_t i = 0; i < dim; i++)
{
if (h_vec[i] != 0)
{
sv.features[sparse_index].entry = h_vec[i];
sv.features[sparse_index++].feat_index = i;
}
}
sv.sort_features(true);
return sv;
}
SGSparseVector<float64_t>
hash_sparse_vector_tklein23(SGSparseVector <float64_t> &src,
uint32_t seed,
uint32_t hash_size)
{
// SGSparseVector<float64_t> dst = SGSparseVector<float64_t>(src.num_feat_entries);
SGSparseVector<float64_t> dst(src.num_feat_entries);
for (int32_t j = 0; j < src.num_feat_entries; j++)
{
uint32_t mm3 = CHash::MurmurHash3((uint8_t *) & src.features[j].feat_index, 4, seed);
dst.features[j].feat_index = (mm3 >> 1) % hash_size;
dst.features[j].entry = (mm3 % 2 == 1 ? -1.0 : 1.0) * src.features[j].entry;
}
dst.sort_features(true);
return dst;
}
int main(int argc, char * argv[])
{
init_shogun_with_defaults();
int32_t rand_init = 12345;
int32_t num_runs = 1 << 8; // 256
int32_t num_features = 1 << 18; // 256*1024
int32_t dim_features = 1 << 28; // 256*1024*1024
uint32_t hash_seed = 23;
uint32_t hash_dim = 1 << 12;
// create random sparse vector
CTime * prepare_time = new CTime();
prepare_time->start();
CRandom * prng = new CRandom(rand_init);
SGSparseVector<float64_t> f = SGSparseVector<float64_t>(num_features);
for (int32_t i = 0; i < num_features; i++)
{
f.features[i].feat_index = prng->random(0, dim_features);
f.features[i].entry = 1.0;
}
f.sort_features(true);
assert(f.num_feat_entries > 0);
prepare_time->stop();
SG_UNREF(prng);
printf("[%.1f sec] created random sparse vector: %d features, %d dimensions, [min;max;len]=[%d;%d;%d]\n",
prepare_time->cur_time_diff(),
num_features,
dim_features,
f.features[0].feat_index,
f.features[f.num_feat_entries - 1].feat_index,
f.num_feat_entries
);
SG_UNREF(prepare_time);
// benchmark tklein23 hashing function
{
CTime * timer = new CTime();
timer->start();
for (int32_t i = 0; i < num_runs; i++)
{
SGSparseVector<float64_t> hf = hash_sparse_vector_tklein23(f, hash_seed, hash_dim);
}
timer->stop();
float64_t iter_time = timer->cur_time_diff();
printf("[%.1f sec] tklein23 implementation -- %.1f\n", iter_time, num_runs / iter_time);
SG_UNREF(timer);
}
// benchmark abinash hashing function
{
CTime * timer = new CTime();
timer->start();
for (int32_t i = 0; i < num_runs; i++)
{
SGSparseVector<float64_t> hf = hash_sparse_vector_abinash(f, hash_dim, hash_seed);
}
timer->stop();
float64_t iter_time = timer->cur_time_diff();
printf("[%.1f sec] abinash implementation -- %.1f\n", iter_time, num_runs / iter_time);
SG_UNREF(timer);
}
exit_shogun();
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment