Created
June 14, 2014 14:07
-
-
Save tklein23/9b79eb44e298e6719fd4 to your computer and use it in GitHub Desktop.
benchmark of feature-hashing implementations
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* This program is free software; you can redistribute it and/or modify | |
* it under the terms of the GNU General Public License as published by | |
* the Free Software Foundation; either version 3 of the License, or | |
* (at your option) any later version. | |
* | |
* Written (W) 2014 Thoralf Klein | |
*/ | |
#include <shogun/base/init.h> | |
#include <shogun/lib/common.h> | |
#include <shogun/lib/Hash.h> | |
#include <shogun/lib/SGSparseVector.h> | |
#include <shogun/mathematics/Random.h> | |
#include <shogun/lib/Time.h> | |
#include <string.h> | |
#include <cassert> | |
using namespace shogun; | |
SGSparseVector<float64_t> | |
hash_sparse_vector_abinash(SGSparseVector<float64_t> vec, | |
int32_t dim, | |
uint32_t seed) | |
{ | |
SGVector<float64_t> h_vec(dim); | |
h_vec.zero(); | |
for (index_t i = 0; i < vec.num_feat_entries; i++) | |
{ | |
uint32_t hash = CHash::MurmurHash3((uint8_t *) &vec.features[i].feat_index, sizeof(index_t), | |
seed); | |
h_vec[hash % dim] += vec.features[i].entry; | |
} | |
int32_t num_nnz_features = 0; | |
for (index_t i = 0; i < dim; i++) | |
{ | |
if (h_vec[i] != 0) | |
{ | |
num_nnz_features++; | |
} | |
} | |
SGSparseVector<float64_t> sv(num_nnz_features); | |
int32_t sparse_index = 0; | |
for (index_t i = 0; i < dim; i++) | |
{ | |
if (h_vec[i] != 0) | |
{ | |
sv.features[sparse_index].entry = h_vec[i]; | |
sv.features[sparse_index++].feat_index = i; | |
} | |
} | |
sv.sort_features(true); | |
return sv; | |
} | |
SGSparseVector<float64_t> | |
hash_sparse_vector_tklein23(SGSparseVector <float64_t> &src, | |
uint32_t seed, | |
uint32_t hash_size) | |
{ | |
// SGSparseVector<float64_t> dst = SGSparseVector<float64_t>(src.num_feat_entries); | |
SGSparseVector<float64_t> dst(src.num_feat_entries); | |
for (int32_t j = 0; j < src.num_feat_entries; j++) | |
{ | |
uint32_t mm3 = CHash::MurmurHash3((uint8_t *) & src.features[j].feat_index, 4, seed); | |
dst.features[j].feat_index = (mm3 >> 1) % hash_size; | |
dst.features[j].entry = (mm3 % 2 == 1 ? -1.0 : 1.0) * src.features[j].entry; | |
} | |
dst.sort_features(true); | |
return dst; | |
} | |
int main(int argc, char * argv[]) | |
{ | |
init_shogun_with_defaults(); | |
int32_t rand_init = 12345; | |
int32_t num_runs = 1 << 8; // 256 | |
int32_t num_features = 1 << 18; // 256*1024 | |
int32_t dim_features = 1 << 28; // 256*1024*1024 | |
uint32_t hash_seed = 23; | |
uint32_t hash_dim = 1 << 12; | |
// create random sparse vector | |
CTime * prepare_time = new CTime(); | |
prepare_time->start(); | |
CRandom * prng = new CRandom(rand_init); | |
SGSparseVector<float64_t> f = SGSparseVector<float64_t>(num_features); | |
for (int32_t i = 0; i < num_features; i++) | |
{ | |
f.features[i].feat_index = prng->random(0, dim_features); | |
f.features[i].entry = 1.0; | |
} | |
f.sort_features(true); | |
assert(f.num_feat_entries > 0); | |
prepare_time->stop(); | |
SG_UNREF(prng); | |
printf("[%.1f sec] created random sparse vector: %d features, %d dimensions, [min;max;len]=[%d;%d;%d]\n", | |
prepare_time->cur_time_diff(), | |
num_features, | |
dim_features, | |
f.features[0].feat_index, | |
f.features[f.num_feat_entries - 1].feat_index, | |
f.num_feat_entries | |
); | |
SG_UNREF(prepare_time); | |
// benchmark tklein23 hashing function | |
{ | |
CTime * timer = new CTime(); | |
timer->start(); | |
for (int32_t i = 0; i < num_runs; i++) | |
{ | |
SGSparseVector<float64_t> hf = hash_sparse_vector_tklein23(f, hash_seed, hash_dim); | |
} | |
timer->stop(); | |
float64_t iter_time = timer->cur_time_diff(); | |
printf("[%.1f sec] tklein23 implementation -- %.1f\n", iter_time, num_runs / iter_time); | |
SG_UNREF(timer); | |
} | |
// benchmark abinash hashing function | |
{ | |
CTime * timer = new CTime(); | |
timer->start(); | |
for (int32_t i = 0; i < num_runs; i++) | |
{ | |
SGSparseVector<float64_t> hf = hash_sparse_vector_abinash(f, hash_dim, hash_seed); | |
} | |
timer->stop(); | |
float64_t iter_time = timer->cur_time_diff(); | |
printf("[%.1f sec] abinash implementation -- %.1f\n", iter_time, num_runs / iter_time); | |
SG_UNREF(timer); | |
} | |
exit_shogun(); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment