Skip to content

Instantly share code, notes, and snippets.

@tklein23
Last active August 29, 2015 14:02
Show Gist options
  • Save tklein23/7d84358f63babc4a5fbf to your computer and use it in GitHub Desktop.
Save tklein23/7d84358f63babc4a5fbf to your computer and use it in GitHub Desktop.
Loading SVMlight file using streaming API
/*
* Written (W) 2013 Thoralf Klein <thoralf.klein@zib.de>
* Copyright (C) 2013 Zuse-Institute-Berlin (ZIB)
* Copyright (C) 2013 Thoralf Klein
*/
#include <shogun/io/streaming/StreamingAsciiFile.h>
#include <shogun/features/streaming/StreamingSparseFeatures.h>
#include <shogun/lib/SGVector.h>
#include <shogun/lib/SGSparseVector.h>
#include <shogun/lib/SGSparseMatrix.h>
using namespace shogun;
void
parse_svmlight_file (const char *fname, int32_t & num_examples,
int32_t & num_dimensions, int64_t & num_nzz_features)
{
CStreamingAsciiFile *file = new CStreamingAsciiFile (fname);
SG_REF (file);
num_examples = 0;
num_dimensions = 0;
num_nzz_features = 0;
CStreamingSparseFeatures < float64_t > *stream_features =
new CStreamingSparseFeatures < float64_t > (file, true, 1024);
SG_REF (stream_features);
// printf ("on-the-fly feature processing...\n");
stream_features->start_parser ();
while (stream_features->get_next_example ())
{
num_examples += 1;
num_nzz_features += stream_features->get_nnz_features_for_vector ();
// SGSparseVector<float64_t> vec = stream_features->get_vector();
// SGSparseVector < float64_t > vec = stream_features->get_vector().clone();
// printf("x_%d", num_examples);
// vec.display_vector("");
stream_features->release_example ();
}
num_dimensions = stream_features->get_dim_feature_space ();
printf("finished reading %d examples with %d dimensions, sparseness-factor %f.\n",
num_examples, num_dimensions,
1.0 * num_examples * num_dimensions / num_nzz_features);
stream_features->end_parser ();
SG_UNREF (stream_features);
SG_UNREF (file);
}
SGVector < int32_t >
load_svmlight_file (const char *fname, SGSparseMatrix < float64_t > feats, const int32_t max_dimension)
{
SGVector < int32_t > labs(feats.num_vectors);
CStreamingAsciiFile *file = new CStreamingAsciiFile (fname);
SG_REF (file);
int32_t num_examples = 0;
int64_t matrix_size = sizeof (SGSparseVector < float64_t >) * feats.num_vectors;
CStreamingSparseFeatures < float64_t > *stream_features =
new CStreamingSparseFeatures < float64_t > (file, true, 1024);
SG_REF (stream_features);
printf("building sparse matrix (max dimension: %d)...\n", max_dimension);
stream_features->start_parser ();
while (stream_features->get_next_example ())
{
// fprintf(stderr, "reading example %d\n", num_examples);
labs[num_examples] = stream_features->get_label () > 0 ? +1 : -1;
stream_features->sort_features();
SGSparseVector < float64_t > vec = stream_features->get_vector().clone();
stream_features->release_example();
// if max_dimension_index is given, then truncate sparse vector
if (max_dimension > -1) {
for (int32_t i = 0; i < vec.num_feat_entries; i++) {
if (vec.features[i].feat_index >= max_dimension) {
vec.num_feat_entries = i;
}
}
}
float64_t norm_vec = sqrtl(vec.sparse_dot(vec));
// printf("|| vec[%d] || = %g\n", num_examples, norm_vec);
if (fabsl(norm_vec) >= 0.00000001) {
for (int32_t i=0; i<vec.num_feat_entries; i++) {
vec.features[i].entry /= norm_vec;
}
}
ASSERT(vec.num_feat_entries <= feats.num_features);
feats.sparse_matrix[num_examples] = vec;
// feats.sparse_matrix[num_examples].display_vector("x_i");
num_examples += 1;
matrix_size += sizeof (SGSparseVectorEntry < float64_t >) * vec.num_feat_entries;
}
printf ("finished loading %d examples into sparse matrix (%.1f MiB)\n",
num_examples, 1.0 * matrix_size / 1024 / 1024);
stream_features->end_parser ();
SG_UNREF (stream_features);
SG_UNREF (file);
return labs;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment