Last active
August 29, 2015 14:02
-
-
Save tklein23/7d84358f63babc4a5fbf to your computer and use it in GitHub Desktop.
Loading SVMlight file using streaming API
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* Written (W) 2013 Thoralf Klein <thoralf.klein@zib.de> | |
* Copyright (C) 2013 Zuse-Institute-Berlin (ZIB) | |
* Copyright (C) 2013 Thoralf Klein | |
*/ | |
#include <shogun/io/streaming/StreamingAsciiFile.h> | |
#include <shogun/features/streaming/StreamingSparseFeatures.h> | |
#include <shogun/lib/SGVector.h> | |
#include <shogun/lib/SGSparseVector.h> | |
#include <shogun/lib/SGSparseMatrix.h> | |
using namespace shogun; | |
void | |
parse_svmlight_file (const char *fname, int32_t & num_examples, | |
int32_t & num_dimensions, int64_t & num_nzz_features) | |
{ | |
CStreamingAsciiFile *file = new CStreamingAsciiFile (fname); | |
SG_REF (file); | |
num_examples = 0; | |
num_dimensions = 0; | |
num_nzz_features = 0; | |
CStreamingSparseFeatures < float64_t > *stream_features = | |
new CStreamingSparseFeatures < float64_t > (file, true, 1024); | |
SG_REF (stream_features); | |
// printf ("on-the-fly feature processing...\n"); | |
stream_features->start_parser (); | |
while (stream_features->get_next_example ()) | |
{ | |
num_examples += 1; | |
num_nzz_features += stream_features->get_nnz_features_for_vector (); | |
// SGSparseVector<float64_t> vec = stream_features->get_vector(); | |
// SGSparseVector < float64_t > vec = stream_features->get_vector().clone(); | |
// printf("x_%d", num_examples); | |
// vec.display_vector(""); | |
stream_features->release_example (); | |
} | |
num_dimensions = stream_features->get_dim_feature_space (); | |
printf("finished reading %d examples with %d dimensions, sparseness-factor %f.\n", | |
num_examples, num_dimensions, | |
1.0 * num_examples * num_dimensions / num_nzz_features); | |
stream_features->end_parser (); | |
SG_UNREF (stream_features); | |
SG_UNREF (file); | |
} | |
SGVector < int32_t > | |
load_svmlight_file (const char *fname, SGSparseMatrix < float64_t > feats, const int32_t max_dimension) | |
{ | |
SGVector < int32_t > labs(feats.num_vectors); | |
CStreamingAsciiFile *file = new CStreamingAsciiFile (fname); | |
SG_REF (file); | |
int32_t num_examples = 0; | |
int64_t matrix_size = sizeof (SGSparseVector < float64_t >) * feats.num_vectors; | |
CStreamingSparseFeatures < float64_t > *stream_features = | |
new CStreamingSparseFeatures < float64_t > (file, true, 1024); | |
SG_REF (stream_features); | |
printf("building sparse matrix (max dimension: %d)...\n", max_dimension); | |
stream_features->start_parser (); | |
while (stream_features->get_next_example ()) | |
{ | |
// fprintf(stderr, "reading example %d\n", num_examples); | |
labs[num_examples] = stream_features->get_label () > 0 ? +1 : -1; | |
stream_features->sort_features(); | |
SGSparseVector < float64_t > vec = stream_features->get_vector().clone(); | |
stream_features->release_example(); | |
// if max_dimension_index is given, then truncate sparse vector | |
if (max_dimension > -1) { | |
for (int32_t i = 0; i < vec.num_feat_entries; i++) { | |
if (vec.features[i].feat_index >= max_dimension) { | |
vec.num_feat_entries = i; | |
} | |
} | |
} | |
float64_t norm_vec = sqrtl(vec.sparse_dot(vec)); | |
// printf("|| vec[%d] || = %g\n", num_examples, norm_vec); | |
if (fabsl(norm_vec) >= 0.00000001) { | |
for (int32_t i=0; i<vec.num_feat_entries; i++) { | |
vec.features[i].entry /= norm_vec; | |
} | |
} | |
ASSERT(vec.num_feat_entries <= feats.num_features); | |
feats.sparse_matrix[num_examples] = vec; | |
// feats.sparse_matrix[num_examples].display_vector("x_i"); | |
num_examples += 1; | |
matrix_size += sizeof (SGSparseVectorEntry < float64_t >) * vec.num_feat_entries; | |
} | |
printf ("finished loading %d examples into sparse matrix (%.1f MiB)\n", | |
num_examples, 1.0 * matrix_size / 1024 / 1024); | |
stream_features->end_parser (); | |
SG_UNREF (stream_features); | |
SG_UNREF (file); | |
return labs; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment