Skip to content

Instantly share code, notes, and snippets.

@keisukefukuda
Created February 17, 2014 06:54
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save keisukefukuda/9045926 to your computer and use it in GitHub Desktop.
Save keisukefukuda/9045926 to your computer and use it in GitHub Desktop.
Random sampling from a data stream using Reservoir algorithm.
// Copyright(C) 2014 by Keisuke Fukuda
// License: MIT License
/*
// http://en.wikipedia.org/wiki/Reservoir_sampling
Reservoir sampling
array R[k]; // result
integer i, j;
// fill the reservoir array
for each i in 1 to k do
R[i] := S[i]
done;
// replace elements with gradually decreasing probability
for each i in k+1 to length(S) do
j := random(1, i); // important: inclusive range
if j <= k then
R[j] := S[i]
fi
done
*/
#include <cstdlib>
#include <cstdio>
#include <ctime>
#include <vector>
#include <string>
#include <iostream>
#include <unistd.h>
using std::string;
using std::vector;
const size_t BUFSIZE = 10000;
const size_t BUF_MIN = 10;
template<class T>
class Reservoir {
int k_; // size of reservoir
int i_; // index of the current input;
std::vector<T> R_; // the resourvoir
protected:
// Returns a random integer number between a and b (inclusive range)
int randint(int a, int b) {
if (a > b) {int tmp = a; a = b; b = tmp;}
int count = b - a + 1;
return rand() % count + a;
}
public:
Reservoir(int k) : k_(k), i_(0), R_() {}
void append(const T& v) {
i_++;
if (i_ <= k_) {
R_.push_back(v);
} else {
int j = randint(1,i_);
if (j <= k_) {
R_[j-1] = v;
}
}
}
inline const std::vector<T>& get() const {
return R_;
}
};
// read a block of string from stream with at least one newline character.
// In addition, newline is normalized to '\n'.
bool xread(std::string & trg, int block_len, std::istream& ifs) {
int nread = 0;
int c;
while(nread < block_len) {
c = ifs.get();
if (!ifs.good()) return true;
if (ifs.good()) {
if ((c == '\r' && ifs.peek() == '\n') || c == '\r' || c == '\n') {
trg.append("\n");
} else {
trg.append(1, (char) c);
}
nread++;
}
}
return !ifs.good();
}
int main(int argc, char **argv) {
int k = 1000000;
srand(time(NULL));
char c;
while ((c = getopt(argc, argv, "n:")) != -1) {
switch(c) {
case 'n':
k = atoi(optarg);
if (k <= 0) {
fprintf(stderr, "Invalid value for -n: %s\n", optarg);
exit(-1);
}
break;
case '?':
if (optopt == 'n') {
fprintf(stderr, "ERROR: Option -%c takes an argument.\n", c);
exit(-1);
}
default:
fprintf(stderr, "ERROR: Unknown option: %c\n", c);
exit(-1);
}
}
Reservoir<std::string> R(k);
string s;
int eof = 0;
while (1) {
// read more string from stdin, until at
if (s.size() == 0 && eof) {
break;
}
if (s.size() < BUF_MIN && !eof) {
eof = xread(s, 1000, std::cin);
}
size_t pos = s.find("\n");
string line;
if (pos == string::npos) {
line = s;
s = "";
} else {
line.assign(s, 0, pos);
s.erase(0, pos+1); // erase a line including the trailing "\n"
}
R.append(line);
}
for (size_t i = 0; i < R.get().size(); i++) {
std::cout << R.get()[i] << std::endl;
}
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment