Skip to content

Instantly share code, notes, and snippets.

@nishidy
Last active March 5, 2017 09:08
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nishidy/498187de1415fea0c6f5b0636e08242f to your computer and use it in GitHub Desktop.
Save nishidy/498187de1415fea0c6f5b0636e08242f to your computer and use it in GitHub Desktop.
Take TOP N list from large amount of keys in C++11
// -std=c++11
#include <iostream>
#include <vector>
#include <map>
// XXX keys in map are sorted and the time to search value by key is O(logn)
#include <unordered_map>
// XXX keys in unordered_map are hashed and the time search value by key is O(1)
#include <array>
#include <algorithm>
#include <random>
#include <chrono>
int SIZE;
int CHARS;
int TOPN;
using namespace std;
typedef unsigned int uint;
typedef pair<string, int> ass_arr;
bool sort_greater(const ass_arr& left, const ass_arr& right){
return left.second > right.second;
}
int sort_greater_int(const void* left, const void* right){
const ass_arr* l = (const ass_arr*)left;
const ass_arr* r = (const ass_arr*)right;
if( l->second > r->second) return 1;
if( l->second < r->second) return -1;
return 0;
}
void conv_map_to_vec(unordered_map<string,uint>* in, vector<pair<string,uint> >* out){
for(auto itr = in->begin(); itr != in->end(); itr++){
out->push_back(pair<string,int>(itr->first,itr->second));
}
}
void conv_vec_pair_to_vec(vector<pair<string,uint> >* in, vector<string>& out){
for(auto itr = in->begin(); itr != in->end(); itr++){
out.push_back(itr->first);
//cout << itr->first << ":" << itr->second << endl;
}
}
void make_input_data(vector<string>& out){
//random_device rnd;
// XXX random_device always get entropy from hardware.
// Thus, it is relatively slow. (Or blocked sometimes)
mt19937 mt(1);
normal_distribution<double> dist(13.0,6.0);
for(auto i=0;i<SIZE;i++){
string s;
int m;
for(auto j=0;j<CHARS;j++){
int m = ((int)dist(mt))%26+65;
if(m<65){
m=65;
}else if(90<m){
m=90;
}
s.push_back((char)m);
}
out.push_back(s);
//cout<< s << endl;
}
//for(auto itr = out->begin(); itr != out->end(); itr++){
// cout<< *itr << endl;
//}
}
void take_top_n_by_sort(vector<string>& indata, vector<string>& refoutdata){
unordered_map<string,uint> outdata;
for(auto itr = indata.begin(); itr != indata.end(); ++itr){
if( outdata.find(*itr) != outdata.end() ){
outdata[*itr] += 1;
}else{
outdata[*itr] = 1;
}
}
vector<pair<string,uint> > maptovec;
conv_map_to_vec(&outdata,&maptovec);
auto chrono_start = chrono::system_clock::now();
sort(maptovec.begin(),maptovec.end(),sort_greater);
//qsort(&maptovec,maptovec.size(),sizeof(vector<pair<string,uint> >),sort_greater_int);
auto chrono_end = chrono::system_clock::now();
/* debug */
int n=TOPN*2;
cout << "Show top " << n << " data in take_top_n_by_sort()." << endl;
for(auto itr = maptovec.begin(); itr !=maptovec.end(); itr++){
if(n--==0) break;
cout << itr->first << ":" << itr->second << endl;
}
cout << endl;
cout << "(Elapsed time for sort() in take_top_n_by_sort() with " << outdata.size() << " keys : " << chrono::duration_cast<chrono::milliseconds>(chrono_end-chrono_start).count() << "ms)" << endl;
conv_vec_pair_to_vec(&maptovec,refoutdata);
}
string take_min_key(unordered_map<string,uint> in){
string k="";
uint v=1<<31;
for(auto itr = in.begin(); itr != in.end(); itr++){
if(itr->second<v){
k=itr->first;
v=itr->second;
}
}
return k;
}
void take_top_n_no_sort(vector<string>& indata, vector<string>& refoutdata, uint maxnum){
unordered_map<string,uint> outdata;
unordered_map<string,uint> savedata;
for(auto itr = indata.begin(); itr != indata.end(); ++itr){
if( outdata.find(*itr) != outdata.end() ){
outdata[*itr] += 1;
}else{
outdata[*itr] = 1;
}
if( savedata.size() < maxnum ){
auto sitr = savedata.find(*itr);
if( sitr != savedata.end() ){
sitr->second = outdata[*itr];
}else{
savedata[*itr] = outdata[*itr];
}
}else{
string minkey = take_min_key(savedata);
if( savedata[minkey] < outdata[*itr] ){
auto sitr = savedata.find(*itr);
if( (sitr = savedata.find(*itr)) != savedata.end() ){
sitr->second = outdata[*itr];
}else{
savedata.erase(minkey);
savedata[*itr] = outdata[*itr];
}
}
}
}
uint i=0;
for(auto itr = savedata.begin(); itr != savedata.end(); itr++){
refoutdata[i++] = itr->first;
}
}
void take_top_n_no_sort_with_flag(vector<string>& indata, vector<string>& refoutdata, uint maxnum){
unordered_map<string,uint> outdata;
unordered_map<string,uint> savedata;
string minkey = "";
bool mincheckflag = true;
for(auto itr = indata.begin(); itr != indata.end(); ++itr){
if( outdata.find(*itr) != outdata.end() ){
outdata[*itr] += 1;
}else{
outdata[*itr] = 1;
}
if( savedata.size() < maxnum ){
auto sitr = savedata.find(*itr);
if( sitr != savedata.end() ){
sitr->second = outdata[*itr];
}else{
savedata[*itr] = outdata[*itr];
}
}else{
if( mincheckflag ) minkey = take_min_key(savedata);
if( savedata[minkey] < outdata[*itr] ){
auto sitr = savedata.find(*itr);
if( (sitr = savedata.find(*itr)) != savedata.end() ){
sitr->second = outdata[*itr];
}else{
savedata.erase(minkey);
savedata[*itr] = outdata[*itr];
}
mincheckflag = true;
}else{
mincheckflag = false;
}
}
}
//cout << "size=" << savedata.size() << endl;
uint i=0;
for(auto itr = savedata.begin(); itr != savedata.end(); itr++){
refoutdata[i++] = itr->first;
}
}
int main(int argc, char* argv[]){
//vector<string>* indata = new vector<string>(1000);
// 1000 elements are already allocated.
// Thus, push_back will add an element after 1000 elements.
if(argc!=4) exit(1);
SIZE = atoi(argv[1]);
CHARS = atoi(argv[2]);
TOPN = atoi(argv[3]);
vector<string> indata;
make_input_data(indata);
vector<string> outdata;
cout << "Parameters:" << endl
<< " dataset size = " << SIZE << endl
<< " character length = " << CHARS << endl
<< " # of top data to show = " << TOPN << endl << endl;
//////////////////////////
// take_top_n_by_sort() //
//////////////////////////
auto chrono_start = chrono::system_clock::now();
take_top_n_by_sort(indata,outdata);
auto chrono_end = chrono::system_clock::now();
cout << "Elapsed time for take_top_n_by_sort() : " << chrono::duration_cast<chrono::milliseconds>(chrono_end-chrono_start).count() << "ms" << endl;
uint c = TOPN;
for(auto itr = outdata.begin(); itr != outdata.end(); itr++){
if(c--==0) break;
cout << *itr << endl;
}
cout << endl;
//////////////////////////
// take_top_n_no_sort() //
//////////////////////////
chrono_start = chrono::system_clock::now();
take_top_n_no_sort(indata,outdata,TOPN);
chrono_end = chrono::system_clock::now();
cout << "Elapsed time for take_top_n_no_sort() : " << chrono::duration_cast<chrono::milliseconds>(chrono_end-chrono_start).count() << "ms" << endl;
c = TOPN;
for(auto itr = outdata.begin(); itr != outdata.end(); itr++){
if(c--==0) break;
cout << *itr << endl;
}
cout << endl;
///////////////////////////
// take_top_n_no_sort_with_flag() //
///////////////////////////
chrono_start = chrono::system_clock::now();
take_top_n_no_sort_with_flag(indata,outdata,TOPN);
chrono_end = chrono::system_clock::now();
cout << "Elapsed time for take_top_n_no_sort_with_flag() : " << chrono::duration_cast<chrono::milliseconds>(chrono_end-chrono_start).count() << "ms" << endl;
c = TOPN;
for(auto itr = outdata.begin(); itr != outdata.end(); itr++){
if(c--==0) break;
cout << *itr << endl;
}
return 0;
}
@nishidy
Copy link
Author

nishidy commented Mar 5, 2017

$ sw_vers 
ProductName:	Mac OS X
ProductVersion:	10.10.5
BuildVersion:	14F1909
$ g++ take_top_n.cpp -std=c++11 -o take_top_n -O3
$ ./take_top_n 10000000 5 5
Parameters:
  dataset size = 10000000
  character length = 5
  # of top data to show = 5

Show top 10 data in take_top_n_by_sort().
LMKNK:27
LMOOM:27
HLLMM:26
MOKOO:25
NPNKH:24
MMPLM:24
NQMPM:23
NOPLK:23
MNPNP:23
NJOPN:22

(Elapsed time for sort() in take_top_n_by_sort() with 4787637 keys : 327ms)
Elapsed time for take_top_n_by_sort() : 7835ms
LMKNK
LMOOM
HLLMM
MOKOO
NPNKH

Elapsed time for take_top_n_no_sort() : 18567ms
NPNKH
HLLMM
LMKNK
LMOOM
MOKOO

Elapsed time for take_top_n_no_sort_with_flag() : 7490ms
NPNKH
HLLMM
LMKNK
LMOOM
MOKOO

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment