Last active
March 5, 2017 09:08
-
-
Save nishidy/498187de1415fea0c6f5b0636e08242f to your computer and use it in GitHub Desktop.
Take TOP N list from large amount of keys in C++11
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// -std=c++11 | |
#include <iostream> | |
#include <vector> | |
#include <map> | |
// XXX keys in map are sorted and the time to search value by key is O(logn) | |
#include <unordered_map> | |
// XXX keys in unordered_map are hashed and the time search value by key is O(1) | |
#include <array> | |
#include <algorithm> | |
#include <random> | |
#include <chrono> | |
int SIZE; | |
int CHARS; | |
int TOPN; | |
using namespace std; | |
typedef unsigned int uint; | |
typedef pair<string, int> ass_arr; | |
bool sort_greater(const ass_arr& left, const ass_arr& right){ | |
return left.second > right.second; | |
} | |
int sort_greater_int(const void* left, const void* right){ | |
const ass_arr* l = (const ass_arr*)left; | |
const ass_arr* r = (const ass_arr*)right; | |
if( l->second > r->second) return 1; | |
if( l->second < r->second) return -1; | |
return 0; | |
} | |
void conv_map_to_vec(unordered_map<string,uint>* in, vector<pair<string,uint> >* out){ | |
for(auto itr = in->begin(); itr != in->end(); itr++){ | |
out->push_back(pair<string,int>(itr->first,itr->second)); | |
} | |
} | |
void conv_vec_pair_to_vec(vector<pair<string,uint> >* in, vector<string>& out){ | |
for(auto itr = in->begin(); itr != in->end(); itr++){ | |
out.push_back(itr->first); | |
//cout << itr->first << ":" << itr->second << endl; | |
} | |
} | |
void make_input_data(vector<string>& out){ | |
//random_device rnd; | |
// XXX random_device always get entropy from hardware. | |
// Thus, it is relatively slow. (Or blocked sometimes) | |
mt19937 mt(1); | |
normal_distribution<double> dist(13.0,6.0); | |
for(auto i=0;i<SIZE;i++){ | |
string s; | |
int m; | |
for(auto j=0;j<CHARS;j++){ | |
int m = ((int)dist(mt))%26+65; | |
if(m<65){ | |
m=65; | |
}else if(90<m){ | |
m=90; | |
} | |
s.push_back((char)m); | |
} | |
out.push_back(s); | |
//cout<< s << endl; | |
} | |
//for(auto itr = out->begin(); itr != out->end(); itr++){ | |
// cout<< *itr << endl; | |
//} | |
} | |
void take_top_n_by_sort(vector<string>& indata, vector<string>& refoutdata){ | |
unordered_map<string,uint> outdata; | |
for(auto itr = indata.begin(); itr != indata.end(); ++itr){ | |
if( outdata.find(*itr) != outdata.end() ){ | |
outdata[*itr] += 1; | |
}else{ | |
outdata[*itr] = 1; | |
} | |
} | |
vector<pair<string,uint> > maptovec; | |
conv_map_to_vec(&outdata,&maptovec); | |
auto chrono_start = chrono::system_clock::now(); | |
sort(maptovec.begin(),maptovec.end(),sort_greater); | |
//qsort(&maptovec,maptovec.size(),sizeof(vector<pair<string,uint> >),sort_greater_int); | |
auto chrono_end = chrono::system_clock::now(); | |
/* debug */ | |
int n=TOPN*2; | |
cout << "Show top " << n << " data in take_top_n_by_sort()." << endl; | |
for(auto itr = maptovec.begin(); itr !=maptovec.end(); itr++){ | |
if(n--==0) break; | |
cout << itr->first << ":" << itr->second << endl; | |
} | |
cout << endl; | |
cout << "(Elapsed time for sort() in take_top_n_by_sort() with " << outdata.size() << " keys : " << chrono::duration_cast<chrono::milliseconds>(chrono_end-chrono_start).count() << "ms)" << endl; | |
conv_vec_pair_to_vec(&maptovec,refoutdata); | |
} | |
string take_min_key(unordered_map<string,uint> in){ | |
string k=""; | |
uint v=1<<31; | |
for(auto itr = in.begin(); itr != in.end(); itr++){ | |
if(itr->second<v){ | |
k=itr->first; | |
v=itr->second; | |
} | |
} | |
return k; | |
} | |
void take_top_n_no_sort(vector<string>& indata, vector<string>& refoutdata, uint maxnum){ | |
unordered_map<string,uint> outdata; | |
unordered_map<string,uint> savedata; | |
for(auto itr = indata.begin(); itr != indata.end(); ++itr){ | |
if( outdata.find(*itr) != outdata.end() ){ | |
outdata[*itr] += 1; | |
}else{ | |
outdata[*itr] = 1; | |
} | |
if( savedata.size() < maxnum ){ | |
auto sitr = savedata.find(*itr); | |
if( sitr != savedata.end() ){ | |
sitr->second = outdata[*itr]; | |
}else{ | |
savedata[*itr] = outdata[*itr]; | |
} | |
}else{ | |
string minkey = take_min_key(savedata); | |
if( savedata[minkey] < outdata[*itr] ){ | |
auto sitr = savedata.find(*itr); | |
if( (sitr = savedata.find(*itr)) != savedata.end() ){ | |
sitr->second = outdata[*itr]; | |
}else{ | |
savedata.erase(minkey); | |
savedata[*itr] = outdata[*itr]; | |
} | |
} | |
} | |
} | |
uint i=0; | |
for(auto itr = savedata.begin(); itr != savedata.end(); itr++){ | |
refoutdata[i++] = itr->first; | |
} | |
} | |
void take_top_n_no_sort_with_flag(vector<string>& indata, vector<string>& refoutdata, uint maxnum){ | |
unordered_map<string,uint> outdata; | |
unordered_map<string,uint> savedata; | |
string minkey = ""; | |
bool mincheckflag = true; | |
for(auto itr = indata.begin(); itr != indata.end(); ++itr){ | |
if( outdata.find(*itr) != outdata.end() ){ | |
outdata[*itr] += 1; | |
}else{ | |
outdata[*itr] = 1; | |
} | |
if( savedata.size() < maxnum ){ | |
auto sitr = savedata.find(*itr); | |
if( sitr != savedata.end() ){ | |
sitr->second = outdata[*itr]; | |
}else{ | |
savedata[*itr] = outdata[*itr]; | |
} | |
}else{ | |
if( mincheckflag ) minkey = take_min_key(savedata); | |
if( savedata[minkey] < outdata[*itr] ){ | |
auto sitr = savedata.find(*itr); | |
if( (sitr = savedata.find(*itr)) != savedata.end() ){ | |
sitr->second = outdata[*itr]; | |
}else{ | |
savedata.erase(minkey); | |
savedata[*itr] = outdata[*itr]; | |
} | |
mincheckflag = true; | |
}else{ | |
mincheckflag = false; | |
} | |
} | |
} | |
//cout << "size=" << savedata.size() << endl; | |
uint i=0; | |
for(auto itr = savedata.begin(); itr != savedata.end(); itr++){ | |
refoutdata[i++] = itr->first; | |
} | |
} | |
int main(int argc, char* argv[]){ | |
//vector<string>* indata = new vector<string>(1000); | |
// 1000 elements are already allocated. | |
// Thus, push_back will add an element after 1000 elements. | |
if(argc!=4) exit(1); | |
SIZE = atoi(argv[1]); | |
CHARS = atoi(argv[2]); | |
TOPN = atoi(argv[3]); | |
vector<string> indata; | |
make_input_data(indata); | |
vector<string> outdata; | |
cout << "Parameters:" << endl | |
<< " dataset size = " << SIZE << endl | |
<< " character length = " << CHARS << endl | |
<< " # of top data to show = " << TOPN << endl << endl; | |
////////////////////////// | |
// take_top_n_by_sort() // | |
////////////////////////// | |
auto chrono_start = chrono::system_clock::now(); | |
take_top_n_by_sort(indata,outdata); | |
auto chrono_end = chrono::system_clock::now(); | |
cout << "Elapsed time for take_top_n_by_sort() : " << chrono::duration_cast<chrono::milliseconds>(chrono_end-chrono_start).count() << "ms" << endl; | |
uint c = TOPN; | |
for(auto itr = outdata.begin(); itr != outdata.end(); itr++){ | |
if(c--==0) break; | |
cout << *itr << endl; | |
} | |
cout << endl; | |
////////////////////////// | |
// take_top_n_no_sort() // | |
////////////////////////// | |
chrono_start = chrono::system_clock::now(); | |
take_top_n_no_sort(indata,outdata,TOPN); | |
chrono_end = chrono::system_clock::now(); | |
cout << "Elapsed time for take_top_n_no_sort() : " << chrono::duration_cast<chrono::milliseconds>(chrono_end-chrono_start).count() << "ms" << endl; | |
c = TOPN; | |
for(auto itr = outdata.begin(); itr != outdata.end(); itr++){ | |
if(c--==0) break; | |
cout << *itr << endl; | |
} | |
cout << endl; | |
/////////////////////////// | |
// take_top_n_no_sort_with_flag() // | |
/////////////////////////// | |
chrono_start = chrono::system_clock::now(); | |
take_top_n_no_sort_with_flag(indata,outdata,TOPN); | |
chrono_end = chrono::system_clock::now(); | |
cout << "Elapsed time for take_top_n_no_sort_with_flag() : " << chrono::duration_cast<chrono::milliseconds>(chrono_end-chrono_start).count() << "ms" << endl; | |
c = TOPN; | |
for(auto itr = outdata.begin(); itr != outdata.end(); itr++){ | |
if(c--==0) break; | |
cout << *itr << endl; | |
} | |
return 0; | |
} | |
Author
nishidy
commented
Mar 5, 2017
•
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment