Last active
October 7, 2021 03:18
-
-
Save duruyao/a3f9363ca011e771983a5e0e758b9139 to your computer and use it in GitHub Desktop.
Selecting Top-K elements algorithm implemented by C++.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* @author duruyao | |
* @file top_k_by_BFPRT.cpp | |
* @desc | |
* @date 2021-10-02 | |
* @version 1.0 | |
*/ | |
#include <vector> | |
#include <iostream> | |
/** | |
* insertionSort sorts data set by insertion sorting algorithm. | |
* | |
* ---time complexity--- | |
* | |
* best case: O(1) | |
* average case: O(n ^ 2) | |
* worst case: O(n ^ 2) | |
* | |
* ---when to use--- | |
* | |
* (1) data set is small OR | |
* (2) elements of data set are almost ordered OR | |
* (3) less code is required. | |
* | |
* ---parameters and return--- | |
* | |
* @tparam T is a certain data type | |
* @param input is a data set to be sorted | |
* @param begin is the first index | |
* @param end is the index past the last index | |
* @param cmpFunc is a function for comparing elements of data set | |
*/ | |
template<typename T> | |
void insertionSort(std::vector<T> &input, int begin, int end, int (*cmpFunc)(const T &, const T &)) { | |
for (auto i = begin + 1; i < end; i++) { | |
for (auto j = i - 1; j >= begin && cmpFunc(input[j], input[j + 1]) > 0; j--) { | |
std::swap(input[j], input[j + 1]); | |
} | |
} | |
} | |
/** | |
* partition partitions input data set into two parts according to pivot. | |
* | |
* ---time complexity--- | |
* | |
* always: O(n) | |
* | |
* ---parameters and return--- | |
* | |
* @tparam T is a certain data type | |
* @param input is a data set to be partitioned | |
* @param begin is the first index | |
* @param end is the index past the last index | |
* @param cmpFunc is a function for comparing elements of data set | |
* @param idx index of pivot before partitioning | |
* @return index of pivot after partitioning | |
*/ | |
template<typename T> | |
int partition(std::vector<T> &input, int begin, int end, int (*cmpFunc)(const T &, const T &), int idx) { | |
std::swap(input[begin], input[idx]); | |
auto pivotIdx = begin + 1; | |
for (int i = pivotIdx; i < end; i++) { | |
if (cmpFunc(input[i], input[begin]) < 0) { | |
std::swap(input[i], input[pivotIdx]); | |
pivotIdx++; | |
} | |
} | |
pivotIdx--; | |
std::swap(input[begin], input[pivotIdx]); | |
return pivotIdx; | |
} | |
/** | |
* medianOfMedians selects median of medians. | |
* | |
* ---logic--- | |
* | |
* (1) partition data set into multiple groups, every group has groupSize elements | |
* (2) build a new data set contains of median of every group | |
* (3) repeat (1) (2) until number of data set elements <= groupSize | |
* | |
* ---time complexity--- | |
* | |
* worst case: O(n) | |
* | |
* ---parameters and return--- | |
* | |
* @tparam T is a certain data type | |
* @param input is a data set to be searched | |
* @param begin is the first index | |
* @param end is the index past the last index | |
* @param cmpFunc is a function for comparing elements of data set | |
* @param groupSize is number of group elements (default 5) | |
* @return index of median of medians | |
*/ | |
template<typename T> | |
int medianOfMedians(std::vector<T> &input, int begin, int end, | |
int (*cmpFunc)(const T &, const T &), int groupSize = 5) { | |
if (end - begin <= groupSize) { | |
insertionSort(input, begin, end, cmpFunc); | |
return (begin + end - 1) / 2; | |
} | |
int left = begin; | |
for (int i = begin; i < end - groupSize + 1; i += groupSize) { | |
insertionSort(input, i, i + 5, cmpFunc); | |
std::swap(input[left], input[i + groupSize / 2]); | |
left++; | |
} | |
return medianOfMedians(input, begin, left, cmpFunc, groupSize); | |
} | |
/** | |
* BFPRT selects the No.K element, and moves Top K elements to [0, index(No.K)]. | |
* | |
* ---time complexity--- | |
* | |
* worst case: O(n) | |
* | |
* ---when to use--- | |
* | |
* (1) no extra storage space is allowed. | |
* | |
* @tparam T is a certain data type | |
* @param input is a data set to be searched | |
* @param begin is the first index | |
* @param end is the index past the last index | |
* @param cmpFunc is a function for comparing elements of data set | |
* @param k is the rank of the element to be selected | |
* @return index of the No.K element if selecting successfully, otherwise return -1 | |
*/ | |
template<typename T> | |
int BFPRT(std::vector<T> &input, int begin, int end, int (*cmpFunc)(const T &, const T &), int k) { | |
if (k <= 0 || k > end || end - begin <= 1) { | |
return (k - 1 == begin) ? begin : -1; | |
} | |
auto pivotIdx = medianOfMedians(input, begin, end, cmpFunc); | |
pivotIdx = partition(input, begin, end, cmpFunc, pivotIdx); | |
if (k - 1 < pivotIdx) { | |
return BFPRT(input, begin, pivotIdx, cmpFunc, k); | |
} else if (k - 1 > pivotIdx) { | |
return BFPRT(input, pivotIdx + 1, end, cmpFunc, k); | |
} | |
return pivotIdx; | |
} | |
int cmpInt(const int &n1, const int &n2) { | |
return n1 - n2; | |
} | |
template<typename T> | |
std::ostream &operator<<(std::ostream &os, const std::vector<T> &input) { | |
for (auto &it : input) { os << it << " "; } | |
return os; | |
} | |
int main(int argc, char **argv) { | |
std::vector<int> array; | |
for (int i = 0; i < 40; array.push_back(random() % 100), i++); | |
std::cout << array << std::endl; | |
auto k = 7; | |
auto idx = BFPRT(array, 0, (int) array.size(), cmpInt, k); // index of the No.K element | |
std::cout << array << std::endl; | |
std::cout << "the No." << k << " element is array[" << idx << "]" << std::endl; | |
std::cout << std::vector<int>(array.begin(), array.begin() + idx + 1) << std::endl; // the Top K elements | |
return 0; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* @author duruyao | |
* @file top_k_by_STL_heap.cpp | |
* @desc | |
* @date 2021-10-02 | |
* @version 1.0 | |
*/ | |
#include <vector> | |
#include <iostream> | |
#include <algorithm> | |
/** | |
* selectTopK selects the Top K elements by STL provided heap. | |
* | |
* ---time complexity--- | |
* | |
* best case: O(n * log k) | |
* average case: O(n * log k) | |
* worst case: O(n * log k) | |
* | |
* ---space complexity--- | |
* | |
* always: O(k) | |
* | |
* ---when to use--- | |
* | |
* (1) extra storage space is allowed AND | |
* (2) order of input data set is not allowed changed. | |
* | |
* @tparam T is a certain data type | |
* @param input is a data set to be searched | |
* @param begin is the first index | |
* @param end is the index past the last index | |
* @param cmpFunc is a function for comparing elements of data set | |
* @param k is the rank of the element to be selected | |
* @param output is a data set as selecting result | |
*/ | |
template<typename T> | |
void selectTopK(const std::vector<T> &input, int begin, int end, | |
int (*cmpFunc)(const T &, const T &), int k, std::vector<T> &output) { | |
output.clear(); | |
std::make_heap(output.begin(), output.end()); // build a big top heap | |
if (k >= end - begin) { | |
output = input; | |
} else if (k > 0) { | |
for (auto &it : input) { | |
if (output.size() < k) { | |
output.push_back(it); | |
std::push_heap(output.begin(), output.end()); | |
} else if (cmpFunc(it, output.front()) < 0) { | |
std::pop_heap(output.begin(), output.end()); | |
output.pop_back(); | |
output.push_back(it); | |
std::push_heap(output.begin(), output.end()); | |
} | |
} | |
} | |
} | |
int cmpInt(const int &n1, const int &n2) { | |
return n1 - n2; | |
} | |
template<typename T> | |
std::ostream &operator<<(std::ostream &os, const std::vector<T> &input) { | |
for (auto &it : input) { os << it << " "; } | |
return os; | |
} | |
int main(int argc, char **argv) { | |
std::vector<int> array, result; | |
for (int i = 0; i < 40; array.push_back(random() % 100), i++); | |
std::cout << array << std::endl; | |
auto k = 7; | |
selectTopK(array, 0, (int) array.size(), cmpInt, k, result); // select the Top K elements | |
std::cout << result << std::endl; | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment