Skip to content

Instantly share code, notes, and snippets.

@duruyao
Last active October 7, 2021 03:18
Show Gist options
  • Save duruyao/a3f9363ca011e771983a5e0e758b9139 to your computer and use it in GitHub Desktop.
Save duruyao/a3f9363ca011e771983a5e0e758b9139 to your computer and use it in GitHub Desktop.
Selecting Top-K elements algorithm implemented by C++.
/*
* @author duruyao
* @file top_k_by_BFPRT.cpp
* @desc
* @date 2021-10-02
* @version 1.0
*/
#include <vector>
#include <iostream>
/**
* insertionSort sorts data set by insertion sorting algorithm.
*
* ---time complexity---
*
* best case: O(1)
* average case: O(n ^ 2)
* worst case: O(n ^ 2)
*
* ---when to use---
*
* (1) data set is small OR
* (2) elements of data set are almost ordered OR
* (3) less code is required.
*
* ---parameters and return---
*
* @tparam T is a certain data type
* @param input is a data set to be sorted
* @param begin is the first index
* @param end is the index past the last index
* @param cmpFunc is a function for comparing elements of data set
*/
template<typename T>
void insertionSort(std::vector<T> &input, int begin, int end, int (*cmpFunc)(const T &, const T &)) {
for (auto i = begin + 1; i < end; i++) {
for (auto j = i - 1; j >= begin && cmpFunc(input[j], input[j + 1]) > 0; j--) {
std::swap(input[j], input[j + 1]);
}
}
}
/**
* partition partitions input data set into two parts according to pivot.
*
* ---time complexity---
*
* always: O(n)
*
* ---parameters and return---
*
* @tparam T is a certain data type
* @param input is a data set to be partitioned
* @param begin is the first index
* @param end is the index past the last index
* @param cmpFunc is a function for comparing elements of data set
* @param idx index of pivot before partitioning
* @return index of pivot after partitioning
*/
template<typename T>
int partition(std::vector<T> &input, int begin, int end, int (*cmpFunc)(const T &, const T &), int idx) {
std::swap(input[begin], input[idx]);
auto pivotIdx = begin + 1;
for (int i = pivotIdx; i < end; i++) {
if (cmpFunc(input[i], input[begin]) < 0) {
std::swap(input[i], input[pivotIdx]);
pivotIdx++;
}
}
pivotIdx--;
std::swap(input[begin], input[pivotIdx]);
return pivotIdx;
}
/**
* medianOfMedians selects median of medians.
*
* ---logic---
*
* (1) partition data set into multiple groups, every group has groupSize elements
* (2) build a new data set contains of median of every group
* (3) repeat (1) (2) until number of data set elements <= groupSize
*
* ---time complexity---
*
* worst case: O(n)
*
* ---parameters and return---
*
* @tparam T is a certain data type
* @param input is a data set to be searched
* @param begin is the first index
* @param end is the index past the last index
* @param cmpFunc is a function for comparing elements of data set
* @param groupSize is number of group elements (default 5)
* @return index of median of medians
*/
template<typename T>
int medianOfMedians(std::vector<T> &input, int begin, int end,
int (*cmpFunc)(const T &, const T &), int groupSize = 5) {
if (end - begin <= groupSize) {
insertionSort(input, begin, end, cmpFunc);
return (begin + end - 1) / 2;
}
int left = begin;
for (int i = begin; i < end - groupSize + 1; i += groupSize) {
insertionSort(input, i, i + 5, cmpFunc);
std::swap(input[left], input[i + groupSize / 2]);
left++;
}
return medianOfMedians(input, begin, left, cmpFunc, groupSize);
}
/**
* BFPRT selects the No.K element, and moves Top K elements to [0, index(No.K)].
*
* ---time complexity---
*
* worst case: O(n)
*
* ---when to use---
*
* (1) no extra storage space is allowed.
*
* @tparam T is a certain data type
* @param input is a data set to be searched
* @param begin is the first index
* @param end is the index past the last index
* @param cmpFunc is a function for comparing elements of data set
* @param k is the rank of the element to be selected
* @return index of the No.K element if selecting successfully, otherwise return -1
*/
template<typename T>
int BFPRT(std::vector<T> &input, int begin, int end, int (*cmpFunc)(const T &, const T &), int k) {
if (k <= 0 || k > end || end - begin <= 1) {
return (k - 1 == begin) ? begin : -1;
}
auto pivotIdx = medianOfMedians(input, begin, end, cmpFunc);
pivotIdx = partition(input, begin, end, cmpFunc, pivotIdx);
if (k - 1 < pivotIdx) {
return BFPRT(input, begin, pivotIdx, cmpFunc, k);
} else if (k - 1 > pivotIdx) {
return BFPRT(input, pivotIdx + 1, end, cmpFunc, k);
}
return pivotIdx;
}
int cmpInt(const int &n1, const int &n2) {
return n1 - n2;
}
template<typename T>
std::ostream &operator<<(std::ostream &os, const std::vector<T> &input) {
for (auto &it : input) { os << it << " "; }
return os;
}
int main(int argc, char **argv) {
std::vector<int> array;
for (int i = 0; i < 40; array.push_back(random() % 100), i++);
std::cout << array << std::endl;
auto k = 7;
auto idx = BFPRT(array, 0, (int) array.size(), cmpInt, k); // index of the No.K element
std::cout << array << std::endl;
std::cout << "the No." << k << " element is array[" << idx << "]" << std::endl;
std::cout << std::vector<int>(array.begin(), array.begin() + idx + 1) << std::endl; // the Top K elements
return 0;
}
/*
* @author duruyao
* @file top_k_by_STL_heap.cpp
* @desc
* @date 2021-10-02
* @version 1.0
*/
#include <vector>
#include <iostream>
#include <algorithm>
/**
* selectTopK selects the Top K elements by STL provided heap.
*
* ---time complexity---
*
* best case: O(n * log k)
* average case: O(n * log k)
* worst case: O(n * log k)
*
* ---space complexity---
*
* always: O(k)
*
* ---when to use---
*
* (1) extra storage space is allowed AND
* (2) order of input data set is not allowed changed.
*
* @tparam T is a certain data type
* @param input is a data set to be searched
* @param begin is the first index
* @param end is the index past the last index
* @param cmpFunc is a function for comparing elements of data set
* @param k is the rank of the element to be selected
* @param output is a data set as selecting result
*/
template<typename T>
void selectTopK(const std::vector<T> &input, int begin, int end,
int (*cmpFunc)(const T &, const T &), int k, std::vector<T> &output) {
output.clear();
std::make_heap(output.begin(), output.end()); // build a big top heap
if (k >= end - begin) {
output = input;
} else if (k > 0) {
for (auto &it : input) {
if (output.size() < k) {
output.push_back(it);
std::push_heap(output.begin(), output.end());
} else if (cmpFunc(it, output.front()) < 0) {
std::pop_heap(output.begin(), output.end());
output.pop_back();
output.push_back(it);
std::push_heap(output.begin(), output.end());
}
}
}
}
int cmpInt(const int &n1, const int &n2) {
return n1 - n2;
}
template<typename T>
std::ostream &operator<<(std::ostream &os, const std::vector<T> &input) {
for (auto &it : input) { os << it << " "; }
return os;
}
int main(int argc, char **argv) {
std::vector<int> array, result;
for (int i = 0; i < 40; array.push_back(random() % 100), i++);
std::cout << array << std::endl;
auto k = 7;
selectTopK(array, 0, (int) array.size(), cmpInt, k, result); // select the Top K elements
std::cout << result << std::endl;
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment