Skip to content

Instantly share code, notes, and snippets.

@jaseemabid
Created May 30, 2024 13:25
Show Gist options
  • Save jaseemabid/c664a18d42935e125a79c8a99907347b to your computer and use it in GitHub Desktop.
Save jaseemabid/c664a18d42935e125a79c8a99907347b to your computer and use it in GitHub Desktop.
/*! Top K elements of an unordered list
There are 3 ways to get the top k elements from an unsorted list.
1. Build a heap with all the elements and pop k times
2. Sort the whole list and return a slice from the end
3. Quick select k elements
Naively, I would expect 1 to be slowest, 2 to be faster and 3 the fastest.
Results:
```ignore
$ cargo bench topk
test topk::test::bench_heap ... bench: 1,434,903 ns/iter (+/- 379,674)
test topk::test::bench_qselect ... bench: 94,129 ns/iter (+/- 27,035)
test topk::test::bench_sort ... bench: 60,324 ns/iter (+/- 12,335)
```
🤯🤯🤯🤯🤯
A slow heap is understandable since its doing a lot of additional work under the
hood. My implementation of quick select seems to be doing very close to the
minimal work required, but the stdlib sort is SO MUCH FASTER and well optimized,
that sorting beats both by a very good margin.
Great TIL.
*/
use std::collections::BinaryHeap;
// Top k elements with a heap
pub fn top_k_heap(nums: &[i32], k: i32) -> Vec<i32> {
let mut heap = BinaryHeap::<i32>::new();
for n in nums {
heap.push(*n);
}
// NOTE: Heap iter() order is random, so that's not what we want here.
(0..k).map(|_| heap.pop().unwrap()).collect()
}
// Top k elements with quick select
pub fn quickselect(nums: &mut [i32], k: i32) -> &[i32] {
qselect(nums, 0, nums.len() - 1, k as usize)
}
// Sorts a (portion of an) array, divides it into partitions, then sorts those
//
// <https://en.wikipedia.org/wiki/Quicksort#Hoare_partition_scheme>
pub fn quicksort(nums: &mut [i32]) {
qsort(nums, 0, nums.len() - 1)
}
fn qselect(nums: &mut [i32], lo: usize, hi: usize, k: usize) -> &[i32] {
let boundary = nums.len() - k; // nums[boundary..] is the final result
if lo >= hi {
return &nums[nums.len() - k..];
}
let p = partition(nums, lo, hi);
// Instead of partitioning inside the boundary, sort the last known subsection
// with top k elements (lo..hi) inclusive and return sub range.
if p >= boundary {
qsort(nums, lo, nums.len() - 1);
&nums[boundary..]
} else if boundary < p {
qselect(nums, lo, p - 1, k) // Go left
} else {
qselect(nums, p + 1, hi, k) // Go right
}
}
fn qsort(nums: &mut [i32], lo: usize, hi: usize) {
if lo < hi {
let p = partition(nums, lo, hi);
qsort(nums, lo, p); // Note: the pivot is now included
qsort(nums, p + 1, hi);
}
}
// Divides array into two partitions
fn partition(nums: &mut [i32], lo: usize, hi: usize) -> usize {
let pivot = nums[(hi + lo).div_euclid(2)]; // The value in the middle of the array
// The original partition scheme described by Tony Hoare uses two pointers
// (indices into the range) that start at both ends of the array being
// partitioned, then move toward each other, until they detect an inversion: a
// pair of elements, one greater than the bound (Hoare's terms for the pivot
// value) at the first pointer, and one less than the bound at the second
// pointer.
let mut i: i32 = lo as i32 - 1; // Left index. NOTE: this could be negative since i32 instead of usize
let mut j: i32 = hi as i32 + 1; // Right index.
loop {
// Move the left index to the right at least once and while the element at
// the left index is less than the pivot
i += 1;
while nums[i as usize] < pivot {
i += 1;
}
// Move the right index to the left at least once and while the element at
// the right index is greater than the pivot
j -= 1;
while nums[j as usize] > pivot {
j -= 1;
}
// If the indices crossed, return
if i >= j {
return j as usize;
}
// Swap the elements at the left and right indices
nums.swap(i as usize, j as usize)
}
}
#[cfg(test)]
mod test {
extern crate test;
use {super::*, rand::prelude::random, std::i32, test::Bencher};
fn rand(n: usize) -> Vec<i32> {
(0..n).map(|_| random()).collect()
}
#[test]
fn sort_simple() {
let mut a = vec![0, 1, 2, 4, 1, 4, 9, 7, 0];
let b = vec![0, 0, 1, 1, 2, 4, 4, 7, 9];
quicksort(&mut a);
assert_eq!(b, a);
}
#[test]
fn sort_random() {
let mut a = rand(100);
let mut b = a.clone();
b.sort_unstable();
quicksort(&mut a);
assert_eq!(b, a);
}
#[test]
fn select_simple() {
let mut a = vec![9, 0, 1, 2, 4, 1, 4, 9, 7, 0, 1, 2, 3];
let len = a.len();
assert_eq!(vec![7, 9, 9], qselect(&mut a, 0, len - 1, 3));
}
#[test]
fn select_random() {
let mut a = rand(100);
let mut b = a.clone();
let len = a.len();
b.sort_unstable();
assert_eq!(&b[95..100], qselect(&mut a, 0, len - 1, 5));
}
#[bench]
fn bench_sort(b: &mut Bencher) {
let mut a = rand(100_000);
b.iter(|| {
let l = a.len();
assert_eq!(l, 100_000);
a.sort_unstable();
// Sum up the results to make sure its used. i32 would overflow and panic, so
// cast to i64. Would be nice if I could do this without an extra
// map.
a[l - 5..].iter().map(|i| *i as i64).sum::<i64>()
});
test::black_box(());
}
#[bench]
fn bench_qselect(b: &mut Bencher) {
let mut a = rand(100_000);
b.iter(|| {
let r = quickselect(&mut a, 5);
r.iter().map(|i| *i as i64).sum::<i64>()
});
test::black_box(());
}
#[bench]
fn bench_heap(b: &mut Bencher) {
let a = rand(100_000);
b.iter(|| {
let r = top_k_heap(&a, 5);
r.iter().map(|i| *i as i64).sum::<i64>()
});
test::black_box(());
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment