Last active
February 14, 2020 00:20
-
-
Save markpapadakis/10960554 to your computer and use it in GitHub Desktop.
Fast sort by elimination
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Assuming you, for example, are going through dates and for each date you collect ids and for each id the number of times | |
// the object identified by that id (e.g an ad banner, a product, anything) was clicked, or anything like that. | |
// | |
// That is, assuming that you are going to have to update the value of an object 1+ times, as opposed to just once (you can use other | |
// more efficient algorithms to deal with that -- e.g maintain a top-K list as you go, etc), how do you then sort the final list that holds | |
// (id, value) - that is, for each distinct id, the sum of times it was clicked (or anything)? | |
// | |
// Sorting a list that is comprised of 1million distinct IDs is just really slow. | |
// This is one way to do it, which is particularly efficient if the distribution of values is large. | |
// | |
// Say you want to get the top 10 items from that Vector<keywithvalue> | |
// This will filter kv so that Vector<>::Sort() will only end up sorting, usually, a few dozen values | |
FilterForSort(10, kv); | |
// Sort by value DESC | |
kv.Sort([](const keywithvalue &a, const keywithvalue &b) | |
{ | |
return TrivialCmp(b.value, a.value); | |
}); | |
// And now you have the top 10 most clicked (anything) objects in kv. | |
// | |
// This came in handy last night, maybe it can help you too one way or another. | |
// @upto == offset * limit + limit | |
// @kv is a vector that holds {key, value}, where key is e.g an identifier and value whatever we are using to sort (e.g total clicks) | |
static void FilterForSort(const uint32_t upto, Vector<keywithvalue> &kv) | |
{ | |
if (unlikely(upto == 0)) | |
{ | |
kv.SetNoElements(); | |
return; | |
} | |
Uniques seenTimes; // Could have jused a Set<uint32_t> distinctTimes | |
Vector<uint32_t> distinctTimes; | |
const uint32_t n = kv.Size(); | |
const keywithvalue *const all = kv.Values(); | |
// As @gmosx suggested, we could have used a prio-queue here instead to collect the top `upto` distinct | |
// times (or a similar algorithm). It should squeeze another few ms worth of performance out of this. | |
for (uint32_t i = 0; i != n; ++i) | |
{ | |
if (seenTimes.SetUnique(all[i].value)) | |
distinctTimes.Append(all[i].value); | |
} | |
if (distinctTimes.Size() >= upto) | |
{ | |
distinctTimes.Sort([](const uint32_t a, const uint32_t b) | |
{ | |
return TrivialCmp(b, a); | |
}); | |
const uint32_t limit = distinctTimes[upto - 1]; | |
const keywithvalue *in = kv.Values(), *const end = in + n; | |
keywithvalue *out = kv.Values(); | |
while (unlikely(in != end)) | |
{ | |
if (in->value >= limit) | |
*(out++) = *in; | |
++in; | |
} | |
kv.SetTotal(out - kv.Values()); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment