markpapadakis/narrowedsort.cpp

## narrowedsort.cpp
// Assuming you, for example, are going through dates and for each date you collect ids and for each id the number of times
// the object identified by that id (e.g an ad banner, a product, anything) was clicked, or anything like that.
//
// That is, assuming that you are going to have to update the value of an object 1+ times, as opposed to just once (you can use other
// more efficient algorithms to deal with that -- e.g maintain a top-K list as you go, etc), how do you then sort the final list that holds
// (id, value) - that is, for each distinct id, the sum of times it was clicked (or anything)?
//
// Sorting a list that is comprised of 1million distinct IDs is just really slow.
// This is one way to do it, which is particularly efficient if the distribution of values is large.
//
// Say you want to get the top 10 items from that Vector<keywithvalue>


// This will filter kv so that Vector<>::Sort() will only end up sorting, usually, a few dozen values
FilterForSort(10, kv);

// Sort by value DESC
kv.Sort([](const keywithvalue &a, const keywithvalue &b)
{
        return TrivialCmp(b.value, a.value);
});

// And now you have the top 10 most clicked (anything) objects in kv.
//
// This came in handy last night, maybe it can help you too one way or another.


// @upto == offset * limit + limit
// @kv is a vector that holds {key, value}, where key is e.g an identifier and value whatever we are using to sort (e.g total clicks)
static void FilterForSort(const uint32_t upto, Vector<keywithvalue> &kv)
{
        if (unlikely(upto == 0))
        {
                kv.SetNoElements();
                return;
        }

        Uniques seenTimes; // Could have jused a Set<uint32_t> distinctTimes
        Vector<uint32_t> distinctTimes;
        const uint32_t n                = kv.Size();
        const keywithvalue *const all   = kv.Values();

        // As @gmosx suggested, we could have used a prio-queue here instead to collect the top `upto` distinct
        // times (or a similar algorithm). It should squeeze another few ms worth of performance out of this.
        for (uint32_t i = 0; i != n; ++i)
        {
                if (seenTimes.SetUnique(all[i].value))
                        distinctTimes.Append(all[i].value);
        }

        if (distinctTimes.Size() >= upto)
        {
                distinctTimes.Sort([](const uint32_t a, const uint32_t b)
                                {
                                return TrivialCmp(b, a);
                                });

                const uint32_t limit    = distinctTimes[upto - 1];
                const keywithvalue *in  = kv.Values(), *const end = in + n;
                keywithvalue *out       = kv.Values();

                while (unlikely(in != end))
                {
                        if (in->value >= limit)
                                *(out++) = *in;
                        ++in;
                }

                kv.SetTotal(out - kv.Values());
        }
}
	// Assuming you, for example, are going through dates and for each date you collect ids and for each id the number of times
	// the object identified by that id (e.g an ad banner, a product, anything) was clicked, or anything like that.
	//
	// That is, assuming that you are going to have to update the value of an object 1+ times, as opposed to just once (you can use other
	// more efficient algorithms to deal with that -- e.g maintain a top-K list as you go, etc), how do you then sort the final list that holds
	// (id, value) - that is, for each distinct id, the sum of times it was clicked (or anything)?
	//
	// Sorting a list that is comprised of 1million distinct IDs is just really slow.
	// This is one way to do it, which is particularly efficient if the distribution of values is large.
	//
	// Say you want to get the top 10 items from that Vector<keywithvalue>


	// This will filter kv so that Vector<>::Sort() will only end up sorting, usually, a few dozen values
	FilterForSort(10, kv);

	// Sort by value DESC
	kv.Sort([](const keywithvalue &a, const keywithvalue &b)
	{
	return TrivialCmp(b.value, a.value);
	});

	// And now you have the top 10 most clicked (anything) objects in kv.
	//
	// This came in handy last night, maybe it can help you too one way or another.










	// @upto == offset * limit + limit
	// @kv is a vector that holds {key, value}, where key is e.g an identifier and value whatever we are using to sort (e.g total clicks)
	static void FilterForSort(const uint32_t upto, Vector<keywithvalue> &kv)
	{
	if (unlikely(upto == 0))
	{
	kv.SetNoElements();
	return;
	}

	Uniques seenTimes; // Could have jused a Set<uint32_t> distinctTimes
	Vector<uint32_t> distinctTimes;
	const uint32_t n = kv.Size();
	const keywithvalue *const all = kv.Values();

	// As @gmosx suggested, we could have used a prio-queue here instead to collect the top `upto` distinct
	// times (or a similar algorithm). It should squeeze another few ms worth of performance out of this.
	for (uint32_t i = 0; i != n; ++i)
	{
	if (seenTimes.SetUnique(all[i].value))
	distinctTimes.Append(all[i].value);
	}

	if (distinctTimes.Size() >= upto)
	{
	distinctTimes.Sort([](const uint32_t a, const uint32_t b)
	{
	return TrivialCmp(b, a);
	});

	const uint32_t limit = distinctTimes[upto - 1];
	const keywithvalue in = kv.Values(), const end = in + n;
	keywithvalue *out = kv.Values();

	while (unlikely(in != end))
	{
	if (in->value >= limit)
	(out++) = in;
	++in;
	}

	kv.SetTotal(out - kv.Values());
	}
	}