Skip to content

Instantly share code, notes, and snippets.

@rygorous
Created April 3, 2019 19:14
Show Gist options
  • Star 7 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rygorous/422f8d8382eb61a55c087714c92f0d4e to your computer and use it in GitHub Desktop.
Save rygorous/422f8d8382eb61a55c087714c92f0d4e to your computer and use it in GitHub Desktop.
Binary search variants
// Baseline version without prefetch
static const LRMEntry * lrm_search_one_basic(const LRM * lrm, const U8 * ptr)
{
LRM_hash_t hash = lrm_hash8(ptr);
// Jump-in: narrow down the search interval using the jump table
LRM_hash_t ji = hash >> lrm->jumpInShift;
S32 jump1 = lrm->jumpIn[ji];
S32 jump2 = lrm->jumpIn[ji + 1];
// Run the binary search, using the code from the LRM searcher
const LRMEntry * it = lrm->entries.data() + jump1;
size_t nleft = jump2 - jump1;
if (nleft > 0)
{
while (size_t half = nleft >> 1) // Loop while >1 elements properly inside [it, it + nleft)
{
const LRMEntry * mid = &it[half];
// This reduction guarantees the two invariants above. It doesn't shrink nleft quite
// as quickly as it could but it has the advantage of being a very simple update.
//
// Specifically, if (mid->hash < hash), we _could_ set it = mid + 1 and subtract
// (half + 1) from nleft, but keeping "mid" itself inside the interval makes the
// update rule slightly cheaper, even though it's a bit sloppy.
it = (mid->hash < hash) ? mid : it; // conditional move
nleft -= half; // nleft = (nleft >> 1) + ((nleft + 1) >> 1), so nleft - half = (nleft + 1) >> 1
}
// The above loop shrank nleft down to 1. Do the final step.
RR_ASSERT(nleft == 1);
it += (it->hash < hash);
}
return it;
}
// Baseline version with prefetch
static const LRMEntry * lrm_search_one_prefetch(const LRM * lrm, const U8 * ptr)
{
LRM_hash_t hash = lrm_hash8(ptr);
// Jump-in: narrow down the search interval using the jump table
LRM_hash_t ji = hash >> lrm->jumpInShift;
S32 jump1 = lrm->jumpIn[ji];
S32 jump2 = lrm->jumpIn[ji + 1];
// Run the binary search, using the code from the LRM searcher
const LRMEntry * it = lrm->entries.data() + jump1;
size_t nleft = jump2 - jump1;
if (nleft > 0)
{
while (size_t half = nleft >> 1) // Loop while >1 elements properly inside [it, it + nleft)
{
const LRMEntry * mid = &it[half];
// Prefetch both possible paths for the next step
RR_PREFETCHR_CL(&it[half >> 1]);
RR_PREFETCHR_CL(&mid[half >> 1]);
// This reduction guarantees the two invariants above. It doesn't shrink nleft quite
// as quickly as it could but it has the advantage of being a very simple update.
//
// Specifically, if (mid->hash < hash), we _could_ set it = mid + 1 and subtract
// (half + 1) from nleft, but keeping "mid" itself inside the interval makes the
// update rule slightly cheaper, even though it's a bit sloppy.
it = (mid->hash < hash) ? mid : it; // conditional move
nleft -= half; // nleft = (nleft >> 1) + ((nleft + 1) >> 1), so nleft - half = (nleft + 1) >> 1
}
// The above loop shrank nleft down to 1. Do the final step.
RR_ASSERT(nleft == 1);
it += (it->hash < hash);
}
return it;
}
// Memory-pipelined: look up in large batches, getting as many independent
// cache misses into the pipe as possible before actually doing any of the
// loads.
template<int N>
static void lrm_search_multi_pf2(const LRM * lrm, const U8 * ptr, LRMEntry const ** it)
{
const LRMEntry * entries = lrm->entries.data();
LRM_hash_t hash[N];
size_t nleft[N];
size_t nleft_max = 0;
for (int i = 0; i < N; i++)
{
hash[i] = lrm_hash8(ptr + i);
LRM_hash_t ji = hash[i] >> lrm->jumpInShift;
S32 jump1 = lrm->jumpIn[ji];
S32 jump2 = lrm->jumpIn[ji + 1];
it[i] = entries + jump1;
nleft[i] = jump2 - jump1;
nleft_max = RR_MAX(nleft_max, nleft[i]);
RR_PREFETCHR_32B(&it[i][nleft[i] >> 1]);
}
// Run binary search iterations while at least one search has a
// non-trivial range left
while (nleft_max > 1)
{
for (int i = 0; i < N; i++)
{
size_t half = nleft[i] >> 1;
const LRMEntry * cur = it[i];
const LRMEntry * mid = cur + half;
it[i] = (mid->hash < hash[i]) ? mid : cur; // conditional move
nleft[i] -= half;
RR_PREFETCHR_32B(&it[i][nleft[i] >> 1]);
}
nleft_max -= nleft_max >> 1;
}
// Finalize the searches
for (int i = 0; i < N; i++)
{
// Final step
RR_ASSERT(nleft[i] <= 1);
it[i] += (it[i]->hash < hash[i]);
}
}
// Results:
// SimpleProf :seconds calls count : clk/call clk/count min
// search_one : 1.9221 1 8388480 : 5581904066.0 665.42 665.42 <- baseline without PF
// search_one_pf : 1.4311 1 8388480 : 4155978905.0 495.44 495.44 <- baseline with PF
// search_multi8_pf2 : 0.8201 1 8388480 : 2381551257.0 283.91 283.91
// search_multi16_pf2 : 0.6135 1 8388480 : 1781563475.0 212.38 212.38
// search_multi32_pf2 : 0.5709 1 8388480 : 1657999650.0 197.65 197.65
// search_multi64_pf2 : 0.5591 1 8388480 : 1623743724.0 193.57 193.57
// search_multi128_pf2: 0.5739 1 8388480 : 1666734087.0 198.69 198.69
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment