rygorous/gist:422f8d8382eb61a55c087714c92f0d4e

## gistfile1.txt
// Baseline version without prefetch
static const LRMEntry * lrm_search_one_basic(const LRM * lrm, const U8 * ptr)
{
	LRM_hash_t hash = lrm_hash8(ptr);

	// Jump-in: narrow down the search interval using the jump table
	LRM_hash_t ji = hash >> lrm->jumpInShift;
	S32 jump1 = lrm->jumpIn[ji];
	S32 jump2 = lrm->jumpIn[ji + 1];

	// Run the binary search, using the code from the LRM searcher
	const LRMEntry * it = lrm->entries.data() + jump1;
	size_t nleft = jump2 - jump1;

	if (nleft > 0)
	{
		while (size_t half = nleft >> 1) // Loop while >1 elements properly inside [it, it + nleft)
		{
			const LRMEntry * mid = &it[half];

			// This reduction guarantees the two invariants above. It doesn't shrink nleft quite
			// as quickly as it could but it has the advantage of being a very simple update.
			//
			// Specifically, if (mid->hash < hash), we _could_ set it = mid + 1 and subtract
			// (half + 1) from nleft, but keeping "mid" itself inside the interval makes the
			// update rule slightly cheaper, even though it's a bit sloppy.
			it = (mid->hash < hash) ? mid : it; // conditional move
			nleft -= half; // nleft = (nleft >> 1) + ((nleft + 1) >> 1), so nleft - half = (nleft + 1) >> 1
		}

		// The above loop shrank nleft down to 1. Do the final step.
		RR_ASSERT(nleft == 1);
		it += (it->hash < hash);
	}

	return it;
}

// Baseline version with prefetch
static const LRMEntry * lrm_search_one_prefetch(const LRM * lrm, const U8 * ptr)
{
	LRM_hash_t hash = lrm_hash8(ptr);

	// Jump-in: narrow down the search interval using the jump table
	LRM_hash_t ji = hash >> lrm->jumpInShift;
	S32 jump1 = lrm->jumpIn[ji];
	S32 jump2 = lrm->jumpIn[ji + 1];

	// Run the binary search, using the code from the LRM searcher
	const LRMEntry * it = lrm->entries.data() + jump1;
	size_t nleft = jump2 - jump1;

	if (nleft > 0)
	{
		while (size_t half = nleft >> 1) // Loop while >1 elements properly inside [it, it + nleft)
		{
			const LRMEntry * mid = &it[half];

			// Prefetch both possible paths for the next step
			RR_PREFETCHR_CL(&it[half >> 1]);
			RR_PREFETCHR_CL(&mid[half >> 1]);

			// This reduction guarantees the two invariants above. It doesn't shrink nleft quite
			// as quickly as it could but it has the advantage of being a very simple update.
			//
			// Specifically, if (mid->hash < hash), we _could_ set it = mid + 1 and subtract
			// (half + 1) from nleft, but keeping "mid" itself inside the interval makes the
			// update rule slightly cheaper, even though it's a bit sloppy.
			it = (mid->hash < hash) ? mid : it; // conditional move
			nleft -= half; // nleft = (nleft >> 1) + ((nleft + 1) >> 1), so nleft - half = (nleft + 1) >> 1
		}

		// The above loop shrank nleft down to 1. Do the final step.
		RR_ASSERT(nleft == 1);
		it += (it->hash < hash);
	}

	return it;
}

// Memory-pipelined: look up in large batches, getting as many independent
// cache misses into the pipe as possible before actually doing any of the
// loads.
template<int N>
static void lrm_search_multi_pf2(const LRM * lrm, const U8 * ptr, LRMEntry const ** it)
{
	const LRMEntry * entries = lrm->entries.data();

	LRM_hash_t hash[N];
	size_t nleft[N];
	size_t nleft_max = 0;

	for (int i = 0; i < N; i++)
	{
		hash[i] = lrm_hash8(ptr + i);
		LRM_hash_t ji = hash[i] >> lrm->jumpInShift;

		S32 jump1 = lrm->jumpIn[ji];
		S32 jump2 = lrm->jumpIn[ji + 1];

		it[i] = entries + jump1;
		nleft[i] = jump2 - jump1;
		nleft_max = RR_MAX(nleft_max, nleft[i]);

		RR_PREFETCHR_32B(&it[i][nleft[i] >> 1]);
	}

	// Run binary search iterations while at least one search has a
	// non-trivial range left
	while (nleft_max > 1)
	{
		for (int i = 0; i < N; i++)
		{
			size_t half = nleft[i] >> 1;
			const LRMEntry * cur = it[i];
			const LRMEntry * mid = cur + half;

			it[i] = (mid->hash < hash[i]) ? mid : cur; // conditional move
			nleft[i] -= half;

			RR_PREFETCHR_32B(&it[i][nleft[i] >> 1]);
		}

		nleft_max -= nleft_max >> 1;
	}

	// Finalize the searches
	for (int i = 0; i < N; i++)
	{
		// Final step
		RR_ASSERT(nleft[i] <= 1);
		it[i] += (it[i]->hash < hash[i]);
	}
}

// Results:
// SimpleProf         :seconds  calls     count :     clk/call    clk/count      min
// search_one         : 1.9221      1   8388480 : 5581904066.0       665.42   665.42 <- baseline without PF
// search_one_pf      : 1.4311      1   8388480 : 4155978905.0       495.44   495.44 <- baseline with PF
// search_multi8_pf2  : 0.8201      1   8388480 : 2381551257.0       283.91   283.91
// search_multi16_pf2 : 0.6135      1   8388480 : 1781563475.0       212.38   212.38
// search_multi32_pf2 : 0.5709      1   8388480 : 1657999650.0       197.65   197.65
// search_multi64_pf2 : 0.5591      1   8388480 : 1623743724.0       193.57   193.57
// search_multi128_pf2: 0.5739      1   8388480 : 1666734087.0       198.69   198.69
	// Baseline version without prefetch
	static const LRMEntry * lrm_search_one_basic(const LRM * lrm, const U8 * ptr)
	{
	LRM_hash_t hash = lrm_hash8(ptr);

	// Jump-in: narrow down the search interval using the jump table
	LRM_hash_t ji = hash >> lrm->jumpInShift;
	S32 jump1 = lrm->jumpIn[ji];
	S32 jump2 = lrm->jumpIn[ji + 1];

	// Run the binary search, using the code from the LRM searcher
	const LRMEntry * it = lrm->entries.data() + jump1;
	size_t nleft = jump2 - jump1;

	if (nleft > 0)
	{
	while (size_t half = nleft >> 1) // Loop while >1 elements properly inside [it, it + nleft)
	{
	const LRMEntry * mid = &it[half];

	// This reduction guarantees the two invariants above. It doesn't shrink nleft quite
	// as quickly as it could but it has the advantage of being a very simple update.
	//
	// Specifically, if (mid->hash < hash), we _could_ set it = mid + 1 and subtract
	// (half + 1) from nleft, but keeping "mid" itself inside the interval makes the
	// update rule slightly cheaper, even though it's a bit sloppy.
	it = (mid->hash < hash) ? mid : it; // conditional move
	nleft -= half; // nleft = (nleft >> 1) + ((nleft + 1) >> 1), so nleft - half = (nleft + 1) >> 1
	}

	// The above loop shrank nleft down to 1. Do the final step.
	RR_ASSERT(nleft == 1);
	it += (it->hash < hash);
	}

	return it;
	}

	// Baseline version with prefetch
	static const LRMEntry * lrm_search_one_prefetch(const LRM * lrm, const U8 * ptr)
	{
	LRM_hash_t hash = lrm_hash8(ptr);

	// Jump-in: narrow down the search interval using the jump table
	LRM_hash_t ji = hash >> lrm->jumpInShift;
	S32 jump1 = lrm->jumpIn[ji];
	S32 jump2 = lrm->jumpIn[ji + 1];

	// Run the binary search, using the code from the LRM searcher
	const LRMEntry * it = lrm->entries.data() + jump1;
	size_t nleft = jump2 - jump1;

	if (nleft > 0)
	{
	while (size_t half = nleft >> 1) // Loop while >1 elements properly inside [it, it + nleft)
	{
	const LRMEntry * mid = &it[half];

	// Prefetch both possible paths for the next step
	RR_PREFETCHR_CL(&it[half >> 1]);
	RR_PREFETCHR_CL(&mid[half >> 1]);

	// This reduction guarantees the two invariants above. It doesn't shrink nleft quite
	// as quickly as it could but it has the advantage of being a very simple update.
	//
	// Specifically, if (mid->hash < hash), we _could_ set it = mid + 1 and subtract
	// (half + 1) from nleft, but keeping "mid" itself inside the interval makes the
	// update rule slightly cheaper, even though it's a bit sloppy.
	it = (mid->hash < hash) ? mid : it; // conditional move
	nleft -= half; // nleft = (nleft >> 1) + ((nleft + 1) >> 1), so nleft - half = (nleft + 1) >> 1
	}

	// The above loop shrank nleft down to 1. Do the final step.
	RR_ASSERT(nleft == 1);
	it += (it->hash < hash);
	}

	return it;
	}

	// Memory-pipelined: look up in large batches, getting as many independent
	// cache misses into the pipe as possible before actually doing any of the
	// loads.
	template<int N>
	static void lrm_search_multi_pf2(const LRM * lrm, const U8 * ptr, LRMEntry const ** it)
	{
	const LRMEntry * entries = lrm->entries.data();

	LRM_hash_t hash[N];
	size_t nleft[N];
	size_t nleft_max = 0;

	for (int i = 0; i < N; i++)
	{
	hash[i] = lrm_hash8(ptr + i);
	LRM_hash_t ji = hash[i] >> lrm->jumpInShift;

	S32 jump1 = lrm->jumpIn[ji];
	S32 jump2 = lrm->jumpIn[ji + 1];

	it[i] = entries + jump1;
	nleft[i] = jump2 - jump1;
	nleft_max = RR_MAX(nleft_max, nleft[i]);

	RR_PREFETCHR_32B(&it[i][nleft[i] >> 1]);
	}

	// Run binary search iterations while at least one search has a
	// non-trivial range left
	while (nleft_max > 1)
	{
	for (int i = 0; i < N; i++)
	{
	size_t half = nleft[i] >> 1;
	const LRMEntry * cur = it[i];
	const LRMEntry * mid = cur + half;

	it[i] = (mid->hash < hash[i]) ? mid : cur; // conditional move
	nleft[i] -= half;

	RR_PREFETCHR_32B(&it[i][nleft[i] >> 1]);
	}

	nleft_max -= nleft_max >> 1;
	}

	// Finalize the searches
	for (int i = 0; i < N; i++)
	{
	// Final step
	RR_ASSERT(nleft[i] <= 1);
	it[i] += (it[i]->hash < hash[i]);
	}
	}

	// Results:
	// SimpleProf :seconds calls count : clk/call clk/count min
	// search_one : 1.9221 1 8388480 : 5581904066.0 665.42 665.42 <- baseline without PF
	// search_one_pf : 1.4311 1 8388480 : 4155978905.0 495.44 495.44 <- baseline with PF
	// search_multi8_pf2 : 0.8201 1 8388480 : 2381551257.0 283.91 283.91
	// search_multi16_pf2 : 0.6135 1 8388480 : 1781563475.0 212.38 212.38
	// search_multi32_pf2 : 0.5709 1 8388480 : 1657999650.0 197.65 197.65
	// search_multi64_pf2 : 0.5591 1 8388480 : 1623743724.0 193.57 193.57
	// search_multi128_pf2: 0.5739 1 8388480 : 1666734087.0 198.69 198.69