maxwellpirtle/simd_sample.metal

## simd_sample.metal
ushort
divceil(ushort a, ushort b) {
    return a % b == 0 ? a / b : a / b + 1;
}

/// Quickly samples a (2N + 1) x (2N + 1) region for each
/// thread
///
/// This is a generalization of the sampling
/// described in "Discover Advances in Metal for A15 Bionic"
/// to reduce the number of texture samples required to
/// apply a 5x5 compute kernel. The sampling uses the
/// simdgroup shuffle instructions introduced in Metal 2.4
/// to share sample data among threads. Note that with a chunk
/// width of four the corresponding quadgroup instructions could
/// be used, but for simplicity we always use the simdgroup
/// variant instead
///
/// @param threads_per_threadgroup: The threadgroup execution width
/// @param thread_position_in_grid: The position of the thread in
/// the dispatch
/// @param: threads_per_simdgroup: The number of threads
/// in each simdgroup
/// @param texture: The texture to sample an (2N + 1) x (2N + 1)
/// region from
/// @param sampler: The sampler used to read values from the texture
///
/// @returns: A (2N + 1) x (2N + 1) matrix of `float` or `half` containing
/// the sampled texture value relative to the pixel at grid position
/// `thread_position_in_grid`, where the "center" (i.e. element at position
/// (N, N)) corresponds to the pixel at `thread_position_in_grid`, the value
/// at (N + 1, N) corresponds to the pixel to the right of
/// `thread_position_in_grid`,  etc.
///
/// Invariant: The width of the threadgroup divides the number of threads in each
/// simdgroup and is at most one simdgroup wide; i.e.,
///
///     threads_per_simdgroup >= threads_per_threadgroup &&
///     threads_per_simdgroup % threads_per_threadgroup.x == 0
///
template<texture_channel sample_channel, typename xhalf, int N = 3>
matrix<xhalf, 2 * N + 1>
simd_fast_sample(ushort2 threads_per_threadgroup,
                 ushort2 thread_position_in_grid,
                 ushort threads_per_simdgroup,
                 texture2d<xhalf, access::sample> texture,
                 sampler sampler)
{
    constexpr int K = 2 * N + 1;
    matrix<xhalf, K> result;

    // Note that if 2N / chunksize.width > 5,
    // we'll have an index OOB. Ideally we'd
    // want to prevent this with a compilation error,
    // but a size of 5 suffices for most cases; e.g.,
    // with a simdgroup chunk size of 4, a 5x5
    // matrix would suffice up to N = 10, or a 21 x 21 read!
    matrix<xhalf, 5> sampleMap;

    constexpr int2 globalOffset = int2(N, N);
    const ushort threads_per_simd_chunk = threads_per_threadgroup.x;
    const ushort simd_chunks_per_simdgroup = threads_per_simdgroup / threads_per_simd_chunk;
    const ushort simd_chunk_samplesX = 1 + divceil((ushort)2 * N, threads_per_simd_chunk);
    const ushort simd_chunk_samplesY = 1 + divceil((ushort)2 * N, simd_chunks_per_simdgroup);

    // Step 1: Fill the sample map based on the simd_chunk_size
    for (int i = 0; i < simd_chunk_samplesX; i++) {
        for (int j = 0; i < simd_chunk_samplesY; j++) {
            const int2 chunkSampleOffset = int2(j * threads_per_simd_chunk, i * simd_chunks_per_simdgroup);

            // Metal matrices are specified as column-major
            sampleMap[j][i] = texture.sample(sampler, float2(thread_position_in_grid), chunkSampleOffset - globalOffset).r;
        }
    }

    // Step 2. Fill the first row of the matrix
    // for subsequent processing by the remainder
    // of the algorithm
    for(int j = 0; j < K; j++) {
        const int sampleMapLocX = 2 * j / threads_per_simd_chunk;
        result[j][0] = simd_shuffle_and_fill_down(sampleMap[sampleMapLocX][0],
                                                  sampleMap[sampleMapLocX + 1][0],
                                                  j, threads_per_simd_chunk);
    }

    for(int i = 1; i < K; i++) {
        for(int j = 0; j < K; j++) {
            const int sampleMapLocX = 2 * j / threads_per_simd_chunk;
            const int sampleMapLocY = 2 * i / simd_chunks_per_simdgroup;
            const xhalf fill = simd_shuffle_and_fill_down(sampleMap[sampleMapLocX][sampleMapLocY + 1],
                                                          sampleMap[sampleMapLocX + 1][sampleMapLocY + 1],
                                                          j, threads_per_simd_chunk);
            result[j][i] = simd_shuffle_and_fill_down(result[j][i - 1], fill, threads_per_simd_chunk);
        }
    }

    return result;
}
	ushort
	divceil(ushort a, ushort b) {
	return a % b == 0 ? a / b : a / b + 1;
	}

	/// Quickly samples a (2N + 1) x (2N + 1) region for each
	/// thread
	///
	/// This is a generalization of the sampling
	/// described in "Discover Advances in Metal for A15 Bionic"
	/// to reduce the number of texture samples required to
	/// apply a 5x5 compute kernel. The sampling uses the
	/// simdgroup shuffle instructions introduced in Metal 2.4
	/// to share sample data among threads. Note that with a chunk
	/// width of four the corresponding quadgroup instructions could
	/// be used, but for simplicity we always use the simdgroup
	/// variant instead
	///
	/// @param threads_per_threadgroup: The threadgroup execution width
	/// @param thread_position_in_grid: The position of the thread in
	/// the dispatch
	/// @param: threads_per_simdgroup: The number of threads
	/// in each simdgroup
	/// @param texture: The texture to sample an (2N + 1) x (2N + 1)
	/// region from
	/// @param sampler: The sampler used to read values from the texture
	///
	/// @returns: A (2N + 1) x (2N + 1) matrix of `float` or `half` containing
	/// the sampled texture value relative to the pixel at grid position
	/// `thread_position_in_grid`, where the "center" (i.e. element at position
	/// (N, N)) corresponds to the pixel at `thread_position_in_grid`, the value
	/// at (N + 1, N) corresponds to the pixel to the right of
	/// `thread_position_in_grid`, etc.
	///
	/// Invariant: The width of the threadgroup divides the number of threads in each
	/// simdgroup and is at most one simdgroup wide; i.e.,
	///
	/// threads_per_simdgroup >= threads_per_threadgroup &&
	/// threads_per_simdgroup % threads_per_threadgroup.x == 0
	///
	template<texture_channel sample_channel, typename xhalf, int N = 3>
	matrix<xhalf, 2 * N + 1>
	simd_fast_sample(ushort2 threads_per_threadgroup,
	ushort2 thread_position_in_grid,
	ushort threads_per_simdgroup,
	texture2d<xhalf, access::sample> texture,
	sampler sampler)
	{
	constexpr int K = 2 * N + 1;
	matrix<xhalf, K> result;

	// Note that if 2N / chunksize.width > 5,
	// we'll have an index OOB. Ideally we'd
	// want to prevent this with a compilation error,
	// but a size of 5 suffices for most cases; e.g.,
	// with a simdgroup chunk size of 4, a 5x5
	// matrix would suffice up to N = 10, or a 21 x 21 read!
	matrix<xhalf, 5> sampleMap;

	constexpr int2 globalOffset = int2(N, N);
	const ushort threads_per_simd_chunk = threads_per_threadgroup.x;
	const ushort simd_chunks_per_simdgroup = threads_per_simdgroup / threads_per_simd_chunk;
	const ushort simd_chunk_samplesX = 1 + divceil((ushort)2 * N, threads_per_simd_chunk);
	const ushort simd_chunk_samplesY = 1 + divceil((ushort)2 * N, simd_chunks_per_simdgroup);

	// Step 1: Fill the sample map based on the simd_chunk_size
	for (int i = 0; i < simd_chunk_samplesX; i++) {
	for (int j = 0; i < simd_chunk_samplesY; j++) {
	const int2 chunkSampleOffset = int2(j * threads_per_simd_chunk, i * simd_chunks_per_simdgroup);

	// Metal matrices are specified as column-major
	sampleMap[j][i] = texture.sample(sampler, float2(thread_position_in_grid), chunkSampleOffset - globalOffset).r;
	}
	}

	// Step 2. Fill the first row of the matrix
	// for subsequent processing by the remainder
	// of the algorithm
	for(int j = 0; j < K; j++) {
	const int sampleMapLocX = 2 * j / threads_per_simd_chunk;
	result[j][0] = simd_shuffle_and_fill_down(sampleMap[sampleMapLocX][0],
	sampleMap[sampleMapLocX + 1][0],
	j, threads_per_simd_chunk);
	}

	for(int i = 1; i < K; i++) {
	for(int j = 0; j < K; j++) {
	const int sampleMapLocX = 2 * j / threads_per_simd_chunk;
	const int sampleMapLocY = 2 * i / simd_chunks_per_simdgroup;
	const xhalf fill = simd_shuffle_and_fill_down(sampleMap[sampleMapLocX][sampleMapLocY + 1],
	sampleMap[sampleMapLocX + 1][sampleMapLocY + 1],
	j, threads_per_simd_chunk);
	result[j][i] = simd_shuffle_and_fill_down(result[j][i - 1], fill, threads_per_simd_chunk);
	}
	}

	return result;
	}