Skip to content

Instantly share code, notes, and snippets.

@masahi
Created March 16, 2021 19:24
Show Gist options
  • Save masahi/c0979c61907af15f9924b3b3d72fe6a7 to your computer and use it in GitHub Desktop.
Save masahi/c0979c61907af15f9924b3b3d72fe6a7 to your computer and use it in GitHub Desktop.
PrimFunc([argsort_gpu.v0, argsort_gpu.v2, argsort_gpu.v3, argsort_gpu.v1, i_0, any_dim]) attrs={"target": vulkan -keys=vulkan,gpu -max_num_threads=256, "tir.noalias": 1, "global_symbol": "fused_argsort_kernel2", "tir.device_thread_axis": [iter_var(threadIdx.x, , threadIdx.x), iter_var(blockIdx.x, , blockIdx.x), iter_var(blockIdx.y, , blockIdx.y), iter_var(blockIdx.z, , blockIdx.z)], "calling_conv": 2} {
// attr [iter_var(threadIdx.x, , threadIdx.x)] thread_extent = 256
// attr [first] storage_scope = "local"
allocate first[int64 * 1]
// attr [last] storage_scope = "local"
allocate last[int64 * 1]
// attr [first] storage_scope = "local"
allocate first[int64 * 1]
// attr [last] storage_scope = "local"
allocate last[int64 * 1]
// attr [first] storage_scope = "local"
allocate first[int64 * 1]
// attr [last] storage_scope = "local"
allocate last[int64 * 1]
// attr [first] storage_scope = "local"
allocate first[int64 * 1]
// attr [last] storage_scope = "local"
allocate last[int64 * 1]
// attr [first] storage_scope = "local"
allocate first[int64 * 1]
// attr [last] storage_scope = "local"
allocate last[int64 * 1]
// attr [first] storage_scope = "local"
allocate first[int64 * 1]
// attr [last] storage_scope = "local"
allocate last[int64 * 1]
// attr [first] storage_scope = "local"
allocate first[int64 * 1]
// attr [last] storage_scope = "local"
allocate last[int64 * 1]
// attr [iter_var(blockIdx.x, , blockIdx.x)] thread_extent = max(1, int32(tir.shift_right((tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128))))) + (int64)1023), (int64)10)))
// attr [iter_var(blockIdx.y, , blockIdx.y)] thread_extent = 1
// attr [iter_var(blockIdx.z, , blockIdx.z)] thread_extent = max(1, int32((let rmod = (((int64(any_dim) + tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))) - (int64)1) % tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))) in (let rdiv = (((int64(any_dim) + tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))) - (int64)1)/tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))) in select((((tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128))))) >= (int64)0) && (rmod >= (int64)0)) || ((tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128))))) < (int64)0) && (rmod <= (int64)0))), rdiv, (rdiv - (int64)1))))))
if (((tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z)) < int64(any_dim))) {
if ((int32(tir.shift_right((tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128))))) + (int64)1023), (int64)10)) == 1)) {
if (((i_0 % (int64)2) == (int64)0)) {
first[0] = max((int64)0, ((min((tir.shift_right(tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128))))), (int64)1) + (tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z))), int64(any_dim)) + (int64(threadIdx.x)*tir.shift_right((tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128))))) + (int64)255), (int64)8))) - min((tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*(int64(blockIdx.z) + (int64)1)), int64(any_dim))))
last[0] = min((int64(threadIdx.x)*tir.shift_right((tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128))))) + (int64)255), (int64)8)), min(tir.shift_right(tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128))))), (int64)1), (int64(any_dim) - (tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z)))))
while((first[0] < last[0])){
if ((argsort_gpu.v0[((tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z)) + tir.shift_right((first[0] + last[0]), (int64)1))] <= argsort_gpu.v0[(((min((tir.shift_right(tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128))))), (int64)1) + (tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z))), int64(any_dim)) + (int64(threadIdx.x)*tir.shift_right((tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128))))) + (int64)255), (int64)8))) - tir.shift_right((first[0] + last[0]), (int64)1)) - (int64)1)])) {
first[0] = (tir.shift_right((first[0] + last[0]), (int64)1) + (int64)1)
} else {
last[0] = tir.shift_right((first[0] + last[0]), (int64)1)
}
}
first[0] = ((tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z)) + first[0])
last[0] = ((min((tir.shift_right(tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128))))), (int64)1) + (tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z))), int64(any_dim)) + (int64(threadIdx.x)*tir.shift_right((tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128))))) + (int64)255), (int64)8))) - last[0])
for (i_1, (int64)0, min((((min(tir.shift_right(tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128))))), (int64)1), (int64(any_dim) - (tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z)))) + min((tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*(int64(blockIdx.z) + (int64)1)), int64(any_dim))) - min((tir.shift_right(tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128))))), (int64)1) + (tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z))), int64(any_dim))) - (int64(threadIdx.x)*tir.shift_right((tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128))))) + (int64)255), (int64)8))), tir.shift_right((tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128))))) + (int64)255), (int64)8))) {
if ((((first[0] < (tir.shift_right(tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128))))), (int64)1) + (tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z)))) && (first[0] < int64(any_dim))) && ((last[0] < (tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*(int64(blockIdx.z) + (int64)1))) && (last[0] < int64(any_dim))))) {
if ((argsort_gpu.v0[first[0]] <= argsort_gpu.v0[last[0]])) {
argsort_gpu.v2[(((tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z)) + (int64(threadIdx.x)*tir.shift_right((tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128))))) + (int64)255), (int64)8))) + i_1)] = argsort_gpu.v0[first[0]]
argsort_gpu.v3[(((tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z)) + (int64(threadIdx.x)*tir.shift_right((tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128))))) + (int64)255), (int64)8))) + i_1)] = argsort_gpu.v1[first[0]]
first[0] = (first[0] + (int64)1)
} else {
argsort_gpu.v2[(((tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z)) + (int64(threadIdx.x)*tir.shift_right((tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128))))) + (int64)255), (int64)8))) + i_1)] = argsort_gpu.v0[last[0]]
argsort_gpu.v3[(((tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z)) + (int64(threadIdx.x)*tir.shift_right((tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128))))) + (int64)255), (int64)8))) + i_1)] = argsort_gpu.v1[last[0]]
last[0] = (last[0] + (int64)1)
}
} else if (((first[0] < (tir.shift_right(tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128))))), (int64)1) + (tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z)))) && (first[0] < int64(any_dim)))) {
argsort_gpu.v2[(((tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z)) + (int64(threadIdx.x)*tir.shift_right((tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128))))) + (int64)255), (int64)8))) + i_1)] = argsort_gpu.v0[first[0]]
argsort_gpu.v3[(((tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z)) + (int64(threadIdx.x)*tir.shift_right((tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128))))) + (int64)255), (int64)8))) + i_1)] = argsort_gpu.v1[first[0]]
first[0] = (first[0] + (int64)1)
} else {
argsort_gpu.v2[(((tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z)) + (int64(threadIdx.x)*tir.shift_right((tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128))))) + (int64)255), (int64)8))) + i_1)] = argsort_gpu.v0[last[0]]
argsort_gpu.v3[(((tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z)) + (int64(threadIdx.x)*tir.shift_right((tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128))))) + (int64)255), (int64)8))) + i_1)] = argsort_gpu.v1[last[0]]
last[0] = (last[0] + (int64)1)
}
}
} else {
first[0] = max((int64)0, ((min((tir.shift_right(tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128))))), (int64)1) + (tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z))), int64(any_dim)) + (int64(threadIdx.x)*tir.shift_right((tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128))))) + (int64)255), (int64)8))) - min((tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*(int64(blockIdx.z) + (int64)1)), int64(any_dim))))
last[0] = min((int64(threadIdx.x)*tir.shift_right((tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128))))) + (int64)255), (int64)8)), min(tir.shift_right(tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128))))), (int64)1), (int64(any_dim) - (tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z)))))
while((first[0] < last[0])){
if ((argsort_gpu.v2[((tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z)) + tir.shift_right((first[0] + last[0]), (int64)1))] <= argsort_gpu.v2[(((min((tir.shift_right(tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128))))), (int64)1) + (tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z))), int64(any_dim)) + (int64(threadIdx.x)*tir.shift_right((tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128))))) + (int64)255), (int64)8))) - tir.shift_right((first[0] + last[0]), (int64)1)) - (int64)1)])) {
first[0] = (tir.shift_right((first[0] + last[0]), (int64)1) + (int64)1)
} else {
last[0] = tir.shift_right((first[0] + last[0]), (int64)1)
}
}
first[0] = ((tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z)) + first[0])
last[0] = ((min((tir.shift_right(tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128))))), (int64)1) + (tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z))), int64(any_dim)) + (int64(threadIdx.x)*tir.shift_right((tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128))))) + (int64)255), (int64)8))) - last[0])
for (i_2, (int64)0, min((((min(tir.shift_right(tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128))))), (int64)1), (int64(any_dim) - (tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z)))) + min((tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*(int64(blockIdx.z) + (int64)1)), int64(any_dim))) - min((tir.shift_right(tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128))))), (int64)1) + (tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z))), int64(any_dim))) - (int64(threadIdx.x)*tir.shift_right((tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128))))) + (int64)255), (int64)8))), tir.shift_right((tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128))))) + (int64)255), (int64)8))) {
if ((((first[0] < (tir.shift_right(tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128))))), (int64)1) + (tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z)))) && (first[0] < int64(any_dim))) && ((last[0] < (tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*(int64(blockIdx.z) + (int64)1))) && (last[0] < int64(any_dim))))) {
if ((argsort_gpu.v2[first[0]] <= argsort_gpu.v2[last[0]])) {
argsort_gpu.v0[(((tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z)) + (int64(threadIdx.x)*tir.shift_right((tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128))))) + (int64)255), (int64)8))) + i_2)] = argsort_gpu.v2[first[0]]
argsort_gpu.v1[(((tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z)) + (int64(threadIdx.x)*tir.shift_right((tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128))))) + (int64)255), (int64)8))) + i_2)] = argsort_gpu.v3[first[0]]
first[0] = (first[0] + (int64)1)
} else {
argsort_gpu.v0[(((tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z)) + (int64(threadIdx.x)*tir.shift_right((tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128))))) + (int64)255), (int64)8))) + i_2)] = argsort_gpu.v2[last[0]]
argsort_gpu.v1[(((tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z)) + (int64(threadIdx.x)*tir.shift_right((tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128))))) + (int64)255), (int64)8))) + i_2)] = argsort_gpu.v3[last[0]]
last[0] = (last[0] + (int64)1)
}
} else if (((first[0] < (tir.shift_right(tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128))))), (int64)1) + (tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z)))) && (first[0] < int64(any_dim)))) {
argsort_gpu.v0[(((tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z)) + (int64(threadIdx.x)*tir.shift_right((tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128))))) + (int64)255), (int64)8))) + i_2)] = argsort_gpu.v2[first[0]]
argsort_gpu.v1[(((tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z)) + (int64(threadIdx.x)*tir.shift_right((tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128))))) + (int64)255), (int64)8))) + i_2)] = argsort_gpu.v3[first[0]]
first[0] = (first[0] + (int64)1)
} else {
argsort_gpu.v0[(((tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z)) + (int64(threadIdx.x)*tir.shift_right((tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128))))) + (int64)255), (int64)8))) + i_2)] = argsort_gpu.v2[last[0]]
argsort_gpu.v1[(((tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z)) + (int64(threadIdx.x)*tir.shift_right((tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128))))) + (int64)255), (int64)8))) + i_2)] = argsort_gpu.v3[last[0]]
last[0] = (last[0] + (int64)1)
}
}
}
} else if (((i_0 % (int64)2) == (int64)0)) {
first[0] = max((int64)0, (((int64(blockIdx.x)*(int64)1024) + min((tir.shift_right(tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128))))), (int64)1) + (tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z))), int64(any_dim))) - min((tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*(int64(blockIdx.z) + (int64)1)), int64(any_dim))))
last[0] = min((int64(blockIdx.x)*(int64)1024), min(tir.shift_right(tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128))))), (int64)1), (int64(any_dim) - (tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z)))))
while((first[0] < last[0])){
if ((argsort_gpu.v0[((tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z)) + tir.shift_right((first[0] + last[0]), (int64)1))] <= argsort_gpu.v0[((((int64(blockIdx.x)*(int64)1024) + min((tir.shift_right(tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128))))), (int64)1) + (tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z))), int64(any_dim))) - tir.shift_right((first[0] + last[0]), (int64)1)) - (int64)1)])) {
first[0] = (tir.shift_right((first[0] + last[0]), (int64)1) + (int64)1)
} else {
last[0] = tir.shift_right((first[0] + last[0]), (int64)1)
}
}
first[0] = max((int64)0, ((int64(threadIdx.x)*(int64)4) - min((((min((tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*(int64(blockIdx.z) + (int64)1)), int64(any_dim)) + last[0]) - min((tir.shift_right(tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128))))), (int64)1) + (tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z))), int64(any_dim))) - (int64(blockIdx.x)*(int64)1024)), (int64)1024)))
last[0] = min((int64(threadIdx.x)*(int64)4), min((min(tir.shift_right(tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128))))), (int64)1), (int64(any_dim) - (tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z)))) - first[0]), (int64)1024))
while((first[0] < last[0])){
if ((argsort_gpu.v0[(((tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z)) + first[0]) + tir.shift_right((first[0] + last[0]), (int64)1))] <= argsort_gpu.v0[((((((int64(blockIdx.x)*(int64)1024) + (int64(threadIdx.x)*(int64)4)) + min((tir.shift_right(tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128))))), (int64)1) + (tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z))), int64(any_dim))) - last[0]) - tir.shift_right((first[0] + last[0]), (int64)1)) - (int64)1)])) {
first[0] = (tir.shift_right((first[0] + last[0]), (int64)1) + (int64)1)
} else {
last[0] = tir.shift_right((first[0] + last[0]), (int64)1)
}
}
first[0] = (((tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z)) + first[0]) + first[0])
last[0] = (((((int64(blockIdx.x)*(int64)1024) + (int64(threadIdx.x)*(int64)4)) + min((tir.shift_right(tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128))))), (int64)1) + (tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z))), int64(any_dim))) - last[0]) - last[0])
for (i_3, (int64)0, min(((min((min(tir.shift_right(tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128))))), (int64)1), (int64(any_dim) - (tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z)))) - first[0]), (int64)1024) + min((((min((tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*(int64(blockIdx.z) + (int64)1)), int64(any_dim)) + last[0]) - min((tir.shift_right(tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128))))), (int64)1) + (tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z))), int64(any_dim))) - (int64(blockIdx.x)*(int64)1024)), (int64)1024)) - (int64(threadIdx.x)*(int64)4)), (int64)4)) {
if (((((first[0] < (tir.shift_right(tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128))))), (int64)1) + (tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z)))) && (first[0] < int64(any_dim))) && (((first[0] - (tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z))) - first[0]) < (int64)1024)) && ((last[0] < ((((int64(blockIdx.x)*(int64)1024) + min((tir.shift_right(tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128))))), (int64)1) + (tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z))), int64(any_dim))) + (int64)1024) - last[0])) && ((last[0] < (tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*(int64(blockIdx.z) + (int64)1))) && (last[0] < int64(any_dim)))))) {
if ((argsort_gpu.v0[first[0]] <= argsort_gpu.v0[last[0]])) {
argsort_gpu.v2[((((int64(blockIdx.x)*(int64)1024) + (int64(threadIdx.x)*(int64)4)) + (tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z))) + i_3)] = argsort_gpu.v0[first[0]]
argsort_gpu.v3[((((int64(blockIdx.x)*(int64)1024) + (int64(threadIdx.x)*(int64)4)) + (tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z))) + i_3)] = argsort_gpu.v1[first[0]]
first[0] = (first[0] + (int64)1)
} else {
argsort_gpu.v2[((((int64(blockIdx.x)*(int64)1024) + (int64(threadIdx.x)*(int64)4)) + (tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z))) + i_3)] = argsort_gpu.v0[last[0]]
argsort_gpu.v3[((((int64(blockIdx.x)*(int64)1024) + (int64(threadIdx.x)*(int64)4)) + (tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z))) + i_3)] = argsort_gpu.v1[last[0]]
last[0] = (last[0] + (int64)1)
}
} else if ((((first[0] < (tir.shift_right(tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128))))), (int64)1) + (tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z)))) && (first[0] < int64(any_dim))) && (((first[0] - (tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z))) - first[0]) < (int64)1024))) {
argsort_gpu.v2[((((int64(blockIdx.x)*(int64)1024) + (int64(threadIdx.x)*(int64)4)) + (tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z))) + i_3)] = argsort_gpu.v0[first[0]]
argsort_gpu.v3[((((int64(blockIdx.x)*(int64)1024) + (int64(threadIdx.x)*(int64)4)) + (tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z))) + i_3)] = argsort_gpu.v1[first[0]]
first[0] = (first[0] + (int64)1)
} else {
argsort_gpu.v2[((((int64(blockIdx.x)*(int64)1024) + (int64(threadIdx.x)*(int64)4)) + (tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z))) + i_3)] = argsort_gpu.v0[last[0]]
argsort_gpu.v3[((((int64(blockIdx.x)*(int64)1024) + (int64(threadIdx.x)*(int64)4)) + (tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z))) + i_3)] = argsort_gpu.v1[last[0]]
last[0] = (last[0] + (int64)1)
}
}
} else {
first[0] = max((int64)0, (((int64(blockIdx.x)*(int64)1024) + min((tir.shift_right(tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128))))), (int64)1) + (tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z))), int64(any_dim))) - min((tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*(int64(blockIdx.z) + (int64)1)), int64(any_dim))))
last[0] = min((int64(blockIdx.x)*(int64)1024), min(tir.shift_right(tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128))))), (int64)1), (int64(any_dim) - (tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z)))))
while((first[0] < last[0])){
if ((argsort_gpu.v2[((tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z)) + tir.shift_right((first[0] + last[0]), (int64)1))] <= argsort_gpu.v2[((((int64(blockIdx.x)*(int64)1024) + min((tir.shift_right(tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128))))), (int64)1) + (tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z))), int64(any_dim))) - tir.shift_right((first[0] + last[0]), (int64)1)) - (int64)1)])) {
first[0] = (tir.shift_right((first[0] + last[0]), (int64)1) + (int64)1)
} else {
last[0] = tir.shift_right((first[0] + last[0]), (int64)1)
}
}
if (((i_0 % (int64)2) == (int64)0)) {
first[0] = max((int64)0, ((int64(threadIdx.x)*(int64)4) - min((((min((tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*(int64(blockIdx.z) + (int64)1)), int64(any_dim)) + last[0]) - min((tir.shift_right(tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128))))), (int64)1) + (tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z))), int64(any_dim))) - (int64(blockIdx.x)*(int64)1024)), (int64)1024)))
last[0] = min((int64(threadIdx.x)*(int64)4), min((min(tir.shift_right(tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128))))), (int64)1), (int64(any_dim) - (tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z)))) - first[0]), (int64)1024))
while((first[0] < last[0])){
if ((argsort_gpu.v0[(((tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z)) + first[0]) + tir.shift_right((first[0] + last[0]), (int64)1))] <= argsort_gpu.v0[((((((int64(blockIdx.x)*(int64)1024) + (int64(threadIdx.x)*(int64)4)) + min((tir.shift_right(tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128))))), (int64)1) + (tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z))), int64(any_dim))) - last[0]) - tir.shift_right((first[0] + last[0]), (int64)1)) - (int64)1)])) {
first[0] = (tir.shift_right((first[0] + last[0]), (int64)1) + (int64)1)
} else {
last[0] = tir.shift_right((first[0] + last[0]), (int64)1)
}
}
first[0] = (((tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z)) + first[0]) + first[0])
last[0] = (((((int64(blockIdx.x)*(int64)1024) + (int64(threadIdx.x)*(int64)4)) + min((tir.shift_right(tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128))))), (int64)1) + (tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z))), int64(any_dim))) - last[0]) - last[0])
for (i_5, (int64)0, min(((min((min(tir.shift_right(tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128))))), (int64)1), (int64(any_dim) - (tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z)))) - first[0]), (int64)1024) + min((((min((tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*(int64(blockIdx.z) + (int64)1)), int64(any_dim)) + last[0]) - min((tir.shift_right(tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128))))), (int64)1) + (tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z))), int64(any_dim))) - (int64(blockIdx.x)*(int64)1024)), (int64)1024)) - (int64(threadIdx.x)*(int64)4)), (int64)4)) {
if (((((first[0] < (tir.shift_right(tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128))))), (int64)1) + (tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z)))) && (first[0] < int64(any_dim))) && (((first[0] - (tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z))) - first[0]) < (int64)1024)) && ((last[0] < ((((int64(blockIdx.x)*(int64)1024) + min((tir.shift_right(tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128))))), (int64)1) + (tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z))), int64(any_dim))) + (int64)1024) - last[0])) && ((last[0] < (tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*(int64(blockIdx.z) + (int64)1))) && (last[0] < int64(any_dim)))))) {
if ((argsort_gpu.v0[first[0]] <= argsort_gpu.v0[last[0]])) {
argsort_gpu.v2[((((int64(blockIdx.x)*(int64)1024) + (int64(threadIdx.x)*(int64)4)) + (tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z))) + i_5)] = argsort_gpu.v0[first[0]]
argsort_gpu.v3[((((int64(blockIdx.x)*(int64)1024) + (int64(threadIdx.x)*(int64)4)) + (tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z))) + i_5)] = argsort_gpu.v1[first[0]]
first[0] = (first[0] + (int64)1)
} else {
argsort_gpu.v2[((((int64(blockIdx.x)*(int64)1024) + (int64(threadIdx.x)*(int64)4)) + (tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z))) + i_5)] = argsort_gpu.v0[last[0]]
argsort_gpu.v3[((((int64(blockIdx.x)*(int64)1024) + (int64(threadIdx.x)*(int64)4)) + (tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z))) + i_5)] = argsort_gpu.v1[last[0]]
last[0] = (last[0] + (int64)1)
}
} else if ((((first[0] < (tir.shift_right(tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128))))), (int64)1) + (tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z)))) && (first[0] < int64(any_dim))) && (((first[0] - (tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z))) - first[0]) < (int64)1024))) {
argsort_gpu.v2[((((int64(blockIdx.x)*(int64)1024) + (int64(threadIdx.x)*(int64)4)) + (tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z))) + i_5)] = argsort_gpu.v0[first[0]]
argsort_gpu.v3[((((int64(blockIdx.x)*(int64)1024) + (int64(threadIdx.x)*(int64)4)) + (tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z))) + i_5)] = argsort_gpu.v1[first[0]]
first[0] = (first[0] + (int64)1)
} else {
argsort_gpu.v2[((((int64(blockIdx.x)*(int64)1024) + (int64(threadIdx.x)*(int64)4)) + (tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z))) + i_5)] = argsort_gpu.v0[last[0]]
argsort_gpu.v3[((((int64(blockIdx.x)*(int64)1024) + (int64(threadIdx.x)*(int64)4)) + (tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z))) + i_5)] = argsort_gpu.v1[last[0]]
last[0] = (last[0] + (int64)1)
}
}
} else {
first[0] = max((int64)0, ((int64(threadIdx.x)*(int64)4) - min((((min((tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*(int64(blockIdx.z) + (int64)1)), int64(any_dim)) + last[0]) - min((tir.shift_right(tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128))))), (int64)1) + (tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z))), int64(any_dim))) - (int64(blockIdx.x)*(int64)1024)), (int64)1024)))
last[0] = min((int64(threadIdx.x)*(int64)4), min((min(tir.shift_right(tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128))))), (int64)1), (int64(any_dim) - (tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z)))) - first[0]), (int64)1024))
while((first[0] < last[0])){
if ((argsort_gpu.v2[(((tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z)) + first[0]) + tir.shift_right((first[0] + last[0]), (int64)1))] <= argsort_gpu.v2[((((((int64(blockIdx.x)*(int64)1024) + (int64(threadIdx.x)*(int64)4)) + min((tir.shift_right(tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128))))), (int64)1) + (tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z))), int64(any_dim))) - last[0]) - tir.shift_right((first[0] + last[0]), (int64)1)) - (int64)1)])) {
first[0] = (tir.shift_right((first[0] + last[0]), (int64)1) + (int64)1)
} else {
last[0] = tir.shift_right((first[0] + last[0]), (int64)1)
}
}
first[0] = (((tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z)) + first[0]) + first[0])
last[0] = (((((int64(blockIdx.x)*(int64)1024) + (int64(threadIdx.x)*(int64)4)) + min((tir.shift_right(tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128))))), (int64)1) + (tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z))), int64(any_dim))) - last[0]) - last[0])
for (i_6, (int64)0, min(((min((min(tir.shift_right(tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128))))), (int64)1), (int64(any_dim) - (tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z)))) - first[0]), (int64)1024) + min((((min((tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*(int64(blockIdx.z) + (int64)1)), int64(any_dim)) + last[0]) - min((tir.shift_right(tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128))))), (int64)1) + (tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z))), int64(any_dim))) - (int64(blockIdx.x)*(int64)1024)), (int64)1024)) - (int64(threadIdx.x)*(int64)4)), (int64)4)) {
if (((((first[0] < (tir.shift_right(tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128))))), (int64)1) + (tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z)))) && (first[0] < int64(any_dim))) && (((first[0] - (tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z))) - first[0]) < (int64)1024)) && ((last[0] < ((((int64(blockIdx.x)*(int64)1024) + min((tir.shift_right(tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128))))), (int64)1) + (tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z))), int64(any_dim))) + (int64)1024) - last[0])) && ((last[0] < (tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*(int64(blockIdx.z) + (int64)1))) && (last[0] < int64(any_dim)))))) {
if ((argsort_gpu.v2[first[0]] <= argsort_gpu.v2[last[0]])) {
argsort_gpu.v0[((((int64(blockIdx.x)*(int64)1024) + (int64(threadIdx.x)*(int64)4)) + (tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z))) + i_6)] = argsort_gpu.v2[first[0]]
argsort_gpu.v1[((((int64(blockIdx.x)*(int64)1024) + (int64(threadIdx.x)*(int64)4)) + (tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z))) + i_6)] = argsort_gpu.v3[first[0]]
first[0] = (first[0] + (int64)1)
} else {
argsort_gpu.v0[((((int64(blockIdx.x)*(int64)1024) + (int64(threadIdx.x)*(int64)4)) + (tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z))) + i_6)] = argsort_gpu.v2[last[0]]
argsort_gpu.v1[((((int64(blockIdx.x)*(int64)1024) + (int64(threadIdx.x)*(int64)4)) + (tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z))) + i_6)] = argsort_gpu.v3[last[0]]
last[0] = (last[0] + (int64)1)
}
} else if ((((first[0] < (tir.shift_right(tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128))))), (int64)1) + (tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z)))) && (first[0] < int64(any_dim))) && (((first[0] - (tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z))) - first[0]) < (int64)1024))) {
argsort_gpu.v0[((((int64(blockIdx.x)*(int64)1024) + (int64(threadIdx.x)*(int64)4)) + (tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z))) + i_6)] = argsort_gpu.v2[first[0]]
argsort_gpu.v1[((((int64(blockIdx.x)*(int64)1024) + (int64(threadIdx.x)*(int64)4)) + (tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z))) + i_6)] = argsort_gpu.v3[first[0]]
first[0] = (first[0] + (int64)1)
} else {
argsort_gpu.v0[((((int64(blockIdx.x)*(int64)1024) + (int64(threadIdx.x)*(int64)4)) + (tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z))) + i_6)] = argsort_gpu.v2[last[0]]
argsort_gpu.v1[((((int64(blockIdx.x)*(int64)1024) + (int64(threadIdx.x)*(int64)4)) + (tir.shift_left((int64)2, (i_0 + int64(tir.call_spirv_pure_glsl450((uint32)9, tir.call_spirv_pure_glsl450((uint32)30, 128)))))*int64(blockIdx.z))) + i_6)] = argsort_gpu.v3[last[0]]
last[0] = (last[0] + (int64)1)
}
}
}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment