Skip to content

Instantly share code, notes, and snippets.

@jdavidberger
Created February 5, 2023 23:23
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jdavidberger/e24b9724efd946a62e9e12aaa8ebc00e to your computer and use it in GitHub Desktop.
Save jdavidberger/e24b9724efd946a62e9e12aaa8ebc00e to your computer and use it in GitHub Desktop.
Status: Using GLEW 2.2.0
GL version: 3.1
GL Shading language version OpenGL ES GLSL ES 3.10
Vendor Panfrost
Renderer Mali-G52 r1 (Panfrost) OpenGL ES 3.1 Mesa 22.3.4
Invocations size 256
Work group size 256 256 256
Work group count 65535 65535 65535
Local size 256
compute shader ----------
#define KERNEL compute_sp_v1
#define LOCAL_SIZE_X 256
#define DATATYPE float
#line 64
// Avoiding auto-vectorize by using vector-width locked dependent code
layout(local_size_x = LOCAL_SIZE_X) in;
#undef MAD_4
#undef MAD_16
#undef MAD_64
#define mad(a,b,c) (a*b+c)
#define MAD_4(x, y) x = mad(y, x, y); y = mad(x, y, x); x = mad(y, x, y); y = mad(x, y, x);
#define MAD_16(x, y) MAD_4(x, y); MAD_4(x, y); MAD_4(x, y); MAD_4(x, y);
#define MAD_64(x, y) MAD_16(x, y); MAD_16(x, y); MAD_16(x, y); MAD_16(x, y);
struct vec8 {
vec4 d0, d1;
};
#define VEC8(x0,x1,x2,x3,x4,x5,x6,x7) vec8(vec4(x0,x1,x2,x3), vec4(x4,x5,x6,x7))
#define VEC8_S(x) vec8(vec4(x,x,x,x), vec4(x,x,x,x))
#define VEC8_ADD(a, b) (vec8(a.d0 + b.d0, a.d1 + b.d1))
#define VEC8_MUL(a, b) (vec8(a.d0 * b.d0, a.d1 * b.d1))
struct vec16 {
vec8 d0,d1;
};
#define VEC16(x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15) vec16(VEC8(x0,x1,x2,x3,x4,x5,x6,x7), VEC8(x8,x9,x10,x11,x12,x13,x14,x15))
#define VEC16_S(x) vec16(VEC8_S(x), VEC8_S(x));
#define VEC16_ADD(a, b) (vec16(VEC8_ADD(a.d0, b.d0), VEC8_ADD(a.d1, b.d1)))
#define VEC16_MUL(a, b) (vec16(VEC8_MUL(a.d0, b.d0), VEC8_MUL(a.d1, b.d1)))
#define mad8(a,b,c) (VEC8_ADD(VEC8_MUL(a,b),c))
#define mad16(a,b,c) (VEC16_ADD(VEC16_MUL(a,b),c))
layout(location = 1) uniform DATATYPE _A;
#define SCALE 1e-10
layout(std430, binding = 0) restrict writeonly buffer outbuffer {
DATATYPE ptr[];
};
#line 111
void compute_sp_v1()
{
uint id = gl_GlobalInvocationID[0] + gl_GlobalInvocationID[1] * 256u + gl_GlobalInvocationID[2] * 256u * 256u;
DATATYPE x = _A;
DATATYPE y = DATATYPE(float(id) * SCALE);
for(int i=0; i<128; i++)
{
MAD_16(x, y);
}
ptr[id] = y;
}
void main() {compute_sp_v1();}
----------
shader: MESA_SHADER_COMPUTE
source_sha1: {0xa4c5306d, 0xa0569abe, 0xd12df6c4, 0xb7754e85, 0x74e5b882}
name: GLSL2
workgroup-size: 256, 1, 1
shared-size: 0
inputs: 0
outputs: 0
uniforms: 1
ubos: 1
shared: 0
ray queries: 0
decl_var ssbo INTERP_MODE_NONE restrict writeonly highp float[] ptr (0, 0, 0)
decl_var uniform INTERP_MODE_NONE highp float _A (1, 0, 0)
decl_var ubo INTERP_MODE_NONE vec4[1] uniform_0 (0, 0, 0)
decl_function main (0 params)
impl main {
block block_0:
/* preds: */
vec3 32 ssa_4 = intrinsic load_global_invocation_id () ()
vec1 32 ssa_5 = load_const (0x00000008 = 0.000000)
vec1 32 ssa_6 = ishl ssa_4.y, ssa_5
vec1 32 ssa_7 = iadd ssa_4.x, ssa_6
vec1 32 ssa_1 = load_const (0x00000001 = 0.000000)
vec1 32 ssa_78 = insert_u16 ssa_4.z, ssa_1
vec1 32 ssa_10 = iadd ssa_7, ssa_78
vec1 32 ssa_12 = u2f32 ssa_10
vec1 32 ssa_2 = load_const (0x2edbe6ff = 0.000000)
vec1 32 ssa_13 = fmul ssa_12, ssa_2
vec1 32 ssa_0 = load_const (0x00000000 = 0.000000)
vec1 32 ssa_3 = load_const (0x00000080 = 0.000000)
vec1 32 ssa_11 = intrinsic load_ubo (ssa_0, ssa_0) (access=0, align_mul=1073741824, align_offset=0, range_base=0, range=4)
/* succs: block_1 */
loop {
block block_1:
/* preds: block_0 block_4 */
vec1 32 ssa_14 = phi block_0: ssa_13, block_4: ssa_61
vec1 32 ssa_15 = phi block_0: ssa_11, block_4: ssa_62
vec1 32 ssa_16 = phi block_0: ssa_0, block_4: ssa_50
vec1 32 ssa_17 = ige32 ssa_16, ssa_3
/* succs: block_2 block_3 */
if ssa_17 {
block block_2:
/* preds: block_1 */
break
/* succs: block_5 */
} else {
block block_3:
/* preds: block_1 */
/* succs: block_4 */
}
block block_4:
/* preds: block_3 */
vec1 32 ssa_76 = ffma ssa_14, ssa_15, ssa_14
vec1 32 ssa_75 = ffma ssa_76, ssa_14, ssa_76
vec1 32 ssa_74 = ffma ssa_75, ssa_76, ssa_75
vec1 32 ssa_73 = ffma ssa_74, ssa_75, ssa_74
vec1 32 ssa_72 = ffma ssa_73, ssa_74, ssa_73
vec1 32 ssa_71 = ffma ssa_72, ssa_73, ssa_72
vec1 32 ssa_70 = ffma ssa_71, ssa_72, ssa_71
vec1 32 ssa_69 = ffma ssa_70, ssa_71, ssa_70
vec1 32 ssa_68 = ffma ssa_69, ssa_70, ssa_69
vec1 32 ssa_67 = ffma ssa_68, ssa_69, ssa_68
vec1 32 ssa_66 = ffma ssa_67, ssa_68, ssa_67
vec1 32 ssa_65 = ffma ssa_66, ssa_67, ssa_66
vec1 32 ssa_64 = ffma ssa_65, ssa_66, ssa_65
vec1 32 ssa_63 = ffma ssa_64, ssa_65, ssa_64
vec1 32 ssa_62 = ffma ssa_63, ssa_64, ssa_63
vec1 32 ssa_61 = ffma ssa_62, ssa_63, ssa_62
vec1 32 ssa_50 = iadd ssa_16, ssa_1
/* succs: block_1 */
}
block block_5:
/* preds: block_2 */
vec1 32 ssa_51 = load_const (0x00000002 = 0.000000)
vec1 32 ssa_52 = ishl ssa_10, ssa_51
vec1 64 ssa_53 = intrinsic load_ssbo_address (ssa_0) ()
vec1 32 ssa_54 = unpack_64_2x32_split_x ssa_53
vec1 32 ssa_55 = unpack_64_2x32_split_y ssa_53
vec1 32 ssa_56 = iadd ssa_54, ssa_52
vec1 32 ssa_57 = ult32 ssa_56, ssa_54
vec1 32 ssa_58 = b2i32 ssa_57
vec1 32 ssa_59 = iadd ssa_58, ssa_55
vec1 64 ssa_60 = pack_64_2x32_split ssa_56, ssa_59
intrinsic store_global (ssa_14, ssa_60) (wrmask=x /*1*/, access=0, align_mul=4, align_offset=0)
/* succs: block_6 */
block block_6:
}
block0 {
81 = MOV.i32 r62
80 = MOV.i32 r61
79 = MOV.i32 r60
6 = LSHIFT_OR.i32 80, #0x0, #0x8.b0
7 = IADD.s32 79, 6
78 = MKVEC.v2i16 #0x0.h00, 81.h00
10 = IADD.s32 7, 78
12 = U32_TO_F32 10
13 = FMA.f32 12, #0x2edbe6ff, #0x0.neg
} -> block1
block1 {
14 = PHI 13, 61
15 = PHI u1, 62
16 = PHI #0x0, 50
17 = ICMP.s32.m1.ge 16, #0x80
BRANCHZ.i16.eq 17.h00, #0x0 -> block3
} -> block3 block2 from block0 block4
block2 {
JUMP #0x0 -> block5
} -> block5 from block1
block3 {
} -> block4 from block1
block4 {
76 = FMA.f32 14, 15, 14
75 = FMA.f32 76, 14, 76
74 = FMA.f32 75, 76, 75
73 = FMA.f32 74, 75, 74
72 = FMA.f32 73, 74, 73
71 = FMA.f32 72, 73, 72
70 = FMA.f32 71, 72, 71
69 = FMA.f32 70, 71, 70
68 = FMA.f32 69, 70, 69
67 = FMA.f32 68, 69, 68
66 = FMA.f32 67, 68, 67
65 = FMA.f32 66, 67, 66
64 = FMA.f32 65, 66, 65
63 = FMA.f32 64, 65, 64
62 = FMA.f32 63, 64, 63
61 = FMA.f32 62, 63, 62
50 = IADD.s32 16, #0x1
JUMP #0x0 -> block1
} -> block1 from block3
block5 {
52 = LSHIFT_OR.i32 10, #0x0, #0x2.b0
56 = IADD.s32 u0, 52
58 = ICMP.u32.i1.lt 56, u0
59 = IADD.s32 58, u0[1]
STORE.i32 14, 56, 59, byte_offset:0
} from block2
block0 {
r0 = LSHIFT_OR.i32 r61, #0x0, #0x8.b0
r0 = IADD.s32 r60, r0
r1 = MKVEC.v2i16 #0x0.h00, r62.h00
r0 = IADD.s32 r0, r1
r1 = U32_TO_F32 r0
r1 = FMA.f32 r1, #0x2edbe6ff, #0x0.neg
r2 = MOV.i32 u1
r3 = MOV.i32 #0x0
} -> block1
block1 {
r4 = ICMP.s32.m1.ge r3, #0x80
BRANCHZ.i16.eq r4.h00, #0x0 -> block3
} -> block3 block2 from block0 block4
block2 {
JUMP #0x0 -> block5
} -> block5 from block1
block3 {
} -> block4 from block1
block4 {
r2 = FMA.f32 r1, r2, r1
r1 = FMA.f32 r2, r1, r2
r2 = FMA.f32 r1, r2, r1
r1 = FMA.f32 r2, r1, r2
r2 = FMA.f32 r1, r2, r1
r1 = FMA.f32 r2, r1, r2
r2 = FMA.f32 r1, r2, r1
r1 = FMA.f32 r2, r1, r2
r2 = FMA.f32 r1, r2, r1
r1 = FMA.f32 r2, r1, r2
r2 = FMA.f32 r1, r2, r1
r1 = FMA.f32 r2, r1, r2
r2 = FMA.f32 r1, r2, r1
r1 = FMA.f32 r2, r1, r2
r2 = FMA.f32 r1, r2, r1
r1 = FMA.f32 r2, r1, r2
r3 = IADD.s32 r3, #0x1
JUMP #0x0 -> block1
} -> block1 from block3
block5 {
r0 = LSHIFT_OR.i32 r0, #0x0, #0x2.b0
r0 = IADD.s32 u0, r0
r2 = ICMP.u32.i1.lt r0, u0
r2 = IADD.s32 r2, u0[1]
STORE.i32 r1, r0, r2, byte_offset:0
} from block2
block0 {
id(0) nbb r_uncond
* _.h00 = LSHIFT_OR.i32 r61, t, fau.x.b0
+ _.h00 = IADD.s32 r60, t
* _.h00 = MKVEC.v2i16 t.h00, r62.h00
+ r0 = IADD.s32 t1, t
* NOP
+ _.h00 = U32_TO_F32 t1
* r1 = FMA.f32 t1, fau.y, t.neg
+ NOP
* NOP
+ r2 = MOV.i32 fau.x
* NOP
+ r3 = MOV.i32 fau.x
2edbe6ff00000008
} -> block1
block1 {
id(0) nbb r_uncond pcrel(0)
* NOP
+ _.h00 = ICMP.s32.m1.ge r3, fau.x
* NOP
+ BRANCHZ.i16.eq t1.h00, fau.y -> block3
4000000000000080
} -> block3 block2 from block0 block4
block2 {
id(0) nbb no_prefetch pcrel(0)
* NOP
+ JUMP fau.y -> block5
4000000000000000
} -> block5 from block1
block3 {
} -> block4 from block1
block4 {
id(0) nbb
* r2 = FMA.f32 r1, r2, r1
+ NOP
* r1 = FMA.f32 t0, r1, t0
+ NOP
* r2 = FMA.f32 t0, r2, t0
+ NOP
* r1 = FMA.f32 t0, r1, t0
+ NOP
* r2 = FMA.f32 t0, r2, t0
+ NOP
* r1 = FMA.f32 t0, r1, t0
+ NOP
* r2 = FMA.f32 t0, r2, t0
+ NOP
* r1 = FMA.f32 t0, r1, t0
+ NOP
id(0) nbb r_uncond no_prefetch pcrel(1)
* r2 = FMA.f32 r1, r2, r1
+ NOP
* r1 = FMA.f32 t0, r1, t0
+ NOP
* r2 = FMA.f32 t0, r2, t0
+ NOP
* r1 = FMA.f32 t0, r1, t0
+ NOP
* r2 = FMA.f32 t0, r2, t0
+ NOP
* r1 = FMA.f32 t0, r1, t0
+ NOP
* r2 = FMA.f32 t0, r2, t0
+ r3 = IADD.s32 r3, fau.x
* r1 = FMA.f32 t0, r1, t0
+ JUMP fau.y -> block1
0 4000000000000001
} -> block1 from block3
block5 {
id(0) wait(0 ) nbb r_uncond
* _.h00 = LSHIFT_OR.i32 r0, t, fau.y.b0
+ NOP
* NOP
+ r0 = IADD.s32 fau.x, t0
* NOP
+ _.h00 = ICMP.u32.i1.lt t1, fau.x
* NOP
+ _.h00 = IADD.s32 t1, fau.y
* NOP
+ STORE.i32 r1, r0, t1, byte_offset:0
200000000
} from block2
slot 0 reads: r1
clause_0:
ds(0) nbb r_uncond ncph
{
*LSHIFT_OR.i32 t0, r61, #0, 0x00000008 /* 0.000000 */
+IADD.s32 t1, r60, t
*MKVEC.v2i16 t0, #0, r62
+IADD.s32 r0:t1, t1, t
*NOP t0
+U32_TO_F32 t1, t1
*FMA.f32 r1:t0, t1, 0x2edbe6ff /* 0.000000 */, #0.neg
+NOP t1
*NOP t0
+MOV.i32 r2:t1, u1.w0
*NOP t0
+MOV.i32 r3:t1, #0.x
}
clause_5:
ds(0) nbb r_uncond ncph
{
*NOP t0
+ICMP.s32.m1.ge t1, r3, 0x00000080 /* 0.000000 */
*NOP t0
+BRANCHZ.i16.eq t1, t1.h0, clause_10
}
clause_8:
ds(0) nbb next_store dwb(0)
{
*NOP t0
+JUMP t1, clause_23
}
clause_10:
ds(0) nbb ncph
{
*FMA.f32 r2:t0, r1, r2, r1
+NOP t1
*FMA.f32 r1:t0, t0, r1, t0
+NOP t1
*FMA.f32 r2:t0, t0, r2, t0
+NOP t1
*FMA.f32 r1:t0, t0, r1, t0
+NOP t1
*FMA.f32 r2:t0, t0, r2, t0
+NOP t1
*FMA.f32 r1:t0, t0, r1, t0
+NOP t1
*FMA.f32 r2:t0, t0, r2, t0
+NOP t1
*FMA.f32 r1:t0, t0, r1, t0
+NOP t1
}
clause_16:
ds(0) nbb r_uncond
{
*FMA.f32 r2:t0, r1, r2, r1
+NOP t1
*FMA.f32 r1:t0, t0, r1, t0
+NOP t1
*FMA.f32 r2:t0, t0, r2, t0
+NOP t1
*FMA.f32 r1:t0, t0, r1, t0
+NOP t1
*FMA.f32 r2:t0, t0, r2, t0
+NOP t1
*FMA.f32 r1:t0, t0, r1, t0
+NOP t1
*FMA.f32 r2:t0, t0, r2, t0
+IADD.s32 r3:t1, r3, 0x00000001 /* 0.000000 */
*FMA.f32 r1:t0, t0, r1, t0
+JUMP t1, clause_5
}
clause_23:
ds(0) eos store
{
*LSHIFT_OR.i32 t0, r0, #0, 0x00000002 /* 0.000000 */
+NOP t1
*NOP t0
+IADD.s32 r0:t1, u0.w0, t0
*NOP t0
+ICMP.u32.gt t1, u0.w0, t1
*NOP t0
+IADD.s32 t1, t1, u0.w1
*NOP t0
+STORE.i32 t1, r0, t1, @r1
}
shader: MESA_SHADER_FRAGMENT
source_sha1: {0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000}
name: GLSL0
inputs: 0
outputs: 1
uniforms: 1
ubos: 1
shared: 0
ray queries: 0
decl_var uniform INTERP_MODE_NONE vec4 gl_CurrentAttribFrag2MESA (0, 0, 0)
decl_var shader_out INTERP_MODE_NONE vec4 gl_FragData[0] (FRAG_RESULT_DATA0.xyzw, 0, 0)
decl_var ubo INTERP_MODE_NONE vec4[1] uniform_0 (0, 0, 0)
decl_function main (0 params)
impl main {
block block_0:
/* preds: */
vec1 32 ssa_0 = load_const (0x00000000 = 0.000000)
vec4 32 ssa_1 = intrinsic load_ubo (ssa_0, ssa_0) (access=0, align_mul=1073741824, align_offset=0, range_base=0, range=16)
intrinsic store_output (ssa_1, ssa_0) (base=0, wrmask=xyzw /*15*/, component=0, src_type=float32 /*160*/, io location=4 slots=1 /*132*/, xfb() /*0*/, xfb2() /*0*/) /* gl_FragData[0] */
/* succs: block_1 */
block block_1:
}
block0 {
6 = MOV.i32 r60
1 = COLLECT.i32 u0, u0[1], u1, u1[1]
7 = ATEST 6, u1[1], atest-param
8 = BLEND.f32 1, 7, blend_descriptor_0, blend_descriptor_0[1], _.h00, sr_count:4, sr_count_2:2
}
block0 {
r3 = MOV.i32 u1[1]
r60 = ATEST r60, r3, atest-param
r2 = MOV.i32 u1
r0 = MOV.i32 u0
r1 = MOV.i32 u0[1]
r48 = BLEND.f32 r0, r60, blend_descriptor_0, blend_descriptor_0[1], _.h00, sr_count:4, sr_count_2:2
}
block0 {
id(0) wait(6 ) nbb td
* NOP
+ r3 = MOV.i32 fau.y
* NOP
+ r60 = ATEST r60, t1, fau.x
id(0) wait(0 6 7 ) nbb r_uncond td
* NOP
+ r2 = MOV.i32 fau.x
* r0 = MOV.i32 fau.x
+ r1 = MOV.i32 fau.y
* NOP
+ r48 = BLEND.f32 r0, r60, fau.x, fau.y, _.h00, sr_count:4, sr_count_2:2
}
slot 0 reads: r0 r1 r2 r3
clause_0:
ds(0) nbb atest td ncph next_blend dwb(0, 6, 7)
{
*NOP t0
+MOV.i32 r3:t1, u1.w1
*NOP t0
+ATEST r60:t1, r60, t1, @r60
}
clause_2:
ds(0) eos blend td
{
*NOP t0
+MOV.i32 r2:t1, u1.w0
*MOV.i32 r0:t0, u0.w0
+MOV.i32 r1:t1, u0.w1
*NOP t0
+BLEND r48:t1, r60, blend_descriptor_0.x, blend_descriptor_0.y, @r0
}
shader: MESA_SHADER_VERTEX
source_sha1: {0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000}
name: ARB0
inputs: 0
outputs: 1
uniforms: 5
ubos: 1
shared: 0
ray queries: 0
decl_var uniform INTERP_MODE_NONE vec4[5] state.matrix.mvp.transpose.row[0] (0, 0, 0)
decl_var shader_out INTERP_MODE_SMOOTH vec4 out_0 (VARYING_SLOT_POS.xyzw, 0, 0)
decl_var ubo INTERP_MODE_NONE vec4[5] uniform_0 (0, 0, 0)
decl_function main (0 params)
impl main {
block block_0:
/* preds: */
vec1 32 ssa_0 = load_const (0x00000000 = 0.000000)
vec4 32 ssa_1 = intrinsic load_ubo (ssa_0, ssa_0) (access=0, align_mul=4, align_offset=0, range_base=0, range=80)
vec1 32 ssa_2 = load_const (0x00000040 = 0.000000)
vec4 32 ssa_3 = intrinsic load_ubo (ssa_0, ssa_2) (access=0, align_mul=4, align_offset=0, range_base=0, range=80)
vec1 32 ssa_4 = fmul ssa_3.x, ssa_1.x
vec1 32 ssa_5 = fmul ssa_3.x, ssa_1.y
vec1 32 ssa_6 = fmul ssa_3.x, ssa_1.z
vec1 32 ssa_7 = fmul ssa_3.x, ssa_1.w
vec1 32 ssa_8 = load_const (0x00000010 = 0.000000)
vec4 32 ssa_9 = intrinsic load_ubo (ssa_0, ssa_8) (access=0, align_mul=4, align_offset=0, range_base=0, range=80)
vec1 32 ssa_10 = ffma ssa_3.y, ssa_9.x, ssa_4
vec1 32 ssa_11 = ffma ssa_3.y, ssa_9.y, ssa_5
vec1 32 ssa_12 = ffma ssa_3.y, ssa_9.z, ssa_6
vec1 32 ssa_13 = ffma ssa_3.y, ssa_9.w, ssa_7
vec1 32 ssa_14 = load_const (0x00000020 = 0.000000)
vec4 32 ssa_15 = intrinsic load_ubo (ssa_0, ssa_14) (access=0, align_mul=4, align_offset=0, range_base=0, range=80)
vec1 32 ssa_16 = ffma ssa_3.z, ssa_15.x, ssa_10
vec1 32 ssa_17 = ffma ssa_3.z, ssa_15.y, ssa_11
vec1 32 ssa_18 = ffma ssa_3.z, ssa_15.z, ssa_12
vec1 32 ssa_19 = ffma ssa_3.z, ssa_15.w, ssa_13
vec1 32 ssa_20 = load_const (0x00000030 = 0.000000)
vec4 32 ssa_21 = intrinsic load_ubo (ssa_0, ssa_20) (access=0, align_mul=4, align_offset=0, range_base=0, range=80)
vec1 32 ssa_22 = ffma ssa_3.w, ssa_21.x, ssa_16
vec1 32 ssa_23 = ffma ssa_3.w, ssa_21.y, ssa_17
vec1 32 ssa_24 = ffma ssa_3.w, ssa_21.z, ssa_18
vec1 32 ssa_25 = ffma ssa_3.w, ssa_21.w, ssa_19
vec3 32 ssa_26 = intrinsic load_viewport_scale () ()
vec3 32 ssa_27 = intrinsic load_viewport_offset () ()
vec1 32 ssa_28 = frcp ssa_25
vec1 32 ssa_29 = fmul ssa_22, ssa_28
vec1 32 ssa_30 = fmul ssa_23, ssa_28
vec1 32 ssa_31 = fmul ssa_24, ssa_28
vec1 32 ssa_32 = ffma ssa_29, ssa_26.x, ssa_27.x
vec1 32 ssa_33 = ffma ssa_30, ssa_26.y, ssa_27.y
vec1 32 ssa_34 = ffma ssa_31, ssa_26.z, ssa_27.z
vec4 32 ssa_35 = vec4 ssa_32, ssa_33, ssa_34, ssa_28
intrinsic store_output (ssa_35, ssa_0) (base=0, wrmask=xyzw /*15*/, component=0, src_type=float32 /*160*/, io location=0 slots=1 /*128*/, xfb() /*0*/, xfb2() /*0*/) /* out_0 */
/* succs: block_1 */
block block_1:
}
block0 {
63 = MOV.i32 r59
62 = MOV.i32 r58
4 = FMA.f32 u3[1], u3, #0x0.neg
5 = FMA.f32 u3[1], u4, #0x0.neg
6 = FMA.f32 u3[1], u4[1], #0x0.neg
7 = FMA.f32 u3[1], u11, #0x0.neg
10 = FMA.f32 u5[1], u5, 4
11 = FMA.f32 u5[1], u6, 5
12 = FMA.f32 u5[1], u6[1], 6
13 = FMA.f32 u5[1], u11[1], 7
16 = FMA.f32 u7[1], u7, 10
17 = FMA.f32 u7[1], u8, 11
18 = FMA.f32 u7[1], u8[1], 12
19 = FMA.f32 u7[1], u12, 13
22 = FMA.f32 u9[1], u9, 16
23 = FMA.f32 u9[1], u10, 17
24 = FMA.f32 u9[1], u10[1], 18
25 = FMA.f32 u9[1], u12[1], 19
28 = FRCP.f32 25
29 = FMA.f32 22, 28, #0x0.neg
30 = FMA.f32 23, 28, #0x0.neg
31 = FMA.f32 24, 28, #0x0.neg
32 = FMA.f32 29, u0, u0[1]
33 = FMA.f32 30, u1, u1[1]
34 = FMA.f32 31, u2, u2[1]
35 = COLLECT.i32 32, 33, 34, 28
ST_CVT.f32.v4 35, 62, 63, #0x105e000
}
block0 {
r0 = MOV.i32 u11
r0 = FMA.f32 u3[1], r0, #0x0.neg
r1 = MOV.i32 u11[1]
r0 = FMA.f32 u5[1], r1, r0
r1 = MOV.i32 u4[1]
r1 = FMA.f32 u3[1], r1, #0x0.neg
r2 = MOV.i32 u6[1]
r1 = FMA.f32 u5[1], r2, r1
r2 = MOV.i32 u4
r2 = FMA.f32 u3[1], r2, #0x0.neg
r3 = MOV.i32 u6
r2 = FMA.f32 u5[1], r3, r2
r3 = MOV.i32 u12
r0 = FMA.f32 u7[1], r3, r0
r3 = MOV.i32 u8[1]
r1 = FMA.f32 u7[1], r3, r1
r3 = MOV.i32 u8
r2 = FMA.f32 u7[1], r3, r2
r3 = MOV.i32 u12[1]
r0 = FMA.f32 u9[1], r3, r0
r3 = MOV.i32 u10[1]
r1 = FMA.f32 u9[1], r3, r1
r3 = MOV.i32 u10
r2 = FMA.f32 u9[1], r3, r2
r3 = FMA.f32 u3[1], u3, #0x0.neg
r3 = FMA.f32 u5[1], u5, r3
r3 = FMA.f32 u7[1], u7, r3
r0 = FRCP.f32 r0
r3 = FMA.f32 u9[1], u9, r3
r1 = FMA.f32 r1, r0, #0x0.neg
r2 = FMA.f32 r2, r0, #0x0.neg
r3 = FMA.f32 r3, r0, #0x0.neg
r1 = FMA.f32 r1, u2, u2[1]
r2 = FMA.f32 r2, u1, u1[1]
r3 = FMA.f32 r3, u0, u0[1]
r4 = MOV.i32 r2
r5 = MOV.i32 r1
r6 = MOV.i32 r0
ST_CVT.f32.v4 r3, r58, r59, #0x105e000
}
block0 {
id(0) nbb
* NOP
+ _.h00 = MOV.i32 fau.x
* r0 = FMA.f32 fau.y, t1, t.neg
+ NOP
* NOP
+ r1 = MOV.i32 fau.y
id(0) nbb
* r0 = FMA.f32 fau.y, r1, r0
+ NOP
* NOP
+ _.h00 = MOV.i32 fau.y
* r1 = FMA.f32 fau.y, t1, t.neg
+ NOP
* NOP
+ _.h00 = MOV.i32 fau.y
* r1 = FMA.f32 fau.y, t1, r1
+ NOP
* NOP
+ _.h00 = MOV.i32 fau.x
* r2 = FMA.f32 fau.y, t1, t.neg
+ NOP
* NOP
+ r3 = MOV.i32 fau.x
id(0) nbb
* r2 = FMA.f32 fau.y, r3, r2
+ NOP
* NOP
+ _.h00 = MOV.i32 fau.x
* r0 = FMA.f32 fau.y, t1, r0
+ NOP
* NOP
+ _.h00 = MOV.i32 fau.y
* r1 = FMA.f32 fau.y, t1, r1
+ NOP
* NOP
+ _.h00 = MOV.i32 fau.x
* r2 = FMA.f32 fau.y, t1, r2
+ NOP
* NOP
+ r3 = MOV.i32 fau.y
id(0) nbb
* r0 = FMA.f32 fau.y, r3, r0
+ NOP
* NOP
+ _.h00 = MOV.i32 fau.y
* r1 = FMA.f32 fau.y, t1, r1
+ NOP
* NOP
+ _.h00 = MOV.i32 fau.x
* r2 = FMA.f32 fau.y, t1, r2
+ NOP
* _.h00 = FMA.f32 fau.y, fau.x, t.neg
+ NOP
* _.h00 = FMA.f32 fau.y, fau.x, t0
+ NOP
* r3 = FMA.f32 fau.y, fau.x, t0
+ NOP
id(0) wait(0 ) nbb r_uncond
* r3 = FMA.f32 fau.y, fau.x, r3
+ r0 = FRCP.f32 r0
* r1 = FMA.f32 r1, t1, t.neg
+ NOP
* r2 = FMA.f32 r2, r0, t.neg
+ NOP
* r3 = FMA.f32 r3, r0, t.neg
+ NOP
* _.h00 = FMA.f32 r2, fau.x, fau.y
+ r4 = MOV.i32 t
* _.h00 = FMA.f32 r1, fau.x, fau.y
+ r5 = MOV.i32 t
* r3 = FMA.f32 r3, fau.x, fau.y
+ r6 = MOV.i32 r0
* NOP
+ ST_CVT.f32.v4 r3, r58, r59, fau.y
105e00000000000
}
slot 0 reads: r3 r4 r5 r6
clause_0:
ds(0) nbb ncph
{
*NOP t0
+MOV.i32 t1, u11.w0
*FMA.f32 r0:t0, u3.w1, t1, #0.neg
+NOP t1
*NOP t0
+MOV.i32 r1:t1, u11.w1
}
clause_3:
ds(0) nbb ncph
{
*FMA.f32 r0:t0, u5.w1, r1, r0
+NOP t1
*NOP t0
+MOV.i32 t1, u4.w1
*FMA.f32 r1:t0, u3.w1, t1, #0.neg
+NOP t1
*NOP t0
+MOV.i32 t1, u6.w1
*FMA.f32 r1:t0, u5.w1, t1, r1
+NOP t1
*NOP t0
+MOV.i32 t1, u4.w0
*FMA.f32 r2:t0, u3.w1, t1, #0.neg
+NOP t1
*NOP t0
+MOV.i32 r3:t1, u6.w0
}
clause_9:
ds(0) nbb ncph
{
*FMA.f32 r2:t0, u5.w1, r3, r2
+NOP t1
*NOP t0
+MOV.i32 t1, u12.w0
*FMA.f32 r0:t0, u7.w1, t1, r0
+NOP t1
*NOP t0
+MOV.i32 t1, u8.w1
*FMA.f32 r1:t0, u7.w1, t1, r1
+NOP t1
*NOP t0
+MOV.i32 t1, u8.w0
*FMA.f32 r2:t0, u7.w1, t1, r2
+NOP t1
*NOP t0
+MOV.i32 r3:t1, u12.w1
}
clause_15:
ds(0) nbb ncph next_store dwb(0)
{
*FMA.f32 r0:t0, u9.w1, r3, r0
+NOP t1
*NOP t0
+MOV.i32 t1, u10.w1
*FMA.f32 r1:t0, u9.w1, t1, r1
+NOP t1
*NOP t0
+MOV.i32 t1, u10.w0
*FMA.f32 r2:t0, u9.w1, t1, r2
+NOP t1
*FMA.f32 t0, u3.w1, u3.w0, #0.neg
+NOP t1
*FMA.f32 t0, u5.w1, u5.w0, t0
+NOP t1
*FMA.f32 r3:t0, u7.w1, u7.w0, t0
+NOP t1
}
clause_21:
ds(0) eos store
{
*FMA.f32 r3:t0, u9.w1, u9.w0, r3
+FRCP.f32 r0:t1, r0
*FMA.f32 r1:t0, r1, t1, #0.neg
+NOP t1
*FMA.f32 r2:t0, r2, r0, #0.neg
+NOP t1
*FMA.f32 r3:t0, r3, r0, #0.neg
+NOP t1
*FMA.f32 t0, r2, u1.w0, u1.w1
+MOV.i32 r4:t1, t
*FMA.f32 t0, r1, u2.w0, u2.w1
+MOV.i32 r5:t1, t
*FMA.f32 r3:t0, r3, u0.w0, u0.w1
+MOV.i32 r6:t1, r0
*NOP t0
+ST_CVT.v4 t1, r58, r59, 0x0105e000 /* 0.000000 */, @r3
}
shader: MESA_SHADER_VERTEX
source_sha1: {0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000}
name: ARB0
inputs: 0
outputs: 1
uniforms: 5
ubos: 1
shared: 0
ray queries: 0
decl_var uniform INTERP_MODE_NONE vec4[5] state.matrix.mvp.transpose.row[0] (0, 0, 0)
decl_var shader_out INTERP_MODE_SMOOTH vec4 out_0 (VARYING_SLOT_POS.xyzw, 0, 0)
decl_var ubo INTERP_MODE_NONE vec4[5] uniform_0 (0, 0, 0)
decl_function main (0 params)
impl main {
block block_0:
/* preds: */
/* succs: block_1 */
block block_1:
}
block0 {
}
block0 {
}
block0 {
}
ccdd1a79 compute_sp_v1_float 22.094 GFLOPs 12.150ms
compute shader ----------
#define KERNEL compute_sp_v2
#define LOCAL_SIZE_X 256
#define DATATYPE float
#line 64
// Avoiding auto-vectorize by using vector-width locked dependent code
layout(local_size_x = LOCAL_SIZE_X) in;
#undef MAD_4
#undef MAD_16
#undef MAD_64
#define mad(a,b,c) (a*b+c)
#define MAD_4(x, y) x = mad(y, x, y); y = mad(x, y, x); x = mad(y, x, y); y = mad(x, y, x);
#define MAD_16(x, y) MAD_4(x, y); MAD_4(x, y); MAD_4(x, y); MAD_4(x, y);
#define MAD_64(x, y) MAD_16(x, y); MAD_16(x, y); MAD_16(x, y); MAD_16(x, y);
struct vec8 {
vec4 d0, d1;
};
#define VEC8(x0,x1,x2,x3,x4,x5,x6,x7) vec8(vec4(x0,x1,x2,x3), vec4(x4,x5,x6,x7))
#define VEC8_S(x) vec8(vec4(x,x,x,x), vec4(x,x,x,x))
#define VEC8_ADD(a, b) (vec8(a.d0 + b.d0, a.d1 + b.d1))
#define VEC8_MUL(a, b) (vec8(a.d0 * b.d0, a.d1 * b.d1))
struct vec16 {
vec8 d0,d1;
};
#define VEC16(x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15) vec16(VEC8(x0,x1,x2,x3,x4,x5,x6,x7), VEC8(x8,x9,x10,x11,x12,x13,x14,x15))
#define VEC16_S(x) vec16(VEC8_S(x), VEC8_S(x));
#define VEC16_ADD(a, b) (vec16(VEC8_ADD(a.d0, b.d0), VEC8_ADD(a.d1, b.d1)))
#define VEC16_MUL(a, b) (vec16(VEC8_MUL(a.d0, b.d0), VEC8_MUL(a.d1, b.d1)))
#define mad8(a,b,c) (VEC8_ADD(VEC8_MUL(a,b),c))
#define mad16(a,b,c) (VEC16_ADD(VEC16_MUL(a,b),c))
layout(location = 1) uniform DATATYPE _A;
#define SCALE 1e-10
layout(std430, binding = 0) restrict writeonly buffer outbuffer {
DATATYPE ptr[];
};
#line 128
void compute_sp_v2()
{
uint id = gl_GlobalInvocationID[0] + gl_GlobalInvocationID[1] * 256u + gl_GlobalInvocationID[2] * 256u * 256u;
vec2 x = vec2(_A, (_A+DATATYPE(1)));
vec2 y = vec2((float(id) * SCALE), (float(id) * SCALE));
for(int i=0; i<64; i++)
{
MAD_16(x, y);
}
ptr[id] = (y.x) + (y.y);
}
void main() {compute_sp_v2();}
----------
shader: MESA_SHADER_COMPUTE
source_sha1: {0xc009c35c, 0x704fbf5f, 0x33c55587, 0x620877d8, 0x4a008af3}
name: GLSL4
workgroup-size: 256, 1, 1
shared-size: 0
inputs: 0
outputs: 0
uniforms: 1
ubos: 1
shared: 0
ray queries: 0
decl_var ssbo INTERP_MODE_NONE restrict writeonly highp float[] ptr (0, 0, 0)
decl_var uniform INTERP_MODE_NONE highp float _A (1, 0, 0)
decl_var ubo INTERP_MODE_NONE vec4[1] uniform_0 (0, 0, 0)
decl_function main (0 params)
impl main {
block block_0:
/* preds: */
vec3 32 ssa_5 = intrinsic load_global_invocation_id () ()
vec1 32 ssa_6 = load_const (0x00000008 = 0.000000)
vec1 32 ssa_7 = ishl ssa_5.y, ssa_6
vec1 32 ssa_8 = iadd ssa_5.x, ssa_7
vec1 32 ssa_1 = load_const (0x00000001 = 0.000000)
vec1 32 ssa_131 = insert_u16 ssa_5.z, ssa_1
vec1 32 ssa_11 = iadd ssa_8, ssa_131
vec1 32 ssa_0 = load_const (0x00000000 = 0.000000)
vec1 32 ssa_12 = intrinsic load_ubo (ssa_0, ssa_0) (access=0, align_mul=1073741824, align_offset=0, range_base=0, range=4)
vec1 32 ssa_2 = load_const (0x3f800000 = 1.000000)
vec1 32 ssa_13 = fadd ssa_12, ssa_2
vec1 32 ssa_14 = u2f32 ssa_11
vec1 32 ssa_3 = load_const (0x2edbe6ff = 0.000000)
vec1 32 ssa_15 = fmul ssa_14, ssa_3
vec1 32 ssa_4 = load_const (0x00000040 = 0.000000)
/* succs: block_1 */
loop {
block block_1:
/* preds: block_0 block_4 */
vec1 32 ssa_16 = phi block_0: ssa_15, block_4: ssa_99
vec1 32 ssa_17 = phi block_0: ssa_15, block_4: ssa_98
vec1 32 ssa_18 = phi block_0: ssa_12, block_4: ssa_101
vec1 32 ssa_19 = phi block_0: ssa_13, block_4: ssa_100
vec1 32 ssa_20 = phi block_0: ssa_0, block_4: ssa_86
vec1 32 ssa_21 = ige32 ssa_20, ssa_4
/* succs: block_2 block_3 */
if ssa_21 {
block block_2:
/* preds: block_1 */
break
/* succs: block_5 */
} else {
block block_3:
/* preds: block_1 */
/* succs: block_4 */
}
block block_4:
/* preds: block_3 */
vec1 32 ssa_129 = ffma ssa_16, ssa_18, ssa_16
vec1 32 ssa_128 = ffma ssa_17, ssa_19, ssa_17
vec1 32 ssa_127 = ffma ssa_129, ssa_16, ssa_129
vec1 32 ssa_126 = ffma ssa_128, ssa_17, ssa_128
vec1 32 ssa_125 = ffma ssa_127, ssa_129, ssa_127
vec1 32 ssa_124 = ffma ssa_126, ssa_128, ssa_126
vec1 32 ssa_123 = ffma ssa_125, ssa_127, ssa_125
vec1 32 ssa_122 = ffma ssa_124, ssa_126, ssa_124
vec1 32 ssa_121 = ffma ssa_123, ssa_125, ssa_123
vec1 32 ssa_120 = ffma ssa_122, ssa_124, ssa_122
vec1 32 ssa_119 = ffma ssa_121, ssa_123, ssa_121
vec1 32 ssa_118 = ffma ssa_120, ssa_122, ssa_120
vec1 32 ssa_117 = ffma ssa_119, ssa_121, ssa_119
vec1 32 ssa_116 = ffma ssa_118, ssa_120, ssa_118
vec1 32 ssa_115 = ffma ssa_117, ssa_119, ssa_117
vec1 32 ssa_114 = ffma ssa_116, ssa_118, ssa_116
vec1 32 ssa_113 = ffma ssa_115, ssa_117, ssa_115
vec1 32 ssa_112 = ffma ssa_114, ssa_116, ssa_114
vec1 32 ssa_111 = ffma ssa_113, ssa_115, ssa_113
vec1 32 ssa_110 = ffma ssa_112, ssa_114, ssa_112
vec1 32 ssa_109 = ffma ssa_111, ssa_113, ssa_111
vec1 32 ssa_108 = ffma ssa_110, ssa_112, ssa_110
vec1 32 ssa_107 = ffma ssa_109, ssa_111, ssa_109
vec1 32 ssa_106 = ffma ssa_108, ssa_110, ssa_108
vec1 32 ssa_105 = ffma ssa_107, ssa_109, ssa_107
vec1 32 ssa_104 = ffma ssa_106, ssa_108, ssa_106
vec1 32 ssa_103 = ffma ssa_105, ssa_107, ssa_105
vec1 32 ssa_102 = ffma ssa_104, ssa_106, ssa_104
vec1 32 ssa_101 = ffma ssa_103, ssa_105, ssa_103
vec1 32 ssa_100 = ffma ssa_102, ssa_104, ssa_102
vec1 32 ssa_99 = ffma ssa_101, ssa_103, ssa_101
vec1 32 ssa_98 = ffma ssa_100, ssa_102, ssa_100
vec1 32 ssa_86 = iadd ssa_20, ssa_1
/* succs: block_1 */
}
block block_5:
/* preds: block_2 */
vec1 32 ssa_87 = load_const (0x00000002 = 0.000000)
vec1 32 ssa_88 = ishl ssa_11, ssa_87
vec1 32 ssa_89 = fadd ssa_16, ssa_17
vec1 64 ssa_90 = intrinsic load_ssbo_address (ssa_0) ()
vec1 32 ssa_91 = unpack_64_2x32_split_x ssa_90
vec1 32 ssa_92 = unpack_64_2x32_split_y ssa_90
vec1 32 ssa_93 = iadd ssa_91, ssa_88
vec1 32 ssa_94 = ult32 ssa_93, ssa_91
vec1 32 ssa_95 = b2i32 ssa_94
vec1 32 ssa_96 = iadd ssa_95, ssa_92
vec1 64 ssa_97 = pack_64_2x32_split ssa_93, ssa_96
intrinsic store_global (ssa_89, ssa_97) (wrmask=x /*1*/, access=0, align_mul=4, align_offset=0)
/* succs: block_6 */
block block_6:
}
block0 {
134 = MOV.i32 r62
133 = MOV.i32 r61
132 = MOV.i32 r60
7 = LSHIFT_OR.i32 133, #0x0, #0x8.b0
8 = IADD.s32 132, 7
131 = MKVEC.v2i16 #0x0.h00, 134.h00
11 = IADD.s32 8, 131
13 = FADD.f32 u1, #0x3f800000
14 = U32_TO_F32 11
15 = FMA.f32 14, #0x2edbe6ff, #0x0.neg
} -> block1
block1 {
16 = PHI 15, 99
17 = PHI 15, 98
18 = PHI u1, 101
19 = PHI 13, 100
20 = PHI #0x0, 86
21 = ICMP.s32.m1.ge 20, #0x40
BRANCHZ.i16.eq 21.h00, #0x0 -> block3
} -> block3 block2 from block0 block4
block2 {
JUMP #0x0 -> block5
} -> block5 from block1
block3 {
} -> block4 from block1
block4 {
129 = FMA.f32 16, 18, 16
128 = FMA.f32 17, 19, 17
127 = FMA.f32 129, 16, 129
126 = FMA.f32 128, 17, 128
125 = FMA.f32 127, 129, 127
124 = FMA.f32 126, 128, 126
123 = FMA.f32 125, 127, 125
122 = FMA.f32 124, 126, 124
121 = FMA.f32 123, 125, 123
120 = FMA.f32 122, 124, 122
119 = FMA.f32 121, 123, 121
118 = FMA.f32 120, 122, 120
117 = FMA.f32 119, 121, 119
116 = FMA.f32 118, 120, 118
115 = FMA.f32 117, 119, 117
114 = FMA.f32 116, 118, 116
113 = FMA.f32 115, 117, 115
112 = FMA.f32 114, 116, 114
111 = FMA.f32 113, 115, 113
110 = FMA.f32 112, 114, 112
109 = FMA.f32 111, 113, 111
108 = FMA.f32 110, 112, 110
107 = FMA.f32 109, 111, 109
106 = FMA.f32 108, 110, 108
105 = FMA.f32 107, 109, 107
104 = FMA.f32 106, 108, 106
103 = FMA.f32 105, 107, 105
102 = FMA.f32 104, 106, 104
101 = FMA.f32 103, 105, 103
100 = FMA.f32 102, 104, 102
99 = FMA.f32 101, 103, 101
98 = FMA.f32 100, 102, 100
86 = IADD.s32 20, #0x1
JUMP #0x0 -> block1
} -> block1 from block3
block5 {
88 = LSHIFT_OR.i32 11, #0x0, #0x2.b0
89 = FADD.f32 16, 17
93 = IADD.s32 u0, 88
95 = ICMP.u32.i1.lt 93, u0
96 = IADD.s32 95, u0[1]
STORE.i32 89, 93, 96, byte_offset:0
} from block2
block0 {
r0 = LSHIFT_OR.i32 r61, #0x0, #0x8.b0
r0 = IADD.s32 r60, r0
r1 = MKVEC.v2i16 #0x0.h00, r62.h00
r0 = IADD.s32 r0, r1
r1 = MOV.i32 #0x3f800000
r1 = FADD.f32 u1, r1
r2 = U32_TO_F32 r0
r2 = FMA.f32 r2, #0x2edbe6ff, #0x0.neg
r3 = MOV.i32 r2
r4 = MOV.i32 u1
r5 = MOV.i32 #0x0
} -> block1
block1 {
r6 = ICMP.s32.m1.ge r5, #0x40
BRANCHZ.i16.eq r6.h00, #0x0 -> block3
} -> block3 block2 from block0 block4
block2 {
JUMP #0x0 -> block5
} -> block5 from block1
block3 {
} -> block4 from block1
block4 {
r4 = FMA.f32 r2, r4, r2
r1 = FMA.f32 r3, r1, r3
r2 = FMA.f32 r4, r2, r4
r3 = FMA.f32 r1, r3, r1
r4 = FMA.f32 r2, r4, r2
r1 = FMA.f32 r3, r1, r3
r2 = FMA.f32 r4, r2, r4
r3 = FMA.f32 r1, r3, r1
r4 = FMA.f32 r2, r4, r2
r1 = FMA.f32 r3, r1, r3
r2 = FMA.f32 r4, r2, r4
r3 = FMA.f32 r1, r3, r1
r4 = FMA.f32 r2, r4, r2
r1 = FMA.f32 r3, r1, r3
r2 = FMA.f32 r4, r2, r4
r3 = FMA.f32 r1, r3, r1
r4 = FMA.f32 r2, r4, r2
r1 = FMA.f32 r3, r1, r3
r2 = FMA.f32 r4, r2, r4
r3 = FMA.f32 r1, r3, r1
r4 = FMA.f32 r2, r4, r2
r1 = FMA.f32 r3, r1, r3
r2 = FMA.f32 r4, r2, r4
r3 = FMA.f32 r1, r3, r1
r4 = FMA.f32 r2, r4, r2
r1 = FMA.f32 r3, r1, r3
r2 = FMA.f32 r4, r2, r4
r3 = FMA.f32 r1, r3, r1
r4 = FMA.f32 r2, r4, r2
r1 = FMA.f32 r3, r1, r3
r2 = FMA.f32 r4, r2, r4
r3 = FMA.f32 r1, r3, r1
r5 = IADD.s32 r5, #0x1
JUMP #0x0 -> block1
} -> block1 from block3
block5 {
r0 = LSHIFT_OR.i32 r0, #0x0, #0x2.b0
r1 = FADD.f32 r2, r3
r0 = IADD.s32 u0, r0
r2 = ICMP.u32.i1.lt r0, u0
r2 = IADD.s32 r2, u0[1]
STORE.i32 r1, r0, r2, byte_offset:0
} from block2
block0 {
id(0) nbb r_uncond
* _.h00 = LSHIFT_OR.i32 r61, t, fau.y.b0
+ _.h00 = IADD.s32 r60, t
* _.h00 = MKVEC.v2i16 t.h00, r62.h00
+ r0 = IADD.s32 t1, t
* NOP
+ _.h00 = U32_TO_F32 t1
* r2 = FMA.f32 t1, fau.x, t.neg
+ _.h00 = MOV.i32 fau.y
* NOP
+ r1 = FADD.f32 fau.x, t1
* r3 = MOV.i32 r2
+ r4 = MOV.i32 fau.x
* NOP
+ r5 = MOV.i32 fau.x
800000000 3f8000002edbe6ff
} -> block1
block1 {
id(0) nbb r_uncond pcrel(0)
* NOP
+ _.h00 = ICMP.s32.m1.ge r5, fau.x
* NOP
+ BRANCHZ.i16.eq t1.h00, fau.y -> block3
4000000000000040
} -> block3 block2 from block0 block4
block2 {
id(0) nbb no_prefetch pcrel(0)
* NOP
+ JUMP fau.y -> block5
4000000000000000
} -> block5 from block1
block3 {
} -> block4 from block1
block4 {
id(0) nbb
* r4 = FMA.f32 r2, r4, r2
+ NOP
* r1 = FMA.f32 r3, r1, r3
+ NOP
* r2 = FMA.f32 r4, r2, r4
+ NOP
* r3 = FMA.f32 r1, r3, r1
+ NOP
* r4 = FMA.f32 r2, r4, r2
+ NOP
* r1 = FMA.f32 r3, r1, r3
+ NOP
* r2 = FMA.f32 r4, r2, r4
+ NOP
* r3 = FMA.f32 r1, r3, r1
+ NOP
id(0) nbb
* r4 = FMA.f32 r2, r4, r2
+ NOP
* r1 = FMA.f32 r3, r1, r3
+ NOP
* r2 = FMA.f32 r4, r2, r4
+ NOP
* r3 = FMA.f32 r1, r3, r1
+ NOP
* r4 = FMA.f32 r2, r4, r2
+ NOP
* r1 = FMA.f32 r3, r1, r3
+ NOP
* r2 = FMA.f32 r4, r2, r4
+ NOP
* r3 = FMA.f32 r1, r3, r1
+ NOP
id(0) nbb
* r4 = FMA.f32 r2, r4, r2
+ NOP
* r1 = FMA.f32 r3, r1, r3
+ NOP
* r2 = FMA.f32 r4, r2, r4
+ NOP
* r3 = FMA.f32 r1, r3, r1
+ NOP
* r4 = FMA.f32 r2, r4, r2
+ NOP
* r1 = FMA.f32 r3, r1, r3
+ NOP
* r2 = FMA.f32 r4, r2, r4
+ NOP
* r3 = FMA.f32 r1, r3, r1
+ NOP
id(0) nbb r_uncond no_prefetch pcrel(1)
* r4 = FMA.f32 r2, r4, r2
+ NOP
* r1 = FMA.f32 r3, r1, r3
+ NOP
* r2 = FMA.f32 r4, r2, r4
+ NOP
* r3 = FMA.f32 r1, r3, r1
+ NOP
* r4 = FMA.f32 r2, r4, r2
+ NOP
* r1 = FMA.f32 r3, r1, r3
+ NOP
* r2 = FMA.f32 r4, r2, r4
+ r5 = IADD.s32 r5, fau.x
* r3 = FMA.f32 r1, r3, r1
+ JUMP fau.y -> block1
0 4000000000000001
} -> block1 from block3
block5 {
id(0) wait(0 ) nbb r_uncond
* _.h00 = LSHIFT_OR.i32 r0, t, fau.y.b0
+ NOP
* NOP
+ r0 = IADD.s32 fau.x, t0
* r1 = FADD.f32 r2, r3
+ _.h00 = ICMP.u32.i1.lt t1, fau.x
* NOP
+ _.h00 = IADD.s32 t1, fau.y
* NOP
+ STORE.i32 r1, r0, t1, byte_offset:0
200000000
} from block2
slot 0 reads: r1
clause_0:
ds(0) nbb r_uncond ncph
{
*LSHIFT_OR.i32 t0, r61, #0, 0x00000008 /* 0.000000 */
+IADD.s32 t1, r60, t
*MKVEC.v2i16 t0, #0, r62
+IADD.s32 r0:t1, t1, t
*NOP t0
+U32_TO_F32 t1, t1
*FMA.f32 r2:t0, t1, 0x2edbe6ff /* 0.000000 */, #0.neg
+MOV.i32 t1, 0x3f800000 /* 1.000000 */
*NOP t0
+FADD.f32 r1:t1, u1.w0, t1
*MOV.i32 r3:t0, r2
+MOV.i32 r4:t1, u1.w0
*NOP t0
+MOV.i32 r5:t1, #0.x
}
clause_6:
ds(0) nbb r_uncond ncph
{
*NOP t0
+ICMP.s32.m1.ge t1, r5, 0x00000040 /* 0.000000 */
*NOP t0
+BRANCHZ.i16.eq t1, t1.h0, clause_11
}
clause_9:
ds(0) nbb next_store dwb(0)
{
*NOP t0
+JUMP t1, clause_36
}
clause_11:
ds(0) nbb ncph
{
*FMA.f32 r4:t0, r2, r4, r2
+NOP t1
*FMA.f32 r1:t0, r3, r1, r3
+NOP t1
*FMA.f32 r2:t0, r4, r2, r4
+NOP t1
*FMA.f32 r3:t0, r1, r3, r1
+NOP t1
*FMA.f32 r4:t0, r2, r4, r2
+NOP t1
*FMA.f32 r1:t0, r3, r1, r3
+NOP t1
*FMA.f32 r2:t0, r4, r2, r4
+NOP t1
*FMA.f32 r3:t0, r1, r3, r1
+NOP t1
}
clause_17:
ds(0) nbb ncph
{
*FMA.f32 r4:t0, r2, r4, r2
+NOP t1
*FMA.f32 r1:t0, r3, r1, r3
+NOP t1
*FMA.f32 r2:t0, r4, r2, r4
+NOP t1
*FMA.f32 r3:t0, r1, r3, r1
+NOP t1
*FMA.f32 r4:t0, r2, r4, r2
+NOP t1
*FMA.f32 r1:t0, r3, r1, r3
+NOP t1
*FMA.f32 r2:t0, r4, r2, r4
+NOP t1
*FMA.f32 r3:t0, r1, r3, r1
+NOP t1
}
clause_23:
ds(0) nbb ncph
{
*FMA.f32 r4:t0, r2, r4, r2
+NOP t1
*FMA.f32 r1:t0, r3, r1, r3
+NOP t1
*FMA.f32 r2:t0, r4, r2, r4
+NOP t1
*FMA.f32 r3:t0, r1, r3, r1
+NOP t1
*FMA.f32 r4:t0, r2, r4, r2
+NOP t1
*FMA.f32 r1:t0, r3, r1, r3
+NOP t1
*FMA.f32 r2:t0, r4, r2, r4
+NOP t1
*FMA.f32 r3:t0, r1, r3, r1
+NOP t1
}
clause_29:
ds(0) nbb r_uncond
{
*FMA.f32 r4:t0, r2, r4, r2
+NOP t1
*FMA.f32 r1:t0, r3, r1, r3
+NOP t1
*FMA.f32 r2:t0, r4, r2, r4
+NOP t1
*FMA.f32 r3:t0, r1, r3, r1
+NOP t1
*FMA.f32 r4:t0, r2, r4, r2
+NOP t1
*FMA.f32 r1:t0, r3, r1, r3
+NOP t1
*FMA.f32 r2:t0, r4, r2, r4
+IADD.s32 r5:t1, r5, 0x00000001 /* 0.000000 */
*FMA.f32 r3:t0, r1, r3, r1
+JUMP t1, clause_6
}
clause_36:
ds(0) eos store
{
*LSHIFT_OR.i32 t0, r0, #0, 0x00000002 /* 0.000000 */
+NOP t1
*NOP t0
+IADD.s32 r0:t1, u0.w0, t0
*FADD.f32 r1:t0, r2, r3
+ICMP.u32.gt t1, u0.w0, t1
*NOP t0
+IADD.s32 t1, t1, u0.w1
*NOP t0
+STORE.i32 t1, r0, t1, @r1
}
36483168 compute_sp_v2_float 23.489 GFLOPs 11.428ms
compute shader ----------
#define KERNEL compute_sp_v4
#define LOCAL_SIZE_X 256
#define DATATYPE float
#line 64
// Avoiding auto-vectorize by using vector-width locked dependent code
layout(local_size_x = LOCAL_SIZE_X) in;
#undef MAD_4
#undef MAD_16
#undef MAD_64
#define mad(a,b,c) (a*b+c)
#define MAD_4(x, y) x = mad(y, x, y); y = mad(x, y, x); x = mad(y, x, y); y = mad(x, y, x);
#define MAD_16(x, y) MAD_4(x, y); MAD_4(x, y); MAD_4(x, y); MAD_4(x, y);
#define MAD_64(x, y) MAD_16(x, y); MAD_16(x, y); MAD_16(x, y); MAD_16(x, y);
struct vec8 {
vec4 d0, d1;
};
#define VEC8(x0,x1,x2,x3,x4,x5,x6,x7) vec8(vec4(x0,x1,x2,x3), vec4(x4,x5,x6,x7))
#define VEC8_S(x) vec8(vec4(x,x,x,x), vec4(x,x,x,x))
#define VEC8_ADD(a, b) (vec8(a.d0 + b.d0, a.d1 + b.d1))
#define VEC8_MUL(a, b) (vec8(a.d0 * b.d0, a.d1 * b.d1))
struct vec16 {
vec8 d0,d1;
};
#define VEC16(x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15) vec16(VEC8(x0,x1,x2,x3,x4,x5,x6,x7), VEC8(x8,x9,x10,x11,x12,x13,x14,x15))
#define VEC16_S(x) vec16(VEC8_S(x), VEC8_S(x));
#define VEC16_ADD(a, b) (vec16(VEC8_ADD(a.d0, b.d0), VEC8_ADD(a.d1, b.d1)))
#define VEC16_MUL(a, b) (vec16(VEC8_MUL(a.d0, b.d0), VEC8_MUL(a.d1, b.d1)))
#define mad8(a,b,c) (VEC8_ADD(VEC8_MUL(a,b),c))
#define mad16(a,b,c) (VEC16_ADD(VEC16_MUL(a,b),c))
layout(location = 1) uniform DATATYPE _A;
#define SCALE 1e-10
layout(std430, binding = 0) restrict writeonly buffer outbuffer {
DATATYPE ptr[];
};
#line 145
void compute_sp_v4()
{
uint id = gl_GlobalInvocationID[0] + gl_GlobalInvocationID[1] * 256u + gl_GlobalInvocationID[2] * 256u * 256u;
vec4 x = vec4(_A, (_A+DATATYPE(1)), (_A+DATATYPE(2)), (_A+DATATYPE(3)));
vec4 y = vec4((float(id) * SCALE), (float(id) * SCALE), (float(id) * SCALE), (float(id) * SCALE));
for(int i=0; i<32; i++)
{
MAD_16(x, y);
}
ptr[id] = (y.x) + (y.y) + (y.z) + (y.w);
}
void main() {compute_sp_v4();}
----------
shader: MESA_SHADER_COMPUTE
source_sha1: {0x48ee6346, 0xd3c6fdf4, 0x78082e2c, 0xa4f173ed, 0x74ccc824}
name: GLSL6
workgroup-size: 256, 1, 1
shared-size: 0
inputs: 0
outputs: 0
uniforms: 1
ubos: 1
shared: 0
ray queries: 0
decl_var ssbo INTERP_MODE_NONE restrict writeonly highp float[] ptr (0, 0, 0)
decl_var uniform INTERP_MODE_NONE highp float _A (1, 0, 0)
decl_var ubo INTERP_MODE_NONE vec4[1] uniform_0 (0, 0, 0)
decl_function main (0 params)
impl main {
block block_0:
/* preds: */
vec3 32 ssa_7 = intrinsic load_global_invocation_id () ()
vec1 32 ssa_8 = load_const (0x00000008 = 0.000000)
vec1 32 ssa_9 = ishl ssa_7.y, ssa_8
vec1 32 ssa_10 = iadd ssa_7.x, ssa_9
vec1 32 ssa_1 = load_const (0x00000001 = 0.000000)
vec1 32 ssa_237 = insert_u16 ssa_7.z, ssa_1
vec1 32 ssa_13 = iadd ssa_10, ssa_237
vec1 32 ssa_0 = load_const (0x00000000 = 0.000000)
vec1 32 ssa_14 = intrinsic load_ubo (ssa_0, ssa_0) (access=0, align_mul=1073741824, align_offset=0, range_base=0, range=4)
vec1 32 ssa_2 = load_const (0x3f800000 = 1.000000)
vec1 32 ssa_15 = fadd ssa_14, ssa_2
vec1 32 ssa_3 = load_const (0x40000000 = 2.000000)
vec1 32 ssa_16 = fadd ssa_14, ssa_3
vec1 32 ssa_4 = load_const (0x40400000 = 3.000000)
vec1 32 ssa_17 = fadd ssa_14, ssa_4
vec1 32 ssa_18 = u2f32 ssa_13
vec1 32 ssa_5 = load_const (0x2edbe6ff = 0.000000)
vec1 32 ssa_19 = fmul ssa_18, ssa_5
vec1 32 ssa_6 = load_const (0x00000020 = 0.000000)
/* succs: block_1 */
loop {
block block_1:
/* preds: block_0 block_4 */
vec1 32 ssa_20 = phi block_0: ssa_19, block_4: ssa_175
vec1 32 ssa_21 = phi block_0: ssa_19, block_4: ssa_174
vec1 32 ssa_22 = phi block_0: ssa_19, block_4: ssa_173
vec1 32 ssa_23 = phi block_0: ssa_19, block_4: ssa_172
vec1 32 ssa_24 = phi block_0: ssa_14, block_4: ssa_179
vec1 32 ssa_25 = phi block_0: ssa_15, block_4: ssa_178
vec1 32 ssa_26 = phi block_0: ssa_16, block_4: ssa_177
vec1 32 ssa_27 = phi block_0: ssa_17, block_4: ssa_176
vec1 32 ssa_28 = phi block_0: ssa_0, block_4: ssa_158
vec1 32 ssa_29 = ige32 ssa_28, ssa_6
/* succs: block_2 block_3 */
if ssa_29 {
block block_2:
/* preds: block_1 */
break
/* succs: block_5 */
} else {
block block_3:
/* preds: block_1 */
/* succs: block_4 */
}
block block_4:
/* preds: block_3 */
vec1 32 ssa_235 = ffma ssa_20, ssa_24, ssa_20
vec1 32 ssa_234 = ffma ssa_21, ssa_25, ssa_21
vec1 32 ssa_233 = ffma ssa_22, ssa_26, ssa_22
vec1 32 ssa_232 = ffma ssa_23, ssa_27, ssa_23
vec1 32 ssa_231 = ffma ssa_235, ssa_20, ssa_235
vec1 32 ssa_230 = ffma ssa_234, ssa_21, ssa_234
vec1 32 ssa_229 = ffma ssa_233, ssa_22, ssa_233
vec1 32 ssa_228 = ffma ssa_232, ssa_23, ssa_232
vec1 32 ssa_227 = ffma ssa_231, ssa_235, ssa_231
vec1 32 ssa_226 = ffma ssa_230, ssa_234, ssa_230
vec1 32 ssa_225 = ffma ssa_229, ssa_233, ssa_229
vec1 32 ssa_224 = ffma ssa_228, ssa_232, ssa_228
vec1 32 ssa_223 = ffma ssa_227, ssa_231, ssa_227
vec1 32 ssa_222 = ffma ssa_226, ssa_230, ssa_226
vec1 32 ssa_221 = ffma ssa_225, ssa_229, ssa_225
vec1 32 ssa_220 = ffma ssa_224, ssa_228, ssa_224
vec1 32 ssa_219 = ffma ssa_223, ssa_227, ssa_223
vec1 32 ssa_218 = ffma ssa_222, ssa_226, ssa_222
vec1 32 ssa_217 = ffma ssa_221, ssa_225, ssa_221
vec1 32 ssa_216 = ffma ssa_220, ssa_224, ssa_220
vec1 32 ssa_215 = ffma ssa_219, ssa_223, ssa_219
vec1 32 ssa_214 = ffma ssa_218, ssa_222, ssa_218
vec1 32 ssa_213 = ffma ssa_217, ssa_221, ssa_217
vec1 32 ssa_212 = ffma ssa_216, ssa_220, ssa_216
vec1 32 ssa_211 = ffma ssa_215, ssa_219, ssa_215
vec1 32 ssa_210 = ffma ssa_214, ssa_218, ssa_214
vec1 32 ssa_209 = ffma ssa_213, ssa_217, ssa_213
vec1 32 ssa_208 = ffma ssa_212, ssa_216, ssa_212
vec1 32 ssa_207 = ffma ssa_211, ssa_215, ssa_211
vec1 32 ssa_206 = ffma ssa_210, ssa_214, ssa_210
vec1 32 ssa_205 = ffma ssa_209, ssa_213, ssa_209
vec1 32 ssa_204 = ffma ssa_208, ssa_212, ssa_208
vec1 32 ssa_203 = ffma ssa_207, ssa_211, ssa_207
vec1 32 ssa_202 = ffma ssa_206, ssa_210, ssa_206
vec1 32 ssa_201 = ffma ssa_205, ssa_209, ssa_205
vec1 32 ssa_200 = ffma ssa_204, ssa_208, ssa_204
vec1 32 ssa_199 = ffma ssa_203, ssa_207, ssa_203
vec1 32 ssa_198 = ffma ssa_202, ssa_206, ssa_202
vec1 32 ssa_197 = ffma ssa_201, ssa_205, ssa_201
vec1 32 ssa_196 = ffma ssa_200, ssa_204, ssa_200
vec1 32 ssa_195 = ffma ssa_199, ssa_203, ssa_199
vec1 32 ssa_194 = ffma ssa_198, ssa_202, ssa_198
vec1 32 ssa_193 = ffma ssa_197, ssa_201, ssa_197
vec1 32 ssa_192 = ffma ssa_196, ssa_200, ssa_196
vec1 32 ssa_191 = ffma ssa_195, ssa_199, ssa_195
vec1 32 ssa_190 = ffma ssa_194, ssa_198, ssa_194
vec1 32 ssa_189 = ffma ssa_193, ssa_197, ssa_193
vec1 32 ssa_188 = ffma ssa_192, ssa_196, ssa_192
vec1 32 ssa_187 = ffma ssa_191, ssa_195, ssa_191
vec1 32 ssa_186 = ffma ssa_190, ssa_194, ssa_190
vec1 32 ssa_185 = ffma ssa_189, ssa_193, ssa_189
vec1 32 ssa_184 = ffma ssa_188, ssa_192, ssa_188
vec1 32 ssa_183 = ffma ssa_187, ssa_191, ssa_187
vec1 32 ssa_182 = ffma ssa_186, ssa_190, ssa_186
vec1 32 ssa_181 = ffma ssa_185, ssa_189, ssa_185
vec1 32 ssa_180 = ffma ssa_184, ssa_188, ssa_184
vec1 32 ssa_179 = ffma ssa_183, ssa_187, ssa_183
vec1 32 ssa_178 = ffma ssa_182, ssa_186, ssa_182
vec1 32 ssa_177 = ffma ssa_181, ssa_185, ssa_181
vec1 32 ssa_176 = ffma ssa_180, ssa_184, ssa_180
vec1 32 ssa_175 = ffma ssa_179, ssa_183, ssa_179
vec1 32 ssa_174 = ffma ssa_178, ssa_182, ssa_178
vec1 32 ssa_173 = ffma ssa_177, ssa_181, ssa_177
vec1 32 ssa_172 = ffma ssa_176, ssa_180, ssa_176
vec1 32 ssa_158 = iadd ssa_28, ssa_1
/* succs: block_1 */
}
block block_5:
/* preds: block_2 */
vec1 32 ssa_159 = load_const (0x00000002 = 0.000000)
vec1 32 ssa_160 = ishl ssa_13, ssa_159
vec1 32 ssa_161 = fadd ssa_20, ssa_21
vec1 32 ssa_162 = fadd ssa_161, ssa_22
vec1 32 ssa_163 = fadd ssa_162, ssa_23
vec1 64 ssa_164 = intrinsic load_ssbo_address (ssa_0) ()
vec1 32 ssa_165 = unpack_64_2x32_split_x ssa_164
vec1 32 ssa_166 = unpack_64_2x32_split_y ssa_164
vec1 32 ssa_167 = iadd ssa_165, ssa_160
vec1 32 ssa_168 = ult32 ssa_167, ssa_165
vec1 32 ssa_169 = b2i32 ssa_168
vec1 32 ssa_170 = iadd ssa_169, ssa_166
vec1 64 ssa_171 = pack_64_2x32_split ssa_167, ssa_170
intrinsic store_global (ssa_163, ssa_171) (wrmask=x /*1*/, access=0, align_mul=4, align_offset=0)
/* succs: block_6 */
block block_6:
}
block0 {
240 = MOV.i32 r62
239 = MOV.i32 r61
238 = MOV.i32 r60
9 = LSHIFT_OR.i32 239, #0x0, #0x8.b0
10 = IADD.s32 238, 9
237 = MKVEC.v2i16 #0x0.h00, 240.h00
13 = IADD.s32 10, 237
15 = FADD.f32 u1, #0x3f800000
16 = FADD.f32 u1, #0x40000000
17 = FADD.f32 u1, #0x40400000
18 = U32_TO_F32 13
19 = FMA.f32 18, #0x2edbe6ff, #0x0.neg
} -> block1
block1 {
20 = PHI 19, 175
21 = PHI 19, 174
22 = PHI 19, 173
23 = PHI 19, 172
24 = PHI u1, 179
25 = PHI 15, 178
26 = PHI 16, 177
27 = PHI 17, 176
28 = PHI #0x0, 158
29 = ICMP.s32.m1.ge 28, #0x20
BRANCHZ.i16.eq 29.h00, #0x0 -> block3
} -> block3 block2 from block0 block4
block2 {
JUMP #0x0 -> block5
} -> block5 from block1
block3 {
} -> block4 from block1
block4 {
235 = FMA.f32 20, 24, 20
234 = FMA.f32 21, 25, 21
233 = FMA.f32 22, 26, 22
232 = FMA.f32 23, 27, 23
231 = FMA.f32 235, 20, 235
230 = FMA.f32 234, 21, 234
229 = FMA.f32 233, 22, 233
228 = FMA.f32 232, 23, 232
227 = FMA.f32 231, 235, 231
226 = FMA.f32 230, 234, 230
225 = FMA.f32 229, 233, 229
224 = FMA.f32 228, 232, 228
223 = FMA.f32 227, 231, 227
222 = FMA.f32 226, 230, 226
221 = FMA.f32 225, 229, 225
220 = FMA.f32 224, 228, 224
219 = FMA.f32 223, 227, 223
218 = FMA.f32 222, 226, 222
217 = FMA.f32 221, 225, 221
216 = FMA.f32 220, 224, 220
215 = FMA.f32 219, 223, 219
214 = FMA.f32 218, 222, 218
213 = FMA.f32 217, 221, 217
212 = FMA.f32 216, 220, 216
211 = FMA.f32 215, 219, 215
210 = FMA.f32 214, 218, 214
209 = FMA.f32 213, 217, 213
208 = FMA.f32 212, 216, 212
207 = FMA.f32 211, 215, 211
206 = FMA.f32 210, 214, 210
205 = FMA.f32 209, 213, 209
204 = FMA.f32 208, 212, 208
203 = FMA.f32 207, 211, 207
202 = FMA.f32 206, 210, 206
201 = FMA.f32 205, 209, 205
200 = FMA.f32 204, 208, 204
199 = FMA.f32 203, 207, 203
198 = FMA.f32 202, 206, 202
197 = FMA.f32 201, 205, 201
196 = FMA.f32 200, 204, 200
195 = FMA.f32 199, 203, 199
194 = FMA.f32 198, 202, 198
193 = FMA.f32 197, 201, 197
192 = FMA.f32 196, 200, 196
191 = FMA.f32 195, 199, 195
190 = FMA.f32 194, 198, 194
189 = FMA.f32 193, 197, 193
188 = FMA.f32 192, 196, 192
187 = FMA.f32 191, 195, 191
186 = FMA.f32 190, 194, 190
185 = FMA.f32 189, 193, 189
184 = FMA.f32 188, 192, 188
183 = FMA.f32 187, 191, 187
182 = FMA.f32 186, 190, 186
181 = FMA.f32 185, 189, 185
180 = FMA.f32 184, 188, 184
179 = FMA.f32 183, 187, 183
178 = FMA.f32 182, 186, 182
177 = FMA.f32 181, 185, 181
176 = FMA.f32 180, 184, 180
175 = FMA.f32 179, 183, 179
174 = FMA.f32 178, 182, 178
173 = FMA.f32 177, 181, 177
172 = FMA.f32 176, 180, 176
158 = IADD.s32 28, #0x1
JUMP #0x0 -> block1
} -> block1 from block3
block5 {
160 = LSHIFT_OR.i32 13, #0x0, #0x2.b0
161 = FADD.f32 20, 21
162 = FADD.f32 161, 22
163 = FADD.f32 162, 23
167 = IADD.s32 u0, 160
169 = ICMP.u32.i1.lt 167, u0
170 = IADD.s32 169, u0[1]
STORE.i32 163, 167, 170, byte_offset:0
} from block2
block0 {
r0 = LSHIFT_OR.i32 r61, #0x0, #0x8.b0
r0 = IADD.s32 r60, r0
r1 = MKVEC.v2i16 #0x0.h00, r62.h00
r0 = IADD.s32 r0, r1
r1 = MOV.i32 #0x3f800000
r1 = FADD.f32 u1, r1
r2 = MOV.i32 #0x40000000
r2 = FADD.f32 u1, r2
r3 = MOV.i32 #0x40400000
r3 = FADD.f32 u1, r3
r4 = U32_TO_F32 r0
r4 = FMA.f32 r4, #0x2edbe6ff, #0x0.neg
r5 = MOV.i32 r4
r6 = MOV.i32 r4
r7 = MOV.i32 r4
r8 = MOV.i32 u1
r9 = MOV.i32 #0x0
} -> block1
block1 {
r10 = ICMP.s32.m1.ge r9, #0x20
BRANCHZ.i16.eq r10.h00, #0x0 -> block3
} -> block3 block2 from block0 block4
block2 {
JUMP #0x0 -> block5
} -> block5 from block1
block3 {
} -> block4 from block1
block4 {
r8 = FMA.f32 r4, r8, r4
r1 = FMA.f32 r5, r1, r5
r2 = FMA.f32 r6, r2, r6
r3 = FMA.f32 r7, r3, r7
r4 = FMA.f32 r8, r4, r8
r5 = FMA.f32 r1, r5, r1
r6 = FMA.f32 r2, r6, r2
r7 = FMA.f32 r3, r7, r3
r8 = FMA.f32 r4, r8, r4
r1 = FMA.f32 r5, r1, r5
r2 = FMA.f32 r6, r2, r6
r3 = FMA.f32 r7, r3, r7
r4 = FMA.f32 r8, r4, r8
r5 = FMA.f32 r1, r5, r1
r6 = FMA.f32 r2, r6, r2
r7 = FMA.f32 r3, r7, r3
r8 = FMA.f32 r4, r8, r4
r1 = FMA.f32 r5, r1, r5
r2 = FMA.f32 r6, r2, r6
r3 = FMA.f32 r7, r3, r7
r4 = FMA.f32 r8, r4, r8
r5 = FMA.f32 r1, r5, r1
r6 = FMA.f32 r2, r6, r2
r7 = FMA.f32 r3, r7, r3
r8 = FMA.f32 r4, r8, r4
r1 = FMA.f32 r5, r1, r5
r2 = FMA.f32 r6, r2, r6
r3 = FMA.f32 r7, r3, r7
r4 = FMA.f32 r8, r4, r8
r5 = FMA.f32 r1, r5, r1
r6 = FMA.f32 r2, r6, r2
r7 = FMA.f32 r3, r7, r3
r8 = FMA.f32 r4, r8, r4
r1 = FMA.f32 r5, r1, r5
r2 = FMA.f32 r6, r2, r6
r3 = FMA.f32 r7, r3, r7
r4 = FMA.f32 r8, r4, r8
r5 = FMA.f32 r1, r5, r1
r6 = FMA.f32 r2, r6, r2
r7 = FMA.f32 r3, r7, r3
r8 = FMA.f32 r4, r8, r4
r1 = FMA.f32 r5, r1, r5
r2 = FMA.f32 r6, r2, r6
r3 = FMA.f32 r7, r3, r7
r4 = FMA.f32 r8, r4, r8
r5 = FMA.f32 r1, r5, r1
r6 = FMA.f32 r2, r6, r2
r7 = FMA.f32 r3, r7, r3
r8 = FMA.f32 r4, r8, r4
r1 = FMA.f32 r5, r1, r5
r2 = FMA.f32 r6, r2, r6
r3 = FMA.f32 r7, r3, r7
r4 = FMA.f32 r8, r4, r8
r5 = FMA.f32 r1, r5, r1
r6 = FMA.f32 r2, r6, r2
r7 = FMA.f32 r3, r7, r3
r8 = FMA.f32 r4, r8, r4
r1 = FMA.f32 r5, r1, r5
r2 = FMA.f32 r6, r2, r6
r3 = FMA.f32 r7, r3, r7
r4 = FMA.f32 r8, r4, r8
r5 = FMA.f32 r1, r5, r1
r6 = FMA.f32 r2, r6, r2
r7 = FMA.f32 r3, r7, r3
r9 = IADD.s32 r9, #0x1
JUMP #0x0 -> block1
} -> block1 from block3
block5 {
r0 = LSHIFT_OR.i32 r0, #0x0, #0x2.b0
r1 = FADD.f32 r4, r5
r1 = FADD.f32 r1, r6
r1 = FADD.f32 r1, r7
r0 = IADD.s32 u0, r0
r2 = ICMP.u32.i1.lt r0, u0
r2 = IADD.s32 r2, u0[1]
STORE.i32 r1, r0, r2, byte_offset:0
} from block2
block0 {
id(0) nbb
* _.h00 = LSHIFT_OR.i32 r61, t, fau.y.b0
+ _.h00 = IADD.s32 r60, t
* _.h00 = MKVEC.v2i16 t.h00, r62.h00
+ r0 = IADD.s32 t1, t
800000000
id(0) nbb r_uncond
* NOP
+ _.h00 = MOV.i32 fau.x
* NOP
+ r1 = FADD.f32 fau.x, t1
* r2 = MOV.i32 fau.y
+ _.h00 = U32_TO_F32 r0
* r4 = FMA.f32 t1, fau.x, t.neg
+ _.h00 = MOV.i32 fau.y
* r2 = FADD.f32 fau.x, r2
+ r3 = FADD.f32 fau.x, t1
* r5 = MOV.i32 r4
+ r6 = MOV.i32 r4
* r7 = MOV.i32 r4
+ r8 = MOV.i32 fau.x
* NOP
+ r9 = MOV.i32 fau.x
404000002edbe6ff 400000003f800000
} -> block1
block1 {
id(0) nbb r_uncond pcrel(0)
* NOP
+ _.h00 = ICMP.s32.m1.ge r9, fau.x
* NOP
+ BRANCHZ.i16.eq t1.h00, fau.y -> block3
4000000000000020
} -> block3 block2 from block0 block4
block2 {
id(0) nbb no_prefetch pcrel(0)
* NOP
+ JUMP fau.y -> block5
4000000000000000
} -> block5 from block1
block3 {
} -> block4 from block1
block4 {
id(0) nbb
* r8 = FMA.f32 r4, r8, r4
+ NOP
* r1 = FMA.f32 r5, r1, r5
+ NOP
* r2 = FMA.f32 r6, r2, r6
+ NOP
* r3 = FMA.f32 r7, r3, r7
+ NOP
* r4 = FMA.f32 r8, r4, r8
+ NOP
* r5 = FMA.f32 r1, r5, r1
+ NOP
* r6 = FMA.f32 r2, r6, r2
+ NOP
* r7 = FMA.f32 r3, r7, r3
+ NOP
id(0) nbb
* r8 = FMA.f32 r4, r8, r4
+ NOP
* r1 = FMA.f32 r5, r1, r5
+ NOP
* r2 = FMA.f32 r6, r2, r6
+ NOP
* r3 = FMA.f32 r7, r3, r7
+ NOP
* r4 = FMA.f32 r8, r4, r8
+ NOP
* r5 = FMA.f32 r1, r5, r1
+ NOP
* r6 = FMA.f32 r2, r6, r2
+ NOP
* r7 = FMA.f32 r3, r7, r3
+ NOP
id(0) nbb
* r8 = FMA.f32 r4, r8, r4
+ NOP
* r1 = FMA.f32 r5, r1, r5
+ NOP
* r2 = FMA.f32 r6, r2, r6
+ NOP
* r3 = FMA.f32 r7, r3, r7
+ NOP
* r4 = FMA.f32 r8, r4, r8
+ NOP
* r5 = FMA.f32 r1, r5, r1
+ NOP
* r6 = FMA.f32 r2, r6, r2
+ NOP
* r7 = FMA.f32 r3, r7, r3
+ NOP
id(0) nbb
* r8 = FMA.f32 r4, r8, r4
+ NOP
* r1 = FMA.f32 r5, r1, r5
+ NOP
* r2 = FMA.f32 r6, r2, r6
+ NOP
* r3 = FMA.f32 r7, r3, r7
+ NOP
* r4 = FMA.f32 r8, r4, r8
+ NOP
* r5 = FMA.f32 r1, r5, r1
+ NOP
* r6 = FMA.f32 r2, r6, r2
+ NOP
* r7 = FMA.f32 r3, r7, r3
+ NOP
id(0) nbb
* r8 = FMA.f32 r4, r8, r4
+ NOP
* r1 = FMA.f32 r5, r1, r5
+ NOP
* r2 = FMA.f32 r6, r2, r6
+ NOP
* r3 = FMA.f32 r7, r3, r7
+ NOP
* r4 = FMA.f32 r8, r4, r8
+ NOP
* r5 = FMA.f32 r1, r5, r1
+ NOP
* r6 = FMA.f32 r2, r6, r2
+ NOP
* r7 = FMA.f32 r3, r7, r3
+ NOP
id(0) nbb
* r8 = FMA.f32 r4, r8, r4
+ NOP
* r1 = FMA.f32 r5, r1, r5
+ NOP
* r2 = FMA.f32 r6, r2, r6
+ NOP
* r3 = FMA.f32 r7, r3, r7
+ NOP
* r4 = FMA.f32 r8, r4, r8
+ NOP
* r5 = FMA.f32 r1, r5, r1
+ NOP
* r6 = FMA.f32 r2, r6, r2
+ NOP
* r7 = FMA.f32 r3, r7, r3
+ NOP
id(0) nbb
* r8 = FMA.f32 r4, r8, r4
+ NOP
* r1 = FMA.f32 r5, r1, r5
+ NOP
* r2 = FMA.f32 r6, r2, r6
+ NOP
* r3 = FMA.f32 r7, r3, r7
+ NOP
* r4 = FMA.f32 r8, r4, r8
+ NOP
* r5 = FMA.f32 r1, r5, r1
+ NOP
* r6 = FMA.f32 r2, r6, r2
+ NOP
* r7 = FMA.f32 r3, r7, r3
+ NOP
id(0) nbb r_uncond no_prefetch pcrel(1)
* r8 = FMA.f32 r4, r8, r4
+ NOP
* r1 = FMA.f32 r5, r1, r5
+ NOP
* r2 = FMA.f32 r6, r2, r6
+ NOP
* r3 = FMA.f32 r7, r3, r7
+ NOP
* r4 = FMA.f32 r8, r4, r8
+ NOP
* r5 = FMA.f32 r1, r5, r1
+ NOP
* r6 = FMA.f32 r2, r6, r2
+ r9 = IADD.s32 r9, fau.x
* r7 = FMA.f32 r3, r7, r3
+ JUMP fau.y -> block1
0 4000000000000001
} -> block1 from block3
block5 {
id(0) wait(0 ) nbb r_uncond
* _.h00 = LSHIFT_OR.i32 r0, t, fau.y.b0
+ NOP
* _.h00 = FADD.f32 r4, r5
+ r0 = IADD.s32 fau.x, t0
* _.h00 = FADD.f32 t0, r6
+ _.h00 = ICMP.u32.i1.lt t1, fau.x
* r1 = FADD.f32 t0, r7
+ _.h00 = IADD.s32 t1, fau.y
* NOP
+ STORE.i32 r1, r0, t1, byte_offset:0
200000000
} from block2
slot 0 reads: r1
clause_0:
ds(0) nbb ncph
{
*LSHIFT_OR.i32 t0, r61, #0, 0x00000008 /* 0.000000 */
+IADD.s32 t1, r60, t
*MKVEC.v2i16 t0, #0, r62
+IADD.s32 r0:t1, t1, t
}
clause_3:
ds(0) nbb r_uncond ncph
{
*NOP t0
+MOV.i32 t1, 0x3f800000 /* 1.000000 */
*NOP t0
+FADD.f32 r1:t1, u1.w0, t1
*MOV.i32 r2:t0, 0x40000000 /* 2.000000 */
+U32_TO_F32 t1, r0
*FMA.f32 r4:t0, t1, 0x2edbe6ff /* 0.000000 */, #0.neg
+MOV.i32 t1, 0x40400000 /* 3.000000 */
*FADD.f32 r2:t0, u1.w0, r2
+FADD.f32 r3:t1, u1.w0, t1
*MOV.i32 r5:t0, r4
+MOV.i32 r6:t1, r4
*MOV.i32 r7:t0, r4
+MOV.i32 r8:t1, u1.w0
*NOP t0
+MOV.i32 r9:t1, #0.x
}
clause_10:
ds(0) nbb r_uncond ncph
{
*NOP t0
+ICMP.s32.m1.ge t1, r9, 0x00000020 /* 0.000000 */
*NOP t0
+BRANCHZ.i16.eq t1, t1.h0, clause_15
}
clause_13:
ds(0) nbb next_store dwb(0)
{
*NOP t0
+JUMP t1, clause_64
}
clause_15:
ds(0) nbb ncph
{
*FMA.f32 r8:t0, r4, r8, r4
+NOP t1
*FMA.f32 r1:t0, r5, r1, r5
+NOP t1
*FMA.f32 r2:t0, r6, r2, r6
+NOP t1
*FMA.f32 r3:t0, r7, r3, r7
+NOP t1
*FMA.f32 r4:t0, r8, r4, r8
+NOP t1
*FMA.f32 r5:t0, r1, r5, r1
+NOP t1
*FMA.f32 r6:t0, r2, r6, r2
+NOP t1
*FMA.f32 r7:t0, r3, r7, r3
+NOP t1
}
clause_21:
ds(0) nbb ncph
{
*FMA.f32 r8:t0, r4, r8, r4
+NOP t1
*FMA.f32 r1:t0, r5, r1, r5
+NOP t1
*FMA.f32 r2:t0, r6, r2, r6
+NOP t1
*FMA.f32 r3:t0, r7, r3, r7
+NOP t1
*FMA.f32 r4:t0, r8, r4, r8
+NOP t1
*FMA.f32 r5:t0, r1, r5, r1
+NOP t1
*FMA.f32 r6:t0, r2, r6, r2
+NOP t1
*FMA.f32 r7:t0, r3, r7, r3
+NOP t1
}
clause_27:
ds(0) nbb ncph
{
*FMA.f32 r8:t0, r4, r8, r4
+NOP t1
*FMA.f32 r1:t0, r5, r1, r5
+NOP t1
*FMA.f32 r2:t0, r6, r2, r6
+NOP t1
*FMA.f32 r3:t0, r7, r3, r7
+NOP t1
*FMA.f32 r4:t0, r8, r4, r8
+NOP t1
*FMA.f32 r5:t0, r1, r5, r1
+NOP t1
*FMA.f32 r6:t0, r2, r6, r2
+NOP t1
*FMA.f32 r7:t0, r3, r7, r3
+NOP t1
}
clause_33:
ds(0) nbb ncph
{
*FMA.f32 r8:t0, r4, r8, r4
+NOP t1
*FMA.f32 r1:t0, r5, r1, r5
+NOP t1
*FMA.f32 r2:t0, r6, r2, r6
+NOP t1
*FMA.f32 r3:t0, r7, r3, r7
+NOP t1
*FMA.f32 r4:t0, r8, r4, r8
+NOP t1
*FMA.f32 r5:t0, r1, r5, r1
+NOP t1
*FMA.f32 r6:t0, r2, r6, r2
+NOP t1
*FMA.f32 r7:t0, r3, r7, r3
+NOP t1
}
clause_39:
ds(0) nbb ncph
{
*FMA.f32 r8:t0, r4, r8, r4
+NOP t1
*FMA.f32 r1:t0, r5, r1, r5
+NOP t1
*FMA.f32 r2:t0, r6, r2, r6
+NOP t1
*FMA.f32 r3:t0, r7, r3, r7
+NOP t1
*FMA.f32 r4:t0, r8, r4, r8
+NOP t1
*FMA.f32 r5:t0, r1, r5, r1
+NOP t1
*FMA.f32 r6:t0, r2, r6, r2
+NOP t1
*FMA.f32 r7:t0, r3, r7, r3
+NOP t1
}
clause_45:
ds(0) nbb ncph
{
*FMA.f32 r8:t0, r4, r8, r4
+NOP t1
*FMA.f32 r1:t0, r5, r1, r5
+NOP t1
*FMA.f32 r2:t0, r6, r2, r6
+NOP t1
*FMA.f32 r3:t0, r7, r3, r7
+NOP t1
*FMA.f32 r4:t0, r8, r4, r8
+NOP t1
*FMA.f32 r5:t0, r1, r5, r1
+NOP t1
*FMA.f32 r6:t0, r2, r6, r2
+NOP t1
*FMA.f32 r7:t0, r3, r7, r3
+NOP t1
}
clause_51:
ds(0) nbb ncph
{
*FMA.f32 r8:t0, r4, r8, r4
+NOP t1
*FMA.f32 r1:t0, r5, r1, r5
+NOP t1
*FMA.f32 r2:t0, r6, r2, r6
+NOP t1
*FMA.f32 r3:t0, r7, r3, r7
+NOP t1
*FMA.f32 r4:t0, r8, r4, r8
+NOP t1
*FMA.f32 r5:t0, r1, r5, r1
+NOP t1
*FMA.f32 r6:t0, r2, r6, r2
+NOP t1
*FMA.f32 r7:t0, r3, r7, r3
+NOP t1
}
clause_57:
ds(0) nbb r_uncond
{
*FMA.f32 r8:t0, r4, r8, r4
+NOP t1
*FMA.f32 r1:t0, r5, r1, r5
+NOP t1
*FMA.f32 r2:t0, r6, r2, r6
+NOP t1
*FMA.f32 r3:t0, r7, r3, r7
+NOP t1
*FMA.f32 r4:t0, r8, r4, r8
+NOP t1
*FMA.f32 r5:t0, r1, r5, r1
+NOP t1
*FMA.f32 r6:t0, r2, r6, r2
+IADD.s32 r9:t1, r9, 0x00000001 /* 0.000000 */
*FMA.f32 r7:t0, r3, r7, r3
+JUMP t1, clause_10
}
clause_64:
ds(0) eos store
{
*LSHIFT_OR.i32 t0, r0, #0, 0x00000002 /* 0.000000 */
+NOP t1
*FADD.f32 t0, r4, r5
+IADD.s32 r0:t1, u0.w0, t0
*FADD.f32 t0, t0, r6
+ICMP.u32.gt t1, u0.w0, t1
*FADD.f32 r1:t0, t0, r7
+IADD.s32 t1, t1, u0.w1
*NOP t0
+STORE.i32 t1, r0, t1, @r1
}
d1ede77f compute_sp_v4_float 24.112 GFLOPs 11.133ms
compute shader ----------
#define KERNEL compute_sp_v8
#define LOCAL_SIZE_X 256
#define DATATYPE float
#line 64
// Avoiding auto-vectorize by using vector-width locked dependent code
layout(local_size_x = LOCAL_SIZE_X) in;
#undef MAD_4
#undef MAD_16
#undef MAD_64
#define mad(a,b,c) (a*b+c)
#define MAD_4(x, y) x = mad(y, x, y); y = mad(x, y, x); x = mad(y, x, y); y = mad(x, y, x);
#define MAD_16(x, y) MAD_4(x, y); MAD_4(x, y); MAD_4(x, y); MAD_4(x, y);
#define MAD_64(x, y) MAD_16(x, y); MAD_16(x, y); MAD_16(x, y); MAD_16(x, y);
struct vec8 {
vec4 d0, d1;
};
#define VEC8(x0,x1,x2,x3,x4,x5,x6,x7) vec8(vec4(x0,x1,x2,x3), vec4(x4,x5,x6,x7))
#define VEC8_S(x) vec8(vec4(x,x,x,x), vec4(x,x,x,x))
#define VEC8_ADD(a, b) (vec8(a.d0 + b.d0, a.d1 + b.d1))
#define VEC8_MUL(a, b) (vec8(a.d0 * b.d0, a.d1 * b.d1))
struct vec16 {
vec8 d0,d1;
};
#define VEC16(x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15) vec16(VEC8(x0,x1,x2,x3,x4,x5,x6,x7), VEC8(x8,x9,x10,x11,x12,x13,x14,x15))
#define VEC16_S(x) vec16(VEC8_S(x), VEC8_S(x));
#define VEC16_ADD(a, b) (vec16(VEC8_ADD(a.d0, b.d0), VEC8_ADD(a.d1, b.d1)))
#define VEC16_MUL(a, b) (vec16(VEC8_MUL(a.d0, b.d0), VEC8_MUL(a.d1, b.d1)))
#define mad8(a,b,c) (VEC8_ADD(VEC8_MUL(a,b),c))
#define mad16(a,b,c) (VEC16_ADD(VEC16_MUL(a,b),c))
layout(location = 1) uniform DATATYPE _A;
#define SCALE 1e-10
layout(std430, binding = 0) restrict writeonly buffer outbuffer {
DATATYPE ptr[];
};
#line 162
void compute_sp_v8()
{
uint id = gl_GlobalInvocationID[0] + gl_GlobalInvocationID[1] * 256u + gl_GlobalInvocationID[2] * 256u * 256u;
vec8 x = VEC8(_A, (_A+DATATYPE(1)), (_A+DATATYPE(2)), (_A+DATATYPE(3)), (_A+DATATYPE(4)), (_A+DATATYPE(5)), (_A+DATATYPE(6)), (_A+DATATYPE(7)));
vec8 y = VEC8_S(DATATYPE(float(id) * SCALE));
#undef mad
#define mad mad8
for(int i=0; i<16; i++)
{
MAD_16(x, y);
}
vec4 s = y.d0 + y.d1;
vec2 t = s.xy + s.zw;
ptr[id] = t.x + t.y;
}
void main() {compute_sp_v8();}
----------
shader: MESA_SHADER_COMPUTE
source_sha1: {0xe904ad44, 0x6f83e415, 0x6ab7fd0e, 0x33347baa, 0x854f8dc7}
name: GLSL8
workgroup-size: 256, 1, 1
shared-size: 0
inputs: 0
outputs: 0
uniforms: 1
ubos: 1
shared: 0
ray queries: 0
decl_var ssbo INTERP_MODE_NONE restrict writeonly highp float[] ptr (0, 0, 0)
decl_var uniform INTERP_MODE_NONE highp float _A (1, 0, 0)
decl_var ubo INTERP_MODE_NONE vec4[1] uniform_0 (0, 0, 0)
decl_function main (0 params)
impl main {
block block_0:
/* preds: */
vec3 32 ssa_11 = intrinsic load_global_invocation_id () ()
vec1 32 ssa_12 = load_const (0x00000008 = 0.000000)
vec1 32 ssa_13 = ishl ssa_11.y, ssa_12
vec1 32 ssa_14 = iadd ssa_11.x, ssa_13
vec1 32 ssa_1 = load_const (0x00000001 = 0.000000)
vec1 32 ssa_448 = insert_u16 ssa_11.z, ssa_1
vec1 32 ssa_16 = iadd ssa_14, ssa_448
vec1 32 ssa_0 = load_const (0x00000000 = 0.000000)
vec1 32 ssa_17 = intrinsic load_ubo (ssa_0, ssa_0) (access=0, align_mul=1073741824, align_offset=0, range_base=0, range=4)
vec1 32 ssa_2 = load_const (0x3f800000 = 1.000000)
vec1 32 ssa_18 = fadd ssa_17, ssa_2
vec1 32 ssa_3 = load_const (0x40000000 = 2.000000)
vec1 32 ssa_19 = fadd ssa_17, ssa_3
vec1 32 ssa_4 = load_const (0x40400000 = 3.000000)
vec1 32 ssa_20 = fadd ssa_17, ssa_4
vec1 32 ssa_5 = load_const (0x40800000 = 4.000000)
vec1 32 ssa_21 = fadd ssa_17, ssa_5
vec1 32 ssa_6 = load_const (0x40a00000 = 5.000000)
vec1 32 ssa_22 = fadd ssa_17, ssa_6
vec1 32 ssa_7 = load_const (0x40c00000 = 6.000000)
vec1 32 ssa_23 = fadd ssa_17, ssa_7
vec1 32 ssa_8 = load_const (0x40e00000 = 7.000000)
vec1 32 ssa_24 = fadd ssa_17, ssa_8
vec1 32 ssa_25 = u2f32 ssa_16
vec1 32 ssa_9 = load_const (0x2edbe6ff = 0.000000)
vec1 32 ssa_26 = fmul ssa_25, ssa_9
vec1 32 ssa_10 = load_const (0x00000010 = 0.000000)
/* succs: block_1 */
loop {
block block_1:
/* preds: block_0 block_4 */
vec1 32 ssa_27 = phi block_0: ssa_26, block_4: ssa_322
vec1 32 ssa_28 = phi block_0: ssa_26, block_4: ssa_321
vec1 32 ssa_29 = phi block_0: ssa_26, block_4: ssa_320
vec1 32 ssa_30 = phi block_0: ssa_26, block_4: ssa_319
vec1 32 ssa_31 = phi block_0: ssa_26, block_4: ssa_326
vec1 32 ssa_32 = phi block_0: ssa_26, block_4: ssa_325
vec1 32 ssa_33 = phi block_0: ssa_26, block_4: ssa_324
vec1 32 ssa_34 = phi block_0: ssa_26, block_4: ssa_323
vec1 32 ssa_35 = phi block_0: ssa_21, block_4: ssa_330
vec1 32 ssa_36 = phi block_0: ssa_22, block_4: ssa_329
vec1 32 ssa_37 = phi block_0: ssa_23, block_4: ssa_328
vec1 32 ssa_38 = phi block_0: ssa_24, block_4: ssa_327
vec1 32 ssa_39 = phi block_0: ssa_17, block_4: ssa_334
vec1 32 ssa_40 = phi block_0: ssa_18, block_4: ssa_333
vec1 32 ssa_41 = phi block_0: ssa_19, block_4: ssa_332
vec1 32 ssa_42 = phi block_0: ssa_20, block_4: ssa_331
vec1 32 ssa_43 = phi block_0: ssa_0, block_4: ssa_301
vec1 32 ssa_44 = ige32 ssa_43, ssa_10
/* succs: block_2 block_3 */
if ssa_44 {
block block_2:
/* preds: block_1 */
break
/* succs: block_5 */
} else {
block block_3:
/* preds: block_1 */
/* succs: block_4 */
}
block block_4:
/* preds: block_3 */
vec1 32 ssa_446 = ffma ssa_31, ssa_39, ssa_31
vec1 32 ssa_445 = ffma ssa_32, ssa_40, ssa_32
vec1 32 ssa_444 = ffma ssa_33, ssa_41, ssa_33
vec1 32 ssa_443 = ffma ssa_34, ssa_42, ssa_34
vec1 32 ssa_442 = ffma ssa_27, ssa_35, ssa_27
vec1 32 ssa_441 = ffma ssa_28, ssa_36, ssa_28
vec1 32 ssa_440 = ffma ssa_29, ssa_37, ssa_29
vec1 32 ssa_439 = ffma ssa_30, ssa_38, ssa_30
vec1 32 ssa_438 = ffma ssa_446, ssa_31, ssa_446
vec1 32 ssa_437 = ffma ssa_445, ssa_32, ssa_445
vec1 32 ssa_436 = ffma ssa_444, ssa_33, ssa_444
vec1 32 ssa_435 = ffma ssa_443, ssa_34, ssa_443
vec1 32 ssa_434 = ffma ssa_442, ssa_27, ssa_442
vec1 32 ssa_433 = ffma ssa_441, ssa_28, ssa_441
vec1 32 ssa_432 = ffma ssa_440, ssa_29, ssa_440
vec1 32 ssa_431 = ffma ssa_439, ssa_30, ssa_439
vec1 32 ssa_430 = ffma ssa_438, ssa_446, ssa_438
vec1 32 ssa_429 = ffma ssa_437, ssa_445, ssa_437
vec1 32 ssa_428 = ffma ssa_436, ssa_444, ssa_436
vec1 32 ssa_427 = ffma ssa_435, ssa_443, ssa_435
vec1 32 ssa_426 = ffma ssa_434, ssa_442, ssa_434
vec1 32 ssa_425 = ffma ssa_433, ssa_441, ssa_433
vec1 32 ssa_424 = ffma ssa_432, ssa_440, ssa_432
vec1 32 ssa_423 = ffma ssa_431, ssa_439, ssa_431
vec1 32 ssa_422 = ffma ssa_430, ssa_438, ssa_430
vec1 32 ssa_421 = ffma ssa_429, ssa_437, ssa_429
vec1 32 ssa_420 = ffma ssa_428, ssa_436, ssa_428
vec1 32 ssa_419 = ffma ssa_427, ssa_435, ssa_427
vec1 32 ssa_418 = ffma ssa_426, ssa_434, ssa_426
vec1 32 ssa_417 = ffma ssa_425, ssa_433, ssa_425
vec1 32 ssa_416 = ffma ssa_424, ssa_432, ssa_424
vec1 32 ssa_415 = ffma ssa_423, ssa_431, ssa_423
vec1 32 ssa_414 = ffma ssa_422, ssa_430, ssa_422
vec1 32 ssa_413 = ffma ssa_421, ssa_429, ssa_421
vec1 32 ssa_412 = ffma ssa_420, ssa_428, ssa_420
vec1 32 ssa_411 = ffma ssa_419, ssa_427, ssa_419
vec1 32 ssa_410 = ffma ssa_418, ssa_426, ssa_418
vec1 32 ssa_409 = ffma ssa_417, ssa_425, ssa_417
vec1 32 ssa_408 = ffma ssa_416, ssa_424, ssa_416
vec1 32 ssa_407 = ffma ssa_415, ssa_423, ssa_415
vec1 32 ssa_406 = ffma ssa_414, ssa_422, ssa_414
vec1 32 ssa_405 = ffma ssa_413, ssa_421, ssa_413
vec1 32 ssa_404 = ffma ssa_412, ssa_420, ssa_412
vec1 32 ssa_403 = ffma ssa_411, ssa_419, ssa_411
vec1 32 ssa_402 = ffma ssa_410, ssa_418, ssa_410
vec1 32 ssa_401 = ffma ssa_409, ssa_417, ssa_409
vec1 32 ssa_400 = ffma ssa_408, ssa_416, ssa_408
vec1 32 ssa_399 = ffma ssa_407, ssa_415, ssa_407
vec1 32 ssa_398 = ffma ssa_406, ssa_414, ssa_406
vec1 32 ssa_397 = ffma ssa_405, ssa_413, ssa_405
vec1 32 ssa_396 = ffma ssa_404, ssa_412, ssa_404
vec1 32 ssa_395 = ffma ssa_403, ssa_411, ssa_403
vec1 32 ssa_394 = ffma ssa_402, ssa_410, ssa_402
vec1 32 ssa_393 = ffma ssa_401, ssa_409, ssa_401
vec1 32 ssa_392 = ffma ssa_400, ssa_408, ssa_400
vec1 32 ssa_391 = ffma ssa_399, ssa_407, ssa_399
vec1 32 ssa_390 = ffma ssa_398, ssa_406, ssa_398
vec1 32 ssa_389 = ffma ssa_397, ssa_405, ssa_397
vec1 32 ssa_388 = ffma ssa_396, ssa_404, ssa_396
vec1 32 ssa_387 = ffma ssa_395, ssa_403, ssa_395
vec1 32 ssa_386 = ffma ssa_394, ssa_402, ssa_394
vec1 32 ssa_385 = ffma ssa_393, ssa_401, ssa_393
vec1 32 ssa_384 = ffma ssa_392, ssa_400, ssa_392
vec1 32 ssa_383 = ffma ssa_391, ssa_399, ssa_391
vec1 32 ssa_382 = ffma ssa_390, ssa_398, ssa_390
vec1 32 ssa_381 = ffma ssa_389, ssa_397, ssa_389
vec1 32 ssa_380 = ffma ssa_388, ssa_396, ssa_388
vec1 32 ssa_379 = ffma ssa_387, ssa_395, ssa_387
vec1 32 ssa_378 = ffma ssa_386, ssa_394, ssa_386
vec1 32 ssa_377 = ffma ssa_385, ssa_393, ssa_385
vec1 32 ssa_376 = ffma ssa_384, ssa_392, ssa_384
vec1 32 ssa_375 = ffma ssa_383, ssa_391, ssa_383
vec1 32 ssa_374 = ffma ssa_382, ssa_390, ssa_382
vec1 32 ssa_373 = ffma ssa_381, ssa_389, ssa_381
vec1 32 ssa_372 = ffma ssa_380, ssa_388, ssa_380
vec1 32 ssa_371 = ffma ssa_379, ssa_387, ssa_379
vec1 32 ssa_370 = ffma ssa_378, ssa_386, ssa_378
vec1 32 ssa_369 = ffma ssa_377, ssa_385, ssa_377
vec1 32 ssa_368 = ffma ssa_376, ssa_384, ssa_376
vec1 32 ssa_367 = ffma ssa_375, ssa_383, ssa_375
vec1 32 ssa_366 = ffma ssa_374, ssa_382, ssa_374
vec1 32 ssa_365 = ffma ssa_373, ssa_381, ssa_373
vec1 32 ssa_364 = ffma ssa_372, ssa_380, ssa_372
vec1 32 ssa_363 = ffma ssa_371, ssa_379, ssa_371
vec1 32 ssa_362 = ffma ssa_370, ssa_378, ssa_370
vec1 32 ssa_361 = ffma ssa_369, ssa_377, ssa_369
vec1 32 ssa_360 = ffma ssa_368, ssa_376, ssa_368
vec1 32 ssa_359 = ffma ssa_367, ssa_375, ssa_367
vec1 32 ssa_358 = ffma ssa_366, ssa_374, ssa_366
vec1 32 ssa_357 = ffma ssa_365, ssa_373, ssa_365
vec1 32 ssa_356 = ffma ssa_364, ssa_372, ssa_364
vec1 32 ssa_355 = ffma ssa_363, ssa_371, ssa_363
vec1 32 ssa_354 = ffma ssa_362, ssa_370, ssa_362
vec1 32 ssa_353 = ffma ssa_361, ssa_369, ssa_361
vec1 32 ssa_352 = ffma ssa_360, ssa_368, ssa_360
vec1 32 ssa_351 = ffma ssa_359, ssa_367, ssa_359
vec1 32 ssa_350 = ffma ssa_358, ssa_366, ssa_358
vec1 32 ssa_349 = ffma ssa_357, ssa_365, ssa_357
vec1 32 ssa_348 = ffma ssa_356, ssa_364, ssa_356
vec1 32 ssa_347 = ffma ssa_355, ssa_363, ssa_355
vec1 32 ssa_346 = ffma ssa_354, ssa_362, ssa_354
vec1 32 ssa_345 = ffma ssa_353, ssa_361, ssa_353
vec1 32 ssa_344 = ffma ssa_352, ssa_360, ssa_352
vec1 32 ssa_343 = ffma ssa_351, ssa_359, ssa_351
vec1 32 ssa_342 = ffma ssa_350, ssa_358, ssa_350
vec1 32 ssa_341 = ffma ssa_349, ssa_357, ssa_349
vec1 32 ssa_340 = ffma ssa_348, ssa_356, ssa_348
vec1 32 ssa_339 = ffma ssa_347, ssa_355, ssa_347
vec1 32 ssa_338 = ffma ssa_346, ssa_354, ssa_346
vec1 32 ssa_337 = ffma ssa_345, ssa_353, ssa_345
vec1 32 ssa_336 = ffma ssa_344, ssa_352, ssa_344
vec1 32 ssa_335 = ffma ssa_343, ssa_351, ssa_343
vec1 32 ssa_334 = ffma ssa_342, ssa_350, ssa_342
vec1 32 ssa_333 = ffma ssa_341, ssa_349, ssa_341
vec1 32 ssa_332 = ffma ssa_340, ssa_348, ssa_340
vec1 32 ssa_331 = ffma ssa_339, ssa_347, ssa_339
vec1 32 ssa_330 = ffma ssa_338, ssa_346, ssa_338
vec1 32 ssa_329 = ffma ssa_337, ssa_345, ssa_337
vec1 32 ssa_328 = ffma ssa_336, ssa_344, ssa_336
vec1 32 ssa_327 = ffma ssa_335, ssa_343, ssa_335
vec1 32 ssa_326 = ffma ssa_334, ssa_342, ssa_334
vec1 32 ssa_325 = ffma ssa_333, ssa_341, ssa_333
vec1 32 ssa_324 = ffma ssa_332, ssa_340, ssa_332
vec1 32 ssa_323 = ffma ssa_331, ssa_339, ssa_331
vec1 32 ssa_322 = ffma ssa_330, ssa_338, ssa_330
vec1 32 ssa_321 = ffma ssa_329, ssa_337, ssa_329
vec1 32 ssa_320 = ffma ssa_328, ssa_336, ssa_328
vec1 32 ssa_319 = ffma ssa_327, ssa_335, ssa_327
vec1 32 ssa_301 = iadd ssa_43, ssa_1
/* succs: block_1 */
}
block block_5:
/* preds: block_2 */
vec1 32 ssa_302 = fadd ssa_31, ssa_27
vec1 32 ssa_303 = fadd ssa_32, ssa_28
vec1 32 ssa_304 = fadd ssa_33, ssa_29
vec1 32 ssa_305 = fadd ssa_34, ssa_30
vec1 32 ssa_306 = fadd ssa_302, ssa_304
vec1 32 ssa_307 = fadd ssa_303, ssa_305
vec1 32 ssa_308 = load_const (0x00000002 = 0.000000)
vec1 32 ssa_309 = ishl ssa_16, ssa_308
vec1 32 ssa_310 = fadd ssa_306, ssa_307
vec1 64 ssa_311 = intrinsic load_ssbo_address (ssa_0) ()
vec1 32 ssa_312 = unpack_64_2x32_split_x ssa_311
vec1 32 ssa_313 = unpack_64_2x32_split_y ssa_311
vec1 32 ssa_314 = iadd ssa_312, ssa_309
vec1 32 ssa_315 = ult32 ssa_314, ssa_312
vec1 32 ssa_316 = b2i32 ssa_315
vec1 32 ssa_317 = iadd ssa_316, ssa_313
vec1 64 ssa_318 = pack_64_2x32_split ssa_314, ssa_317
intrinsic store_global (ssa_310, ssa_318) (wrmask=x /*1*/, access=0, align_mul=4, align_offset=0)
/* succs: block_6 */
block block_6:
}
block0 {
451 = MOV.i32 r62
450 = MOV.i32 r61
449 = MOV.i32 r60
13 = LSHIFT_OR.i32 450, #0x0, #0x8.b0
14 = IADD.s32 449, 13
448 = MKVEC.v2i16 #0x0.h00, 451.h00
16 = IADD.s32 14, 448
18 = FADD.f32 u1, #0x3f800000
19 = FADD.f32 u1, #0x40000000
20 = FADD.f32 u1, #0x40400000
21 = FADD.f32 u1, #0x40800000
22 = FADD.f32 u1, #0x40a00000
23 = FADD.f32 u1, #0x40c00000
24 = FADD.f32 u1, #0x40e00000
25 = U32_TO_F32 16
26 = FMA.f32 25, #0x2edbe6ff, #0x0.neg
} -> block1
block1 {
27 = PHI 26, 322
28 = PHI 26, 321
29 = PHI 26, 320
30 = PHI 26, 319
31 = PHI 26, 326
32 = PHI 26, 325
33 = PHI 26, 324
34 = PHI 26, 323
35 = PHI 21, 330
36 = PHI 22, 329
37 = PHI 23, 328
38 = PHI 24, 327
39 = PHI u1, 334
40 = PHI 18, 333
41 = PHI 19, 332
42 = PHI 20, 331
43 = PHI #0x0, 301
44 = ICMP.s32.m1.ge 43, #0x10
BRANCHZ.i16.eq 44.h00, #0x0 -> block3
} -> block3 block2 from block0 block4
block2 {
JUMP #0x0 -> block5
} -> block5 from block1
block3 {
} -> block4 from block1
block4 {
446 = FMA.f32 31, 39, 31
445 = FMA.f32 32, 40, 32
444 = FMA.f32 33, 41, 33
443 = FMA.f32 34, 42, 34
442 = FMA.f32 27, 35, 27
441 = FMA.f32 28, 36, 28
440 = FMA.f32 29, 37, 29
439 = FMA.f32 30, 38, 30
438 = FMA.f32 446, 31, 446
437 = FMA.f32 445, 32, 445
436 = FMA.f32 444, 33, 444
435 = FMA.f32 443, 34, 443
434 = FMA.f32 442, 27, 442
433 = FMA.f32 441, 28, 441
432 = FMA.f32 440, 29, 440
431 = FMA.f32 439, 30, 439
430 = FMA.f32 438, 446, 438
429 = FMA.f32 437, 445, 437
428 = FMA.f32 436, 444, 436
427 = FMA.f32 435, 443, 435
426 = FMA.f32 434, 442, 434
425 = FMA.f32 433, 441, 433
424 = FMA.f32 432, 440, 432
423 = FMA.f32 431, 439, 431
422 = FMA.f32 430, 438, 430
421 = FMA.f32 429, 437, 429
420 = FMA.f32 428, 436, 428
419 = FMA.f32 427, 435, 427
418 = FMA.f32 426, 434, 426
417 = FMA.f32 425, 433, 425
416 = FMA.f32 424, 432, 424
415 = FMA.f32 423, 431, 423
414 = FMA.f32 422, 430, 422
413 = FMA.f32 421, 429, 421
412 = FMA.f32 420, 428, 420
411 = FMA.f32 419, 427, 419
410 = FMA.f32 418, 426, 418
409 = FMA.f32 417, 425, 417
408 = FMA.f32 416, 424, 416
407 = FMA.f32 415, 423, 415
406 = FMA.f32 414, 422, 414
405 = FMA.f32 413, 421, 413
404 = FMA.f32 412, 420, 412
403 = FMA.f32 411, 419, 411
402 = FMA.f32 410, 418, 410
401 = FMA.f32 409, 417, 409
400 = FMA.f32 408, 416, 408
399 = FMA.f32 407, 415, 407
398 = FMA.f32 406, 414, 406
397 = FMA.f32 405, 413, 405
396 = FMA.f32 404, 412, 404
395 = FMA.f32 403, 411, 403
394 = FMA.f32 402, 410, 402
393 = FMA.f32 401, 409, 401
392 = FMA.f32 400, 408, 400
391 = FMA.f32 399, 407, 399
390 = FMA.f32 398, 406, 398
389 = FMA.f32 397, 405, 397
388 = FMA.f32 396, 404, 396
387 = FMA.f32 395, 403, 395
386 = FMA.f32 394, 402, 394
385 = FMA.f32 393, 401, 393
384 = FMA.f32 392, 400, 392
383 = FMA.f32 391, 399, 391
382 = FMA.f32 390, 398, 390
381 = FMA.f32 389, 397, 389
380 = FMA.f32 388, 396, 388
379 = FMA.f32 387, 395, 387
378 = FMA.f32 386, 394, 386
377 = FMA.f32 385, 393, 385
376 = FMA.f32 384, 392, 384
375 = FMA.f32 383, 391, 383
374 = FMA.f32 382, 390, 382
373 = FMA.f32 381, 389, 381
372 = FMA.f32 380, 388, 380
371 = FMA.f32 379, 387, 379
370 = FMA.f32 378, 386, 378
369 = FMA.f32 377, 385, 377
368 = FMA.f32 376, 384, 376
367 = FMA.f32 375, 383, 375
366 = FMA.f32 374, 382, 374
365 = FMA.f32 373, 381, 373
364 = FMA.f32 372, 380, 372
363 = FMA.f32 371, 379, 371
362 = FMA.f32 370, 378, 370
361 = FMA.f32 369, 377, 369
360 = FMA.f32 368, 376, 368
359 = FMA.f32 367, 375, 367
358 = FMA.f32 366, 374, 366
357 = FMA.f32 365, 373, 365
356 = FMA.f32 364, 372, 364
355 = FMA.f32 363, 371, 363
354 = FMA.f32 362, 370, 362
353 = FMA.f32 361, 369, 361
352 = FMA.f32 360, 368, 360
351 = FMA.f32 359, 367, 359
350 = FMA.f32 358, 366, 358
349 = FMA.f32 357, 365, 357
348 = FMA.f32 356, 364, 356
347 = FMA.f32 355, 363, 355
346 = FMA.f32 354, 362, 354
345 = FMA.f32 353, 361, 353
344 = FMA.f32 352, 360, 352
343 = FMA.f32 351, 359, 351
342 = FMA.f32 350, 358, 350
341 = FMA.f32 349, 357, 349
340 = FMA.f32 348, 356, 348
339 = FMA.f32 347, 355, 347
338 = FMA.f32 346, 354, 346
337 = FMA.f32 345, 353, 345
336 = FMA.f32 344, 352, 344
335 = FMA.f32 343, 351, 343
334 = FMA.f32 342, 350, 342
333 = FMA.f32 341, 349, 341
332 = FMA.f32 340, 348, 340
331 = FMA.f32 339, 347, 339
330 = FMA.f32 338, 346, 338
329 = FMA.f32 337, 345, 337
328 = FMA.f32 336, 344, 336
327 = FMA.f32 335, 343, 335
326 = FMA.f32 334, 342, 334
325 = FMA.f32 333, 341, 333
324 = FMA.f32 332, 340, 332
323 = FMA.f32 331, 339, 331
322 = FMA.f32 330, 338, 330
321 = FMA.f32 329, 337, 329
320 = FMA.f32 328, 336, 328
319 = FMA.f32 327, 335, 327
301 = IADD.s32 43, #0x1
JUMP #0x0 -> block1
} -> block1 from block3
block5 {
302 = FADD.f32 31, 27
303 = FADD.f32 32, 28
304 = FADD.f32 33, 29
305 = FADD.f32 34, 30
306 = FADD.f32 302, 304
307 = FADD.f32 303, 305
309 = LSHIFT_OR.i32 16, #0x0, #0x2.b0
310 = FADD.f32 306, 307
314 = IADD.s32 u0, 309
316 = ICMP.u32.i1.lt 314, u0
317 = IADD.s32 316, u0[1]
STORE.i32 310, 314, 317, byte_offset:0
} from block2
block0 {
r0 = LSHIFT_OR.i32 r61, #0x0, #0x8.b0
r0 = IADD.s32 r60, r0
r1 = MKVEC.v2i16 #0x0.h00, r62.h00
r0 = IADD.s32 r0, r1
r1 = MOV.i32 #0x3f800000
r1 = FADD.f32 u1, r1
r2 = MOV.i32 #0x40000000
r2 = FADD.f32 u1, r2
r3 = MOV.i32 #0x40400000
r3 = FADD.f32 u1, r3
r4 = MOV.i32 #0x40800000
r4 = FADD.f32 u1, r4
r5 = MOV.i32 #0x40a00000
r5 = FADD.f32 u1, r5
r6 = MOV.i32 #0x40c00000
r6 = FADD.f32 u1, r6
r7 = MOV.i32 #0x40e00000
r7 = FADD.f32 u1, r7
r8 = U32_TO_F32 r0
r8 = FMA.f32 r8, #0x2edbe6ff, #0x0.neg
r9 = MOV.i32 r8
r10 = MOV.i32 r8
r11 = MOV.i32 r8
r12 = MOV.i32 r8
r13 = MOV.i32 r8
r14 = MOV.i32 r8
r15 = MOV.i32 r8
r48 = MOV.i32 u1
r49 = MOV.i32 #0x0
} -> block1
block1 {
r50 = ICMP.s32.m1.ge r49, #0x10
BRANCHZ.i16.eq r50.h00, #0x0 -> block3
} -> block3 block2 from block0 block4
block2 {
JUMP #0x0 -> block5
} -> block5 from block1
block3 {
} -> block4 from block1
block4 {
r48 = FMA.f32 r12, r48, r12
r1 = FMA.f32 r13, r1, r13
r2 = FMA.f32 r14, r2, r14
r3 = FMA.f32 r15, r3, r15
r4 = FMA.f32 r8, r4, r8
r5 = FMA.f32 r9, r5, r9
r6 = FMA.f32 r10, r6, r10
r7 = FMA.f32 r11, r7, r11
r12 = FMA.f32 r48, r12, r48
r13 = FMA.f32 r1, r13, r1
r14 = FMA.f32 r2, r14, r2
r15 = FMA.f32 r3, r15, r3
r8 = FMA.f32 r4, r8, r4
r9 = FMA.f32 r5, r9, r5
r10 = FMA.f32 r6, r10, r6
r11 = FMA.f32 r7, r11, r7
r48 = FMA.f32 r12, r48, r12
r1 = FMA.f32 r13, r1, r13
r2 = FMA.f32 r14, r2, r14
r3 = FMA.f32 r15, r3, r15
r4 = FMA.f32 r8, r4, r8
r5 = FMA.f32 r9, r5, r9
r6 = FMA.f32 r10, r6, r10
r7 = FMA.f32 r11, r7, r11
r12 = FMA.f32 r48, r12, r48
r13 = FMA.f32 r1, r13, r1
r14 = FMA.f32 r2, r14, r2
r15 = FMA.f32 r3, r15, r3
r8 = FMA.f32 r4, r8, r4
r9 = FMA.f32 r5, r9, r5
r10 = FMA.f32 r6, r10, r6
r11 = FMA.f32 r7, r11, r7
r48 = FMA.f32 r12, r48, r12
r1 = FMA.f32 r13, r1, r13
r2 = FMA.f32 r14, r2, r14
r3 = FMA.f32 r15, r3, r15
r4 = FMA.f32 r8, r4, r8
r5 = FMA.f32 r9, r5, r9
r6 = FMA.f32 r10, r6, r10
r7 = FMA.f32 r11, r7, r11
r12 = FMA.f32 r48, r12, r48
r13 = FMA.f32 r1, r13, r1
r14 = FMA.f32 r2, r14, r2
r15 = FMA.f32 r3, r15, r3
r8 = FMA.f32 r4, r8, r4
r9 = FMA.f32 r5, r9, r5
r10 = FMA.f32 r6, r10, r6
r11 = FMA.f32 r7, r11, r7
r48 = FMA.f32 r12, r48, r12
r1 = FMA.f32 r13, r1, r13
r2 = FMA.f32 r14, r2, r14
r3 = FMA.f32 r15, r3, r15
r4 = FMA.f32 r8, r4, r8
r5 = FMA.f32 r9, r5, r9
r6 = FMA.f32 r10, r6, r10
r7 = FMA.f32 r11, r7, r11
r12 = FMA.f32 r48, r12, r48
r13 = FMA.f32 r1, r13, r1
r14 = FMA.f32 r2, r14, r2
r15 = FMA.f32 r3, r15, r3
r8 = FMA.f32 r4, r8, r4
r9 = FMA.f32 r5, r9, r5
r10 = FMA.f32 r6, r10, r6
r11 = FMA.f32 r7, r11, r7
r48 = FMA.f32 r12, r48, r12
r1 = FMA.f32 r13, r1, r13
r2 = FMA.f32 r14, r2, r14
r3 = FMA.f32 r15, r3, r15
r4 = FMA.f32 r8, r4, r8
r5 = FMA.f32 r9, r5, r9
r6 = FMA.f32 r10, r6, r10
r7 = FMA.f32 r11, r7, r11
r12 = FMA.f32 r48, r12, r48
r13 = FMA.f32 r1, r13, r1
r14 = FMA.f32 r2, r14, r2
r15 = FMA.f32 r3, r15, r3
r8 = FMA.f32 r4, r8, r4
r9 = FMA.f32 r5, r9, r5
r10 = FMA.f32 r6, r10, r6
r11 = FMA.f32 r7, r11, r7
r48 = FMA.f32 r12, r48, r12
r1 = FMA.f32 r13, r1, r13
r2 = FMA.f32 r14, r2, r14
r3 = FMA.f32 r15, r3, r15
r4 = FMA.f32 r8, r4, r8
r5 = FMA.f32 r9, r5, r9
r6 = FMA.f32 r10, r6, r10
r7 = FMA.f32 r11, r7, r11
r12 = FMA.f32 r48, r12, r48
r13 = FMA.f32 r1, r13, r1
r14 = FMA.f32 r2, r14, r2
r15 = FMA.f32 r3, r15, r3
r8 = FMA.f32 r4, r8, r4
r9 = FMA.f32 r5, r9, r5
r10 = FMA.f32 r6, r10, r6
r11 = FMA.f32 r7, r11, r7
r48 = FMA.f32 r12, r48, r12
r1 = FMA.f32 r13, r1, r13
r2 = FMA.f32 r14, r2, r14
r3 = FMA.f32 r15, r3, r15
r4 = FMA.f32 r8, r4, r8
r5 = FMA.f32 r9, r5, r9
r6 = FMA.f32 r10, r6, r10
r7 = FMA.f32 r11, r7, r11
r12 = FMA.f32 r48, r12, r48
r13 = FMA.f32 r1, r13, r1
r14 = FMA.f32 r2, r14, r2
r15 = FMA.f32 r3, r15, r3
r8 = FMA.f32 r4, r8, r4
r9 = FMA.f32 r5, r9, r5
r10 = FMA.f32 r6, r10, r6
r11 = FMA.f32 r7, r11, r7
r48 = FMA.f32 r12, r48, r12
r1 = FMA.f32 r13, r1, r13
r2 = FMA.f32 r14, r2, r14
r3 = FMA.f32 r15, r3, r15
r4 = FMA.f32 r8, r4, r8
r5 = FMA.f32 r9, r5, r9
r6 = FMA.f32 r10, r6, r10
r7 = FMA.f32 r11, r7, r11
r12 = FMA.f32 r48, r12, r48
r13 = FMA.f32 r1, r13, r1
r14 = FMA.f32 r2, r14, r2
r15 = FMA.f32 r3, r15, r3
r8 = FMA.f32 r4, r8, r4
r9 = FMA.f32 r5, r9, r5
r10 = FMA.f32 r6, r10, r6
r11 = FMA.f32 r7, r11, r7
r49 = IADD.s32 r49, #0x1
JUMP #0x0 -> block1
} -> block1 from block3
block5 {
r1 = FADD.f32 r12, r8
r2 = FADD.f32 r13, r9
r3 = FADD.f32 r14, r10
r4 = FADD.f32 r15, r11
r1 = FADD.f32 r1, r3
r2 = FADD.f32 r2, r4
r0 = LSHIFT_OR.i32 r0, #0x0, #0x2.b0
r1 = FADD.f32 r1, r2
r0 = IADD.s32 u0, r0
r2 = ICMP.u32.i1.lt r0, u0
r2 = IADD.s32 r2, u0[1]
STORE.i32 r1, r0, r2, byte_offset:0
} from block2
block0 {
id(0) nbb
* _.h00 = LSHIFT_OR.i32 r61, t, fau.x.b0
+ _.h00 = IADD.s32 r60, t
* _.h00 = MKVEC.v2i16 t.h00, r62.h00
+ r0 = IADD.s32 t1, t
* _.h00 = MOV.i32 fau.x
+ _.h00 = MOV.i32 fau.y
* r1 = FADD.f32 fau.x, t0
+ r2 = FADD.f32 fau.x, t1
* _.h00 = MOV.i32 fau.x
+ _.h00 = MOV.i32 fau.y
* r3 = FADD.f32 fau.x, t0
+ r4 = FADD.f32 fau.x, t1
* NOP
+ _.h00 = MOV.i32 fau.y
* NOP
+ r5 = FADD.f32 fau.x, t1
400000003f800000 4080000040400000 40a0000000000008
id(0) nbb r_uncond
* r6 = MOV.i32 fau.y
+ _.h00 = U32_TO_F32 r0
* r8 = FMA.f32 t1, fau.x, t.neg
+ _.h00 = MOV.i32 fau.y
* r6 = FADD.f32 fau.x, r6
+ r7 = FADD.f32 fau.x, t1
* r9 = MOV.i32 r8
+ r10 = MOV.i32 r8
* r11 = MOV.i32 r8
+ r12 = MOV.i32 r8
* r13 = MOV.i32 r8
+ r14 = MOV.i32 r8
* r15 = MOV.i32 r8
+ r48 = MOV.i32 fau.x
* NOP
+ r49 = MOV.i32 fau.x
40e000002edbe6ff 40c0000000000000
} -> block1
block1 {
id(0) nbb r_uncond pcrel(0)
* NOP
+ _.h00 = ICMP.s32.m1.ge r49, fau.x
* NOP
+ BRANCHZ.i16.eq t1.h00, fau.y -> block3
4000000000000010
} -> block3 block2 from block0 block4
block2 {
id(0) nbb no_prefetch pcrel(0)
* NOP
+ JUMP fau.y -> block5
4000000000000000
} -> block5 from block1
block3 {
} -> block4 from block1
block4 {
id(0) nbb
* r48 = FMA.f32 r12, r48, r12
+ NOP
* r1 = FMA.f32 r13, r1, r13
+ NOP
* r2 = FMA.f32 r14, r2, r14
+ NOP
* r3 = FMA.f32 r15, r3, r15
+ NOP
* r4 = FMA.f32 r8, r4, r8
+ NOP
* r5 = FMA.f32 r9, r5, r9
+ NOP
* r6 = FMA.f32 r10, r6, r10
+ NOP
* r7 = FMA.f32 r11, r7, r11
+ NOP
id(0) nbb
* r12 = FMA.f32 r48, r12, r48
+ NOP
* r13 = FMA.f32 r1, r13, r1
+ NOP
* r14 = FMA.f32 r2, r14, r2
+ NOP
* r15 = FMA.f32 r3, r15, r3
+ NOP
* r8 = FMA.f32 r4, r8, r4
+ NOP
* r9 = FMA.f32 r5, r9, r5
+ NOP
* r10 = FMA.f32 r6, r10, r6
+ NOP
* r11 = FMA.f32 r7, r11, r7
+ NOP
id(0) nbb
* r48 = FMA.f32 r12, r48, r12
+ NOP
* r1 = FMA.f32 r13, r1, r13
+ NOP
* r2 = FMA.f32 r14, r2, r14
+ NOP
* r3 = FMA.f32 r15, r3, r15
+ NOP
* r4 = FMA.f32 r8, r4, r8
+ NOP
* r5 = FMA.f32 r9, r5, r9
+ NOP
* r6 = FMA.f32 r10, r6, r10
+ NOP
* r7 = FMA.f32 r11, r7, r11
+ NOP
id(0) nbb
* r12 = FMA.f32 r48, r12, r48
+ NOP
* r13 = FMA.f32 r1, r13, r1
+ NOP
* r14 = FMA.f32 r2, r14, r2
+ NOP
* r15 = FMA.f32 r3, r15, r3
+ NOP
* r8 = FMA.f32 r4, r8, r4
+ NOP
* r9 = FMA.f32 r5, r9, r5
+ NOP
* r10 = FMA.f32 r6, r10, r6
+ NOP
* r11 = FMA.f32 r7, r11, r7
+ NOP
id(0) nbb
* r48 = FMA.f32 r12, r48, r12
+ NOP
* r1 = FMA.f32 r13, r1, r13
+ NOP
* r2 = FMA.f32 r14, r2, r14
+ NOP
* r3 = FMA.f32 r15, r3, r15
+ NOP
* r4 = FMA.f32 r8, r4, r8
+ NOP
* r5 = FMA.f32 r9, r5, r9
+ NOP
* r6 = FMA.f32 r10, r6, r10
+ NOP
* r7 = FMA.f32 r11, r7, r11
+ NOP
id(0) nbb
* r12 = FMA.f32 r48, r12, r48
+ NOP
* r13 = FMA.f32 r1, r13, r1
+ NOP
* r14 = FMA.f32 r2, r14, r2
+ NOP
* r15 = FMA.f32 r3, r15, r3
+ NOP
* r8 = FMA.f32 r4, r8, r4
+ NOP
* r9 = FMA.f32 r5, r9, r5
+ NOP
* r10 = FMA.f32 r6, r10, r6
+ NOP
* r11 = FMA.f32 r7, r11, r7
+ NOP
id(0) nbb
* r48 = FMA.f32 r12, r48, r12
+ NOP
* r1 = FMA.f32 r13, r1, r13
+ NOP
* r2 = FMA.f32 r14, r2, r14
+ NOP
* r3 = FMA.f32 r15, r3, r15
+ NOP
* r4 = FMA.f32 r8, r4, r8
+ NOP
* r5 = FMA.f32 r9, r5, r9
+ NOP
* r6 = FMA.f32 r10, r6, r10
+ NOP
* r7 = FMA.f32 r11, r7, r11
+ NOP
id(0) nbb
* r12 = FMA.f32 r48, r12, r48
+ NOP
* r13 = FMA.f32 r1, r13, r1
+ NOP
* r14 = FMA.f32 r2, r14, r2
+ NOP
* r15 = FMA.f32 r3, r15, r3
+ NOP
* r8 = FMA.f32 r4, r8, r4
+ NOP
* r9 = FMA.f32 r5, r9, r5
+ NOP
* r10 = FMA.f32 r6, r10, r6
+ NOP
* r11 = FMA.f32 r7, r11, r7
+ NOP
id(0) nbb
* r48 = FMA.f32 r12, r48, r12
+ NOP
* r1 = FMA.f32 r13, r1, r13
+ NOP
* r2 = FMA.f32 r14, r2, r14
+ NOP
* r3 = FMA.f32 r15, r3, r15
+ NOP
* r4 = FMA.f32 r8, r4, r8
+ NOP
* r5 = FMA.f32 r9, r5, r9
+ NOP
* r6 = FMA.f32 r10, r6, r10
+ NOP
* r7 = FMA.f32 r11, r7, r11
+ NOP
id(0) nbb
* r12 = FMA.f32 r48, r12, r48
+ NOP
* r13 = FMA.f32 r1, r13, r1
+ NOP
* r14 = FMA.f32 r2, r14, r2
+ NOP
* r15 = FMA.f32 r3, r15, r3
+ NOP
* r8 = FMA.f32 r4, r8, r4
+ NOP
* r9 = FMA.f32 r5, r9, r5
+ NOP
* r10 = FMA.f32 r6, r10, r6
+ NOP
* r11 = FMA.f32 r7, r11, r7
+ NOP
id(0) nbb
* r48 = FMA.f32 r12, r48, r12
+ NOP
* r1 = FMA.f32 r13, r1, r13
+ NOP
* r2 = FMA.f32 r14, r2, r14
+ NOP
* r3 = FMA.f32 r15, r3, r15
+ NOP
* r4 = FMA.f32 r8, r4, r8
+ NOP
* r5 = FMA.f32 r9, r5, r9
+ NOP
* r6 = FMA.f32 r10, r6, r10
+ NOP
* r7 = FMA.f32 r11, r7, r11
+ NOP
id(0) nbb
* r12 = FMA.f32 r48, r12, r48
+ NOP
* r13 = FMA.f32 r1, r13, r1
+ NOP
* r14 = FMA.f32 r2, r14, r2
+ NOP
* r15 = FMA.f32 r3, r15, r3
+ NOP
* r8 = FMA.f32 r4, r8, r4
+ NOP
* r9 = FMA.f32 r5, r9, r5
+ NOP
* r10 = FMA.f32 r6, r10, r6
+ NOP
* r11 = FMA.f32 r7, r11, r7
+ NOP
id(0) nbb
* r48 = FMA.f32 r12, r48, r12
+ NOP
* r1 = FMA.f32 r13, r1, r13
+ NOP
* r2 = FMA.f32 r14, r2, r14
+ NOP
* r3 = FMA.f32 r15, r3, r15
+ NOP
* r4 = FMA.f32 r8, r4, r8
+ NOP
* r5 = FMA.f32 r9, r5, r9
+ NOP
* r6 = FMA.f32 r10, r6, r10
+ NOP
* r7 = FMA.f32 r11, r7, r11
+ NOP
id(0) nbb
* r12 = FMA.f32 r48, r12, r48
+ NOP
* r13 = FMA.f32 r1, r13, r1
+ NOP
* r14 = FMA.f32 r2, r14, r2
+ NOP
* r15 = FMA.f32 r3, r15, r3
+ NOP
* r8 = FMA.f32 r4, r8, r4
+ NOP
* r9 = FMA.f32 r5, r9, r5
+ NOP
* r10 = FMA.f32 r6, r10, r6
+ NOP
* r11 = FMA.f32 r7, r11, r7
+ NOP
id(0) nbb
* r48 = FMA.f32 r12, r48, r12
+ NOP
* r1 = FMA.f32 r13, r1, r13
+ NOP
* r2 = FMA.f32 r14, r2, r14
+ NOP
* r3 = FMA.f32 r15, r3, r15
+ NOP
* r4 = FMA.f32 r8, r4, r8
+ NOP
* r5 = FMA.f32 r9, r5, r9
+ NOP
* r6 = FMA.f32 r10, r6, r10
+ NOP
* r7 = FMA.f32 r11, r7, r11
+ NOP
id(0) nbb r_uncond no_prefetch pcrel(1)
* r12 = FMA.f32 r48, r12, r48
+ NOP
* r13 = FMA.f32 r1, r13, r1
+ NOP
* r14 = FMA.f32 r2, r14, r2
+ NOP
* r15 = FMA.f32 r3, r15, r3
+ NOP
* r8 = FMA.f32 r4, r8, r4
+ NOP
* r9 = FMA.f32 r5, r9, r5
+ NOP
* r10 = FMA.f32 r6, r10, r6
+ r49 = IADD.s32 r49, fau.x
* r11 = FMA.f32 r7, r11, r7
+ JUMP fau.y -> block1
0 4000000000000001
} -> block1 from block3
block5 {
id(0) nbb
* NOP
+ r1 = FADD.f32 r12, r8
id(0) wait(0 ) nbb r_uncond
* NOP
+ r2 = FADD.f32 r13, r9
* NOP
+ r3 = FADD.f32 r14, r10
* NOP
+ r4 = FADD.f32 r15, r11
* _.h00 = LSHIFT_OR.i32 r0, t, fau.y.b0
+ r1 = FADD.f32 r1, r3
* _.h00 = FADD.f32 r2, r4
+ r0 = IADD.s32 fau.x, t0
* r1 = FADD.f32 r1, t0
+ _.h00 = ICMP.u32.i1.lt t1, fau.x
* NOP
+ _.h00 = IADD.s32 t1, fau.y
* NOP
+ STORE.i32 r1, r0, t1, byte_offset:0
200000000
} from block2
slot 0 reads: r1
clause_0:
ds(0) nbb ncph
{
*LSHIFT_OR.i32 t0, r61, #0, 0x00000008 /* 0.000000 */
+IADD.s32 t1, r60, t
*MKVEC.v2i16 t0, #0, r62
+IADD.s32 r0:t1, t1, t
*MOV.i32 t0, 0x3f800000 /* 1.000000 */
+MOV.i32 t1, 0x40000000 /* 2.000000 */
*FADD.f32 r1:t0, u1.w0, t0
+FADD.f32 r2:t1, u1.w0, t1
*MOV.i32 t0, 0x40400000 /* 3.000000 */
+MOV.i32 t1, 0x40800000 /* 4.000000 */
*FADD.f32 r3:t0, u1.w0, t0
+FADD.f32 r4:t1, u1.w0, t1
*NOP t0
+MOV.i32 t1, 0x40a00000 /* 5.000000 */
*NOP t0
+FADD.f32 r5:t1, u1.w0, t1
}
clause_7:
ds(0) nbb r_uncond ncph
{
*MOV.i32 r6:t0, 0x40c00000 /* 6.000000 */
+U32_TO_F32 t1, r0
*FMA.f32 r8:t0, t1, 0x2edbe6ff /* 0.000000 */, #0.neg
+MOV.i32 t1, 0x40e00000 /* 7.000000 */
*FADD.f32 r6:t0, u1.w0, r6
+FADD.f32 r7:t1, u1.w0, t1
*MOV.i32 r9:t0, r8
+MOV.i32 r10:t1, r8
*MOV.i32 r11:t0, r8
+MOV.i32 r12:t1, r8
*MOV.i32 r13:t0, r8
+MOV.i32 r14:t1, r8
*MOV.i32 r15:t0, r8
+MOV.i32 r48:t1, u1.w0
*NOP t0
+MOV.i32 r49:t1, #0.x
}
clause_14:
ds(0) nbb r_uncond ncph
{
*NOP t0
+ICMP.s32.m1.ge t1, r49, 0x00000010 /* 0.000000 */
*NOP t0
+BRANCHZ.i16.eq t1, t1.h0, clause_19
}
clause_17:
ds(0) nbb
{
*NOP t0
+JUMP t1, clause_116
}
clause_19:
ds(0) nbb ncph
{
*FMA.f32 r48:t0, r12, r48, r12
+NOP t1
*FMA.f32 r1:t0, r13, r1, r13
+NOP t1
*FMA.f32 r2:t0, r14, r2, r14
+NOP t1
*FMA.f32 r3:t0, r15, r3, r15
+NOP t1
*FMA.f32 r4:t0, r8, r4, r8
+NOP t1
*FMA.f32 r5:t0, r9, r5, r9
+NOP t1
*FMA.f32 r6:t0, r10, r6, r10
+NOP t1
*FMA.f32 r7:t0, r11, r7, r11
+NOP t1
}
clause_25:
ds(0) nbb ncph
{
*FMA.f32 r12:t0, r48, r12, r48
+NOP t1
*FMA.f32 r13:t0, r1, r13, r1
+NOP t1
*FMA.f32 r14:t0, r2, r14, r2
+NOP t1
*FMA.f32 r15:t0, r3, r15, r3
+NOP t1
*FMA.f32 r8:t0, r4, r8, r4
+NOP t1
*FMA.f32 r9:t0, r5, r9, r5
+NOP t1
*FMA.f32 r10:t0, r6, r10, r6
+NOP t1
*FMA.f32 r11:t0, r7, r11, r7
+NOP t1
}
clause_31:
ds(0) nbb ncph
{
*FMA.f32 r48:t0, r12, r48, r12
+NOP t1
*FMA.f32 r1:t0, r13, r1, r13
+NOP t1
*FMA.f32 r2:t0, r14, r2, r14
+NOP t1
*FMA.f32 r3:t0, r15, r3, r15
+NOP t1
*FMA.f32 r4:t0, r8, r4, r8
+NOP t1
*FMA.f32 r5:t0, r9, r5, r9
+NOP t1
*FMA.f32 r6:t0, r10, r6, r10
+NOP t1
*FMA.f32 r7:t0, r11, r7, r11
+NOP t1
}
clause_37:
ds(0) nbb ncph
{
*FMA.f32 r12:t0, r48, r12, r48
+NOP t1
*FMA.f32 r13:t0, r1, r13, r1
+NOP t1
*FMA.f32 r14:t0, r2, r14, r2
+NOP t1
*FMA.f32 r15:t0, r3, r15, r3
+NOP t1
*FMA.f32 r8:t0, r4, r8, r4
+NOP t1
*FMA.f32 r9:t0, r5, r9, r5
+NOP t1
*FMA.f32 r10:t0, r6, r10, r6
+NOP t1
*FMA.f32 r11:t0, r7, r11, r7
+NOP t1
}
clause_43:
ds(0) nbb ncph
{
*FMA.f32 r48:t0, r12, r48, r12
+NOP t1
*FMA.f32 r1:t0, r13, r1, r13
+NOP t1
*FMA.f32 r2:t0, r14, r2, r14
+NOP t1
*FMA.f32 r3:t0, r15, r3, r15
+NOP t1
*FMA.f32 r4:t0, r8, r4, r8
+NOP t1
*FMA.f32 r5:t0, r9, r5, r9
+NOP t1
*FMA.f32 r6:t0, r10, r6, r10
+NOP t1
*FMA.f32 r7:t0, r11, r7, r11
+NOP t1
}
clause_49:
ds(0) nbb ncph
{
*FMA.f32 r12:t0, r48, r12, r48
+NOP t1
*FMA.f32 r13:t0, r1, r13, r1
+NOP t1
*FMA.f32 r14:t0, r2, r14, r2
+NOP t1
*FMA.f32 r15:t0, r3, r15, r3
+NOP t1
*FMA.f32 r8:t0, r4, r8, r4
+NOP t1
*FMA.f32 r9:t0, r5, r9, r5
+NOP t1
*FMA.f32 r10:t0, r6, r10, r6
+NOP t1
*FMA.f32 r11:t0, r7, r11, r7
+NOP t1
}
clause_55:
ds(0) nbb ncph
{
*FMA.f32 r48:t0, r12, r48, r12
+NOP t1
*FMA.f32 r1:t0, r13, r1, r13
+NOP t1
*FMA.f32 r2:t0, r14, r2, r14
+NOP t1
*FMA.f32 r3:t0, r15, r3, r15
+NOP t1
*FMA.f32 r4:t0, r8, r4, r8
+NOP t1
*FMA.f32 r5:t0, r9, r5, r9
+NOP t1
*FMA.f32 r6:t0, r10, r6, r10
+NOP t1
*FMA.f32 r7:t0, r11, r7, r11
+NOP t1
}
clause_61:
ds(0) nbb ncph
{
*FMA.f32 r12:t0, r48, r12, r48
+NOP t1
*FMA.f32 r13:t0, r1, r13, r1
+NOP t1
*FMA.f32 r14:t0, r2, r14, r2
+NOP t1
*FMA.f32 r15:t0, r3, r15, r3
+NOP t1
*FMA.f32 r8:t0, r4, r8, r4
+NOP t1
*FMA.f32 r9:t0, r5, r9, r5
+NOP t1
*FMA.f32 r10:t0, r6, r10, r6
+NOP t1
*FMA.f32 r11:t0, r7, r11, r7
+NOP t1
}
clause_67:
ds(0) nbb ncph
{
*FMA.f32 r48:t0, r12, r48, r12
+NOP t1
*FMA.f32 r1:t0, r13, r1, r13
+NOP t1
*FMA.f32 r2:t0, r14, r2, r14
+NOP t1
*FMA.f32 r3:t0, r15, r3, r15
+NOP t1
*FMA.f32 r4:t0, r8, r4, r8
+NOP t1
*FMA.f32 r5:t0, r9, r5, r9
+NOP t1
*FMA.f32 r6:t0, r10, r6, r10
+NOP t1
*FMA.f32 r7:t0, r11, r7, r11
+NOP t1
}
clause_73:
ds(0) nbb ncph
{
*FMA.f32 r12:t0, r48, r12, r48
+NOP t1
*FMA.f32 r13:t0, r1, r13, r1
+NOP t1
*FMA.f32 r14:t0, r2, r14, r2
+NOP t1
*FMA.f32 r15:t0, r3, r15, r3
+NOP t1
*FMA.f32 r8:t0, r4, r8, r4
+NOP t1
*FMA.f32 r9:t0, r5, r9, r5
+NOP t1
*FMA.f32 r10:t0, r6, r10, r6
+NOP t1
*FMA.f32 r11:t0, r7, r11, r7
+NOP t1
}
clause_79:
ds(0) nbb ncph
{
*FMA.f32 r48:t0, r12, r48, r12
+NOP t1
*FMA.f32 r1:t0, r13, r1, r13
+NOP t1
*FMA.f32 r2:t0, r14, r2, r14
+NOP t1
*FMA.f32 r3:t0, r15, r3, r15
+NOP t1
*FMA.f32 r4:t0, r8, r4, r8
+NOP t1
*FMA.f32 r5:t0, r9, r5, r9
+NOP t1
*FMA.f32 r6:t0, r10, r6, r10
+NOP t1
*FMA.f32 r7:t0, r11, r7, r11
+NOP t1
}
clause_85:
ds(0) nbb ncph
{
*FMA.f32 r12:t0, r48, r12, r48
+NOP t1
*FMA.f32 r13:t0, r1, r13, r1
+NOP t1
*FMA.f32 r14:t0, r2, r14, r2
+NOP t1
*FMA.f32 r15:t0, r3, r15, r3
+NOP t1
*FMA.f32 r8:t0, r4, r8, r4
+NOP t1
*FMA.f32 r9:t0, r5, r9, r5
+NOP t1
*FMA.f32 r10:t0, r6, r10, r6
+NOP t1
*FMA.f32 r11:t0, r7, r11, r7
+NOP t1
}
clause_91:
ds(0) nbb ncph
{
*FMA.f32 r48:t0, r12, r48, r12
+NOP t1
*FMA.f32 r1:t0, r13, r1, r13
+NOP t1
*FMA.f32 r2:t0, r14, r2, r14
+NOP t1
*FMA.f32 r3:t0, r15, r3, r15
+NOP t1
*FMA.f32 r4:t0, r8, r4, r8
+NOP t1
*FMA.f32 r5:t0, r9, r5, r9
+NOP t1
*FMA.f32 r6:t0, r10, r6, r10
+NOP t1
*FMA.f32 r7:t0, r11, r7, r11
+NOP t1
}
clause_97:
ds(0) nbb ncph
{
*FMA.f32 r12:t0, r48, r12, r48
+NOP t1
*FMA.f32 r13:t0, r1, r13, r1
+NOP t1
*FMA.f32 r14:t0, r2, r14, r2
+NOP t1
*FMA.f32 r15:t0, r3, r15, r3
+NOP t1
*FMA.f32 r8:t0, r4, r8, r4
+NOP t1
*FMA.f32 r9:t0, r5, r9, r5
+NOP t1
*FMA.f32 r10:t0, r6, r10, r6
+NOP t1
*FMA.f32 r11:t0, r7, r11, r7
+NOP t1
}
clause_103:
ds(0) nbb ncph
{
*FMA.f32 r48:t0, r12, r48, r12
+NOP t1
*FMA.f32 r1:t0, r13, r1, r13
+NOP t1
*FMA.f32 r2:t0, r14, r2, r14
+NOP t1
*FMA.f32 r3:t0, r15, r3, r15
+NOP t1
*FMA.f32 r4:t0, r8, r4, r8
+NOP t1
*FMA.f32 r5:t0, r9, r5, r9
+NOP t1
*FMA.f32 r6:t0, r10, r6, r10
+NOP t1
*FMA.f32 r7:t0, r11, r7, r11
+NOP t1
}
clause_109:
ds(0) nbb r_uncond
{
*FMA.f32 r12:t0, r48, r12, r48
+NOP t1
*FMA.f32 r13:t0, r1, r13, r1
+NOP t1
*FMA.f32 r14:t0, r2, r14, r2
+NOP t1
*FMA.f32 r15:t0, r3, r15, r3
+NOP t1
*FMA.f32 r8:t0, r4, r8, r4
+NOP t1
*FMA.f32 r9:t0, r5, r9, r5
+NOP t1
*FMA.f32 r10:t0, r6, r10, r6
+IADD.s32 r49:t1, r49, 0x00000001 /* 0.000000 */
*FMA.f32 r11:t0, r7, r11, r7
+JUMP t1, clause_14
}
clause_116:
ds(0) nbb ncph next_store dwb(0)
{
*NOP t0
+FADD.f32 r1:t1, r12, r8
}
clause_117:
ds(0) eos store
{
*NOP t0
+FADD.f32 r2:t1, r13, r9
*NOP t0
+FADD.f32 r3:t1, r14, r10
*NOP t0
+FADD.f32 r4:t1, r15, r11
*LSHIFT_OR.i32 t0, r0, #0, 0x00000002 /* 0.000000 */
+FADD.f32 r1:t1, r1, r3
*FADD.f32 t0, r2, r4
+IADD.s32 r0:t1, u0.w0, t0
*FADD.f32 r1:t0, r1, t0
+ICMP.u32.gt t1, u0.w0, t1
*NOP t0
+IADD.s32 t1, t1, u0.w1
*NOP t0
+STORE.i32 t1, r0, t1, @r1
}
8a550d7d compute_sp_v8_float 24.268 GFLOPs 11.061ms
compute shader ----------
#define KERNEL compute_sp_v16
#define LOCAL_SIZE_X 256
#define DATATYPE float
#line 64
// Avoiding auto-vectorize by using vector-width locked dependent code
layout(local_size_x = LOCAL_SIZE_X) in;
#undef MAD_4
#undef MAD_16
#undef MAD_64
#define mad(a,b,c) (a*b+c)
#define MAD_4(x, y) x = mad(y, x, y); y = mad(x, y, x); x = mad(y, x, y); y = mad(x, y, x);
#define MAD_16(x, y) MAD_4(x, y); MAD_4(x, y); MAD_4(x, y); MAD_4(x, y);
#define MAD_64(x, y) MAD_16(x, y); MAD_16(x, y); MAD_16(x, y); MAD_16(x, y);
struct vec8 {
vec4 d0, d1;
};
#define VEC8(x0,x1,x2,x3,x4,x5,x6,x7) vec8(vec4(x0,x1,x2,x3), vec4(x4,x5,x6,x7))
#define VEC8_S(x) vec8(vec4(x,x,x,x), vec4(x,x,x,x))
#define VEC8_ADD(a, b) (vec8(a.d0 + b.d0, a.d1 + b.d1))
#define VEC8_MUL(a, b) (vec8(a.d0 * b.d0, a.d1 * b.d1))
struct vec16 {
vec8 d0,d1;
};
#define VEC16(x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15) vec16(VEC8(x0,x1,x2,x3,x4,x5,x6,x7), VEC8(x8,x9,x10,x11,x12,x13,x14,x15))
#define VEC16_S(x) vec16(VEC8_S(x), VEC8_S(x));
#define VEC16_ADD(a, b) (vec16(VEC8_ADD(a.d0, b.d0), VEC8_ADD(a.d1, b.d1)))
#define VEC16_MUL(a, b) (vec16(VEC8_MUL(a.d0, b.d0), VEC8_MUL(a.d1, b.d1)))
#define mad8(a,b,c) (VEC8_ADD(VEC8_MUL(a,b),c))
#define mad16(a,b,c) (VEC16_ADD(VEC16_MUL(a,b),c))
layout(location = 1) uniform DATATYPE _A;
#define SCALE 1e-10
layout(std430, binding = 0) restrict writeonly buffer outbuffer {
DATATYPE ptr[];
};
#line 184
void compute_sp_v16()
{
uint id = gl_GlobalInvocationID[0] + gl_GlobalInvocationID[1] * 256u + gl_GlobalInvocationID[2] * 256u * 256u;
vec16 x = VEC16(_A, (_A+DATATYPE(1)), (_A+DATATYPE(2)), (_A+DATATYPE(3)), (_A+DATATYPE(4)), (_A+DATATYPE(5)), (_A+DATATYPE(6)), (_A+DATATYPE(7)),
(_A+DATATYPE(8)), (_A+DATATYPE(9)), (_A+DATATYPE(10)), (_A+DATATYPE(11)), (_A+DATATYPE(12)), (_A+DATATYPE(13)), (_A+DATATYPE(14)), (_A+DATATYPE(15)));
vec16 y = VEC16_S(DATATYPE((float(id) * SCALE)));
#undef mad
#define mad mad16
for(int i=0; i<8; i++)
{
MAD_16(x, y);
}
vec8 u = VEC8_ADD(y.d0, y.d1);
vec4 s = u.d0 + u.d1;
vec2 t = s.xy + s.zw;
ptr[id] = t.x + t.y;
}
void main() {compute_sp_v16();}
----------
shader: MESA_SHADER_COMPUTE
source_sha1: {0x31891d4a, 0x859f0d4a, 0x14e16879, 0xa0b69f0b, 0x0d3a19ec}
name: GLSL10
workgroup-size: 256, 1, 1
shared-size: 0
inputs: 0
outputs: 0
uniforms: 1
ubos: 1
shared: 0
ray queries: 0
decl_var ssbo INTERP_MODE_NONE restrict writeonly highp float[] ptr (0, 0, 0)
decl_var uniform INTERP_MODE_NONE highp float _A (1, 0, 0)
decl_var ubo INTERP_MODE_NONE vec4[1] uniform_0 (0, 0, 0)
decl_function main (0 params)
impl main {
block block_0:
/* preds: */
vec3 32 ssa_19 = intrinsic load_global_invocation_id () ()
vec1 32 ssa_18 = load_const (0x00000008 = 0.000000)
vec1 32 ssa_20 = ishl ssa_19.y, ssa_18
vec1 32 ssa_21 = iadd ssa_19.x, ssa_20
vec1 32 ssa_1 = load_const (0x00000001 = 0.000000)
vec1 32 ssa_872 = insert_u16 ssa_19.z, ssa_1
vec1 32 ssa_24 = iadd ssa_21, ssa_872
vec1 32 ssa_0 = load_const (0x00000000 = 0.000000)
vec1 32 ssa_25 = intrinsic load_ubo (ssa_0, ssa_0) (access=0, align_mul=1073741824, align_offset=0, range_base=0, range=4)
vec1 32 ssa_2 = load_const (0x3f800000 = 1.000000)
vec1 32 ssa_26 = fadd ssa_25, ssa_2
vec1 32 ssa_3 = load_const (0x40000000 = 2.000000)
vec1 32 ssa_27 = fadd ssa_25, ssa_3
vec1 32 ssa_4 = load_const (0x40400000 = 3.000000)
vec1 32 ssa_28 = fadd ssa_25, ssa_4
vec1 32 ssa_5 = load_const (0x40800000 = 4.000000)
vec1 32 ssa_29 = fadd ssa_25, ssa_5
vec1 32 ssa_6 = load_const (0x40a00000 = 5.000000)
vec1 32 ssa_30 = fadd ssa_25, ssa_6
vec1 32 ssa_7 = load_const (0x40c00000 = 6.000000)
vec1 32 ssa_31 = fadd ssa_25, ssa_7
vec1 32 ssa_8 = load_const (0x40e00000 = 7.000000)
vec1 32 ssa_32 = fadd ssa_25, ssa_8
vec1 32 ssa_9 = load_const (0x41000000 = 8.000000)
vec1 32 ssa_33 = fadd ssa_25, ssa_9
vec1 32 ssa_10 = load_const (0x41100000 = 9.000000)
vec1 32 ssa_34 = fadd ssa_25, ssa_10
vec1 32 ssa_11 = load_const (0x41200000 = 10.000000)
vec1 32 ssa_35 = fadd ssa_25, ssa_11
vec1 32 ssa_12 = load_const (0x41300000 = 11.000000)
vec1 32 ssa_36 = fadd ssa_25, ssa_12
vec1 32 ssa_13 = load_const (0x41400000 = 12.000000)
vec1 32 ssa_37 = fadd ssa_25, ssa_13
vec1 32 ssa_14 = load_const (0x41500000 = 13.000000)
vec1 32 ssa_38 = fadd ssa_25, ssa_14
vec1 32 ssa_15 = load_const (0x41600000 = 14.000000)
vec1 32 ssa_39 = fadd ssa_25, ssa_15
vec1 32 ssa_16 = load_const (0x41700000 = 15.000000)
vec1 32 ssa_40 = fadd ssa_25, ssa_16
vec1 32 ssa_41 = u2f32 ssa_24
vec1 32 ssa_17 = load_const (0x2edbe6ff = 0.000000)
vec1 32 ssa_42 = fmul ssa_41, ssa_17
/* succs: block_1 */
loop {
block block_1:
/* preds: block_0 block_4 */
vec1 32 ssa_43 = phi block_0: ssa_42, block_4: ssa_618
vec1 32 ssa_44 = phi block_0: ssa_42, block_4: ssa_617
vec1 32 ssa_45 = phi block_0: ssa_42, block_4: ssa_616
vec1 32 ssa_46 = phi block_0: ssa_42, block_4: ssa_615
vec1 32 ssa_47 = phi block_0: ssa_42, block_4: ssa_622
vec1 32 ssa_48 = phi block_0: ssa_42, block_4: ssa_621
vec1 32 ssa_49 = phi block_0: ssa_42, block_4: ssa_620
vec1 32 ssa_50 = phi block_0: ssa_42, block_4: ssa_619
vec1 32 ssa_51 = phi block_0: ssa_42, block_4: ssa_626
vec1 32 ssa_52 = phi block_0: ssa_42, block_4: ssa_625
vec1 32 ssa_53 = phi block_0: ssa_42, block_4: ssa_624
vec1 32 ssa_54 = phi block_0: ssa_42, block_4: ssa_623
vec1 32 ssa_55 = phi block_0: ssa_42, block_4: ssa_630
vec1 32 ssa_56 = phi block_0: ssa_42, block_4: ssa_629
vec1 32 ssa_57 = phi block_0: ssa_42, block_4: ssa_628
vec1 32 ssa_58 = phi block_0: ssa_42, block_4: ssa_627
vec1 32 ssa_59 = phi block_0: ssa_37, block_4: ssa_634
vec1 32 ssa_60 = phi block_0: ssa_38, block_4: ssa_633
vec1 32 ssa_61 = phi block_0: ssa_39, block_4: ssa_632
vec1 32 ssa_62 = phi block_0: ssa_40, block_4: ssa_631
vec1 32 ssa_63 = phi block_0: ssa_33, block_4: ssa_638
vec1 32 ssa_64 = phi block_0: ssa_34, block_4: ssa_637
vec1 32 ssa_65 = phi block_0: ssa_35, block_4: ssa_636
vec1 32 ssa_66 = phi block_0: ssa_36, block_4: ssa_635
vec1 32 ssa_67 = phi block_0: ssa_29, block_4: ssa_642
vec1 32 ssa_68 = phi block_0: ssa_30, block_4: ssa_641
vec1 32 ssa_69 = phi block_0: ssa_31, block_4: ssa_640
vec1 32 ssa_70 = phi block_0: ssa_32, block_4: ssa_639
vec1 32 ssa_71 = phi block_0: ssa_25, block_4: ssa_646
vec1 32 ssa_72 = phi block_0: ssa_26, block_4: ssa_645
vec1 32 ssa_73 = phi block_0: ssa_27, block_4: ssa_644
vec1 32 ssa_74 = phi block_0: ssa_28, block_4: ssa_643
vec1 32 ssa_75 = phi block_0: ssa_0, block_4: ssa_589
vec1 32 ssa_76 = ige32 ssa_75, ssa_18
/* succs: block_2 block_3 */
if ssa_76 {
block block_2:
/* preds: block_1 */
break
/* succs: block_5 */
} else {
block block_3:
/* preds: block_1 */
/* succs: block_4 */
}
block block_4:
/* preds: block_3 */
vec1 32 ssa_870 = ffma ssa_55, ssa_71, ssa_55
vec1 32 ssa_869 = ffma ssa_56, ssa_72, ssa_56
vec1 32 ssa_868 = ffma ssa_57, ssa_73, ssa_57
vec1 32 ssa_867 = ffma ssa_58, ssa_74, ssa_58
vec1 32 ssa_866 = ffma ssa_51, ssa_67, ssa_51
vec1 32 ssa_865 = ffma ssa_52, ssa_68, ssa_52
vec1 32 ssa_864 = ffma ssa_53, ssa_69, ssa_53
vec1 32 ssa_863 = ffma ssa_54, ssa_70, ssa_54
vec1 32 ssa_862 = ffma ssa_47, ssa_63, ssa_47
vec1 32 ssa_861 = ffma ssa_48, ssa_64, ssa_48
vec1 32 ssa_860 = ffma ssa_49, ssa_65, ssa_49
vec1 32 ssa_859 = ffma ssa_50, ssa_66, ssa_50
vec1 32 ssa_858 = ffma ssa_43, ssa_59, ssa_43
vec1 32 ssa_857 = ffma ssa_44, ssa_60, ssa_44
vec1 32 ssa_856 = ffma ssa_45, ssa_61, ssa_45
vec1 32 ssa_855 = ffma ssa_46, ssa_62, ssa_46
vec1 32 ssa_854 = ffma ssa_870, ssa_55, ssa_870
vec1 32 ssa_853 = ffma ssa_869, ssa_56, ssa_869
vec1 32 ssa_852 = ffma ssa_868, ssa_57, ssa_868
vec1 32 ssa_851 = ffma ssa_867, ssa_58, ssa_867
vec1 32 ssa_850 = ffma ssa_866, ssa_51, ssa_866
vec1 32 ssa_849 = ffma ssa_865, ssa_52, ssa_865
vec1 32 ssa_848 = ffma ssa_864, ssa_53, ssa_864
vec1 32 ssa_847 = ffma ssa_863, ssa_54, ssa_863
vec1 32 ssa_846 = ffma ssa_862, ssa_47, ssa_862
vec1 32 ssa_845 = ffma ssa_861, ssa_48, ssa_861
vec1 32 ssa_844 = ffma ssa_860, ssa_49, ssa_860
vec1 32 ssa_843 = ffma ssa_859, ssa_50, ssa_859
vec1 32 ssa_842 = ffma ssa_858, ssa_43, ssa_858
vec1 32 ssa_841 = ffma ssa_857, ssa_44, ssa_857
vec1 32 ssa_840 = ffma ssa_856, ssa_45, ssa_856
vec1 32 ssa_839 = ffma ssa_855, ssa_46, ssa_855
vec1 32 ssa_838 = ffma ssa_854, ssa_870, ssa_854
vec1 32 ssa_837 = ffma ssa_853, ssa_869, ssa_853
vec1 32 ssa_836 = ffma ssa_852, ssa_868, ssa_852
vec1 32 ssa_835 = ffma ssa_851, ssa_867, ssa_851
vec1 32 ssa_834 = ffma ssa_850, ssa_866, ssa_850
vec1 32 ssa_833 = ffma ssa_849, ssa_865, ssa_849
vec1 32 ssa_832 = ffma ssa_848, ssa_864, ssa_848
vec1 32 ssa_831 = ffma ssa_847, ssa_863, ssa_847
vec1 32 ssa_830 = ffma ssa_846, ssa_862, ssa_846
vec1 32 ssa_829 = ffma ssa_845, ssa_861, ssa_845
vec1 32 ssa_828 = ffma ssa_844, ssa_860, ssa_844
vec1 32 ssa_827 = ffma ssa_843, ssa_859, ssa_843
vec1 32 ssa_826 = ffma ssa_842, ssa_858, ssa_842
vec1 32 ssa_825 = ffma ssa_841, ssa_857, ssa_841
vec1 32 ssa_824 = ffma ssa_840, ssa_856, ssa_840
vec1 32 ssa_823 = ffma ssa_839, ssa_855, ssa_839
vec1 32 ssa_822 = ffma ssa_838, ssa_854, ssa_838
vec1 32 ssa_821 = ffma ssa_837, ssa_853, ssa_837
vec1 32 ssa_820 = ffma ssa_836, ssa_852, ssa_836
vec1 32 ssa_819 = ffma ssa_835, ssa_851, ssa_835
vec1 32 ssa_818 = ffma ssa_834, ssa_850, ssa_834
vec1 32 ssa_817 = ffma ssa_833, ssa_849, ssa_833
vec1 32 ssa_816 = ffma ssa_832, ssa_848, ssa_832
vec1 32 ssa_815 = ffma ssa_831, ssa_847, ssa_831
vec1 32 ssa_814 = ffma ssa_830, ssa_846, ssa_830
vec1 32 ssa_813 = ffma ssa_829, ssa_845, ssa_829
vec1 32 ssa_812 = ffma ssa_828, ssa_844, ssa_828
vec1 32 ssa_811 = ffma ssa_827, ssa_843, ssa_827
vec1 32 ssa_810 = ffma ssa_826, ssa_842, ssa_826
vec1 32 ssa_809 = ffma ssa_825, ssa_841, ssa_825
vec1 32 ssa_808 = ffma ssa_824, ssa_840, ssa_824
vec1 32 ssa_807 = ffma ssa_823, ssa_839, ssa_823
vec1 32 ssa_806 = ffma ssa_822, ssa_838, ssa_822
vec1 32 ssa_805 = ffma ssa_821, ssa_837, ssa_821
vec1 32 ssa_804 = ffma ssa_820, ssa_836, ssa_820
vec1 32 ssa_803 = ffma ssa_819, ssa_835, ssa_819
vec1 32 ssa_802 = ffma ssa_818, ssa_834, ssa_818
vec1 32 ssa_801 = ffma ssa_817, ssa_833, ssa_817
vec1 32 ssa_800 = ffma ssa_816, ssa_832, ssa_816
vec1 32 ssa_799 = ffma ssa_815, ssa_831, ssa_815
vec1 32 ssa_798 = ffma ssa_814, ssa_830, ssa_814
vec1 32 ssa_797 = ffma ssa_813, ssa_829, ssa_813
vec1 32 ssa_796 = ffma ssa_812, ssa_828, ssa_812
vec1 32 ssa_795 = ffma ssa_811, ssa_827, ssa_811
vec1 32 ssa_794 = ffma ssa_810, ssa_826, ssa_810
vec1 32 ssa_793 = ffma ssa_809, ssa_825, ssa_809
vec1 32 ssa_792 = ffma ssa_808, ssa_824, ssa_808
vec1 32 ssa_791 = ffma ssa_807, ssa_823, ssa_807
vec1 32 ssa_790 = ffma ssa_806, ssa_822, ssa_806
vec1 32 ssa_789 = ffma ssa_805, ssa_821, ssa_805
vec1 32 ssa_788 = ffma ssa_804, ssa_820, ssa_804
vec1 32 ssa_787 = ffma ssa_803, ssa_819, ssa_803
vec1 32 ssa_786 = ffma ssa_802, ssa_818, ssa_802
vec1 32 ssa_785 = ffma ssa_801, ssa_817, ssa_801
vec1 32 ssa_784 = ffma ssa_800, ssa_816, ssa_800
vec1 32 ssa_783 = ffma ssa_799, ssa_815, ssa_799
vec1 32 ssa_782 = ffma ssa_798, ssa_814, ssa_798
vec1 32 ssa_781 = ffma ssa_797, ssa_813, ssa_797
vec1 32 ssa_780 = ffma ssa_796, ssa_812, ssa_796
vec1 32 ssa_779 = ffma ssa_795, ssa_811, ssa_795
vec1 32 ssa_778 = ffma ssa_794, ssa_810, ssa_794
vec1 32 ssa_777 = ffma ssa_793, ssa_809, ssa_793
vec1 32 ssa_776 = ffma ssa_792, ssa_808, ssa_792
vec1 32 ssa_775 = ffma ssa_791, ssa_807, ssa_791
vec1 32 ssa_774 = ffma ssa_790, ssa_806, ssa_790
vec1 32 ssa_773 = ffma ssa_789, ssa_805, ssa_789
vec1 32 ssa_772 = ffma ssa_788, ssa_804, ssa_788
vec1 32 ssa_771 = ffma ssa_787, ssa_803, ssa_787
vec1 32 ssa_770 = ffma ssa_786, ssa_802, ssa_786
vec1 32 ssa_769 = ffma ssa_785, ssa_801, ssa_785
vec1 32 ssa_768 = ffma ssa_784, ssa_800, ssa_784
vec1 32 ssa_767 = ffma ssa_783, ssa_799, ssa_783
vec1 32 ssa_766 = ffma ssa_782, ssa_798, ssa_782
vec1 32 ssa_765 = ffma ssa_781, ssa_797, ssa_781
vec1 32 ssa_764 = ffma ssa_780, ssa_796, ssa_780
vec1 32 ssa_763 = ffma ssa_779, ssa_795, ssa_779
vec1 32 ssa_762 = ffma ssa_778, ssa_794, ssa_778
vec1 32 ssa_761 = ffma ssa_777, ssa_793, ssa_777
vec1 32 ssa_760 = ffma ssa_776, ssa_792, ssa_776
vec1 32 ssa_759 = ffma ssa_775, ssa_791, ssa_775
vec1 32 ssa_758 = ffma ssa_774, ssa_790, ssa_774
vec1 32 ssa_757 = ffma ssa_773, ssa_789, ssa_773
vec1 32 ssa_756 = ffma ssa_772, ssa_788, ssa_772
vec1 32 ssa_755 = ffma ssa_771, ssa_787, ssa_771
vec1 32 ssa_754 = ffma ssa_770, ssa_786, ssa_770
vec1 32 ssa_753 = ffma ssa_769, ssa_785, ssa_769
vec1 32 ssa_752 = ffma ssa_768, ssa_784, ssa_768
vec1 32 ssa_751 = ffma ssa_767, ssa_783, ssa_767
vec1 32 ssa_750 = ffma ssa_766, ssa_782, ssa_766
vec1 32 ssa_749 = ffma ssa_765, ssa_781, ssa_765
vec1 32 ssa_748 = ffma ssa_764, ssa_780, ssa_764
vec1 32 ssa_747 = ffma ssa_763, ssa_779, ssa_763
vec1 32 ssa_746 = ffma ssa_762, ssa_778, ssa_762
vec1 32 ssa_745 = ffma ssa_761, ssa_777, ssa_761
vec1 32 ssa_744 = ffma ssa_760, ssa_776, ssa_760
vec1 32 ssa_743 = ffma ssa_759, ssa_775, ssa_759
vec1 32 ssa_742 = ffma ssa_758, ssa_774, ssa_758
vec1 32 ssa_741 = ffma ssa_757, ssa_773, ssa_757
vec1 32 ssa_740 = ffma ssa_756, ssa_772, ssa_756
vec1 32 ssa_739 = ffma ssa_755, ssa_771, ssa_755
vec1 32 ssa_738 = ffma ssa_754, ssa_770, ssa_754
vec1 32 ssa_737 = ffma ssa_753, ssa_769, ssa_753
vec1 32 ssa_736 = ffma ssa_752, ssa_768, ssa_752
vec1 32 ssa_735 = ffma ssa_751, ssa_767, ssa_751
vec1 32 ssa_734 = ffma ssa_750, ssa_766, ssa_750
vec1 32 ssa_733 = ffma ssa_749, ssa_765, ssa_749
vec1 32 ssa_732 = ffma ssa_748, ssa_764, ssa_748
vec1 32 ssa_731 = ffma ssa_747, ssa_763, ssa_747
vec1 32 ssa_730 = ffma ssa_746, ssa_762, ssa_746
vec1 32 ssa_729 = ffma ssa_745, ssa_761, ssa_745
vec1 32 ssa_728 = ffma ssa_744, ssa_760, ssa_744
vec1 32 ssa_727 = ffma ssa_743, ssa_759, ssa_743
vec1 32 ssa_726 = ffma ssa_742, ssa_758, ssa_742
vec1 32 ssa_725 = ffma ssa_741, ssa_757, ssa_741
vec1 32 ssa_724 = ffma ssa_740, ssa_756, ssa_740
vec1 32 ssa_723 = ffma ssa_739, ssa_755, ssa_739
vec1 32 ssa_722 = ffma ssa_738, ssa_754, ssa_738
vec1 32 ssa_721 = ffma ssa_737, ssa_753, ssa_737
vec1 32 ssa_720 = ffma ssa_736, ssa_752, ssa_736
vec1 32 ssa_719 = ffma ssa_735, ssa_751, ssa_735
vec1 32 ssa_718 = ffma ssa_734, ssa_750, ssa_734
vec1 32 ssa_717 = ffma ssa_733, ssa_749, ssa_733
vec1 32 ssa_716 = ffma ssa_732, ssa_748, ssa_732
vec1 32 ssa_715 = ffma ssa_731, ssa_747, ssa_731
vec1 32 ssa_714 = ffma ssa_730, ssa_746, ssa_730
vec1 32 ssa_713 = ffma ssa_729, ssa_745, ssa_729
vec1 32 ssa_712 = ffma ssa_728, ssa_744, ssa_728
vec1 32 ssa_711 = ffma ssa_727, ssa_743, ssa_727
vec1 32 ssa_710 = ffma ssa_726, ssa_742, ssa_726
vec1 32 ssa_709 = ffma ssa_725, ssa_741, ssa_725
vec1 32 ssa_708 = ffma ssa_724, ssa_740, ssa_724
vec1 32 ssa_707 = ffma ssa_723, ssa_739, ssa_723
vec1 32 ssa_706 = ffma ssa_722, ssa_738, ssa_722
vec1 32 ssa_705 = ffma ssa_721, ssa_737, ssa_721
vec1 32 ssa_704 = ffma ssa_720, ssa_736, ssa_720
vec1 32 ssa_703 = ffma ssa_719, ssa_735, ssa_719
vec1 32 ssa_702 = ffma ssa_718, ssa_734, ssa_718
vec1 32 ssa_701 = ffma ssa_717, ssa_733, ssa_717
vec1 32 ssa_700 = ffma ssa_716, ssa_732, ssa_716
vec1 32 ssa_699 = ffma ssa_715, ssa_731, ssa_715
vec1 32 ssa_698 = ffma ssa_714, ssa_730, ssa_714
vec1 32 ssa_697 = ffma ssa_713, ssa_729, ssa_713
vec1 32 ssa_696 = ffma ssa_712, ssa_728, ssa_712
vec1 32 ssa_695 = ffma ssa_711, ssa_727, ssa_711
vec1 32 ssa_694 = ffma ssa_710, ssa_726, ssa_710
vec1 32 ssa_693 = ffma ssa_709, ssa_725, ssa_709
vec1 32 ssa_692 = ffma ssa_708, ssa_724, ssa_708
vec1 32 ssa_691 = ffma ssa_707, ssa_723, ssa_707
vec1 32 ssa_690 = ffma ssa_706, ssa_722, ssa_706
vec1 32 ssa_689 = ffma ssa_705, ssa_721, ssa_705
vec1 32 ssa_688 = ffma ssa_704, ssa_720, ssa_704
vec1 32 ssa_687 = ffma ssa_703, ssa_719, ssa_703
vec1 32 ssa_686 = ffma ssa_702, ssa_718, ssa_702
vec1 32 ssa_685 = ffma ssa_701, ssa_717, ssa_701
vec1 32 ssa_684 = ffma ssa_700, ssa_716, ssa_700
vec1 32 ssa_683 = ffma ssa_699, ssa_715, ssa_699
vec1 32 ssa_682 = ffma ssa_698, ssa_714, ssa_698
vec1 32 ssa_681 = ffma ssa_697, ssa_713, ssa_697
vec1 32 ssa_680 = ffma ssa_696, ssa_712, ssa_696
vec1 32 ssa_679 = ffma ssa_695, ssa_711, ssa_695
vec1 32 ssa_678 = ffma ssa_694, ssa_710, ssa_694
vec1 32 ssa_677 = ffma ssa_693, ssa_709, ssa_693
vec1 32 ssa_676 = ffma ssa_692, ssa_708, ssa_692
vec1 32 ssa_675 = ffma ssa_691, ssa_707, ssa_691
vec1 32 ssa_674 = ffma ssa_690, ssa_706, ssa_690
vec1 32 ssa_673 = ffma ssa_689, ssa_705, ssa_689
vec1 32 ssa_672 = ffma ssa_688, ssa_704, ssa_688
vec1 32 ssa_671 = ffma ssa_687, ssa_703, ssa_687
vec1 32 ssa_670 = ffma ssa_686, ssa_702, ssa_686
vec1 32 ssa_669 = ffma ssa_685, ssa_701, ssa_685
vec1 32 ssa_668 = ffma ssa_684, ssa_700, ssa_684
vec1 32 ssa_667 = ffma ssa_683, ssa_699, ssa_683
vec1 32 ssa_666 = ffma ssa_682, ssa_698, ssa_682
vec1 32 ssa_665 = ffma ssa_681, ssa_697, ssa_681
vec1 32 ssa_664 = ffma ssa_680, ssa_696, ssa_680
vec1 32 ssa_663 = ffma ssa_679, ssa_695, ssa_679
vec1 32 ssa_662 = ffma ssa_678, ssa_694, ssa_678
vec1 32 ssa_661 = ffma ssa_677, ssa_693, ssa_677
vec1 32 ssa_660 = ffma ssa_676, ssa_692, ssa_676
vec1 32 ssa_659 = ffma ssa_675, ssa_691, ssa_675
vec1 32 ssa_658 = ffma ssa_674, ssa_690, ssa_674
vec1 32 ssa_657 = ffma ssa_673, ssa_689, ssa_673
vec1 32 ssa_656 = ffma ssa_672, ssa_688, ssa_672
vec1 32 ssa_655 = ffma ssa_671, ssa_687, ssa_671
vec1 32 ssa_654 = ffma ssa_670, ssa_686, ssa_670
vec1 32 ssa_653 = ffma ssa_669, ssa_685, ssa_669
vec1 32 ssa_652 = ffma ssa_668, ssa_684, ssa_668
vec1 32 ssa_651 = ffma ssa_667, ssa_683, ssa_667
vec1 32 ssa_650 = ffma ssa_666, ssa_682, ssa_666
vec1 32 ssa_649 = ffma ssa_665, ssa_681, ssa_665
vec1 32 ssa_648 = ffma ssa_664, ssa_680, ssa_664
vec1 32 ssa_647 = ffma ssa_663, ssa_679, ssa_663
vec1 32 ssa_646 = ffma ssa_662, ssa_678, ssa_662
vec1 32 ssa_645 = ffma ssa_661, ssa_677, ssa_661
vec1 32 ssa_644 = ffma ssa_660, ssa_676, ssa_660
vec1 32 ssa_643 = ffma ssa_659, ssa_675, ssa_659
vec1 32 ssa_642 = ffma ssa_658, ssa_674, ssa_658
vec1 32 ssa_641 = ffma ssa_657, ssa_673, ssa_657
vec1 32 ssa_640 = ffma ssa_656, ssa_672, ssa_656
vec1 32 ssa_639 = ffma ssa_655, ssa_671, ssa_655
vec1 32 ssa_638 = ffma ssa_654, ssa_670, ssa_654
vec1 32 ssa_637 = ffma ssa_653, ssa_669, ssa_653
vec1 32 ssa_636 = ffma ssa_652, ssa_668, ssa_652
vec1 32 ssa_635 = ffma ssa_651, ssa_667, ssa_651
vec1 32 ssa_634 = ffma ssa_650, ssa_666, ssa_650
vec1 32 ssa_633 = ffma ssa_649, ssa_665, ssa_649
vec1 32 ssa_632 = ffma ssa_648, ssa_664, ssa_648
vec1 32 ssa_631 = ffma ssa_647, ssa_663, ssa_647
vec1 32 ssa_630 = ffma ssa_646, ssa_662, ssa_646
vec1 32 ssa_629 = ffma ssa_645, ssa_661, ssa_645
vec1 32 ssa_628 = ffma ssa_644, ssa_660, ssa_644
vec1 32 ssa_627 = ffma ssa_643, ssa_659, ssa_643
vec1 32 ssa_626 = ffma ssa_642, ssa_658, ssa_642
vec1 32 ssa_625 = ffma ssa_641, ssa_657, ssa_641
vec1 32 ssa_624 = ffma ssa_640, ssa_656, ssa_640
vec1 32 ssa_623 = ffma ssa_639, ssa_655, ssa_639
vec1 32 ssa_622 = ffma ssa_638, ssa_654, ssa_638
vec1 32 ssa_621 = ffma ssa_637, ssa_653, ssa_637
vec1 32 ssa_620 = ffma ssa_636, ssa_652, ssa_636
vec1 32 ssa_619 = ffma ssa_635, ssa_651, ssa_635
vec1 32 ssa_618 = ffma ssa_634, ssa_650, ssa_634
vec1 32 ssa_617 = ffma ssa_633, ssa_649, ssa_633
vec1 32 ssa_616 = ffma ssa_632, ssa_648, ssa_632
vec1 32 ssa_615 = ffma ssa_631, ssa_647, ssa_631
vec1 32 ssa_589 = iadd ssa_75, ssa_1
/* succs: block_1 */
}
block block_5:
/* preds: block_2 */
vec1 32 ssa_590 = fadd ssa_55, ssa_47
vec1 32 ssa_591 = fadd ssa_56, ssa_48
vec1 32 ssa_592 = fadd ssa_57, ssa_49
vec1 32 ssa_593 = fadd ssa_58, ssa_50
vec1 32 ssa_594 = fadd ssa_51, ssa_43
vec1 32 ssa_595 = fadd ssa_52, ssa_44
vec1 32 ssa_596 = fadd ssa_53, ssa_45
vec1 32 ssa_597 = fadd ssa_54, ssa_46
vec1 32 ssa_598 = fadd ssa_590, ssa_594
vec1 32 ssa_599 = fadd ssa_591, ssa_595
vec1 32 ssa_600 = fadd ssa_592, ssa_596
vec1 32 ssa_601 = fadd ssa_593, ssa_597
vec1 32 ssa_602 = fadd ssa_598, ssa_600
vec1 32 ssa_603 = fadd ssa_599, ssa_601
vec1 32 ssa_604 = load_const (0x00000002 = 0.000000)
vec1 32 ssa_605 = ishl ssa_24, ssa_604
vec1 32 ssa_606 = fadd ssa_602, ssa_603
vec1 64 ssa_607 = intrinsic load_ssbo_address (ssa_0) ()
vec1 32 ssa_608 = unpack_64_2x32_split_x ssa_607
vec1 32 ssa_609 = unpack_64_2x32_split_y ssa_607
vec1 32 ssa_610 = iadd ssa_608, ssa_605
vec1 32 ssa_611 = ult32 ssa_610, ssa_608
vec1 32 ssa_612 = b2i32 ssa_611
vec1 32 ssa_613 = iadd ssa_612, ssa_609
vec1 64 ssa_614 = pack_64_2x32_split ssa_610, ssa_613
intrinsic store_global (ssa_606, ssa_614) (wrmask=x /*1*/, access=0, align_mul=4, align_offset=0)
/* succs: block_6 */
block block_6:
}
block0 {
875 = MOV.i32 r62
874 = MOV.i32 r61
873 = MOV.i32 r60
20 = LSHIFT_OR.i32 874, #0x0, #0x8.b0
21 = IADD.s32 873, 20
872 = MKVEC.v2i16 #0x0.h00, 875.h00
24 = IADD.s32 21, 872
26 = FADD.f32 u1, #0x3f800000
27 = FADD.f32 u1, #0x40000000
28 = FADD.f32 u1, #0x40400000
29 = FADD.f32 u1, #0x40800000
30 = FADD.f32 u1, #0x40a00000
31 = FADD.f32 u1, #0x40c00000
32 = FADD.f32 u1, #0x40e00000
33 = FADD.f32 u1, #0x41000000
34 = FADD.f32 u1, #0x41100000
35 = FADD.f32 u1, #0x41200000
36 = FADD.f32 u1, #0x41300000
37 = FADD.f32 u1, #0x41400000
38 = FADD.f32 u1, #0x41500000
39 = FADD.f32 u1, #0x41600000
40 = FADD.f32 u1, #0x41700000
41 = U32_TO_F32 24
42 = FMA.f32 41, #0x2edbe6ff, #0x0.neg
} -> block1
block1 {
43 = PHI 42, 618
44 = PHI 42, 617
45 = PHI 42, 616
46 = PHI 42, 615
47 = PHI 42, 622
48 = PHI 42, 621
49 = PHI 42, 620
50 = PHI 42, 619
51 = PHI 42, 626
52 = PHI 42, 625
53 = PHI 42, 624
54 = PHI 42, 623
55 = PHI 42, 630
56 = PHI 42, 629
57 = PHI 42, 628
58 = PHI 42, 627
59 = PHI 37, 634
60 = PHI 38, 633
61 = PHI 39, 632
62 = PHI 40, 631
63 = PHI 33, 638
64 = PHI 34, 637
65 = PHI 35, 636
66 = PHI 36, 635
67 = PHI 29, 642
68 = PHI 30, 641
69 = PHI 31, 640
70 = PHI 32, 639
71 = PHI u1, 646
72 = PHI 26, 645
73 = PHI 27, 644
74 = PHI 28, 643
75 = PHI #0x0, 589
76 = ICMP.s32.m1.ge 75, #0x8
BRANCHZ.i16.eq 76.h00, #0x0 -> block3
} -> block3 block2 from block0 block4
block2 {
JUMP #0x0 -> block5
} -> block5 from block1
block3 {
} -> block4 from block1
block4 {
870 = FMA.f32 55, 71, 55
869 = FMA.f32 56, 72, 56
868 = FMA.f32 57, 73, 57
867 = FMA.f32 58, 74, 58
866 = FMA.f32 51, 67, 51
865 = FMA.f32 52, 68, 52
864 = FMA.f32 53, 69, 53
863 = FMA.f32 54, 70, 54
862 = FMA.f32 47, 63, 47
861 = FMA.f32 48, 64, 48
860 = FMA.f32 49, 65, 49
859 = FMA.f32 50, 66, 50
858 = FMA.f32 43, 59, 43
857 = FMA.f32 44, 60, 44
856 = FMA.f32 45, 61, 45
855 = FMA.f32 46, 62, 46
854 = FMA.f32 870, 55, 870
853 = FMA.f32 869, 56, 869
852 = FMA.f32 868, 57, 868
851 = FMA.f32 867, 58, 867
850 = FMA.f32 866, 51, 866
849 = FMA.f32 865, 52, 865
848 = FMA.f32 864, 53, 864
847 = FMA.f32 863, 54, 863
846 = FMA.f32 862, 47, 862
845 = FMA.f32 861, 48, 861
844 = FMA.f32 860, 49, 860
843 = FMA.f32 859, 50, 859
842 = FMA.f32 858, 43, 858
841 = FMA.f32 857, 44, 857
840 = FMA.f32 856, 45, 856
839 = FMA.f32 855, 46, 855
838 = FMA.f32 854, 870, 854
837 = FMA.f32 853, 869, 853
836 = FMA.f32 852, 868, 852
835 = FMA.f32 851, 867, 851
834 = FMA.f32 850, 866, 850
833 = FMA.f32 849, 865, 849
832 = FMA.f32 848, 864, 848
831 = FMA.f32 847, 863, 847
830 = FMA.f32 846, 862, 846
829 = FMA.f32 845, 861, 845
828 = FMA.f32 844, 860, 844
827 = FMA.f32 843, 859, 843
826 = FMA.f32 842, 858, 842
825 = FMA.f32 841, 857, 841
824 = FMA.f32 840, 856, 840
823 = FMA.f32 839, 855, 839
822 = FMA.f32 838, 854, 838
821 = FMA.f32 837, 853, 837
820 = FMA.f32 836, 852, 836
819 = FMA.f32 835, 851, 835
818 = FMA.f32 834, 850, 834
817 = FMA.f32 833, 849, 833
816 = FMA.f32 832, 848, 832
815 = FMA.f32 831, 847, 831
814 = FMA.f32 830, 846, 830
813 = FMA.f32 829, 845, 829
812 = FMA.f32 828, 844, 828
811 = FMA.f32 827, 843, 827
810 = FMA.f32 826, 842, 826
809 = FMA.f32 825, 841, 825
808 = FMA.f32 824, 840, 824
807 = FMA.f32 823, 839, 823
806 = FMA.f32 822, 838, 822
805 = FMA.f32 821, 837, 821
804 = FMA.f32 820, 836, 820
803 = FMA.f32 819, 835, 819
802 = FMA.f32 818, 834, 818
801 = FMA.f32 817, 833, 817
800 = FMA.f32 816, 832, 816
799 = FMA.f32 815, 831, 815
798 = FMA.f32 814, 830, 814
797 = FMA.f32 813, 829, 813
796 = FMA.f32 812, 828, 812
795 = FMA.f32 811, 827, 811
794 = FMA.f32 810, 826, 810
793 = FMA.f32 809, 825, 809
792 = FMA.f32 808, 824, 808
791 = FMA.f32 807, 823, 807
790 = FMA.f32 806, 822, 806
789 = FMA.f32 805, 821, 805
788 = FMA.f32 804, 820, 804
787 = FMA.f32 803, 819, 803
786 = FMA.f32 802, 818, 802
785 = FMA.f32 801, 817, 801
784 = FMA.f32 800, 816, 800
783 = FMA.f32 799, 815, 799
782 = FMA.f32 798, 814, 798
781 = FMA.f32 797, 813, 797
780 = FMA.f32 796, 812, 796
779 = FMA.f32 795, 811, 795
778 = FMA.f32 794, 810, 794
777 = FMA.f32 793, 809, 793
776 = FMA.f32 792, 808, 792
775 = FMA.f32 791, 807, 791
774 = FMA.f32 790, 806, 790
773 = FMA.f32 789, 805, 789
772 = FMA.f32 788, 804, 788
771 = FMA.f32 787, 803, 787
770 = FMA.f32 786, 802, 786
769 = FMA.f32 785, 801, 785
768 = FMA.f32 784, 800, 784
767 = FMA.f32 783, 799, 783
766 = FMA.f32 782, 798, 782
765 = FMA.f32 781, 797, 781
764 = FMA.f32 780, 796, 780
763 = FMA.f32 779, 795, 779
762 = FMA.f32 778, 794, 778
761 = FMA.f32 777, 793, 777
760 = FMA.f32 776, 792, 776
759 = FMA.f32 775, 791, 775
758 = FMA.f32 774, 790, 774
757 = FMA.f32 773, 789, 773
756 = FMA.f32 772, 788, 772
755 = FMA.f32 771, 787, 771
754 = FMA.f32 770, 786, 770
753 = FMA.f32 769, 785, 769
752 = FMA.f32 768, 784, 768
751 = FMA.f32 767, 783, 767
750 = FMA.f32 766, 782, 766
749 = FMA.f32 765, 781, 765
748 = FMA.f32 764, 780, 764
747 = FMA.f32 763, 779, 763
746 = FMA.f32 762, 778, 762
745 = FMA.f32 761, 777, 761
744 = FMA.f32 760, 776, 760
743 = FMA.f32 759, 775, 759
742 = FMA.f32 758, 774, 758
741 = FMA.f32 757, 773, 757
740 = FMA.f32 756, 772, 756
739 = FMA.f32 755, 771, 755
738 = FMA.f32 754, 770, 754
737 = FMA.f32 753, 769, 753
736 = FMA.f32 752, 768, 752
735 = FMA.f32 751, 767, 751
734 = FMA.f32 750, 766, 750
733 = FMA.f32 749, 765, 749
732 = FMA.f32 748, 764, 748
731 = FMA.f32 747, 763, 747
730 = FMA.f32 746, 762, 746
729 = FMA.f32 745, 761, 745
728 = FMA.f32 744, 760, 744
727 = FMA.f32 743, 759, 743
726 = FMA.f32 742, 758, 742
725 = FMA.f32 741, 757, 741
724 = FMA.f32 740, 756, 740
723 = FMA.f32 739, 755, 739
722 = FMA.f32 738, 754, 738
721 = FMA.f32 737, 753, 737
720 = FMA.f32 736, 752, 736
719 = FMA.f32 735, 751, 735
718 = FMA.f32 734, 750, 734
717 = FMA.f32 733, 749, 733
716 = FMA.f32 732, 748, 732
715 = FMA.f32 731, 747, 731
714 = FMA.f32 730, 746, 730
713 = FMA.f32 729, 745, 729
712 = FMA.f32 728, 744, 728
711 = FMA.f32 727, 743, 727
710 = FMA.f32 726, 742, 726
709 = FMA.f32 725, 741, 725
708 = FMA.f32 724, 740, 724
707 = FMA.f32 723, 739, 723
706 = FMA.f32 722, 738, 722
705 = FMA.f32 721, 737, 721
704 = FMA.f32 720, 736, 720
703 = FMA.f32 719, 735, 719
702 = FMA.f32 718, 734, 718
701 = FMA.f32 717, 733, 717
700 = FMA.f32 716, 732, 716
699 = FMA.f32 715, 731, 715
698 = FMA.f32 714, 730, 714
697 = FMA.f32 713, 729, 713
696 = FMA.f32 712, 728, 712
695 = FMA.f32 711, 727, 711
694 = FMA.f32 710, 726, 710
693 = FMA.f32 709, 725, 709
692 = FMA.f32 708, 724, 708
691 = FMA.f32 707, 723, 707
690 = FMA.f32 706, 722, 706
689 = FMA.f32 705, 721, 705
688 = FMA.f32 704, 720, 704
687 = FMA.f32 703, 719, 703
686 = FMA.f32 702, 718, 702
685 = FMA.f32 701, 717, 701
684 = FMA.f32 700, 716, 700
683 = FMA.f32 699, 715, 699
682 = FMA.f32 698, 714, 698
681 = FMA.f32 697, 713, 697
680 = FMA.f32 696, 712, 696
679 = FMA.f32 695, 711, 695
678 = FMA.f32 694, 710, 694
677 = FMA.f32 693, 709, 693
676 = FMA.f32 692, 708, 692
675 = FMA.f32 691, 707, 691
674 = FMA.f32 690, 706, 690
673 = FMA.f32 689, 705, 689
672 = FMA.f32 688, 704, 688
671 = FMA.f32 687, 703, 687
670 = FMA.f32 686, 702, 686
669 = FMA.f32 685, 701, 685
668 = FMA.f32 684, 700, 684
667 = FMA.f32 683, 699, 683
666 = FMA.f32 682, 698, 682
665 = FMA.f32 681, 697, 681
664 = FMA.f32 680, 696, 680
663 = FMA.f32 679, 695, 679
662 = FMA.f32 678, 694, 678
661 = FMA.f32 677, 693, 677
660 = FMA.f32 676, 692, 676
659 = FMA.f32 675, 691, 675
658 = FMA.f32 674, 690, 674
657 = FMA.f32 673, 689, 673
656 = FMA.f32 672, 688, 672
655 = FMA.f32 671, 687, 671
654 = FMA.f32 670, 686, 670
653 = FMA.f32 669, 685, 669
652 = FMA.f32 668, 684, 668
651 = FMA.f32 667, 683, 667
650 = FMA.f32 666, 682, 666
649 = FMA.f32 665, 681, 665
648 = FMA.f32 664, 680, 664
647 = FMA.f32 663, 679, 663
646 = FMA.f32 662, 678, 662
645 = FMA.f32 661, 677, 661
644 = FMA.f32 660, 676, 660
643 = FMA.f32 659, 675, 659
642 = FMA.f32 658, 674, 658
641 = FMA.f32 657, 673, 657
640 = FMA.f32 656, 672, 656
639 = FMA.f32 655, 671, 655
638 = FMA.f32 654, 670, 654
637 = FMA.f32 653, 669, 653
636 = FMA.f32 652, 668, 652
635 = FMA.f32 651, 667, 651
634 = FMA.f32 650, 666, 650
633 = FMA.f32 649, 665, 649
632 = FMA.f32 648, 664, 648
631 = FMA.f32 647, 663, 647
630 = FMA.f32 646, 662, 646
629 = FMA.f32 645, 661, 645
628 = FMA.f32 644, 660, 644
627 = FMA.f32 643, 659, 643
626 = FMA.f32 642, 658, 642
625 = FMA.f32 641, 657, 641
624 = FMA.f32 640, 656, 640
623 = FMA.f32 639, 655, 639
622 = FMA.f32 638, 654, 638
621 = FMA.f32 637, 653, 637
620 = FMA.f32 636, 652, 636
619 = FMA.f32 635, 651, 635
618 = FMA.f32 634, 650, 634
617 = FMA.f32 633, 649, 633
616 = FMA.f32 632, 648, 632
615 = FMA.f32 631, 647, 631
589 = IADD.s32 75, #0x1
JUMP #0x0 -> block1
} -> block1 from block3
block5 {
590 = FADD.f32 55, 47
591 = FADD.f32 56, 48
592 = FADD.f32 57, 49
593 = FADD.f32 58, 50
594 = FADD.f32 51, 43
595 = FADD.f32 52, 44
596 = FADD.f32 53, 45
597 = FADD.f32 54, 46
598 = FADD.f32 590, 594
599 = FADD.f32 591, 595
600 = FADD.f32 592, 596
601 = FADD.f32 593, 597
602 = FADD.f32 598, 600
603 = FADD.f32 599, 601
605 = LSHIFT_OR.i32 24, #0x0, #0x2.b0
606 = FADD.f32 602, 603
610 = IADD.s32 u0, 605
612 = ICMP.u32.i1.lt 610, u0
613 = IADD.s32 612, u0[1]
STORE.i32 606, 610, 613, byte_offset:0
} from block2
block0 {
r0 = LSHIFT_OR.i32 r61, #0x0, #0x8.b0
r0 = IADD.s32 r60, r0
r1 = MKVEC.v2i16 #0x0.h00, r62.h00
r0 = IADD.s32 r0, r1
r1 = MOV.i32 #0x3f800000
r1 = FADD.f32 u1, r1
r2 = MOV.i32 #0x40000000
r2 = FADD.f32 u1, r2
r3 = MOV.i32 #0x40400000
r3 = FADD.f32 u1, r3
r4 = MOV.i32 #0x40800000
r4 = FADD.f32 u1, r4
r5 = MOV.i32 #0x40a00000
r5 = FADD.f32 u1, r5
r6 = MOV.i32 #0x40c00000
r6 = FADD.f32 u1, r6
r7 = MOV.i32 #0x40e00000
r7 = FADD.f32 u1, r7
r8 = MOV.i32 #0x41000000
r8 = FADD.f32 u1, r8
r9 = MOV.i32 #0x41100000
r9 = FADD.f32 u1, r9
r10 = MOV.i32 #0x41200000
r10 = FADD.f32 u1, r10
r11 = MOV.i32 #0x41300000
r11 = FADD.f32 u1, r11
r12 = MOV.i32 #0x41400000
r12 = FADD.f32 u1, r12
r13 = MOV.i32 #0x41500000
r13 = FADD.f32 u1, r13
r14 = MOV.i32 #0x41600000
r14 = FADD.f32 u1, r14
r15 = MOV.i32 #0x41700000
r15 = FADD.f32 u1, r15
r16 = U32_TO_F32 r0
r16 = FMA.f32 r16, #0x2edbe6ff, #0x0.neg
r17 = MOV.i32 r16
r18 = MOV.i32 r16
r19 = MOV.i32 r16
r20 = MOV.i32 r16
r21 = MOV.i32 r16
r22 = MOV.i32 r16
r23 = MOV.i32 r16
r24 = MOV.i32 r16
r25 = MOV.i32 r16
r26 = MOV.i32 r16
r27 = MOV.i32 r16
r28 = MOV.i32 r16
r29 = MOV.i32 r16
r30 = MOV.i32 r16
r31 = MOV.i32 r16
r32 = MOV.i32 u1
r33 = MOV.i32 #0x0
} -> block1
block1 {
r34 = ICMP.s32.m1.ge r33, #0x8
BRANCHZ.i16.eq r34.h00, #0x0 -> block3
} -> block3 block2 from block0 block4
block2 {
JUMP #0x0 -> block5
} -> block5 from block1
block3 {
} -> block4 from block1
block4 {
r32 = FMA.f32 r28, r32, r28
r1 = FMA.f32 r29, r1, r29
r2 = FMA.f32 r30, r2, r30
r3 = FMA.f32 r31, r3, r31
r4 = FMA.f32 r24, r4, r24
r5 = FMA.f32 r25, r5, r25
r6 = FMA.f32 r26, r6, r26
r7 = FMA.f32 r27, r7, r27
r8 = FMA.f32 r20, r8, r20
r9 = FMA.f32 r21, r9, r21
r10 = FMA.f32 r22, r10, r22
r11 = FMA.f32 r23, r11, r23
r12 = FMA.f32 r16, r12, r16
r13 = FMA.f32 r17, r13, r17
r14 = FMA.f32 r18, r14, r18
r15 = FMA.f32 r19, r15, r19
r28 = FMA.f32 r32, r28, r32
r29 = FMA.f32 r1, r29, r1
r30 = FMA.f32 r2, r30, r2
r31 = FMA.f32 r3, r31, r3
r24 = FMA.f32 r4, r24, r4
r25 = FMA.f32 r5, r25, r5
r26 = FMA.f32 r6, r26, r6
r27 = FMA.f32 r7, r27, r7
r20 = FMA.f32 r8, r20, r8
r21 = FMA.f32 r9, r21, r9
r22 = FMA.f32 r10, r22, r10
r23 = FMA.f32 r11, r23, r11
r16 = FMA.f32 r12, r16, r12
r17 = FMA.f32 r13, r17, r13
r18 = FMA.f32 r14, r18, r14
r19 = FMA.f32 r15, r19, r15
r32 = FMA.f32 r28, r32, r28
r1 = FMA.f32 r29, r1, r29
r2 = FMA.f32 r30, r2, r30
r3 = FMA.f32 r31, r3, r31
r4 = FMA.f32 r24, r4, r24
r5 = FMA.f32 r25, r5, r25
r6 = FMA.f32 r26, r6, r26
r7 = FMA.f32 r27, r7, r27
r8 = FMA.f32 r20, r8, r20
r9 = FMA.f32 r21, r9, r21
r10 = FMA.f32 r22, r10, r22
r11 = FMA.f32 r23, r11, r23
r12 = FMA.f32 r16, r12, r16
r13 = FMA.f32 r17, r13, r17
r14 = FMA.f32 r18, r14, r18
r15 = FMA.f32 r19, r15, r19
r28 = FMA.f32 r32, r28, r32
r29 = FMA.f32 r1, r29, r1
r30 = FMA.f32 r2, r30, r2
r31 = FMA.f32 r3, r31, r3
r24 = FMA.f32 r4, r24, r4
r25 = FMA.f32 r5, r25, r5
r26 = FMA.f32 r6, r26, r6
r27 = FMA.f32 r7, r27, r7
r20 = FMA.f32 r8, r20, r8
r21 = FMA.f32 r9, r21, r9
r22 = FMA.f32 r10, r22, r10
r23 = FMA.f32 r11, r23, r11
r16 = FMA.f32 r12, r16, r12
r17 = FMA.f32 r13, r17, r13
r18 = FMA.f32 r14, r18, r14
r19 = FMA.f32 r15, r19, r15
r32 = FMA.f32 r28, r32, r28
r1 = FMA.f32 r29, r1, r29
r2 = FMA.f32 r30, r2, r30
r3 = FMA.f32 r31, r3, r31
r4 = FMA.f32 r24, r4, r24
r5 = FMA.f32 r25, r5, r25
r6 = FMA.f32 r26, r6, r26
r7 = FMA.f32 r27, r7, r27
r8 = FMA.f32 r20, r8, r20
r9 = FMA.f32 r21, r9, r21
r10 = FMA.f32 r22, r10, r22
r11 = FMA.f32 r23, r11, r23
r12 = FMA.f32 r16, r12, r16
r13 = FMA.f32 r17, r13, r17
r14 = FMA.f32 r18, r14, r18
r15 = FMA.f32 r19, r15, r19
r28 = FMA.f32 r32, r28, r32
r29 = FMA.f32 r1, r29, r1
r30 = FMA.f32 r2, r30, r2
r31 = FMA.f32 r3, r31, r3
r24 = FMA.f32 r4, r24, r4
r25 = FMA.f32 r5, r25, r5
r26 = FMA.f32 r6, r26, r6
r27 = FMA.f32 r7, r27, r7
r20 = FMA.f32 r8, r20, r8
r21 = FMA.f32 r9, r21, r9
r22 = FMA.f32 r10, r22, r10
r23 = FMA.f32 r11, r23, r11
r16 = FMA.f32 r12, r16, r12
r17 = FMA.f32 r13, r17, r13
r18 = FMA.f32 r14, r18, r14
r19 = FMA.f32 r15, r19, r15
r32 = FMA.f32 r28, r32, r28
r1 = FMA.f32 r29, r1, r29
r2 = FMA.f32 r30, r2, r30
r3 = FMA.f32 r31, r3, r31
r4 = FMA.f32 r24, r4, r24
r5 = FMA.f32 r25, r5, r25
r6 = FMA.f32 r26, r6, r26
r7 = FMA.f32 r27, r7, r27
r8 = FMA.f32 r20, r8, r20
r9 = FMA.f32 r21, r9, r21
r10 = FMA.f32 r22, r10, r22
r11 = FMA.f32 r23, r11, r23
r12 = FMA.f32 r16, r12, r16
r13 = FMA.f32 r17, r13, r17
r14 = FMA.f32 r18, r14, r18
r15 = FMA.f32 r19, r15, r19
r28 = FMA.f32 r32, r28, r32
r29 = FMA.f32 r1, r29, r1
r30 = FMA.f32 r2, r30, r2
r31 = FMA.f32 r3, r31, r3
r24 = FMA.f32 r4, r24, r4
r25 = FMA.f32 r5, r25, r5
r26 = FMA.f32 r6, r26, r6
r27 = FMA.f32 r7, r27, r7
r20 = FMA.f32 r8, r20, r8
r21 = FMA.f32 r9, r21, r9
r22 = FMA.f32 r10, r22, r10
r23 = FMA.f32 r11, r23, r11
r16 = FMA.f32 r12, r16, r12
r17 = FMA.f32 r13, r17, r13
r18 = FMA.f32 r14, r18, r14
r19 = FMA.f32 r15, r19, r15
r32 = FMA.f32 r28, r32, r28
r1 = FMA.f32 r29, r1, r29
r2 = FMA.f32 r30, r2, r30
r3 = FMA.f32 r31, r3, r31
r4 = FMA.f32 r24, r4, r24
r5 = FMA.f32 r25, r5, r25
r6 = FMA.f32 r26, r6, r26
r7 = FMA.f32 r27, r7, r27
r8 = FMA.f32 r20, r8, r20
r9 = FMA.f32 r21, r9, r21
r10 = FMA.f32 r22, r10, r22
r11 = FMA.f32 r23, r11, r23
r12 = FMA.f32 r16, r12, r16
r13 = FMA.f32 r17, r13, r17
r14 = FMA.f32 r18, r14, r18
r15 = FMA.f32 r19, r15, r19
r28 = FMA.f32 r32, r28, r32
r29 = FMA.f32 r1, r29, r1
r30 = FMA.f32 r2, r30, r2
r31 = FMA.f32 r3, r31, r3
r24 = FMA.f32 r4, r24, r4
r25 = FMA.f32 r5, r25, r5
r26 = FMA.f32 r6, r26, r6
r27 = FMA.f32 r7, r27, r7
r20 = FMA.f32 r8, r20, r8
r21 = FMA.f32 r9, r21, r9
r22 = FMA.f32 r10, r22, r10
r23 = FMA.f32 r11, r23, r11
r16 = FMA.f32 r12, r16, r12
r17 = FMA.f32 r13, r17, r13
r18 = FMA.f32 r14, r18, r14
r19 = FMA.f32 r15, r19, r15
r32 = FMA.f32 r28, r32, r28
r1 = FMA.f32 r29, r1, r29
r2 = FMA.f32 r30, r2, r30
r3 = FMA.f32 r31, r3, r31
r4 = FMA.f32 r24, r4, r24
r5 = FMA.f32 r25, r5, r25
r6 = FMA.f32 r26, r6, r26
r7 = FMA.f32 r27, r7, r27
r8 = FMA.f32 r20, r8, r20
r9 = FMA.f32 r21, r9, r21
r10 = FMA.f32 r22, r10, r22
r11 = FMA.f32 r23, r11, r23
r12 = FMA.f32 r16, r12, r16
r13 = FMA.f32 r17, r13, r17
r14 = FMA.f32 r18, r14, r18
r15 = FMA.f32 r19, r15, r19
r28 = FMA.f32 r32, r28, r32
r29 = FMA.f32 r1, r29, r1
r30 = FMA.f32 r2, r30, r2
r31 = FMA.f32 r3, r31, r3
r24 = FMA.f32 r4, r24, r4
r25 = FMA.f32 r5, r25, r5
r26 = FMA.f32 r6, r26, r6
r27 = FMA.f32 r7, r27, r7
r20 = FMA.f32 r8, r20, r8
r21 = FMA.f32 r9, r21, r9
r22 = FMA.f32 r10, r22, r10
r23 = FMA.f32 r11, r23, r11
r16 = FMA.f32 r12, r16, r12
r17 = FMA.f32 r13, r17, r13
r18 = FMA.f32 r14, r18, r14
r19 = FMA.f32 r15, r19, r15
r32 = FMA.f32 r28, r32, r28
r1 = FMA.f32 r29, r1, r29
r2 = FMA.f32 r30, r2, r30
r3 = FMA.f32 r31, r3, r31
r4 = FMA.f32 r24, r4, r24
r5 = FMA.f32 r25, r5, r25
r6 = FMA.f32 r26, r6, r26
r7 = FMA.f32 r27, r7, r27
r8 = FMA.f32 r20, r8, r20
r9 = FMA.f32 r21, r9, r21
r10 = FMA.f32 r22, r10, r22
r11 = FMA.f32 r23, r11, r23
r12 = FMA.f32 r16, r12, r16
r13 = FMA.f32 r17, r13, r17
r14 = FMA.f32 r18, r14, r18
r15 = FMA.f32 r19, r15, r19
r28 = FMA.f32 r32, r28, r32
r29 = FMA.f32 r1, r29, r1
r30 = FMA.f32 r2, r30, r2
r31 = FMA.f32 r3, r31, r3
r24 = FMA.f32 r4, r24, r4
r25 = FMA.f32 r5, r25, r5
r26 = FMA.f32 r6, r26, r6
r27 = FMA.f32 r7, r27, r7
r20 = FMA.f32 r8, r20, r8
r21 = FMA.f32 r9, r21, r9
r22 = FMA.f32 r10, r22, r10
r23 = FMA.f32 r11, r23, r11
r16 = FMA.f32 r12, r16, r12
r17 = FMA.f32 r13, r17, r13
r18 = FMA.f32 r14, r18, r14
r19 = FMA.f32 r15, r19, r15
r32 = FMA.f32 r28, r32, r28
r1 = FMA.f32 r29, r1, r29
r2 = FMA.f32 r30, r2, r30
r3 = FMA.f32 r31, r3, r31
r4 = FMA.f32 r24, r4, r24
r5 = FMA.f32 r25, r5, r25
r6 = FMA.f32 r26, r6, r26
r7 = FMA.f32 r27, r7, r27
r8 = FMA.f32 r20, r8, r20
r9 = FMA.f32 r21, r9, r21
r10 = FMA.f32 r22, r10, r22
r11 = FMA.f32 r23, r11, r23
r12 = FMA.f32 r16, r12, r16
r13 = FMA.f32 r17, r13, r17
r14 = FMA.f32 r18, r14, r18
r15 = FMA.f32 r19, r15, r19
r28 = FMA.f32 r32, r28, r32
r29 = FMA.f32 r1, r29, r1
r30 = FMA.f32 r2, r30, r2
r31 = FMA.f32 r3, r31, r3
r24 = FMA.f32 r4, r24, r4
r25 = FMA.f32 r5, r25, r5
r26 = FMA.f32 r6, r26, r6
r27 = FMA.f32 r7, r27, r7
r20 = FMA.f32 r8, r20, r8
r21 = FMA.f32 r9, r21, r9
r22 = FMA.f32 r10, r22, r10
r23 = FMA.f32 r11, r23, r11
r16 = FMA.f32 r12, r16, r12
r17 = FMA.f32 r13, r17, r13
r18 = FMA.f32 r14, r18, r14
r19 = FMA.f32 r15, r19, r15
r33 = IADD.s32 r33, #0x1
JUMP #0x0 -> block1
} -> block1 from block3
block5 {
r1 = FADD.f32 r28, r20
r2 = FADD.f32 r29, r21
r3 = FADD.f32 r30, r22
r4 = FADD.f32 r31, r23
r5 = FADD.f32 r24, r16
r6 = FADD.f32 r25, r17
r7 = FADD.f32 r26, r18
r8 = FADD.f32 r27, r19
r1 = FADD.f32 r1, r5
r2 = FADD.f32 r2, r6
r3 = FADD.f32 r3, r7
r4 = FADD.f32 r4, r8
r1 = FADD.f32 r1, r3
r2 = FADD.f32 r2, r4
r0 = LSHIFT_OR.i32 r0, #0x0, #0x2.b0
r1 = FADD.f32 r1, r2
r0 = IADD.s32 u0, r0
r2 = ICMP.u32.i1.lt r0, u0
r2 = IADD.s32 r2, u0[1]
STORE.i32 r1, r0, r2, byte_offset:0
} from block2
block0 {
id(0) nbb
* _.h00 = LSHIFT_OR.i32 r61, t, fau.x.b0
+ _.h00 = IADD.s32 r60, t
* _.h00 = MKVEC.v2i16 t.h00, r62.h00
+ r0 = IADD.s32 t1, t
* _.h00 = MOV.i32 fau.x
+ _.h00 = MOV.i32 fau.y
* r1 = FADD.f32 fau.x, t0
+ r2 = FADD.f32 fau.x, t1
* NOP
+ _.h00 = MOV.i32 fau.y
* NOP
+ r3 = FADD.f32 fau.x, t1
400000003f800000 4040000000000008
id(0) nbb
* _.h00 = MOV.i32 fau.x
+ _.h00 = MOV.i32 fau.y
* r4 = FADD.f32 fau.x, t0
+ r5 = FADD.f32 fau.x, t1
* _.h00 = MOV.i32 fau.x
+ _.h00 = MOV.i32 fau.y
* r6 = FADD.f32 fau.x, t0
+ r7 = FADD.f32 fau.x, t1
* _.h00 = MOV.i32 fau.x
+ _.h00 = MOV.i32 fau.y
* r8 = FADD.f32 fau.x, t0
+ r9 = FADD.f32 fau.x, t1
* NOP
+ _.h00 = MOV.i32 fau.y
* NOP
+ r10 = FADD.f32 fau.x, t1
40a0000040800000 40e0000040c00000 4110000041000000 4120000000000000
id(0) nbb
* _.h00 = MOV.i32 fau.x
+ _.h00 = MOV.i32 fau.y
* r11 = FADD.f32 fau.x, t0
+ r12 = FADD.f32 fau.x, t1
* _.h00 = MOV.i32 fau.x
+ _.h00 = MOV.i32 fau.y
* r13 = FADD.f32 fau.x, t0
+ r14 = FADD.f32 fau.x, t1
* NOP
+ _.h00 = MOV.i32 fau.y
* r15 = FADD.f32 fau.x, t1
+ _.h00 = U32_TO_F32 r0
* r16 = FMA.f32 t1, fau.x, t.neg
+ r17 = MOV.i32 t
* NOP
+ r18 = MOV.i32 t0
4140000041300000 4160000041500000 417000002edbe6ff
id(0) nbb r_uncond
* r19 = MOV.i32 r16
+ r20 = MOV.i32 r16
* r21 = MOV.i32 r16
+ r22 = MOV.i32 r16
* r23 = MOV.i32 r16
+ r24 = MOV.i32 r16
* r25 = MOV.i32 r16
+ r26 = MOV.i32 r16
* r27 = MOV.i32 r16
+ r28 = MOV.i32 r16
* r29 = MOV.i32 r16
+ r30 = MOV.i32 r16
* r31 = MOV.i32 r16
+ r32 = MOV.i32 fau.x
* NOP
+ r33 = MOV.i32 fau.x
} -> block1
block1 {
id(0) nbb r_uncond pcrel(0)
* NOP
+ _.h00 = ICMP.s32.m1.ge r33, fau.x
* NOP
+ BRANCHZ.i16.eq t1.h00, fau.y -> block3
4000000000000008
} -> block3 block2 from block0 block4
block2 {
id(0) nbb no_prefetch pcrel(0)
* NOP
+ JUMP fau.y -> block5
4000000000000000
} -> block5 from block1
block3 {
} -> block4 from block1
block4 {
id(0) nbb
* r32 = FMA.f32 r28, r32, r28
+ NOP
* r1 = FMA.f32 r29, r1, r29
+ NOP
* r2 = FMA.f32 r30, r2, r30
+ NOP
* r3 = FMA.f32 r31, r3, r31
+ NOP
* r4 = FMA.f32 r24, r4, r24
+ NOP
* r5 = FMA.f32 r25, r5, r25
+ NOP
* r6 = FMA.f32 r26, r6, r26
+ NOP
* r7 = FMA.f32 r27, r7, r27
+ NOP
id(0) nbb
* r8 = FMA.f32 r20, r8, r20
+ NOP
* r9 = FMA.f32 r21, r9, r21
+ NOP
* r10 = FMA.f32 r22, r10, r22
+ NOP
* r11 = FMA.f32 r23, r11, r23
+ NOP
* r12 = FMA.f32 r16, r12, r16
+ NOP
* r13 = FMA.f32 r17, r13, r17
+ NOP
* r14 = FMA.f32 r18, r14, r18
+ NOP
* r15 = FMA.f32 r19, r15, r19
+ NOP
id(0) nbb
* r28 = FMA.f32 r32, r28, r32
+ NOP
* r29 = FMA.f32 r1, r29, r1
+ NOP
* r30 = FMA.f32 r2, r30, r2
+ NOP
* r31 = FMA.f32 r3, r31, r3
+ NOP
* r24 = FMA.f32 r4, r24, r4
+ NOP
* r25 = FMA.f32 r5, r25, r5
+ NOP
* r26 = FMA.f32 r6, r26, r6
+ NOP
* r27 = FMA.f32 r7, r27, r7
+ NOP
id(0) nbb
* r20 = FMA.f32 r8, r20, r8
+ NOP
* r21 = FMA.f32 r9, r21, r9
+ NOP
* r22 = FMA.f32 r10, r22, r10
+ NOP
* r23 = FMA.f32 r11, r23, r11
+ NOP
* r16 = FMA.f32 r12, r16, r12
+ NOP
* r17 = FMA.f32 r13, r17, r13
+ NOP
* r18 = FMA.f32 r14, r18, r14
+ NOP
* r19 = FMA.f32 r15, r19, r15
+ NOP
id(0) nbb
* r32 = FMA.f32 r28, r32, r28
+ NOP
* r1 = FMA.f32 r29, r1, r29
+ NOP
* r2 = FMA.f32 r30, r2, r30
+ NOP
* r3 = FMA.f32 r31, r3, r31
+ NOP
* r4 = FMA.f32 r24, r4, r24
+ NOP
* r5 = FMA.f32 r25, r5, r25
+ NOP
* r6 = FMA.f32 r26, r6, r26
+ NOP
* r7 = FMA.f32 r27, r7, r27
+ NOP
id(0) nbb
* r8 = FMA.f32 r20, r8, r20
+ NOP
* r9 = FMA.f32 r21, r9, r21
+ NOP
* r10 = FMA.f32 r22, r10, r22
+ NOP
* r11 = FMA.f32 r23, r11, r23
+ NOP
* r12 = FMA.f32 r16, r12, r16
+ NOP
* r13 = FMA.f32 r17, r13, r17
+ NOP
* r14 = FMA.f32 r18, r14, r18
+ NOP
* r15 = FMA.f32 r19, r15, r19
+ NOP
id(0) nbb
* r28 = FMA.f32 r32, r28, r32
+ NOP
* r29 = FMA.f32 r1, r29, r1
+ NOP
* r30 = FMA.f32 r2, r30, r2
+ NOP
* r31 = FMA.f32 r3, r31, r3
+ NOP
* r24 = FMA.f32 r4, r24, r4
+ NOP
* r25 = FMA.f32 r5, r25, r5
+ NOP
* r26 = FMA.f32 r6, r26, r6
+ NOP
* r27 = FMA.f32 r7, r27, r7
+ NOP
id(0) nbb
* r20 = FMA.f32 r8, r20, r8
+ NOP
* r21 = FMA.f32 r9, r21, r9
+ NOP
* r22 = FMA.f32 r10, r22, r10
+ NOP
* r23 = FMA.f32 r11, r23, r11
+ NOP
* r16 = FMA.f32 r12, r16, r12
+ NOP
* r17 = FMA.f32 r13, r17, r13
+ NOP
* r18 = FMA.f32 r14, r18, r14
+ NOP
* r19 = FMA.f32 r15, r19, r15
+ NOP
id(0) nbb
* r32 = FMA.f32 r28, r32, r28
+ NOP
* r1 = FMA.f32 r29, r1, r29
+ NOP
* r2 = FMA.f32 r30, r2, r30
+ NOP
* r3 = FMA.f32 r31, r3, r31
+ NOP
* r4 = FMA.f32 r24, r4, r24
+ NOP
* r5 = FMA.f32 r25, r5, r25
+ NOP
* r6 = FMA.f32 r26, r6, r26
+ NOP
* r7 = FMA.f32 r27, r7, r27
+ NOP
id(0) nbb
* r8 = FMA.f32 r20, r8, r20
+ NOP
* r9 = FMA.f32 r21, r9, r21
+ NOP
* r10 = FMA.f32 r22, r10, r22
+ NOP
* r11 = FMA.f32 r23, r11, r23
+ NOP
* r12 = FMA.f32 r16, r12, r16
+ NOP
* r13 = FMA.f32 r17, r13, r17
+ NOP
* r14 = FMA.f32 r18, r14, r18
+ NOP
* r15 = FMA.f32 r19, r15, r19
+ NOP
id(0) nbb
* r28 = FMA.f32 r32, r28, r32
+ NOP
* r29 = FMA.f32 r1, r29, r1
+ NOP
* r30 = FMA.f32 r2, r30, r2
+ NOP
* r31 = FMA.f32 r3, r31, r3
+ NOP
* r24 = FMA.f32 r4, r24, r4
+ NOP
* r25 = FMA.f32 r5, r25, r5
+ NOP
* r26 = FMA.f32 r6, r26, r6
+ NOP
* r27 = FMA.f32 r7, r27, r7
+ NOP
id(0) nbb
* r20 = FMA.f32 r8, r20, r8
+ NOP
* r21 = FMA.f32 r9, r21, r9
+ NOP
* r22 = FMA.f32 r10, r22, r10
+ NOP
* r23 = FMA.f32 r11, r23, r11
+ NOP
* r16 = FMA.f32 r12, r16, r12
+ NOP
* r17 = FMA.f32 r13, r17, r13
+ NOP
* r18 = FMA.f32 r14, r18, r14
+ NOP
* r19 = FMA.f32 r15, r19, r15
+ NOP
id(0) nbb
* r32 = FMA.f32 r28, r32, r28
+ NOP
* r1 = FMA.f32 r29, r1, r29
+ NOP
* r2 = FMA.f32 r30, r2, r30
+ NOP
* r3 = FMA.f32 r31, r3, r31
+ NOP
* r4 = FMA.f32 r24, r4, r24
+ NOP
* r5 = FMA.f32 r25, r5, r25
+ NOP
* r6 = FMA.f32 r26, r6, r26
+ NOP
* r7 = FMA.f32 r27, r7, r27
+ NOP
id(0) nbb
* r8 = FMA.f32 r20, r8, r20
+ NOP
* r9 = FMA.f32 r21, r9, r21
+ NOP
* r10 = FMA.f32 r22, r10, r22
+ NOP
* r11 = FMA.f32 r23, r11, r23
+ NOP
* r12 = FMA.f32 r16, r12, r16
+ NOP
* r13 = FMA.f32 r17, r13, r17
+ NOP
* r14 = FMA.f32 r18, r14, r18
+ NOP
* r15 = FMA.f32 r19, r15, r19
+ NOP
id(0) nbb
* r28 = FMA.f32 r32, r28, r32
+ NOP
* r29 = FMA.f32 r1, r29, r1
+ NOP
* r30 = FMA.f32 r2, r30, r2
+ NOP
* r31 = FMA.f32 r3, r31, r3
+ NOP
* r24 = FMA.f32 r4, r24, r4
+ NOP
* r25 = FMA.f32 r5, r25, r5
+ NOP
* r26 = FMA.f32 r6, r26, r6
+ NOP
* r27 = FMA.f32 r7, r27, r7
+ NOP
id(0) nbb
* r20 = FMA.f32 r8, r20, r8
+ NOP
* r21 = FMA.f32 r9, r21, r9
+ NOP
* r22 = FMA.f32 r10, r22, r10
+ NOP
* r23 = FMA.f32 r11, r23, r11
+ NOP
* r16 = FMA.f32 r12, r16, r12
+ NOP
* r17 = FMA.f32 r13, r17, r13
+ NOP
* r18 = FMA.f32 r14, r18, r14
+ NOP
* r19 = FMA.f32 r15, r19, r15
+ NOP
id(0) nbb
* r32 = FMA.f32 r28, r32, r28
+ NOP
* r1 = FMA.f32 r29, r1, r29
+ NOP
* r2 = FMA.f32 r30, r2, r30
+ NOP
* r3 = FMA.f32 r31, r3, r31
+ NOP
* r4 = FMA.f32 r24, r4, r24
+ NOP
* r5 = FMA.f32 r25, r5, r25
+ NOP
* r6 = FMA.f32 r26, r6, r26
+ NOP
* r7 = FMA.f32 r27, r7, r27
+ NOP
id(0) nbb
* r8 = FMA.f32 r20, r8, r20
+ NOP
* r9 = FMA.f32 r21, r9, r21
+ NOP
* r10 = FMA.f32 r22, r10, r22
+ NOP
* r11 = FMA.f32 r23, r11, r23
+ NOP
* r12 = FMA.f32 r16, r12, r16
+ NOP
* r13 = FMA.f32 r17, r13, r17
+ NOP
* r14 = FMA.f32 r18, r14, r18
+ NOP
* r15 = FMA.f32 r19, r15, r19
+ NOP
id(0) nbb
* r28 = FMA.f32 r32, r28, r32
+ NOP
* r29 = FMA.f32 r1, r29, r1
+ NOP
* r30 = FMA.f32 r2, r30, r2
+ NOP
* r31 = FMA.f32 r3, r31, r3
+ NOP
* r24 = FMA.f32 r4, r24, r4
+ NOP
* r25 = FMA.f32 r5, r25, r5
+ NOP
* r26 = FMA.f32 r6, r26, r6
+ NOP
* r27 = FMA.f32 r7, r27, r7
+ NOP
id(0) nbb
* r20 = FMA.f32 r8, r20, r8
+ NOP
* r21 = FMA.f32 r9, r21, r9
+ NOP
* r22 = FMA.f32 r10, r22, r10
+ NOP
* r23 = FMA.f32 r11, r23, r11
+ NOP
* r16 = FMA.f32 r12, r16, r12
+ NOP
* r17 = FMA.f32 r13, r17, r13
+ NOP
* r18 = FMA.f32 r14, r18, r14
+ NOP
* r19 = FMA.f32 r15, r19, r15
+ NOP
id(0) nbb
* r32 = FMA.f32 r28, r32, r28
+ NOP
* r1 = FMA.f32 r29, r1, r29
+ NOP
* r2 = FMA.f32 r30, r2, r30
+ NOP
* r3 = FMA.f32 r31, r3, r31
+ NOP
* r4 = FMA.f32 r24, r4, r24
+ NOP
* r5 = FMA.f32 r25, r5, r25
+ NOP
* r6 = FMA.f32 r26, r6, r26
+ NOP
* r7 = FMA.f32 r27, r7, r27
+ NOP
id(0) nbb
* r8 = FMA.f32 r20, r8, r20
+ NOP
* r9 = FMA.f32 r21, r9, r21
+ NOP
* r10 = FMA.f32 r22, r10, r22
+ NOP
* r11 = FMA.f32 r23, r11, r23
+ NOP
* r12 = FMA.f32 r16, r12, r16
+ NOP
* r13 = FMA.f32 r17, r13, r17
+ NOP
* r14 = FMA.f32 r18, r14, r18
+ NOP
* r15 = FMA.f32 r19, r15, r19
+ NOP
id(0) nbb
* r28 = FMA.f32 r32, r28, r32
+ NOP
* r29 = FMA.f32 r1, r29, r1
+ NOP
* r30 = FMA.f32 r2, r30, r2
+ NOP
* r31 = FMA.f32 r3, r31, r3
+ NOP
* r24 = FMA.f32 r4, r24, r4
+ NOP
* r25 = FMA.f32 r5, r25, r5
+ NOP
* r26 = FMA.f32 r6, r26, r6
+ NOP
* r27 = FMA.f32 r7, r27, r7
+ NOP
id(0) nbb
* r20 = FMA.f32 r8, r20, r8
+ NOP
* r21 = FMA.f32 r9, r21, r9
+ NOP
* r22 = FMA.f32 r10, r22, r10
+ NOP
* r23 = FMA.f32 r11, r23, r11
+ NOP
* r16 = FMA.f32 r12, r16, r12
+ NOP
* r17 = FMA.f32 r13, r17, r13
+ NOP
* r18 = FMA.f32 r14, r18, r14
+ NOP
* r19 = FMA.f32 r15, r19, r15
+ NOP
id(0) nbb
* r32 = FMA.f32 r28, r32, r28
+ NOP
* r1 = FMA.f32 r29, r1, r29
+ NOP
* r2 = FMA.f32 r30, r2, r30
+ NOP
* r3 = FMA.f32 r31, r3, r31
+ NOP
* r4 = FMA.f32 r24, r4, r24
+ NOP
* r5 = FMA.f32 r25, r5, r25
+ NOP
* r6 = FMA.f32 r26, r6, r26
+ NOP
* r7 = FMA.f32 r27, r7, r27
+ NOP
id(0) nbb
* r8 = FMA.f32 r20, r8, r20
+ NOP
* r9 = FMA.f32 r21, r9, r21
+ NOP
* r10 = FMA.f32 r22, r10, r22
+ NOP
* r11 = FMA.f32 r23, r11, r23
+ NOP
* r12 = FMA.f32 r16, r12, r16
+ NOP
* r13 = FMA.f32 r17, r13, r17
+ NOP
* r14 = FMA.f32 r18, r14, r18
+ NOP
* r15 = FMA.f32 r19, r15, r19
+ NOP
id(0) nbb
* r28 = FMA.f32 r32, r28, r32
+ NOP
* r29 = FMA.f32 r1, r29, r1
+ NOP
* r30 = FMA.f32 r2, r30, r2
+ NOP
* r31 = FMA.f32 r3, r31, r3
+ NOP
* r24 = FMA.f32 r4, r24, r4
+ NOP
* r25 = FMA.f32 r5, r25, r5
+ NOP
* r26 = FMA.f32 r6, r26, r6
+ NOP
* r27 = FMA.f32 r7, r27, r7
+ NOP
id(0) nbb
* r20 = FMA.f32 r8, r20, r8
+ NOP
* r21 = FMA.f32 r9, r21, r9
+ NOP
* r22 = FMA.f32 r10, r22, r10
+ NOP
* r23 = FMA.f32 r11, r23, r11
+ NOP
* r16 = FMA.f32 r12, r16, r12
+ NOP
* r17 = FMA.f32 r13, r17, r13
+ NOP
* r18 = FMA.f32 r14, r18, r14
+ NOP
* r19 = FMA.f32 r15, r19, r15
+ NOP
id(0) nbb
* r32 = FMA.f32 r28, r32, r28
+ NOP
* r1 = FMA.f32 r29, r1, r29
+ NOP
* r2 = FMA.f32 r30, r2, r30
+ NOP
* r3 = FMA.f32 r31, r3, r31
+ NOP
* r4 = FMA.f32 r24, r4, r24
+ NOP
* r5 = FMA.f32 r25, r5, r25
+ NOP
* r6 = FMA.f32 r26, r6, r26
+ NOP
* r7 = FMA.f32 r27, r7, r27
+ NOP
id(0) nbb
* r8 = FMA.f32 r20, r8, r20
+ NOP
* r9 = FMA.f32 r21, r9, r21
+ NOP
* r10 = FMA.f32 r22, r10, r22
+ NOP
* r11 = FMA.f32 r23, r11, r23
+ NOP
* r12 = FMA.f32 r16, r12, r16
+ NOP
* r13 = FMA.f32 r17, r13, r17
+ NOP
* r14 = FMA.f32 r18, r14, r18
+ NOP
* r15 = FMA.f32 r19, r15, r19
+ NOP
id(0) nbb
* r28 = FMA.f32 r32, r28, r32
+ NOP
* r29 = FMA.f32 r1, r29, r1
+ NOP
* r30 = FMA.f32 r2, r30, r2
+ NOP
* r31 = FMA.f32 r3, r31, r3
+ NOP
* r24 = FMA.f32 r4, r24, r4
+ NOP
* r25 = FMA.f32 r5, r25, r5
+ NOP
* r26 = FMA.f32 r6, r26, r6
+ NOP
* r27 = FMA.f32 r7, r27, r7
+ NOP
id(0) nbb r_uncond no_prefetch pcrel(1)
* r20 = FMA.f32 r8, r20, r8
+ NOP
* r21 = FMA.f32 r9, r21, r9
+ NOP
* r22 = FMA.f32 r10, r22, r10
+ NOP
* r23 = FMA.f32 r11, r23, r11
+ NOP
* r16 = FMA.f32 r12, r16, r12
+ NOP
* r17 = FMA.f32 r13, r17, r13
+ NOP
* r18 = FMA.f32 r14, r18, r14
+ r33 = IADD.s32 r33, fau.x
* r19 = FMA.f32 r15, r19, r15
+ JUMP fau.y -> block1
0 4000000000000001
} -> block1 from block3
block5 {
id(0) nbb
* NOP
+ r1 = FADD.f32 r28, r20
id(0) nbb
* NOP
+ r2 = FADD.f32 r29, r21
* NOP
+ r3 = FADD.f32 r30, r22
* NOP
+ r4 = FADD.f32 r31, r23
* NOP
+ r5 = FADD.f32 r24, r16
* NOP
+ r6 = FADD.f32 r25, r17
* NOP
+ r7 = FADD.f32 r26, r18
* NOP
+ r8 = FADD.f32 r27, r19
* NOP
+ r1 = FADD.f32 r1, r5
id(0) wait(0 ) nbb r_uncond
* NOP
+ r2 = FADD.f32 r2, r6
* NOP
+ r3 = FADD.f32 r3, r7
* NOP
+ r4 = FADD.f32 r4, r8
* _.h00 = LSHIFT_OR.i32 r0, t, fau.y.b0
+ r1 = FADD.f32 r1, r3
* _.h00 = FADD.f32 r2, r4
+ r0 = IADD.s32 fau.x, t0
* r1 = FADD.f32 r1, t0
+ _.h00 = ICMP.u32.i1.lt t1, fau.x
* NOP
+ _.h00 = IADD.s32 t1, fau.y
* NOP
+ STORE.i32 r1, r0, t1, byte_offset:0
200000000
} from block2
slot 0 reads: r1
clause_0:
ds(0) nbb ncph
{
*LSHIFT_OR.i32 t0, r61, #0, 0x00000008 /* 0.000000 */
+IADD.s32 t1, r60, t
*MKVEC.v2i16 t0, #0, r62
+IADD.s32 r0:t1, t1, t
*MOV.i32 t0, 0x3f800000 /* 1.000000 */
+MOV.i32 t1, 0x40000000 /* 2.000000 */
*FADD.f32 r1:t0, u1.w0, t0
+FADD.f32 r2:t1, u1.w0, t1
*NOP t0
+MOV.i32 t1, 0x40400000 /* 3.000000 */
*NOP t0
+FADD.f32 r3:t1, u1.w0, t1
}
clause_6:
ds(0) nbb ncph
{
*MOV.i32 t0, 0x40800000 /* 4.000000 */
+MOV.i32 t1, 0x40a00000 /* 5.000000 */
*FADD.f32 r4:t0, u1.w0, t0
+FADD.f32 r5:t1, u1.w0, t1
*MOV.i32 t0, 0x40c00000 /* 6.000000 */
+MOV.i32 t1, 0x40e00000 /* 7.000000 */
*FADD.f32 r6:t0, u1.w0, t0
+FADD.f32 r7:t1, u1.w0, t1
*MOV.i32 t0, 0x41000000 /* 8.000000 */
+MOV.i32 t1, 0x41100000 /* 9.000000 */
*FADD.f32 r8:t0, u1.w0, t0
+FADD.f32 r9:t1, u1.w0, t1
*NOP t0
+MOV.i32 t1, 0x41200000 /* 10.000000 */
*NOP t0
+FADD.f32 r10:t1, u1.w0, t1
}
clause_14:
ds(0) nbb ncph
{
*MOV.i32 t0, 0x41300000 /* 11.000000 */
+MOV.i32 t1, 0x41400000 /* 12.000000 */
*FADD.f32 r11:t0, u1.w0, t0
+FADD.f32 r12:t1, u1.w0, t1
*MOV.i32 t0, 0x41500000 /* 13.000000 */
+MOV.i32 t1, 0x41600000 /* 14.000000 */
*FADD.f32 r13:t0, u1.w0, t0
+FADD.f32 r14:t1, u1.w0, t1
*NOP t0
+MOV.i32 t1, 0x41700000 /* 15.000000 */
*FADD.f32 r15:t0, u1.w0, t1
+U32_TO_F32 t1, r0
*FMA.f32 r16:t0, t1, 0x2edbe6ff /* 0.000000 */, #0.neg
+MOV.i32 r17:t1, t
*NOP t0
+MOV.i32 r18:t1, t0
}
clause_21:
ds(0) nbb r_uncond ncph
{
*MOV.i32 r19:t0, r16
+MOV.i32 r20:t1, r16
*MOV.i32 r21:t0, r16
+MOV.i32 r22:t1, r16
*MOV.i32 r23:t0, r16
+MOV.i32 r24:t1, r16
*MOV.i32 r25:t0, r16
+MOV.i32 r26:t1, r16
*MOV.i32 r27:t0, r16
+MOV.i32 r28:t1, r16
*MOV.i32 r29:t0, r16
+MOV.i32 r30:t1, r16
*MOV.i32 r31:t0, r16
+MOV.i32 r32:t1, u1.w0
*NOP t0
+MOV.i32 r33:t1, #0.x
}
clause_27:
ds(0) nbb r_uncond ncph
{
*NOP t0
+ICMP.s32.m1.ge t1, r33, 0x00000008 /* 0.000000 */
*NOP t0
+BRANCHZ.i16.eq t1, t1.h0, clause_32
}
clause_30:
ds(0) nbb
{
*NOP t0
+JUMP t1, clause_225
}
clause_32:
ds(0) nbb ncph
{
*FMA.f32 r32:t0, r28, r32, r28
+NOP t1
*FMA.f32 r1:t0, r29, r1, r29
+NOP t1
*FMA.f32 r2:t0, r30, r2, r30
+NOP t1
*FMA.f32 r3:t0, r31, r3, r31
+NOP t1
*FMA.f32 r4:t0, r24, r4, r24
+NOP t1
*FMA.f32 r5:t0, r25, r5, r25
+NOP t1
*FMA.f32 r6:t0, r26, r6, r26
+NOP t1
*FMA.f32 r7:t0, r27, r7, r27
+NOP t1
}
clause_38:
ds(0) nbb ncph
{
*FMA.f32 r8:t0, r20, r8, r20
+NOP t1
*FMA.f32 r9:t0, r21, r9, r21
+NOP t1
*FMA.f32 r10:t0, r22, r10, r22
+NOP t1
*FMA.f32 r11:t0, r23, r11, r23
+NOP t1
*FMA.f32 r12:t0, r16, r12, r16
+NOP t1
*FMA.f32 r13:t0, r17, r13, r17
+NOP t1
*FMA.f32 r14:t0, r18, r14, r18
+NOP t1
*FMA.f32 r15:t0, r19, r15, r19
+NOP t1
}
clause_44:
ds(0) nbb ncph
{
*FMA.f32 r28:t0, r32, r28, r32
+NOP t1
*FMA.f32 r29:t0, r1, r29, r1
+NOP t1
*FMA.f32 r30:t0, r2, r30, r2
+NOP t1
*FMA.f32 r31:t0, r3, r31, r3
+NOP t1
*FMA.f32 r24:t0, r4, r24, r4
+NOP t1
*FMA.f32 r25:t0, r5, r25, r5
+NOP t1
*FMA.f32 r26:t0, r6, r26, r6
+NOP t1
*FMA.f32 r27:t0, r7, r27, r7
+NOP t1
}
clause_50:
ds(0) nbb ncph
{
*FMA.f32 r20:t0, r8, r20, r8
+NOP t1
*FMA.f32 r21:t0, r9, r21, r9
+NOP t1
*FMA.f32 r22:t0, r10, r22, r10
+NOP t1
*FMA.f32 r23:t0, r11, r23, r11
+NOP t1
*FMA.f32 r16:t0, r12, r16, r12
+NOP t1
*FMA.f32 r17:t0, r13, r17, r13
+NOP t1
*FMA.f32 r18:t0, r14, r18, r14
+NOP t1
*FMA.f32 r19:t0, r15, r19, r15
+NOP t1
}
clause_56:
ds(0) nbb ncph
{
*FMA.f32 r32:t0, r28, r32, r28
+NOP t1
*FMA.f32 r1:t0, r29, r1, r29
+NOP t1
*FMA.f32 r2:t0, r30, r2, r30
+NOP t1
*FMA.f32 r3:t0, r31, r3, r31
+NOP t1
*FMA.f32 r4:t0, r24, r4, r24
+NOP t1
*FMA.f32 r5:t0, r25, r5, r25
+NOP t1
*FMA.f32 r6:t0, r26, r6, r26
+NOP t1
*FMA.f32 r7:t0, r27, r7, r27
+NOP t1
}
clause_62:
ds(0) nbb ncph
{
*FMA.f32 r8:t0, r20, r8, r20
+NOP t1
*FMA.f32 r9:t0, r21, r9, r21
+NOP t1
*FMA.f32 r10:t0, r22, r10, r22
+NOP t1
*FMA.f32 r11:t0, r23, r11, r23
+NOP t1
*FMA.f32 r12:t0, r16, r12, r16
+NOP t1
*FMA.f32 r13:t0, r17, r13, r17
+NOP t1
*FMA.f32 r14:t0, r18, r14, r18
+NOP t1
*FMA.f32 r15:t0, r19, r15, r19
+NOP t1
}
clause_68:
ds(0) nbb ncph
{
*FMA.f32 r28:t0, r32, r28, r32
+NOP t1
*FMA.f32 r29:t0, r1, r29, r1
+NOP t1
*FMA.f32 r30:t0, r2, r30, r2
+NOP t1
*FMA.f32 r31:t0, r3, r31, r3
+NOP t1
*FMA.f32 r24:t0, r4, r24, r4
+NOP t1
*FMA.f32 r25:t0, r5, r25, r5
+NOP t1
*FMA.f32 r26:t0, r6, r26, r6
+NOP t1
*FMA.f32 r27:t0, r7, r27, r7
+NOP t1
}
clause_74:
ds(0) nbb ncph
{
*FMA.f32 r20:t0, r8, r20, r8
+NOP t1
*FMA.f32 r21:t0, r9, r21, r9
+NOP t1
*FMA.f32 r22:t0, r10, r22, r10
+NOP t1
*FMA.f32 r23:t0, r11, r23, r11
+NOP t1
*FMA.f32 r16:t0, r12, r16, r12
+NOP t1
*FMA.f32 r17:t0, r13, r17, r13
+NOP t1
*FMA.f32 r18:t0, r14, r18, r14
+NOP t1
*FMA.f32 r19:t0, r15, r19, r15
+NOP t1
}
clause_80:
ds(0) nbb ncph
{
*FMA.f32 r32:t0, r28, r32, r28
+NOP t1
*FMA.f32 r1:t0, r29, r1, r29
+NOP t1
*FMA.f32 r2:t0, r30, r2, r30
+NOP t1
*FMA.f32 r3:t0, r31, r3, r31
+NOP t1
*FMA.f32 r4:t0, r24, r4, r24
+NOP t1
*FMA.f32 r5:t0, r25, r5, r25
+NOP t1
*FMA.f32 r6:t0, r26, r6, r26
+NOP t1
*FMA.f32 r7:t0, r27, r7, r27
+NOP t1
}
clause_86:
ds(0) nbb ncph
{
*FMA.f32 r8:t0, r20, r8, r20
+NOP t1
*FMA.f32 r9:t0, r21, r9, r21
+NOP t1
*FMA.f32 r10:t0, r22, r10, r22
+NOP t1
*FMA.f32 r11:t0, r23, r11, r23
+NOP t1
*FMA.f32 r12:t0, r16, r12, r16
+NOP t1
*FMA.f32 r13:t0, r17, r13, r17
+NOP t1
*FMA.f32 r14:t0, r18, r14, r18
+NOP t1
*FMA.f32 r15:t0, r19, r15, r19
+NOP t1
}
clause_92:
ds(0) nbb ncph
{
*FMA.f32 r28:t0, r32, r28, r32
+NOP t1
*FMA.f32 r29:t0, r1, r29, r1
+NOP t1
*FMA.f32 r30:t0, r2, r30, r2
+NOP t1
*FMA.f32 r31:t0, r3, r31, r3
+NOP t1
*FMA.f32 r24:t0, r4, r24, r4
+NOP t1
*FMA.f32 r25:t0, r5, r25, r5
+NOP t1
*FMA.f32 r26:t0, r6, r26, r6
+NOP t1
*FMA.f32 r27:t0, r7, r27, r7
+NOP t1
}
clause_98:
ds(0) nbb ncph
{
*FMA.f32 r20:t0, r8, r20, r8
+NOP t1
*FMA.f32 r21:t0, r9, r21, r9
+NOP t1
*FMA.f32 r22:t0, r10, r22, r10
+NOP t1
*FMA.f32 r23:t0, r11, r23, r11
+NOP t1
*FMA.f32 r16:t0, r12, r16, r12
+NOP t1
*FMA.f32 r17:t0, r13, r17, r13
+NOP t1
*FMA.f32 r18:t0, r14, r18, r14
+NOP t1
*FMA.f32 r19:t0, r15, r19, r15
+NOP t1
}
clause_104:
ds(0) nbb ncph
{
*FMA.f32 r32:t0, r28, r32, r28
+NOP t1
*FMA.f32 r1:t0, r29, r1, r29
+NOP t1
*FMA.f32 r2:t0, r30, r2, r30
+NOP t1
*FMA.f32 r3:t0, r31, r3, r31
+NOP t1
*FMA.f32 r4:t0, r24, r4, r24
+NOP t1
*FMA.f32 r5:t0, r25, r5, r25
+NOP t1
*FMA.f32 r6:t0, r26, r6, r26
+NOP t1
*FMA.f32 r7:t0, r27, r7, r27
+NOP t1
}
clause_110:
ds(0) nbb ncph
{
*FMA.f32 r8:t0, r20, r8, r20
+NOP t1
*FMA.f32 r9:t0, r21, r9, r21
+NOP t1
*FMA.f32 r10:t0, r22, r10, r22
+NOP t1
*FMA.f32 r11:t0, r23, r11, r23
+NOP t1
*FMA.f32 r12:t0, r16, r12, r16
+NOP t1
*FMA.f32 r13:t0, r17, r13, r17
+NOP t1
*FMA.f32 r14:t0, r18, r14, r18
+NOP t1
*FMA.f32 r15:t0, r19, r15, r19
+NOP t1
}
clause_116:
ds(0) nbb ncph
{
*FMA.f32 r28:t0, r32, r28, r32
+NOP t1
*FMA.f32 r29:t0, r1, r29, r1
+NOP t1
*FMA.f32 r30:t0, r2, r30, r2
+NOP t1
*FMA.f32 r31:t0, r3, r31, r3
+NOP t1
*FMA.f32 r24:t0, r4, r24, r4
+NOP t1
*FMA.f32 r25:t0, r5, r25, r5
+NOP t1
*FMA.f32 r26:t0, r6, r26, r6
+NOP t1
*FMA.f32 r27:t0, r7, r27, r7
+NOP t1
}
clause_122:
ds(0) nbb ncph
{
*FMA.f32 r20:t0, r8, r20, r8
+NOP t1
*FMA.f32 r21:t0, r9, r21, r9
+NOP t1
*FMA.f32 r22:t0, r10, r22, r10
+NOP t1
*FMA.f32 r23:t0, r11, r23, r11
+NOP t1
*FMA.f32 r16:t0, r12, r16, r12
+NOP t1
*FMA.f32 r17:t0, r13, r17, r13
+NOP t1
*FMA.f32 r18:t0, r14, r18, r14
+NOP t1
*FMA.f32 r19:t0, r15, r19, r15
+NOP t1
}
clause_128:
ds(0) nbb ncph
{
*FMA.f32 r32:t0, r28, r32, r28
+NOP t1
*FMA.f32 r1:t0, r29, r1, r29
+NOP t1
*FMA.f32 r2:t0, r30, r2, r30
+NOP t1
*FMA.f32 r3:t0, r31, r3, r31
+NOP t1
*FMA.f32 r4:t0, r24, r4, r24
+NOP t1
*FMA.f32 r5:t0, r25, r5, r25
+NOP t1
*FMA.f32 r6:t0, r26, r6, r26
+NOP t1
*FMA.f32 r7:t0, r27, r7, r27
+NOP t1
}
clause_134:
ds(0) nbb ncph
{
*FMA.f32 r8:t0, r20, r8, r20
+NOP t1
*FMA.f32 r9:t0, r21, r9, r21
+NOP t1
*FMA.f32 r10:t0, r22, r10, r22
+NOP t1
*FMA.f32 r11:t0, r23, r11, r23
+NOP t1
*FMA.f32 r12:t0, r16, r12, r16
+NOP t1
*FMA.f32 r13:t0, r17, r13, r17
+NOP t1
*FMA.f32 r14:t0, r18, r14, r18
+NOP t1
*FMA.f32 r15:t0, r19, r15, r19
+NOP t1
}
clause_140:
ds(0) nbb ncph
{
*FMA.f32 r28:t0, r32, r28, r32
+NOP t1
*FMA.f32 r29:t0, r1, r29, r1
+NOP t1
*FMA.f32 r30:t0, r2, r30, r2
+NOP t1
*FMA.f32 r31:t0, r3, r31, r3
+NOP t1
*FMA.f32 r24:t0, r4, r24, r4
+NOP t1
*FMA.f32 r25:t0, r5, r25, r5
+NOP t1
*FMA.f32 r26:t0, r6, r26, r6
+NOP t1
*FMA.f32 r27:t0, r7, r27, r7
+NOP t1
}
clause_146:
ds(0) nbb ncph
{
*FMA.f32 r20:t0, r8, r20, r8
+NOP t1
*FMA.f32 r21:t0, r9, r21, r9
+NOP t1
*FMA.f32 r22:t0, r10, r22, r10
+NOP t1
*FMA.f32 r23:t0, r11, r23, r11
+NOP t1
*FMA.f32 r16:t0, r12, r16, r12
+NOP t1
*FMA.f32 r17:t0, r13, r17, r13
+NOP t1
*FMA.f32 r18:t0, r14, r18, r14
+NOP t1
*FMA.f32 r19:t0, r15, r19, r15
+NOP t1
}
clause_152:
ds(0) nbb ncph
{
*FMA.f32 r32:t0, r28, r32, r28
+NOP t1
*FMA.f32 r1:t0, r29, r1, r29
+NOP t1
*FMA.f32 r2:t0, r30, r2, r30
+NOP t1
*FMA.f32 r3:t0, r31, r3, r31
+NOP t1
*FMA.f32 r4:t0, r24, r4, r24
+NOP t1
*FMA.f32 r5:t0, r25, r5, r25
+NOP t1
*FMA.f32 r6:t0, r26, r6, r26
+NOP t1
*FMA.f32 r7:t0, r27, r7, r27
+NOP t1
}
clause_158:
ds(0) nbb ncph
{
*FMA.f32 r8:t0, r20, r8, r20
+NOP t1
*FMA.f32 r9:t0, r21, r9, r21
+NOP t1
*FMA.f32 r10:t0, r22, r10, r22
+NOP t1
*FMA.f32 r11:t0, r23, r11, r23
+NOP t1
*FMA.f32 r12:t0, r16, r12, r16
+NOP t1
*FMA.f32 r13:t0, r17, r13, r17
+NOP t1
*FMA.f32 r14:t0, r18, r14, r18
+NOP t1
*FMA.f32 r15:t0, r19, r15, r19
+NOP t1
}
clause_164:
ds(0) nbb ncph
{
*FMA.f32 r28:t0, r32, r28, r32
+NOP t1
*FMA.f32 r29:t0, r1, r29, r1
+NOP t1
*FMA.f32 r30:t0, r2, r30, r2
+NOP t1
*FMA.f32 r31:t0, r3, r31, r3
+NOP t1
*FMA.f32 r24:t0, r4, r24, r4
+NOP t1
*FMA.f32 r25:t0, r5, r25, r5
+NOP t1
*FMA.f32 r26:t0, r6, r26, r6
+NOP t1
*FMA.f32 r27:t0, r7, r27, r7
+NOP t1
}
clause_170:
ds(0) nbb ncph
{
*FMA.f32 r20:t0, r8, r20, r8
+NOP t1
*FMA.f32 r21:t0, r9, r21, r9
+NOP t1
*FMA.f32 r22:t0, r10, r22, r10
+NOP t1
*FMA.f32 r23:t0, r11, r23, r11
+NOP t1
*FMA.f32 r16:t0, r12, r16, r12
+NOP t1
*FMA.f32 r17:t0, r13, r17, r13
+NOP t1
*FMA.f32 r18:t0, r14, r18, r14
+NOP t1
*FMA.f32 r19:t0, r15, r19, r15
+NOP t1
}
clause_176:
ds(0) nbb ncph
{
*FMA.f32 r32:t0, r28, r32, r28
+NOP t1
*FMA.f32 r1:t0, r29, r1, r29
+NOP t1
*FMA.f32 r2:t0, r30, r2, r30
+NOP t1
*FMA.f32 r3:t0, r31, r3, r31
+NOP t1
*FMA.f32 r4:t0, r24, r4, r24
+NOP t1
*FMA.f32 r5:t0, r25, r5, r25
+NOP t1
*FMA.f32 r6:t0, r26, r6, r26
+NOP t1
*FMA.f32 r7:t0, r27, r7, r27
+NOP t1
}
clause_182:
ds(0) nbb ncph
{
*FMA.f32 r8:t0, r20, r8, r20
+NOP t1
*FMA.f32 r9:t0, r21, r9, r21
+NOP t1
*FMA.f32 r10:t0, r22, r10, r22
+NOP t1
*FMA.f32 r11:t0, r23, r11, r23
+NOP t1
*FMA.f32 r12:t0, r16, r12, r16
+NOP t1
*FMA.f32 r13:t0, r17, r13, r17
+NOP t1
*FMA.f32 r14:t0, r18, r14, r18
+NOP t1
*FMA.f32 r15:t0, r19, r15, r19
+NOP t1
}
clause_188:
ds(0) nbb ncph
{
*FMA.f32 r28:t0, r32, r28, r32
+NOP t1
*FMA.f32 r29:t0, r1, r29, r1
+NOP t1
*FMA.f32 r30:t0, r2, r30, r2
+NOP t1
*FMA.f32 r31:t0, r3, r31, r3
+NOP t1
*FMA.f32 r24:t0, r4, r24, r4
+NOP t1
*FMA.f32 r25:t0, r5, r25, r5
+NOP t1
*FMA.f32 r26:t0, r6, r26, r6
+NOP t1
*FMA.f32 r27:t0, r7, r27, r7
+NOP t1
}
clause_194:
ds(0) nbb ncph
{
*FMA.f32 r20:t0, r8, r20, r8
+NOP t1
*FMA.f32 r21:t0, r9, r21, r9
+NOP t1
*FMA.f32 r22:t0, r10, r22, r10
+NOP t1
*FMA.f32 r23:t0, r11, r23, r11
+NOP t1
*FMA.f32 r16:t0, r12, r16, r12
+NOP t1
*FMA.f32 r17:t0, r13, r17, r13
+NOP t1
*FMA.f32 r18:t0, r14, r18, r14
+NOP t1
*FMA.f32 r19:t0, r15, r19, r15
+NOP t1
}
clause_200:
ds(0) nbb ncph
{
*FMA.f32 r32:t0, r28, r32, r28
+NOP t1
*FMA.f32 r1:t0, r29, r1, r29
+NOP t1
*FMA.f32 r2:t0, r30, r2, r30
+NOP t1
*FMA.f32 r3:t0, r31, r3, r31
+NOP t1
*FMA.f32 r4:t0, r24, r4, r24
+NOP t1
*FMA.f32 r5:t0, r25, r5, r25
+NOP t1
*FMA.f32 r6:t0, r26, r6, r26
+NOP t1
*FMA.f32 r7:t0, r27, r7, r27
+NOP t1
}
clause_206:
ds(0) nbb ncph
{
*FMA.f32 r8:t0, r20, r8, r20
+NOP t1
*FMA.f32 r9:t0, r21, r9, r21
+NOP t1
*FMA.f32 r10:t0, r22, r10, r22
+NOP t1
*FMA.f32 r11:t0, r23, r11, r23
+NOP t1
*FMA.f32 r12:t0, r16, r12, r16
+NOP t1
*FMA.f32 r13:t0, r17, r13, r17
+NOP t1
*FMA.f32 r14:t0, r18, r14, r18
+NOP t1
*FMA.f32 r15:t0, r19, r15, r19
+NOP t1
}
clause_212:
ds(0) nbb ncph
{
*FMA.f32 r28:t0, r32, r28, r32
+NOP t1
*FMA.f32 r29:t0, r1, r29, r1
+NOP t1
*FMA.f32 r30:t0, r2, r30, r2
+NOP t1
*FMA.f32 r31:t0, r3, r31, r3
+NOP t1
*FMA.f32 r24:t0, r4, r24, r4
+NOP t1
*FMA.f32 r25:t0, r5, r25, r5
+NOP t1
*FMA.f32 r26:t0, r6, r26, r6
+NOP t1
*FMA.f32 r27:t0, r7, r27, r7
+NOP t1
}
clause_218:
ds(0) nbb r_uncond
{
*FMA.f32 r20:t0, r8, r20, r8
+NOP t1
*FMA.f32 r21:t0, r9, r21, r9
+NOP t1
*FMA.f32 r22:t0, r10, r22, r10
+NOP t1
*FMA.f32 r23:t0, r11, r23, r11
+NOP t1
*FMA.f32 r16:t0, r12, r16, r12
+NOP t1
*FMA.f32 r17:t0, r13, r17, r13
+NOP t1
*FMA.f32 r18:t0, r14, r18, r14
+IADD.s32 r33:t1, r33, 0x00000001 /* 0.000000 */
*FMA.f32 r19:t0, r15, r19, r15
+JUMP t1, clause_27
}
clause_225:
ds(0) nbb ncph
{
*NOP t0
+FADD.f32 r1:t1, r28, r20
}
clause_226:
ds(0) nbb ncph next_store dwb(0)
{
*NOP t0
+FADD.f32 r2:t1, r29, r21
*NOP t0
+FADD.f32 r3:t1, r30, r22
*NOP t0
+FADD.f32 r4:t1, r31, r23
*NOP t0
+FADD.f32 r5:t1, r24, r16
*NOP t0
+FADD.f32 r6:t1, r25, r17
*NOP t0
+FADD.f32 r7:t1, r26, r18
*NOP t0
+FADD.f32 r8:t1, r27, r19
*NOP t0
+FADD.f32 r1:t1, r1, r5
}
clause_232:
ds(0) eos store
{
*NOP t0
+FADD.f32 r2:t1, r2, r6
*NOP t0
+FADD.f32 r3:t1, r3, r7
*NOP t0
+FADD.f32 r4:t1, r4, r8
*LSHIFT_OR.i32 t0, r0, #0, 0x00000002 /* 0.000000 */
+FADD.f32 r1:t1, r1, r3
*FADD.f32 t0, r2, r4
+IADD.s32 r0:t1, u0.w0, t0
*FADD.f32 r1:t0, r1, t0
+ICMP.u32.gt t1, u0.w0, t1
*NOP t0
+IADD.s32 t1, t1, u0.w1
*NOP t0
+STORE.i32 t1, r0, t1, @r1
}
25a14253 compute_sp_v16_float 21.759 GFLOPs 12.337ms
compute shader ----------
#define KERNEL compute_sp_v1
#define LOCAL_SIZE_X 256
#define DATATYPE int
#define vec2 ivec2
#define vec4 ivec4
#line 64
// Avoiding auto-vectorize by using vector-width locked dependent code
layout(local_size_x = LOCAL_SIZE_X) in;
#undef MAD_4
#undef MAD_16
#undef MAD_64
#define mad(a,b,c) (a*b+c)
#define MAD_4(x, y) x = mad(y, x, y); y = mad(x, y, x); x = mad(y, x, y); y = mad(x, y, x);
#define MAD_16(x, y) MAD_4(x, y); MAD_4(x, y); MAD_4(x, y); MAD_4(x, y);
#define MAD_64(x, y) MAD_16(x, y); MAD_16(x, y); MAD_16(x, y); MAD_16(x, y);
struct vec8 {
vec4 d0, d1;
};
#define VEC8(x0,x1,x2,x3,x4,x5,x6,x7) vec8(vec4(x0,x1,x2,x3), vec4(x4,x5,x6,x7))
#define VEC8_S(x) vec8(vec4(x,x,x,x), vec4(x,x,x,x))
#define VEC8_ADD(a, b) (vec8(a.d0 + b.d0, a.d1 + b.d1))
#define VEC8_MUL(a, b) (vec8(a.d0 * b.d0, a.d1 * b.d1))
struct vec16 {
vec8 d0,d1;
};
#define VEC16(x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15) vec16(VEC8(x0,x1,x2,x3,x4,x5,x6,x7), VEC8(x8,x9,x10,x11,x12,x13,x14,x15))
#define VEC16_S(x) vec16(VEC8_S(x), VEC8_S(x));
#define VEC16_ADD(a, b) (vec16(VEC8_ADD(a.d0, b.d0), VEC8_ADD(a.d1, b.d1)))
#define VEC16_MUL(a, b) (vec16(VEC8_MUL(a.d0, b.d0), VEC8_MUL(a.d1, b.d1)))
#define mad8(a,b,c) (VEC8_ADD(VEC8_MUL(a,b),c))
#define mad16(a,b,c) (VEC16_ADD(VEC16_MUL(a,b),c))
layout(location = 1) uniform DATATYPE _A;
#define SCALE 1e-10
layout(std430, binding = 0) restrict writeonly buffer outbuffer {
DATATYPE ptr[];
};
#line 111
void compute_sp_v1()
{
uint id = gl_GlobalInvocationID[0] + gl_GlobalInvocationID[1] * 256u + gl_GlobalInvocationID[2] * 256u * 256u;
DATATYPE x = _A;
DATATYPE y = DATATYPE(float(id) * SCALE);
for(int i=0; i<128; i++)
{
MAD_16(x, y);
}
ptr[id] = y;
}
void main() {compute_sp_v1();}
----------
shader: MESA_SHADER_COMPUTE
source_sha1: {0xbbd9d65d, 0x1def99df, 0x82d13ce1, 0x50fbde77, 0x01b473fb}
name: GLSL12
workgroup-size: 256, 1, 1
shared-size: 0
inputs: 0
outputs: 0
uniforms: 1
ubos: 1
shared: 0
ray queries: 0
decl_var ssbo INTERP_MODE_NONE restrict writeonly highp int[] ptr (0, 0, 0)
decl_var uniform INTERP_MODE_NONE highp int _A (1, 0, 0)
decl_var ubo INTERP_MODE_NONE vec4[1] uniform_0 (0, 0, 0)
decl_function main (0 params)
impl main {
block block_0:
/* preds: */
vec3 32 ssa_4 = intrinsic load_global_invocation_id () ()
vec1 32 ssa_5 = load_const (0x00000008 = 0.000000)
vec1 32 ssa_6 = ishl ssa_4.y, ssa_5
vec1 32 ssa_7 = iadd ssa_4.x, ssa_6
vec1 32 ssa_1 = load_const (0x00000001 = 0.000000)
vec1 32 ssa_63 = insert_u16 ssa_4.z, ssa_1
vec1 32 ssa_10 = iadd ssa_7, ssa_63
vec1 32 ssa_12 = u2f32 ssa_10
vec1 32 ssa_2 = load_const (0x2edbe6ff = 0.000000)
vec1 32 ssa_13 = fmul ssa_12, ssa_2
vec1 32 ssa_14 = f2i32 ssa_13
vec1 32 ssa_0 = load_const (0x00000000 = 0.000000)
vec1 32 ssa_3 = load_const (0x00000080 = 0.000000)
vec1 32 ssa_11 = intrinsic load_ubo (ssa_0, ssa_0) (access=0, align_mul=1073741824, align_offset=0, range_base=0, range=4)
/* succs: block_1 */
loop {
block block_1:
/* preds: block_0 block_4 */
vec1 32 ssa_15 = phi block_0: ssa_14, block_4: ssa_50
vec1 32 ssa_16 = phi block_0: ssa_11, block_4: ssa_48
vec1 32 ssa_17 = phi block_0: ssa_0, block_4: ssa_51
vec1 32 ssa_18 = ige32 ssa_17, ssa_3
/* succs: block_2 block_3 */
if ssa_18 {
block block_2:
/* preds: block_1 */
break
/* succs: block_5 */
} else {
block block_3:
/* preds: block_1 */
/* succs: block_4 */
}
block block_4:
/* preds: block_3 */
vec1 32 ssa_19 = imul ssa_15, ssa_16
vec1 32 ssa_20 = iadd ssa_19, ssa_15
vec1 32 ssa_21 = imul ssa_20, ssa_15
vec1 32 ssa_22 = iadd ssa_21, ssa_20
vec1 32 ssa_23 = imul ssa_22, ssa_20
vec1 32 ssa_24 = iadd ssa_23, ssa_22
vec1 32 ssa_25 = imul ssa_24, ssa_22
vec1 32 ssa_26 = iadd ssa_25, ssa_24
vec1 32 ssa_27 = imul ssa_26, ssa_24
vec1 32 ssa_28 = iadd ssa_27, ssa_26
vec1 32 ssa_29 = imul ssa_28, ssa_26
vec1 32 ssa_30 = iadd ssa_29, ssa_28
vec1 32 ssa_31 = imul ssa_30, ssa_28
vec1 32 ssa_32 = iadd ssa_31, ssa_30
vec1 32 ssa_33 = imul ssa_32, ssa_30
vec1 32 ssa_34 = iadd ssa_33, ssa_32
vec1 32 ssa_35 = imul ssa_34, ssa_32
vec1 32 ssa_36 = iadd ssa_35, ssa_34
vec1 32 ssa_37 = imul ssa_36, ssa_34
vec1 32 ssa_38 = iadd ssa_37, ssa_36
vec1 32 ssa_39 = imul ssa_38, ssa_36
vec1 32 ssa_40 = iadd ssa_39, ssa_38
vec1 32 ssa_41 = imul ssa_40, ssa_38
vec1 32 ssa_42 = iadd ssa_41, ssa_40
vec1 32 ssa_43 = imul ssa_42, ssa_40
vec1 32 ssa_44 = iadd ssa_43, ssa_42
vec1 32 ssa_45 = imul ssa_44, ssa_42
vec1 32 ssa_46 = iadd ssa_45, ssa_44
vec1 32 ssa_47 = imul ssa_46, ssa_44
vec1 32 ssa_48 = iadd ssa_47, ssa_46
vec1 32 ssa_49 = imul ssa_48, ssa_46
vec1 32 ssa_50 = iadd ssa_49, ssa_48
vec1 32 ssa_51 = iadd ssa_17, ssa_1
/* succs: block_1 */
}
block block_5:
/* preds: block_2 */
vec1 32 ssa_52 = load_const (0x00000002 = 0.000000)
vec1 32 ssa_53 = ishl ssa_10, ssa_52
vec1 64 ssa_54 = intrinsic load_ssbo_address (ssa_0) ()
vec1 32 ssa_55 = unpack_64_2x32_split_x ssa_54
vec1 32 ssa_56 = unpack_64_2x32_split_y ssa_54
vec1 32 ssa_57 = iadd ssa_55, ssa_53
vec1 32 ssa_58 = ult32 ssa_57, ssa_55
vec1 32 ssa_59 = b2i32 ssa_58
vec1 32 ssa_60 = iadd ssa_59, ssa_56
vec1 64 ssa_61 = pack_64_2x32_split ssa_57, ssa_60
intrinsic store_global (ssa_15, ssa_61) (wrmask=x /*1*/, access=0, align_mul=4, align_offset=0)
/* succs: block_6 */
block block_6:
}
block0 {
66 = MOV.i32 r62
65 = MOV.i32 r61
64 = MOV.i32 r60
6 = LSHIFT_OR.i32 65, #0x0, #0x8.b0
7 = IADD.s32 64, 6
63 = MKVEC.v2i16 #0x0.h00, 66.h00
10 = IADD.s32 7, 63
12 = U32_TO_F32 10
13 = FMA.f32 12, #0x2edbe6ff, #0x0.neg
14 = F32_TO_S32.rtz 13
} -> block1
block1 {
15 = PHI 14, 50
16 = PHI u1, 48
17 = PHI #0x0, 51
18 = ICMP.s32.m1.ge 17, #0x80
BRANCHZ.i16.eq 18.h00, #0x0 -> block3
} -> block3 block2 from block0 block4
block2 {
JUMP #0x0 -> block5
} -> block5 from block1
block3 {
} -> block4 from block1
block4 {
19 = IMUL.i32 15, 16
20 = IADD.s32 19, 15
21 = IMUL.i32 20, 15
22 = IADD.s32 21, 20
23 = IMUL.i32 22, 20
24 = IADD.s32 23, 22
25 = IMUL.i32 24, 22
26 = IADD.s32 25, 24
27 = IMUL.i32 26, 24
28 = IADD.s32 27, 26
29 = IMUL.i32 28, 26
30 = IADD.s32 29, 28
31 = IMUL.i32 30, 28
32 = IADD.s32 31, 30
33 = IMUL.i32 32, 30
34 = IADD.s32 33, 32
35 = IMUL.i32 34, 32
36 = IADD.s32 35, 34
37 = IMUL.i32 36, 34
38 = IADD.s32 37, 36
39 = IMUL.i32 38, 36
40 = IADD.s32 39, 38
41 = IMUL.i32 40, 38
42 = IADD.s32 41, 40
43 = IMUL.i32 42, 40
44 = IADD.s32 43, 42
45 = IMUL.i32 44, 42
46 = IADD.s32 45, 44
47 = IMUL.i32 46, 44
48 = IADD.s32 47, 46
49 = IMUL.i32 48, 46
50 = IADD.s32 49, 48
51 = IADD.s32 17, #0x1
JUMP #0x0 -> block1
} -> block1 from block3
block5 {
53 = LSHIFT_OR.i32 10, #0x0, #0x2.b0
57 = IADD.s32 u0, 53
59 = ICMP.u32.i1.lt 57, u0
60 = IADD.s32 59, u0[1]
STORE.i32 15, 57, 60, byte_offset:0
} from block2
block0 {
r0 = LSHIFT_OR.i32 r61, #0x0, #0x8.b0
r0 = IADD.s32 r60, r0
r1 = MKVEC.v2i16 #0x0.h00, r62.h00
r0 = IADD.s32 r0, r1
r1 = U32_TO_F32 r0
r1 = FMA.f32 r1, #0x2edbe6ff, #0x0.neg
r1 = F32_TO_S32.rtz r1
r2 = MOV.i32 u1
r3 = MOV.i32 #0x0
} -> block1
block1 {
r4 = ICMP.s32.m1.ge r3, #0x80
BRANCHZ.i16.eq r4.h00, #0x0 -> block3
} -> block3 block2 from block0 block4
block2 {
JUMP #0x0 -> block5
} -> block5 from block1
block3 {
} -> block4 from block1
block4 {
r2 = IMUL.i32 r1, r2
r2 = IADD.s32 r2, r1
r1 = IMUL.i32 r2, r1
r1 = IADD.s32 r1, r2
r2 = IMUL.i32 r1, r2
r2 = IADD.s32 r2, r1
r1 = IMUL.i32 r2, r1
r1 = IADD.s32 r1, r2
r2 = IMUL.i32 r1, r2
r2 = IADD.s32 r2, r1
r1 = IMUL.i32 r2, r1
r1 = IADD.s32 r1, r2
r2 = IMUL.i32 r1, r2
r2 = IADD.s32 r2, r1
r1 = IMUL.i32 r2, r1
r1 = IADD.s32 r1, r2
r2 = IMUL.i32 r1, r2
r2 = IADD.s32 r2, r1
r1 = IMUL.i32 r2, r1
r1 = IADD.s32 r1, r2
r2 = IMUL.i32 r1, r2
r2 = IADD.s32 r2, r1
r1 = IMUL.i32 r2, r1
r1 = IADD.s32 r1, r2
r2 = IMUL.i32 r1, r2
r2 = IADD.s32 r2, r1
r1 = IMUL.i32 r2, r1
r1 = IADD.s32 r1, r2
r2 = IMUL.i32 r1, r2
r2 = IADD.s32 r2, r1
r1 = IMUL.i32 r2, r1
r1 = IADD.s32 r1, r2
r3 = IADD.s32 r3, #0x1
JUMP #0x0 -> block1
} -> block1 from block3
block5 {
r0 = LSHIFT_OR.i32 r0, #0x0, #0x2.b0
r0 = IADD.s32 u0, r0
r2 = ICMP.u32.i1.lt r0, u0
r2 = IADD.s32 r2, u0[1]
STORE.i32 r1, r0, r2, byte_offset:0
} from block2
block0 {
id(0) nbb r_uncond
* _.h00 = LSHIFT_OR.i32 r61, t, fau.y.b0
+ _.h00 = IADD.s32 r60, t
* _.h00 = MKVEC.v2i16 t.h00, r62.h00
+ r0 = IADD.s32 t1, t
* r2 = MOV.i32 fau.x
+ _.h00 = U32_TO_F32 t1
* _.h00 = FMA.f32 t1, fau.y, t.neg
+ r3 = MOV.i32 fau.x
* NOP
+ r1 = F32_TO_S32.rtz t0
2edbe6ff00000000 800000000
} -> block1
block1 {
id(0) nbb r_uncond pcrel(0)
* NOP
+ _.h00 = ICMP.s32.m1.ge r3, fau.x
* NOP
+ BRANCHZ.i16.eq t1.h00, fau.y -> block3
4000000000000080
} -> block3 block2 from block0 block4
block2 {
id(0) nbb no_prefetch pcrel(0)
* NOP
+ JUMP fau.y -> block5
4000000000000000
} -> block5 from block1
block3 {
} -> block4 from block1
block4 {
id(0) nbb
* _.h00 = IMUL.i32 r1, r2
+ r2 = IADD.s32 t, r1
* _.h00 = IMUL.i32 t1, r1
+ r1 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r2
+ NOP
* NOP
+ r2 = IADD.s32 t0, r1
id(0) nbb
* _.h00 = IMUL.i32 r2, r1
+ r1 = IADD.s32 t, r2
* _.h00 = IMUL.i32 t1, r2
+ r2 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r1
+ r1 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r2
+ r2 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r1
+ r1 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r2
+ r2 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r1
+ NOP
* NOP
+ r1 = IADD.s32 t0, r2
id(0) nbb r_uncond no_prefetch pcrel(1)
* _.h00 = IMUL.i32 r1, r2
+ r2 = IADD.s32 t, r1
* _.h00 = IMUL.i32 t1, r1
+ r1 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r2
+ r2 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r1
+ r1 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r2
+ r2 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r1
+ r1 = IADD.s32 t, t1
* NOP
+ r3 = IADD.s32 r3, fau.x
* NOP
+ JUMP fau.y -> block1
0 4000000000000001
} -> block1 from block3
block5 {
id(0) wait(0 ) nbb r_uncond
* _.h00 = LSHIFT_OR.i32 r0, t, fau.y.b0
+ NOP
* NOP
+ r0 = IADD.s32 fau.x, t0
* NOP
+ _.h00 = ICMP.u32.i1.lt t1, fau.x
* NOP
+ _.h00 = IADD.s32 t1, fau.y
* NOP
+ STORE.i32 r1, r0, t1, byte_offset:0
200000000
} from block2
slot 0 reads: r1
clause_0:
ds(0) nbb r_uncond ncph
{
*LSHIFT_OR.i32 t0, r61, #0, 0x00000008 /* 0.000000 */
+IADD.s32 t1, r60, t
*MKVEC.v2i16 t0, #0, r62
+IADD.s32 r0:t1, t1, t
*MOV.i32 r2:t0, u1.w0
+U32_TO_F32 t1, t1
*FMA.f32 t0, t1, 0x2edbe6ff /* 0.000000 */, #0.neg
+MOV.i32 r3:t1, 0x00000000 /* 0.000000 */
*NOP t0
+F32_TO_S32.rtz r1:t1, t0
}
clause_5:
ds(0) nbb r_uncond ncph
{
*NOP t0
+ICMP.s32.m1.ge t1, r3, 0x00000080 /* 0.000000 */
*NOP t0
+BRANCHZ.i16.eq t1, t1.h0, clause_10
}
clause_8:
ds(0) nbb next_store dwb(0)
{
*NOP t0
+JUMP t1, clause_26
}
clause_10:
ds(0) nbb ncph
{
*IMUL.i32 t0, r1, r2
+IADD.s32 r2:t1, t, r1
*IMUL.i32 t0, t1, r1
+IADD.s32 r1:t1, t, t1
*IMUL.i32 t0, t1, r2
+NOP t1
*NOP t0
+IADD.s32 r2:t1, t0, r1
}
clause_13:
ds(0) nbb ncph
{
*IMUL.i32 t0, r2, r1
+IADD.s32 r1:t1, t, r2
*IMUL.i32 t0, t1, r2
+IADD.s32 r2:t1, t, t1
*IMUL.i32 t0, t1, r1
+IADD.s32 r1:t1, t, t1
*IMUL.i32 t0, t1, r2
+IADD.s32 r2:t1, t, t1
*IMUL.i32 t0, t1, r1
+IADD.s32 r1:t1, t, t1
*IMUL.i32 t0, t1, r2
+IADD.s32 r2:t1, t, t1
*IMUL.i32 t0, t1, r1
+NOP t1
*NOP t0
+IADD.s32 r1:t1, t0, r2
}
clause_19:
ds(0) nbb r_uncond
{
*IMUL.i32 t0, r1, r2
+IADD.s32 r2:t1, t, r1
*IMUL.i32 t0, t1, r1
+IADD.s32 r1:t1, t, t1
*IMUL.i32 t0, t1, r2
+IADD.s32 r2:t1, t, t1
*IMUL.i32 t0, t1, r1
+IADD.s32 r1:t1, t, t1
*IMUL.i32 t0, t1, r2
+IADD.s32 r2:t1, t, t1
*IMUL.i32 t0, t1, r1
+IADD.s32 r1:t1, t, t1
*NOP t0
+IADD.s32 r3:t1, r3, 0x00000001 /* 0.000000 */
*NOP t0
+JUMP t1, clause_5
}
clause_26:
ds(0) eos store
{
*LSHIFT_OR.i32 t0, r0, #0, 0x00000002 /* 0.000000 */
+NOP t1
*NOP t0
+IADD.s32 r0:t1, u0.w0, t0
*NOP t0
+ICMP.u32.gt t1, u0.w0, t1
*NOP t0
+IADD.s32 t1, t1, u0.w1
*NOP t0
+STORE.i32 t1, r0, t1, @r1
}
e20eea22 compute_sp_v1_int 18.450 GFLOPs 14.550ms
compute shader ----------
#define KERNEL compute_sp_v2
#define LOCAL_SIZE_X 256
#define DATATYPE int
#define vec2 ivec2
#define vec4 ivec4
#line 64
// Avoiding auto-vectorize by using vector-width locked dependent code
layout(local_size_x = LOCAL_SIZE_X) in;
#undef MAD_4
#undef MAD_16
#undef MAD_64
#define mad(a,b,c) (a*b+c)
#define MAD_4(x, y) x = mad(y, x, y); y = mad(x, y, x); x = mad(y, x, y); y = mad(x, y, x);
#define MAD_16(x, y) MAD_4(x, y); MAD_4(x, y); MAD_4(x, y); MAD_4(x, y);
#define MAD_64(x, y) MAD_16(x, y); MAD_16(x, y); MAD_16(x, y); MAD_16(x, y);
struct vec8 {
vec4 d0, d1;
};
#define VEC8(x0,x1,x2,x3,x4,x5,x6,x7) vec8(vec4(x0,x1,x2,x3), vec4(x4,x5,x6,x7))
#define VEC8_S(x) vec8(vec4(x,x,x,x), vec4(x,x,x,x))
#define VEC8_ADD(a, b) (vec8(a.d0 + b.d0, a.d1 + b.d1))
#define VEC8_MUL(a, b) (vec8(a.d0 * b.d0, a.d1 * b.d1))
struct vec16 {
vec8 d0,d1;
};
#define VEC16(x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15) vec16(VEC8(x0,x1,x2,x3,x4,x5,x6,x7), VEC8(x8,x9,x10,x11,x12,x13,x14,x15))
#define VEC16_S(x) vec16(VEC8_S(x), VEC8_S(x));
#define VEC16_ADD(a, b) (vec16(VEC8_ADD(a.d0, b.d0), VEC8_ADD(a.d1, b.d1)))
#define VEC16_MUL(a, b) (vec16(VEC8_MUL(a.d0, b.d0), VEC8_MUL(a.d1, b.d1)))
#define mad8(a,b,c) (VEC8_ADD(VEC8_MUL(a,b),c))
#define mad16(a,b,c) (VEC16_ADD(VEC16_MUL(a,b),c))
layout(location = 1) uniform DATATYPE _A;
#define SCALE 1e-10
layout(std430, binding = 0) restrict writeonly buffer outbuffer {
DATATYPE ptr[];
};
#line 128
void compute_sp_v2()
{
uint id = gl_GlobalInvocationID[0] + gl_GlobalInvocationID[1] * 256u + gl_GlobalInvocationID[2] * 256u * 256u;
vec2 x = vec2(_A, (_A+DATATYPE(1)));
vec2 y = vec2((float(id) * SCALE), (float(id) * SCALE));
for(int i=0; i<64; i++)
{
MAD_16(x, y);
}
ptr[id] = (y.x) + (y.y);
}
void main() {compute_sp_v2();}
----------
shader: MESA_SHADER_COMPUTE
source_sha1: {0xa0084485, 0xd8844526, 0x94836c67, 0x8e6882bd, 0xb77caae3}
name: GLSL14
workgroup-size: 256, 1, 1
shared-size: 0
inputs: 0
outputs: 0
uniforms: 1
ubos: 1
shared: 0
ray queries: 0
decl_var ssbo INTERP_MODE_NONE restrict writeonly highp int[] ptr (0, 0, 0)
decl_var uniform INTERP_MODE_NONE highp int _A (1, 0, 0)
decl_var ubo INTERP_MODE_NONE vec4[1] uniform_0 (0, 0, 0)
decl_function main (0 params)
impl main {
block block_0:
/* preds: */
vec3 32 ssa_4 = intrinsic load_global_invocation_id () ()
vec1 32 ssa_5 = load_const (0x00000008 = 0.000000)
vec1 32 ssa_6 = ishl ssa_4.y, ssa_5
vec1 32 ssa_7 = iadd ssa_4.x, ssa_6
vec1 32 ssa_1 = load_const (0x00000001 = 0.000000)
vec1 32 ssa_99 = insert_u16 ssa_4.z, ssa_1
vec1 32 ssa_10 = iadd ssa_7, ssa_99
vec1 32 ssa_0 = load_const (0x00000000 = 0.000000)
vec1 32 ssa_11 = intrinsic load_ubo (ssa_0, ssa_0) (access=0, align_mul=1073741824, align_offset=0, range_base=0, range=4)
vec1 32 ssa_12 = iadd ssa_11, ssa_1
vec1 32 ssa_13 = u2f32 ssa_10
vec1 32 ssa_2 = load_const (0x2edbe6ff = 0.000000)
vec1 32 ssa_14 = fmul ssa_13, ssa_2
vec1 32 ssa_15 = f2i32 ssa_14
vec1 32 ssa_3 = load_const (0x00000040 = 0.000000)
/* succs: block_1 */
loop {
block block_1:
/* preds: block_0 block_4 */
vec1 32 ssa_16 = phi block_0: ssa_15, block_4: ssa_84
vec1 32 ssa_17 = phi block_0: ssa_15, block_4: ssa_85
vec1 32 ssa_18 = phi block_0: ssa_11, block_4: ssa_80
vec1 32 ssa_19 = phi block_0: ssa_12, block_4: ssa_81
vec1 32 ssa_20 = phi block_0: ssa_0, block_4: ssa_86
vec1 32 ssa_21 = ige32 ssa_20, ssa_3
/* succs: block_2 block_3 */
if ssa_21 {
block block_2:
/* preds: block_1 */
break
/* succs: block_5 */
} else {
block block_3:
/* preds: block_1 */
/* succs: block_4 */
}
block block_4:
/* preds: block_3 */
vec1 32 ssa_22 = imul ssa_16, ssa_18
vec1 32 ssa_23 = imul ssa_17, ssa_19
vec1 32 ssa_24 = iadd ssa_22, ssa_16
vec1 32 ssa_25 = iadd ssa_23, ssa_17
vec1 32 ssa_26 = imul ssa_24, ssa_16
vec1 32 ssa_27 = imul ssa_25, ssa_17
vec1 32 ssa_28 = iadd ssa_26, ssa_24
vec1 32 ssa_29 = iadd ssa_27, ssa_25
vec1 32 ssa_30 = imul ssa_28, ssa_24
vec1 32 ssa_31 = imul ssa_29, ssa_25
vec1 32 ssa_32 = iadd ssa_30, ssa_28
vec1 32 ssa_33 = iadd ssa_31, ssa_29
vec1 32 ssa_34 = imul ssa_32, ssa_28
vec1 32 ssa_35 = imul ssa_33, ssa_29
vec1 32 ssa_36 = iadd ssa_34, ssa_32
vec1 32 ssa_37 = iadd ssa_35, ssa_33
vec1 32 ssa_38 = imul ssa_36, ssa_32
vec1 32 ssa_39 = imul ssa_37, ssa_33
vec1 32 ssa_40 = iadd ssa_38, ssa_36
vec1 32 ssa_41 = iadd ssa_39, ssa_37
vec1 32 ssa_42 = imul ssa_40, ssa_36
vec1 32 ssa_43 = imul ssa_41, ssa_37
vec1 32 ssa_44 = iadd ssa_42, ssa_40
vec1 32 ssa_45 = iadd ssa_43, ssa_41
vec1 32 ssa_46 = imul ssa_44, ssa_40
vec1 32 ssa_47 = imul ssa_45, ssa_41
vec1 32 ssa_48 = iadd ssa_46, ssa_44
vec1 32 ssa_49 = iadd ssa_47, ssa_45
vec1 32 ssa_50 = imul ssa_48, ssa_44
vec1 32 ssa_51 = imul ssa_49, ssa_45
vec1 32 ssa_52 = iadd ssa_50, ssa_48
vec1 32 ssa_53 = iadd ssa_51, ssa_49
vec1 32 ssa_54 = imul ssa_52, ssa_48
vec1 32 ssa_55 = imul ssa_53, ssa_49
vec1 32 ssa_56 = iadd ssa_54, ssa_52
vec1 32 ssa_57 = iadd ssa_55, ssa_53
vec1 32 ssa_58 = imul ssa_56, ssa_52
vec1 32 ssa_59 = imul ssa_57, ssa_53
vec1 32 ssa_60 = iadd ssa_58, ssa_56
vec1 32 ssa_61 = iadd ssa_59, ssa_57
vec1 32 ssa_62 = imul ssa_60, ssa_56
vec1 32 ssa_63 = imul ssa_61, ssa_57
vec1 32 ssa_64 = iadd ssa_62, ssa_60
vec1 32 ssa_65 = iadd ssa_63, ssa_61
vec1 32 ssa_66 = imul ssa_64, ssa_60
vec1 32 ssa_67 = imul ssa_65, ssa_61
vec1 32 ssa_68 = iadd ssa_66, ssa_64
vec1 32 ssa_69 = iadd ssa_67, ssa_65
vec1 32 ssa_70 = imul ssa_68, ssa_64
vec1 32 ssa_71 = imul ssa_69, ssa_65
vec1 32 ssa_72 = iadd ssa_70, ssa_68
vec1 32 ssa_73 = iadd ssa_71, ssa_69
vec1 32 ssa_74 = imul ssa_72, ssa_68
vec1 32 ssa_75 = imul ssa_73, ssa_69
vec1 32 ssa_76 = iadd ssa_74, ssa_72
vec1 32 ssa_77 = iadd ssa_75, ssa_73
vec1 32 ssa_78 = imul ssa_76, ssa_72
vec1 32 ssa_79 = imul ssa_77, ssa_73
vec1 32 ssa_80 = iadd ssa_78, ssa_76
vec1 32 ssa_81 = iadd ssa_79, ssa_77
vec1 32 ssa_82 = imul ssa_80, ssa_76
vec1 32 ssa_83 = imul ssa_81, ssa_77
vec1 32 ssa_84 = iadd ssa_82, ssa_80
vec1 32 ssa_85 = iadd ssa_83, ssa_81
vec1 32 ssa_86 = iadd ssa_20, ssa_1
/* succs: block_1 */
}
block block_5:
/* preds: block_2 */
vec1 32 ssa_87 = load_const (0x00000002 = 0.000000)
vec1 32 ssa_88 = ishl ssa_10, ssa_87
vec1 32 ssa_89 = iadd ssa_16, ssa_17
vec1 64 ssa_90 = intrinsic load_ssbo_address (ssa_0) ()
vec1 32 ssa_91 = unpack_64_2x32_split_x ssa_90
vec1 32 ssa_92 = unpack_64_2x32_split_y ssa_90
vec1 32 ssa_93 = iadd ssa_91, ssa_88
vec1 32 ssa_94 = ult32 ssa_93, ssa_91
vec1 32 ssa_95 = b2i32 ssa_94
vec1 32 ssa_96 = iadd ssa_95, ssa_92
vec1 64 ssa_97 = pack_64_2x32_split ssa_93, ssa_96
intrinsic store_global (ssa_89, ssa_97) (wrmask=x /*1*/, access=0, align_mul=4, align_offset=0)
/* succs: block_6 */
block block_6:
}
block0 {
102 = MOV.i32 r62
101 = MOV.i32 r61
100 = MOV.i32 r60
6 = LSHIFT_OR.i32 101, #0x0, #0x8.b0
7 = IADD.s32 100, 6
99 = MKVEC.v2i16 #0x0.h00, 102.h00
10 = IADD.s32 7, 99
12 = IADD.s32 u1, #0x1
13 = U32_TO_F32 10
14 = FMA.f32 13, #0x2edbe6ff, #0x0.neg
15 = F32_TO_S32.rtz 14
} -> block1
block1 {
16 = PHI 15, 84
17 = PHI 15, 85
18 = PHI u1, 80
19 = PHI 12, 81
20 = PHI #0x0, 86
21 = ICMP.s32.m1.ge 20, #0x40
BRANCHZ.i16.eq 21.h00, #0x0 -> block3
} -> block3 block2 from block0 block4
block2 {
JUMP #0x0 -> block5
} -> block5 from block1
block3 {
} -> block4 from block1
block4 {
22 = IMUL.i32 16, 18
23 = IMUL.i32 17, 19
24 = IADD.s32 22, 16
25 = IADD.s32 23, 17
26 = IMUL.i32 24, 16
27 = IMUL.i32 25, 17
28 = IADD.s32 26, 24
29 = IADD.s32 27, 25
30 = IMUL.i32 28, 24
31 = IMUL.i32 29, 25
32 = IADD.s32 30, 28
33 = IADD.s32 31, 29
34 = IMUL.i32 32, 28
35 = IMUL.i32 33, 29
36 = IADD.s32 34, 32
37 = IADD.s32 35, 33
38 = IMUL.i32 36, 32
39 = IMUL.i32 37, 33
40 = IADD.s32 38, 36
41 = IADD.s32 39, 37
42 = IMUL.i32 40, 36
43 = IMUL.i32 41, 37
44 = IADD.s32 42, 40
45 = IADD.s32 43, 41
46 = IMUL.i32 44, 40
47 = IMUL.i32 45, 41
48 = IADD.s32 46, 44
49 = IADD.s32 47, 45
50 = IMUL.i32 48, 44
51 = IMUL.i32 49, 45
52 = IADD.s32 50, 48
53 = IADD.s32 51, 49
54 = IMUL.i32 52, 48
55 = IMUL.i32 53, 49
56 = IADD.s32 54, 52
57 = IADD.s32 55, 53
58 = IMUL.i32 56, 52
59 = IMUL.i32 57, 53
60 = IADD.s32 58, 56
61 = IADD.s32 59, 57
62 = IMUL.i32 60, 56
63 = IMUL.i32 61, 57
64 = IADD.s32 62, 60
65 = IADD.s32 63, 61
66 = IMUL.i32 64, 60
67 = IMUL.i32 65, 61
68 = IADD.s32 66, 64
69 = IADD.s32 67, 65
70 = IMUL.i32 68, 64
71 = IMUL.i32 69, 65
72 = IADD.s32 70, 68
73 = IADD.s32 71, 69
74 = IMUL.i32 72, 68
75 = IMUL.i32 73, 69
76 = IADD.s32 74, 72
77 = IADD.s32 75, 73
78 = IMUL.i32 76, 72
79 = IMUL.i32 77, 73
80 = IADD.s32 78, 76
81 = IADD.s32 79, 77
82 = IMUL.i32 80, 76
83 = IMUL.i32 81, 77
84 = IADD.s32 82, 80
85 = IADD.s32 83, 81
86 = IADD.s32 20, #0x1
JUMP #0x0 -> block1
} -> block1 from block3
block5 {
88 = LSHIFT_OR.i32 10, #0x0, #0x2.b0
89 = IADD.s32 16, 17
93 = IADD.s32 u0, 88
95 = ICMP.u32.i1.lt 93, u0
96 = IADD.s32 95, u0[1]
STORE.i32 89, 93, 96, byte_offset:0
} from block2
block0 {
r0 = LSHIFT_OR.i32 r61, #0x0, #0x8.b0
r0 = IADD.s32 r60, r0
r1 = MKVEC.v2i16 #0x0.h00, r62.h00
r0 = IADD.s32 r0, r1
r1 = MOV.i32 #0x1
r1 = IADD.s32 u1, r1
r2 = U32_TO_F32 r0
r2 = FMA.f32 r2, #0x2edbe6ff, #0x0.neg
r2 = F32_TO_S32.rtz r2
r3 = MOV.i32 r2
r4 = MOV.i32 u1
r5 = MOV.i32 #0x0
} -> block1
block1 {
r6 = ICMP.s32.m1.ge r5, #0x40
BRANCHZ.i16.eq r6.h00, #0x0 -> block3
} -> block3 block2 from block0 block4
block2 {
JUMP #0x0 -> block5
} -> block5 from block1
block3 {
} -> block4 from block1
block4 {
r4 = IMUL.i32 r2, r4
r1 = IMUL.i32 r3, r1
r4 = IADD.s32 r4, r2
r1 = IADD.s32 r1, r3
r2 = IMUL.i32 r4, r2
r3 = IMUL.i32 r1, r3
r2 = IADD.s32 r2, r4
r3 = IADD.s32 r3, r1
r4 = IMUL.i32 r2, r4
r1 = IMUL.i32 r3, r1
r4 = IADD.s32 r4, r2
r1 = IADD.s32 r1, r3
r2 = IMUL.i32 r4, r2
r3 = IMUL.i32 r1, r3
r2 = IADD.s32 r2, r4
r3 = IADD.s32 r3, r1
r4 = IMUL.i32 r2, r4
r1 = IMUL.i32 r3, r1
r4 = IADD.s32 r4, r2
r1 = IADD.s32 r1, r3
r2 = IMUL.i32 r4, r2
r3 = IMUL.i32 r1, r3
r2 = IADD.s32 r2, r4
r3 = IADD.s32 r3, r1
r4 = IMUL.i32 r2, r4
r1 = IMUL.i32 r3, r1
r4 = IADD.s32 r4, r2
r1 = IADD.s32 r1, r3
r2 = IMUL.i32 r4, r2
r3 = IMUL.i32 r1, r3
r2 = IADD.s32 r2, r4
r3 = IADD.s32 r3, r1
r4 = IMUL.i32 r2, r4
r1 = IMUL.i32 r3, r1
r4 = IADD.s32 r4, r2
r1 = IADD.s32 r1, r3
r2 = IMUL.i32 r4, r2
r3 = IMUL.i32 r1, r3
r2 = IADD.s32 r2, r4
r3 = IADD.s32 r3, r1
r4 = IMUL.i32 r2, r4
r1 = IMUL.i32 r3, r1
r4 = IADD.s32 r4, r2
r1 = IADD.s32 r1, r3
r2 = IMUL.i32 r4, r2
r3 = IMUL.i32 r1, r3
r2 = IADD.s32 r2, r4
r3 = IADD.s32 r3, r1
r4 = IMUL.i32 r2, r4
r1 = IMUL.i32 r3, r1
r4 = IADD.s32 r4, r2
r1 = IADD.s32 r1, r3
r2 = IMUL.i32 r4, r2
r3 = IMUL.i32 r1, r3
r2 = IADD.s32 r2, r4
r3 = IADD.s32 r3, r1
r4 = IMUL.i32 r2, r4
r1 = IMUL.i32 r3, r1
r4 = IADD.s32 r4, r2
r1 = IADD.s32 r1, r3
r2 = IMUL.i32 r4, r2
r3 = IMUL.i32 r1, r3
r2 = IADD.s32 r2, r4
r3 = IADD.s32 r3, r1
r5 = IADD.s32 r5, #0x1
JUMP #0x0 -> block1
} -> block1 from block3
block5 {
r0 = LSHIFT_OR.i32 r0, #0x0, #0x2.b0
r1 = IADD.s32 r2, r3
r0 = IADD.s32 u0, r0
r2 = ICMP.u32.i1.lt r0, u0
r2 = IADD.s32 r2, u0[1]
STORE.i32 r1, r0, r2, byte_offset:0
} from block2
block0 {
id(0) nbb r_uncond
* _.h00 = LSHIFT_OR.i32 r61, t, fau.y.b0
+ _.h00 = IADD.s32 r60, t
* _.h00 = MKVEC.v2i16 t.h00, r62.h00
+ r0 = IADD.s32 t1, t
* NOP
+ r1 = MOV.i32 fau.x
* r4 = MOV.i32 fau.x
+ _.h00 = U32_TO_F32 r0
* _.h00 = FMA.f32 t1, fau.y, t.neg
+ r2 = F32_TO_S32.rtz t
* r3 = MOV.i32 t1
+ r5 = MOV.i32 fau.x
* NOP
+ r1 = IADD.s32 fau.x, r1
800000001 2edbe6ff00000000
} -> block1
block1 {
id(0) nbb r_uncond pcrel(0)
* NOP
+ _.h00 = ICMP.s32.m1.ge r5, fau.x
* NOP
+ BRANCHZ.i16.eq t1.h00, fau.y -> block3
4000000000000040
} -> block3 block2 from block0 block4
block2 {
id(0) nbb no_prefetch pcrel(0)
* NOP
+ JUMP fau.y -> block5
4000000000000000
} -> block5 from block1
block3 {
} -> block4 from block1
block4 {
id(0) nbb
* _.h00 = IMUL.i32 r2, r4
+ r4 = IADD.s32 t, r2
* _.h00 = IMUL.i32 t1, r2
+ r2 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r4
+ r4 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r2
+ r2 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r4
+ NOP
* NOP
+ r4 = IADD.s32 t0, r2
id(0) nbb
* r2 = IMUL.i32 r4, r2
+ NOP
* _.h00 = IMUL.i32 r3, r1
+ r1 = IADD.s32 t, r3
* _.h00 = IMUL.i32 t1, r3
+ r3 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r1
+ r1 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r3
+ r3 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r1
+ r1 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r3
+ NOP
* NOP
+ r3 = IADD.s32 t0, r1
id(0) nbb
* _.h00 = IMUL.i32 r3, r1
+ r1 = IADD.s32 t, r3
* _.h00 = IMUL.i32 t1, r3
+ r3 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r1
+ r1 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r3
+ r3 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r1
+ r1 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r3
+ r3 = IADD.s32 t, t1
* r1 = IMUL.i32 t1, r1
+ NOP
* NOP
+ r2 = IADD.s32 r2, r4
id(0) nbb
* _.h00 = IMUL.i32 r2, r4
+ r4 = IADD.s32 t, r2
* _.h00 = IMUL.i32 t1, r2
+ r2 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r4
+ r4 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r2
+ r2 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r4
+ r4 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r2
+ r2 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r4
+ r4 = IADD.s32 t, t1
* NOP
+ r1 = IADD.s32 r1, r3
id(0) nbb r_uncond no_prefetch pcrel(1)
* _.h00 = IMUL.i32 r4, r2
+ r2 = IADD.s32 t, r4
* _.h00 = IMUL.i32 r1, r3
+ r3 = IADD.s32 t, r1
* _.h00 = IMUL.i32 r2, r4
+ r4 = IADD.s32 t, r2
* _.h00 = IMUL.i32 r3, r1
+ r1 = IADD.s32 t, r3
* _.h00 = IMUL.i32 r4, r2
+ r2 = IADD.s32 t, r4
* _.h00 = IMUL.i32 r1, r3
+ r3 = IADD.s32 t, r1
* NOP
+ r5 = IADD.s32 r5, fau.x
* NOP
+ JUMP fau.y -> block1
0 4000000000000001
} -> block1 from block3
block5 {
id(0) wait(0 ) nbb r_uncond
* _.h00 = LSHIFT_OR.i32 r0, t, fau.y.b0
+ r1 = IADD.s32 r2, r3
* NOP
+ r0 = IADD.s32 fau.x, t0
* NOP
+ _.h00 = ICMP.u32.i1.lt t1, fau.x
* NOP
+ _.h00 = IADD.s32 t1, fau.y
* NOP
+ STORE.i32 r1, r0, t1, byte_offset:0
200000000
} from block2
slot 0 reads: r1
clause_0:
ds(0) nbb r_uncond ncph
{
*LSHIFT_OR.i32 t0, r61, #0, 0x00000008 /* 0.000000 */
+IADD.s32 t1, r60, t
*MKVEC.v2i16 t0, #0, r62
+IADD.s32 r0:t1, t1, t
*NOP t0
+MOV.i32 r1:t1, 0x00000001 /* 0.000000 */
*MOV.i32 r4:t0, u1.w0
+U32_TO_F32 t1, r0
*FMA.f32 t0, t1, 0x2edbe6ff /* 0.000000 */, #0.neg
+F32_TO_S32.rtz r2:t1, t
*MOV.i32 r3:t0, t1
+MOV.i32 r5:t1, #0.x
*NOP t0
+IADD.s32 r1:t1, u1.w0, r1
}
clause_6:
ds(0) nbb r_uncond ncph
{
*NOP t0
+ICMP.s32.m1.ge t1, r5, 0x00000040 /* 0.000000 */
*NOP t0
+BRANCHZ.i16.eq t1, t1.h0, clause_11
}
clause_9:
ds(0) nbb next_store dwb(0)
{
*NOP t0
+JUMP t1, clause_41
}
clause_11:
ds(0) nbb ncph
{
*IMUL.i32 t0, r2, r4
+IADD.s32 r4:t1, t, r2
*IMUL.i32 t0, t1, r2
+IADD.s32 r2:t1, t, t1
*IMUL.i32 t0, t1, r4
+IADD.s32 r4:t1, t, t1
*IMUL.i32 t0, t1, r2
+IADD.s32 r2:t1, t, t1
*IMUL.i32 t0, t1, r4
+NOP t1
*NOP t0
+IADD.s32 r4:t1, t0, r2
}
clause_16:
ds(0) nbb ncph
{
*IMUL.i32 r2:t0, r4, r2
+NOP t1
*IMUL.i32 t0, r3, r1
+IADD.s32 r1:t1, t, r3
*IMUL.i32 t0, t1, r3
+IADD.s32 r3:t1, t, t1
*IMUL.i32 t0, t1, r1
+IADD.s32 r1:t1, t, t1
*IMUL.i32 t0, t1, r3
+IADD.s32 r3:t1, t, t1
*IMUL.i32 t0, t1, r1
+IADD.s32 r1:t1, t, t1
*IMUL.i32 t0, t1, r3
+NOP t1
*NOP t0
+IADD.s32 r3:t1, t0, r1
}
clause_22:
ds(0) nbb ncph
{
*IMUL.i32 t0, r3, r1
+IADD.s32 r1:t1, t, r3
*IMUL.i32 t0, t1, r3
+IADD.s32 r3:t1, t, t1
*IMUL.i32 t0, t1, r1
+IADD.s32 r1:t1, t, t1
*IMUL.i32 t0, t1, r3
+IADD.s32 r3:t1, t, t1
*IMUL.i32 t0, t1, r1
+IADD.s32 r1:t1, t, t1
*IMUL.i32 t0, t1, r3
+IADD.s32 r3:t1, t, t1
*IMUL.i32 r1:t0, t1, r1
+NOP t1
*NOP t0
+IADD.s32 r2:t1, r2, r4
}
clause_28:
ds(0) nbb ncph
{
*IMUL.i32 t0, r2, r4
+IADD.s32 r4:t1, t, r2
*IMUL.i32 t0, t1, r2
+IADD.s32 r2:t1, t, t1
*IMUL.i32 t0, t1, r4
+IADD.s32 r4:t1, t, t1
*IMUL.i32 t0, t1, r2
+IADD.s32 r2:t1, t, t1
*IMUL.i32 t0, t1, r4
+IADD.s32 r4:t1, t, t1
*IMUL.i32 t0, t1, r2
+IADD.s32 r2:t1, t, t1
*IMUL.i32 t0, t1, r4
+IADD.s32 r4:t1, t, t1
*NOP t0
+IADD.s32 r1:t1, r1, r3
}
clause_34:
ds(0) nbb r_uncond
{
*IMUL.i32 t0, r4, r2
+IADD.s32 r2:t1, t, r4
*IMUL.i32 t0, r1, r3
+IADD.s32 r3:t1, t, r1
*IMUL.i32 t0, r2, r4
+IADD.s32 r4:t1, t, r2
*IMUL.i32 t0, r3, r1
+IADD.s32 r1:t1, t, r3
*IMUL.i32 t0, r4, r2
+IADD.s32 r2:t1, t, r4
*IMUL.i32 t0, r1, r3
+IADD.s32 r3:t1, t, r1
*NOP t0
+IADD.s32 r5:t1, r5, 0x00000001 /* 0.000000 */
*NOP t0
+JUMP t1, clause_6
}
clause_41:
ds(0) eos store
{
*LSHIFT_OR.i32 t0, r0, #0, 0x00000002 /* 0.000000 */
+IADD.s32 r1:t1, r2, r3
*NOP t0
+IADD.s32 r0:t1, u0.w0, t0
*NOP t0
+ICMP.u32.gt t1, u0.w0, t1
*NOP t0
+IADD.s32 t1, t1, u0.w1
*NOP t0
+STORE.i32 t1, r0, t1, @r1
}
e20eea22 compute_sp_v2_int 20.068 GFLOPs 13.376ms
compute shader ----------
#define KERNEL compute_sp_v4
#define LOCAL_SIZE_X 256
#define DATATYPE int
#define vec2 ivec2
#define vec4 ivec4
#line 64
// Avoiding auto-vectorize by using vector-width locked dependent code
layout(local_size_x = LOCAL_SIZE_X) in;
#undef MAD_4
#undef MAD_16
#undef MAD_64
#define mad(a,b,c) (a*b+c)
#define MAD_4(x, y) x = mad(y, x, y); y = mad(x, y, x); x = mad(y, x, y); y = mad(x, y, x);
#define MAD_16(x, y) MAD_4(x, y); MAD_4(x, y); MAD_4(x, y); MAD_4(x, y);
#define MAD_64(x, y) MAD_16(x, y); MAD_16(x, y); MAD_16(x, y); MAD_16(x, y);
struct vec8 {
vec4 d0, d1;
};
#define VEC8(x0,x1,x2,x3,x4,x5,x6,x7) vec8(vec4(x0,x1,x2,x3), vec4(x4,x5,x6,x7))
#define VEC8_S(x) vec8(vec4(x,x,x,x), vec4(x,x,x,x))
#define VEC8_ADD(a, b) (vec8(a.d0 + b.d0, a.d1 + b.d1))
#define VEC8_MUL(a, b) (vec8(a.d0 * b.d0, a.d1 * b.d1))
struct vec16 {
vec8 d0,d1;
};
#define VEC16(x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15) vec16(VEC8(x0,x1,x2,x3,x4,x5,x6,x7), VEC8(x8,x9,x10,x11,x12,x13,x14,x15))
#define VEC16_S(x) vec16(VEC8_S(x), VEC8_S(x));
#define VEC16_ADD(a, b) (vec16(VEC8_ADD(a.d0, b.d0), VEC8_ADD(a.d1, b.d1)))
#define VEC16_MUL(a, b) (vec16(VEC8_MUL(a.d0, b.d0), VEC8_MUL(a.d1, b.d1)))
#define mad8(a,b,c) (VEC8_ADD(VEC8_MUL(a,b),c))
#define mad16(a,b,c) (VEC16_ADD(VEC16_MUL(a,b),c))
layout(location = 1) uniform DATATYPE _A;
#define SCALE 1e-10
layout(std430, binding = 0) restrict writeonly buffer outbuffer {
DATATYPE ptr[];
};
#line 145
void compute_sp_v4()
{
uint id = gl_GlobalInvocationID[0] + gl_GlobalInvocationID[1] * 256u + gl_GlobalInvocationID[2] * 256u * 256u;
vec4 x = vec4(_A, (_A+DATATYPE(1)), (_A+DATATYPE(2)), (_A+DATATYPE(3)));
vec4 y = vec4((float(id) * SCALE), (float(id) * SCALE), (float(id) * SCALE), (float(id) * SCALE));
for(int i=0; i<32; i++)
{
MAD_16(x, y);
}
ptr[id] = (y.x) + (y.y) + (y.z) + (y.w);
}
void main() {compute_sp_v4();}
----------
shader: MESA_SHADER_COMPUTE
source_sha1: {0x183901bc, 0xa5c63b78, 0x1c36bc35, 0xd2853d78, 0x1152fc21}
name: GLSL16
workgroup-size: 256, 1, 1
shared-size: 0
inputs: 0
outputs: 0
uniforms: 1
ubos: 1
shared: 0
ray queries: 0
decl_var ssbo INTERP_MODE_NONE restrict writeonly highp int[] ptr (0, 0, 0)
decl_var uniform INTERP_MODE_NONE highp int _A (1, 0, 0)
decl_var ubo INTERP_MODE_NONE vec4[1] uniform_0 (0, 0, 0)
decl_function main (0 params)
impl main {
block block_0:
/* preds: */
vec3 32 ssa_6 = intrinsic load_global_invocation_id () ()
vec1 32 ssa_7 = load_const (0x00000008 = 0.000000)
vec1 32 ssa_8 = ishl ssa_6.y, ssa_7
vec1 32 ssa_9 = iadd ssa_6.x, ssa_8
vec1 32 ssa_1 = load_const (0x00000001 = 0.000000)
vec1 32 ssa_172 = insert_u16 ssa_6.z, ssa_1
vec1 32 ssa_12 = iadd ssa_9, ssa_172
vec1 32 ssa_0 = load_const (0x00000000 = 0.000000)
vec1 32 ssa_13 = intrinsic load_ubo (ssa_0, ssa_0) (access=0, align_mul=1073741824, align_offset=0, range_base=0, range=4)
vec1 32 ssa_14 = iadd ssa_13, ssa_1
vec1 32 ssa_2 = load_const (0x00000002 = 0.000000)
vec1 32 ssa_15 = iadd ssa_13, ssa_2
vec1 32 ssa_3 = load_const (0x00000003 = 0.000000)
vec1 32 ssa_16 = iadd ssa_13, ssa_3
vec1 32 ssa_17 = u2f32 ssa_12
vec1 32 ssa_4 = load_const (0x2edbe6ff = 0.000000)
vec1 32 ssa_18 = fmul ssa_17, ssa_4
vec1 32 ssa_19 = f2i32 ssa_18
vec1 32 ssa_5 = load_const (0x00000020 = 0.000000)
/* succs: block_1 */
loop {
block block_1:
/* preds: block_0 block_4 */
vec1 32 ssa_20 = phi block_0: ssa_19, block_4: ssa_154
vec1 32 ssa_21 = phi block_0: ssa_19, block_4: ssa_155
vec1 32 ssa_22 = phi block_0: ssa_19, block_4: ssa_156
vec1 32 ssa_23 = phi block_0: ssa_19, block_4: ssa_157
vec1 32 ssa_24 = phi block_0: ssa_13, block_4: ssa_146
vec1 32 ssa_25 = phi block_0: ssa_14, block_4: ssa_147
vec1 32 ssa_26 = phi block_0: ssa_15, block_4: ssa_148
vec1 32 ssa_27 = phi block_0: ssa_16, block_4: ssa_149
vec1 32 ssa_28 = phi block_0: ssa_0, block_4: ssa_158
vec1 32 ssa_29 = ige32 ssa_28, ssa_5
/* succs: block_2 block_3 */
if ssa_29 {
block block_2:
/* preds: block_1 */
break
/* succs: block_5 */
} else {
block block_3:
/* preds: block_1 */
/* succs: block_4 */
}
block block_4:
/* preds: block_3 */
vec1 32 ssa_30 = imul ssa_20, ssa_24
vec1 32 ssa_31 = imul ssa_21, ssa_25
vec1 32 ssa_32 = imul ssa_22, ssa_26
vec1 32 ssa_33 = imul ssa_23, ssa_27
vec1 32 ssa_34 = iadd ssa_30, ssa_20
vec1 32 ssa_35 = iadd ssa_31, ssa_21
vec1 32 ssa_36 = iadd ssa_32, ssa_22
vec1 32 ssa_37 = iadd ssa_33, ssa_23
vec1 32 ssa_38 = imul ssa_34, ssa_20
vec1 32 ssa_39 = imul ssa_35, ssa_21
vec1 32 ssa_40 = imul ssa_36, ssa_22
vec1 32 ssa_41 = imul ssa_37, ssa_23
vec1 32 ssa_42 = iadd ssa_38, ssa_34
vec1 32 ssa_43 = iadd ssa_39, ssa_35
vec1 32 ssa_44 = iadd ssa_40, ssa_36
vec1 32 ssa_45 = iadd ssa_41, ssa_37
vec1 32 ssa_46 = imul ssa_42, ssa_34
vec1 32 ssa_47 = imul ssa_43, ssa_35
vec1 32 ssa_48 = imul ssa_44, ssa_36
vec1 32 ssa_49 = imul ssa_45, ssa_37
vec1 32 ssa_50 = iadd ssa_46, ssa_42
vec1 32 ssa_51 = iadd ssa_47, ssa_43
vec1 32 ssa_52 = iadd ssa_48, ssa_44
vec1 32 ssa_53 = iadd ssa_49, ssa_45
vec1 32 ssa_54 = imul ssa_50, ssa_42
vec1 32 ssa_55 = imul ssa_51, ssa_43
vec1 32 ssa_56 = imul ssa_52, ssa_44
vec1 32 ssa_57 = imul ssa_53, ssa_45
vec1 32 ssa_58 = iadd ssa_54, ssa_50
vec1 32 ssa_59 = iadd ssa_55, ssa_51
vec1 32 ssa_60 = iadd ssa_56, ssa_52
vec1 32 ssa_61 = iadd ssa_57, ssa_53
vec1 32 ssa_62 = imul ssa_58, ssa_50
vec1 32 ssa_63 = imul ssa_59, ssa_51
vec1 32 ssa_64 = imul ssa_60, ssa_52
vec1 32 ssa_65 = imul ssa_61, ssa_53
vec1 32 ssa_66 = iadd ssa_62, ssa_58
vec1 32 ssa_67 = iadd ssa_63, ssa_59
vec1 32 ssa_68 = iadd ssa_64, ssa_60
vec1 32 ssa_69 = iadd ssa_65, ssa_61
vec1 32 ssa_70 = imul ssa_66, ssa_58
vec1 32 ssa_71 = imul ssa_67, ssa_59
vec1 32 ssa_72 = imul ssa_68, ssa_60
vec1 32 ssa_73 = imul ssa_69, ssa_61
vec1 32 ssa_74 = iadd ssa_70, ssa_66
vec1 32 ssa_75 = iadd ssa_71, ssa_67
vec1 32 ssa_76 = iadd ssa_72, ssa_68
vec1 32 ssa_77 = iadd ssa_73, ssa_69
vec1 32 ssa_78 = imul ssa_74, ssa_66
vec1 32 ssa_79 = imul ssa_75, ssa_67
vec1 32 ssa_80 = imul ssa_76, ssa_68
vec1 32 ssa_81 = imul ssa_77, ssa_69
vec1 32 ssa_82 = iadd ssa_78, ssa_74
vec1 32 ssa_83 = iadd ssa_79, ssa_75
vec1 32 ssa_84 = iadd ssa_80, ssa_76
vec1 32 ssa_85 = iadd ssa_81, ssa_77
vec1 32 ssa_86 = imul ssa_82, ssa_74
vec1 32 ssa_87 = imul ssa_83, ssa_75
vec1 32 ssa_88 = imul ssa_84, ssa_76
vec1 32 ssa_89 = imul ssa_85, ssa_77
vec1 32 ssa_90 = iadd ssa_86, ssa_82
vec1 32 ssa_91 = iadd ssa_87, ssa_83
vec1 32 ssa_92 = iadd ssa_88, ssa_84
vec1 32 ssa_93 = iadd ssa_89, ssa_85
vec1 32 ssa_94 = imul ssa_90, ssa_82
vec1 32 ssa_95 = imul ssa_91, ssa_83
vec1 32 ssa_96 = imul ssa_92, ssa_84
vec1 32 ssa_97 = imul ssa_93, ssa_85
vec1 32 ssa_98 = iadd ssa_94, ssa_90
vec1 32 ssa_99 = iadd ssa_95, ssa_91
vec1 32 ssa_100 = iadd ssa_96, ssa_92
vec1 32 ssa_101 = iadd ssa_97, ssa_93
vec1 32 ssa_102 = imul ssa_98, ssa_90
vec1 32 ssa_103 = imul ssa_99, ssa_91
vec1 32 ssa_104 = imul ssa_100, ssa_92
vec1 32 ssa_105 = imul ssa_101, ssa_93
vec1 32 ssa_106 = iadd ssa_102, ssa_98
vec1 32 ssa_107 = iadd ssa_103, ssa_99
vec1 32 ssa_108 = iadd ssa_104, ssa_100
vec1 32 ssa_109 = iadd ssa_105, ssa_101
vec1 32 ssa_110 = imul ssa_106, ssa_98
vec1 32 ssa_111 = imul ssa_107, ssa_99
vec1 32 ssa_112 = imul ssa_108, ssa_100
vec1 32 ssa_113 = imul ssa_109, ssa_101
vec1 32 ssa_114 = iadd ssa_110, ssa_106
vec1 32 ssa_115 = iadd ssa_111, ssa_107
vec1 32 ssa_116 = iadd ssa_112, ssa_108
vec1 32 ssa_117 = iadd ssa_113, ssa_109
vec1 32 ssa_118 = imul ssa_114, ssa_106
vec1 32 ssa_119 = imul ssa_115, ssa_107
vec1 32 ssa_120 = imul ssa_116, ssa_108
vec1 32 ssa_121 = imul ssa_117, ssa_109
vec1 32 ssa_122 = iadd ssa_118, ssa_114
vec1 32 ssa_123 = iadd ssa_119, ssa_115
vec1 32 ssa_124 = iadd ssa_120, ssa_116
vec1 32 ssa_125 = iadd ssa_121, ssa_117
vec1 32 ssa_126 = imul ssa_122, ssa_114
vec1 32 ssa_127 = imul ssa_123, ssa_115
vec1 32 ssa_128 = imul ssa_124, ssa_116
vec1 32 ssa_129 = imul ssa_125, ssa_117
vec1 32 ssa_130 = iadd ssa_126, ssa_122
vec1 32 ssa_131 = iadd ssa_127, ssa_123
vec1 32 ssa_132 = iadd ssa_128, ssa_124
vec1 32 ssa_133 = iadd ssa_129, ssa_125
vec1 32 ssa_134 = imul ssa_130, ssa_122
vec1 32 ssa_135 = imul ssa_131, ssa_123
vec1 32 ssa_136 = imul ssa_132, ssa_124
vec1 32 ssa_137 = imul ssa_133, ssa_125
vec1 32 ssa_138 = iadd ssa_134, ssa_130
vec1 32 ssa_139 = iadd ssa_135, ssa_131
vec1 32 ssa_140 = iadd ssa_136, ssa_132
vec1 32 ssa_141 = iadd ssa_137, ssa_133
vec1 32 ssa_142 = imul ssa_138, ssa_130
vec1 32 ssa_143 = imul ssa_139, ssa_131
vec1 32 ssa_144 = imul ssa_140, ssa_132
vec1 32 ssa_145 = imul ssa_141, ssa_133
vec1 32 ssa_146 = iadd ssa_142, ssa_138
vec1 32 ssa_147 = iadd ssa_143, ssa_139
vec1 32 ssa_148 = iadd ssa_144, ssa_140
vec1 32 ssa_149 = iadd ssa_145, ssa_141
vec1 32 ssa_150 = imul ssa_146, ssa_138
vec1 32 ssa_151 = imul ssa_147, ssa_139
vec1 32 ssa_152 = imul ssa_148, ssa_140
vec1 32 ssa_153 = imul ssa_149, ssa_141
vec1 32 ssa_154 = iadd ssa_150, ssa_146
vec1 32 ssa_155 = iadd ssa_151, ssa_147
vec1 32 ssa_156 = iadd ssa_152, ssa_148
vec1 32 ssa_157 = iadd ssa_153, ssa_149
vec1 32 ssa_158 = iadd ssa_28, ssa_1
/* succs: block_1 */
}
block block_5:
/* preds: block_2 */
vec1 32 ssa_159 = ishl ssa_12, ssa_2
vec1 32 ssa_160 = iadd ssa_20, ssa_21
vec1 32 ssa_161 = iadd ssa_160, ssa_22
vec1 32 ssa_162 = iadd ssa_161, ssa_23
vec1 64 ssa_163 = intrinsic load_ssbo_address (ssa_0) ()
vec1 32 ssa_164 = unpack_64_2x32_split_x ssa_163
vec1 32 ssa_165 = unpack_64_2x32_split_y ssa_163
vec1 32 ssa_166 = iadd ssa_164, ssa_159
vec1 32 ssa_167 = ult32 ssa_166, ssa_164
vec1 32 ssa_168 = b2i32 ssa_167
vec1 32 ssa_169 = iadd ssa_168, ssa_165
vec1 64 ssa_170 = pack_64_2x32_split ssa_166, ssa_169
intrinsic store_global (ssa_162, ssa_170) (wrmask=x /*1*/, access=0, align_mul=4, align_offset=0)
/* succs: block_6 */
block block_6:
}
block0 {
175 = MOV.i32 r62
174 = MOV.i32 r61
173 = MOV.i32 r60
8 = LSHIFT_OR.i32 174, #0x0, #0x8.b0
9 = IADD.s32 173, 8
172 = MKVEC.v2i16 #0x0.h00, 175.h00
12 = IADD.s32 9, 172
14 = IADD.s32 u1, #0x1
15 = IADD.s32 u1, #0x2
16 = IADD.s32 u1, #0x3
17 = U32_TO_F32 12
18 = FMA.f32 17, #0x2edbe6ff, #0x0.neg
19 = F32_TO_S32.rtz 18
} -> block1
block1 {
20 = PHI 19, 154
21 = PHI 19, 155
22 = PHI 19, 156
23 = PHI 19, 157
24 = PHI u1, 146
25 = PHI 14, 147
26 = PHI 15, 148
27 = PHI 16, 149
28 = PHI #0x0, 158
29 = ICMP.s32.m1.ge 28, #0x20
BRANCHZ.i16.eq 29.h00, #0x0 -> block3
} -> block3 block2 from block0 block4
block2 {
JUMP #0x0 -> block5
} -> block5 from block1
block3 {
} -> block4 from block1
block4 {
30 = IMUL.i32 20, 24
31 = IMUL.i32 21, 25
32 = IMUL.i32 22, 26
33 = IMUL.i32 23, 27
34 = IADD.s32 30, 20
35 = IADD.s32 31, 21
36 = IADD.s32 32, 22
37 = IADD.s32 33, 23
38 = IMUL.i32 34, 20
39 = IMUL.i32 35, 21
40 = IMUL.i32 36, 22
41 = IMUL.i32 37, 23
42 = IADD.s32 38, 34
43 = IADD.s32 39, 35
44 = IADD.s32 40, 36
45 = IADD.s32 41, 37
46 = IMUL.i32 42, 34
47 = IMUL.i32 43, 35
48 = IMUL.i32 44, 36
49 = IMUL.i32 45, 37
50 = IADD.s32 46, 42
51 = IADD.s32 47, 43
52 = IADD.s32 48, 44
53 = IADD.s32 49, 45
54 = IMUL.i32 50, 42
55 = IMUL.i32 51, 43
56 = IMUL.i32 52, 44
57 = IMUL.i32 53, 45
58 = IADD.s32 54, 50
59 = IADD.s32 55, 51
60 = IADD.s32 56, 52
61 = IADD.s32 57, 53
62 = IMUL.i32 58, 50
63 = IMUL.i32 59, 51
64 = IMUL.i32 60, 52
65 = IMUL.i32 61, 53
66 = IADD.s32 62, 58
67 = IADD.s32 63, 59
68 = IADD.s32 64, 60
69 = IADD.s32 65, 61
70 = IMUL.i32 66, 58
71 = IMUL.i32 67, 59
72 = IMUL.i32 68, 60
73 = IMUL.i32 69, 61
74 = IADD.s32 70, 66
75 = IADD.s32 71, 67
76 = IADD.s32 72, 68
77 = IADD.s32 73, 69
78 = IMUL.i32 74, 66
79 = IMUL.i32 75, 67
80 = IMUL.i32 76, 68
81 = IMUL.i32 77, 69
82 = IADD.s32 78, 74
83 = IADD.s32 79, 75
84 = IADD.s32 80, 76
85 = IADD.s32 81, 77
86 = IMUL.i32 82, 74
87 = IMUL.i32 83, 75
88 = IMUL.i32 84, 76
89 = IMUL.i32 85, 77
90 = IADD.s32 86, 82
91 = IADD.s32 87, 83
92 = IADD.s32 88, 84
93 = IADD.s32 89, 85
94 = IMUL.i32 90, 82
95 = IMUL.i32 91, 83
96 = IMUL.i32 92, 84
97 = IMUL.i32 93, 85
98 = IADD.s32 94, 90
99 = IADD.s32 95, 91
100 = IADD.s32 96, 92
101 = IADD.s32 97, 93
102 = IMUL.i32 98, 90
103 = IMUL.i32 99, 91
104 = IMUL.i32 100, 92
105 = IMUL.i32 101, 93
106 = IADD.s32 102, 98
107 = IADD.s32 103, 99
108 = IADD.s32 104, 100
109 = IADD.s32 105, 101
110 = IMUL.i32 106, 98
111 = IMUL.i32 107, 99
112 = IMUL.i32 108, 100
113 = IMUL.i32 109, 101
114 = IADD.s32 110, 106
115 = IADD.s32 111, 107
116 = IADD.s32 112, 108
117 = IADD.s32 113, 109
118 = IMUL.i32 114, 106
119 = IMUL.i32 115, 107
120 = IMUL.i32 116, 108
121 = IMUL.i32 117, 109
122 = IADD.s32 118, 114
123 = IADD.s32 119, 115
124 = IADD.s32 120, 116
125 = IADD.s32 121, 117
126 = IMUL.i32 122, 114
127 = IMUL.i32 123, 115
128 = IMUL.i32 124, 116
129 = IMUL.i32 125, 117
130 = IADD.s32 126, 122
131 = IADD.s32 127, 123
132 = IADD.s32 128, 124
133 = IADD.s32 129, 125
134 = IMUL.i32 130, 122
135 = IMUL.i32 131, 123
136 = IMUL.i32 132, 124
137 = IMUL.i32 133, 125
138 = IADD.s32 134, 130
139 = IADD.s32 135, 131
140 = IADD.s32 136, 132
141 = IADD.s32 137, 133
142 = IMUL.i32 138, 130
143 = IMUL.i32 139, 131
144 = IMUL.i32 140, 132
145 = IMUL.i32 141, 133
146 = IADD.s32 142, 138
147 = IADD.s32 143, 139
148 = IADD.s32 144, 140
149 = IADD.s32 145, 141
150 = IMUL.i32 146, 138
151 = IMUL.i32 147, 139
152 = IMUL.i32 148, 140
153 = IMUL.i32 149, 141
154 = IADD.s32 150, 146
155 = IADD.s32 151, 147
156 = IADD.s32 152, 148
157 = IADD.s32 153, 149
158 = IADD.s32 28, #0x1
JUMP #0x0 -> block1
} -> block1 from block3
block5 {
159 = LSHIFT_OR.i32 12, #0x0, #0x2.b0
160 = IADD.s32 20, 21
161 = IADD.s32 160, 22
162 = IADD.s32 161, 23
166 = IADD.s32 u0, 159
168 = ICMP.u32.i1.lt 166, u0
169 = IADD.s32 168, u0[1]
STORE.i32 162, 166, 169, byte_offset:0
} from block2
block0 {
r0 = LSHIFT_OR.i32 r61, #0x0, #0x8.b0
r0 = IADD.s32 r60, r0
r1 = MKVEC.v2i16 #0x0.h00, r62.h00
r0 = IADD.s32 r0, r1
r1 = MOV.i32 #0x1
r1 = IADD.s32 u1, r1
r2 = MOV.i32 #0x2
r2 = IADD.s32 u1, r2
r3 = MOV.i32 #0x3
r3 = IADD.s32 u1, r3
r4 = U32_TO_F32 r0
r4 = FMA.f32 r4, #0x2edbe6ff, #0x0.neg
r4 = F32_TO_S32.rtz r4
r5 = MOV.i32 r4
r6 = MOV.i32 r4
r7 = MOV.i32 r4
r8 = MOV.i32 u1
r9 = MOV.i32 #0x0
} -> block1
block1 {
r10 = ICMP.s32.m1.ge r9, #0x20
BRANCHZ.i16.eq r10.h00, #0x0 -> block3
} -> block3 block2 from block0 block4
block2 {
JUMP #0x0 -> block5
} -> block5 from block1
block3 {
} -> block4 from block1
block4 {
r8 = IMUL.i32 r4, r8
r1 = IMUL.i32 r5, r1
r2 = IMUL.i32 r6, r2
r3 = IMUL.i32 r7, r3
r8 = IADD.s32 r8, r4
r1 = IADD.s32 r1, r5
r2 = IADD.s32 r2, r6
r3 = IADD.s32 r3, r7
r4 = IMUL.i32 r8, r4
r5 = IMUL.i32 r1, r5
r6 = IMUL.i32 r2, r6
r7 = IMUL.i32 r3, r7
r4 = IADD.s32 r4, r8
r5 = IADD.s32 r5, r1
r6 = IADD.s32 r6, r2
r7 = IADD.s32 r7, r3
r8 = IMUL.i32 r4, r8
r1 = IMUL.i32 r5, r1
r2 = IMUL.i32 r6, r2
r3 = IMUL.i32 r7, r3
r8 = IADD.s32 r8, r4
r1 = IADD.s32 r1, r5
r2 = IADD.s32 r2, r6
r3 = IADD.s32 r3, r7
r4 = IMUL.i32 r8, r4
r5 = IMUL.i32 r1, r5
r6 = IMUL.i32 r2, r6
r7 = IMUL.i32 r3, r7
r4 = IADD.s32 r4, r8
r5 = IADD.s32 r5, r1
r6 = IADD.s32 r6, r2
r7 = IADD.s32 r7, r3
r8 = IMUL.i32 r4, r8
r1 = IMUL.i32 r5, r1
r2 = IMUL.i32 r6, r2
r3 = IMUL.i32 r7, r3
r8 = IADD.s32 r8, r4
r1 = IADD.s32 r1, r5
r2 = IADD.s32 r2, r6
r3 = IADD.s32 r3, r7
r4 = IMUL.i32 r8, r4
r5 = IMUL.i32 r1, r5
r6 = IMUL.i32 r2, r6
r7 = IMUL.i32 r3, r7
r4 = IADD.s32 r4, r8
r5 = IADD.s32 r5, r1
r6 = IADD.s32 r6, r2
r7 = IADD.s32 r7, r3
r8 = IMUL.i32 r4, r8
r1 = IMUL.i32 r5, r1
r2 = IMUL.i32 r6, r2
r3 = IMUL.i32 r7, r3
r8 = IADD.s32 r8, r4
r1 = IADD.s32 r1, r5
r2 = IADD.s32 r2, r6
r3 = IADD.s32 r3, r7
r4 = IMUL.i32 r8, r4
r5 = IMUL.i32 r1, r5
r6 = IMUL.i32 r2, r6
r7 = IMUL.i32 r3, r7
r4 = IADD.s32 r4, r8
r5 = IADD.s32 r5, r1
r6 = IADD.s32 r6, r2
r7 = IADD.s32 r7, r3
r8 = IMUL.i32 r4, r8
r1 = IMUL.i32 r5, r1
r2 = IMUL.i32 r6, r2
r3 = IMUL.i32 r7, r3
r8 = IADD.s32 r8, r4
r1 = IADD.s32 r1, r5
r2 = IADD.s32 r2, r6
r3 = IADD.s32 r3, r7
r4 = IMUL.i32 r8, r4
r5 = IMUL.i32 r1, r5
r6 = IMUL.i32 r2, r6
r7 = IMUL.i32 r3, r7
r4 = IADD.s32 r4, r8
r5 = IADD.s32 r5, r1
r6 = IADD.s32 r6, r2
r7 = IADD.s32 r7, r3
r8 = IMUL.i32 r4, r8
r1 = IMUL.i32 r5, r1
r2 = IMUL.i32 r6, r2
r3 = IMUL.i32 r7, r3
r8 = IADD.s32 r8, r4
r1 = IADD.s32 r1, r5
r2 = IADD.s32 r2, r6
r3 = IADD.s32 r3, r7
r4 = IMUL.i32 r8, r4
r5 = IMUL.i32 r1, r5
r6 = IMUL.i32 r2, r6
r7 = IMUL.i32 r3, r7
r4 = IADD.s32 r4, r8
r5 = IADD.s32 r5, r1
r6 = IADD.s32 r6, r2
r7 = IADD.s32 r7, r3
r8 = IMUL.i32 r4, r8
r1 = IMUL.i32 r5, r1
r2 = IMUL.i32 r6, r2
r3 = IMUL.i32 r7, r3
r8 = IADD.s32 r8, r4
r1 = IADD.s32 r1, r5
r2 = IADD.s32 r2, r6
r3 = IADD.s32 r3, r7
r4 = IMUL.i32 r8, r4
r5 = IMUL.i32 r1, r5
r6 = IMUL.i32 r2, r6
r7 = IMUL.i32 r3, r7
r4 = IADD.s32 r4, r8
r5 = IADD.s32 r5, r1
r6 = IADD.s32 r6, r2
r7 = IADD.s32 r7, r3
r8 = IMUL.i32 r4, r8
r1 = IMUL.i32 r5, r1
r2 = IMUL.i32 r6, r2
r3 = IMUL.i32 r7, r3
r8 = IADD.s32 r8, r4
r1 = IADD.s32 r1, r5
r2 = IADD.s32 r2, r6
r3 = IADD.s32 r3, r7
r4 = IMUL.i32 r8, r4
r5 = IMUL.i32 r1, r5
r6 = IMUL.i32 r2, r6
r7 = IMUL.i32 r3, r7
r4 = IADD.s32 r4, r8
r5 = IADD.s32 r5, r1
r6 = IADD.s32 r6, r2
r7 = IADD.s32 r7, r3
r9 = IADD.s32 r9, #0x1
JUMP #0x0 -> block1
} -> block1 from block3
block5 {
r0 = LSHIFT_OR.i32 r0, #0x0, #0x2.b0
r1 = IADD.s32 r4, r5
r1 = IADD.s32 r1, r6
r1 = IADD.s32 r1, r7
r0 = IADD.s32 u0, r0
r2 = ICMP.u32.i1.lt r0, u0
r2 = IADD.s32 r2, u0[1]
STORE.i32 r1, r0, r2, byte_offset:0
} from block2
block0 {
id(0) nbb
* _.h00 = LSHIFT_OR.i32 r61, t, fau.y.b0
+ _.h00 = IADD.s32 r60, t
* _.h00 = MKVEC.v2i16 t.h00, r62.h00
+ r0 = IADD.s32 t1, t
800000000
id(0) nbb r_uncond
* NOP
+ r1 = MOV.i32 fau.x
* r2 = MOV.i32 fau.y
+ _.h00 = U32_TO_F32 r0
* _.h00 = FMA.f32 t1, fau.y, t.neg
+ r4 = F32_TO_S32.rtz t
* r3 = MOV.i32 fau.x
+ r5 = MOV.i32 t1
* r6 = MOV.i32 r4
+ r7 = MOV.i32 r4
* r8 = MOV.i32 fau.x
+ r1 = IADD.s32 fau.x, r1
* r9 = MOV.i32 t
+ r2 = IADD.s32 fau.x, r2
* NOP
+ r3 = IADD.s32 fau.x, r3
200000001 2edbe6ff00000003
} -> block1
block1 {
id(0) nbb r_uncond pcrel(0)
* NOP
+ _.h00 = ICMP.s32.m1.ge r9, fau.x
* NOP
+ BRANCHZ.i16.eq t1.h00, fau.y -> block3
4000000000000020
} -> block3 block2 from block0 block4
block2 {
id(0) nbb no_prefetch pcrel(0)
* NOP
+ JUMP fau.y -> block5
4000000000000000
} -> block5 from block1
block3 {
} -> block4 from block1
block4 {
id(0) nbb
* r1 = IMUL.i32 r5, r1
+ NOP
* r2 = IMUL.i32 r6, r2
+ NOP
id(0) nbb
* _.h00 = IMUL.i32 r4, r8
+ r8 = IADD.s32 t, r4
* r4 = IMUL.i32 t1, r4
+ NOP
* _.h00 = IMUL.i32 r7, r3
+ r3 = IADD.s32 t, r7
* _.h00 = IMUL.i32 t1, r7
+ r7 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r3
+ r3 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r7
+ r7 = IADD.s32 t, t1
* r3 = IMUL.i32 t1, r3
+ NOP
* NOP
+ r1 = IADD.s32 r1, r5
id(0) nbb
* _.h00 = IMUL.i32 r1, r5
+ r5 = IADD.s32 t, r1
* _.h00 = IMUL.i32 t1, r1
+ r1 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r5
+ r5 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r1
+ r1 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r5
+ r5 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r1
+ r1 = IADD.s32 t, t1
* r5 = IMUL.i32 t1, r5
+ NOP
* NOP
+ r4 = IADD.s32 r4, r8
id(0) nbb
* _.h00 = IMUL.i32 r4, r8
+ r8 = IADD.s32 t, r4
* _.h00 = IMUL.i32 t1, r4
+ r4 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r8
+ r8 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r4
+ r4 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r8
+ r8 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r4
+ r4 = IADD.s32 t, t1
* r8 = IMUL.i32 t1, r8
+ NOP
* NOP
+ r3 = IADD.s32 r3, r7
id(0) nbb
* _.h00 = IMUL.i32 r3, r7
+ r7 = IADD.s32 t, r3
* _.h00 = IMUL.i32 t1, r3
+ r3 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r7
+ r7 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r3
+ r3 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r7
+ r7 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r3
+ r3 = IADD.s32 t, t1
* r7 = IMUL.i32 t1, r7
+ NOP
* NOP
+ r5 = IADD.s32 r5, r1
id(0) nbb
* _.h00 = IMUL.i32 r5, r1
+ r1 = IADD.s32 t, r5
* _.h00 = IMUL.i32 t1, r5
+ r5 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r1
+ r1 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r5
+ r5 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r1
+ r1 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r5
+ r5 = IADD.s32 t, t1
* r1 = IMUL.i32 t1, r1
+ NOP
* NOP
+ r2 = IADD.s32 r2, r6
id(0) nbb
* _.h00 = IMUL.i32 r2, r6
+ r6 = IADD.s32 t, r2
* _.h00 = IMUL.i32 t1, r2
+ r2 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r6
+ r6 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r2
+ r2 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r6
+ r6 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r2
+ r2 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r6
+ r6 = IADD.s32 t, t1
* NOP
+ r8 = IADD.s32 r8, r4
id(0) nbb
* _.h00 = IMUL.i32 r6, r2
+ r2 = IADD.s32 t, r6
* _.h00 = IMUL.i32 r8, r4
+ r4 = IADD.s32 t, r8
* _.h00 = IMUL.i32 r2, r6
+ r6 = IADD.s32 t, r2
* _.h00 = IMUL.i32 r4, r8
+ r8 = IADD.s32 t, r4
* _.h00 = IMUL.i32 r6, r2
+ r2 = IADD.s32 t, r6
* _.h00 = IMUL.i32 r8, r4
+ r4 = IADD.s32 t, r8
* _.h00 = IMUL.i32 r2, r6
+ r6 = IADD.s32 t, r2
* NOP
+ r7 = IADD.s32 r7, r3
id(0) nbb
* _.h00 = IMUL.i32 r4, r8
+ r8 = IADD.s32 t, r4
* _.h00 = IMUL.i32 r6, r2
+ r2 = IADD.s32 t, r6
* _.h00 = IMUL.i32 r7, r3
+ r3 = IADD.s32 t, r7
* _.h00 = IMUL.i32 r8, r4
+ r4 = IADD.s32 t, r8
* _.h00 = IMUL.i32 r2, r6
+ r6 = IADD.s32 t, r2
* _.h00 = IMUL.i32 r3, r7
+ r7 = IADD.s32 t, r3
* _.h00 = IMUL.i32 r4, r8
+ r8 = IADD.s32 t, r4
* NOP
+ r1 = IADD.s32 r1, r5
id(0) nbb r_uncond no_prefetch pcrel(1)
* _.h00 = IMUL.i32 r6, r2
+ r2 = IADD.s32 t, r6
* _.h00 = IMUL.i32 r7, r3
+ r3 = IADD.s32 t, r7
* _.h00 = IMUL.i32 r8, r4
+ r4 = IADD.s32 t, r8
* _.h00 = IMUL.i32 r1, r5
+ r5 = IADD.s32 t, r1
* _.h00 = IMUL.i32 r2, r6
+ r6 = IADD.s32 t, r2
* _.h00 = IMUL.i32 r3, r7
+ r7 = IADD.s32 t, r3
* NOP
+ r9 = IADD.s32 r9, fau.x
* NOP
+ JUMP fau.y -> block1
0 4000000000000001
} -> block1 from block3
block5 {
id(0) wait(0 ) nbb r_uncond
* NOP
+ _.h00 = IADD.s32 r4, r5
* NOP
+ _.h00 = IADD.s32 t1, r6
* _.h00 = LSHIFT_OR.i32 r0, t, fau.y.b0
+ r1 = IADD.s32 t1, r7
* NOP
+ r0 = IADD.s32 fau.x, t0
* NOP
+ _.h00 = ICMP.u32.i1.lt t1, fau.x
* NOP
+ _.h00 = IADD.s32 t1, fau.y
* NOP
+ STORE.i32 r1, r0, t1, byte_offset:0
200000000
} from block2
slot 0 reads: r1
clause_0:
ds(0) nbb ncph
{
*LSHIFT_OR.i32 t0, r61, #0, 0x00000008 /* 0.000000 */
+IADD.s32 t1, r60, t
*MKVEC.v2i16 t0, #0, r62
+IADD.s32 r0:t1, t1, t
}
clause_3:
ds(0) nbb r_uncond ncph
{
*NOP t0
+MOV.i32 r1:t1, 0x00000001 /* 0.000000 */
*MOV.i32 r2:t0, 0x00000002 /* 0.000000 */
+U32_TO_F32 t1, r0
*FMA.f32 t0, t1, 0x2edbe6ff /* 0.000000 */, #0.neg
+F32_TO_S32.rtz r4:t1, t
*MOV.i32 r3:t0, 0x00000003 /* 0.000000 */
+MOV.i32 r5:t1, t1
*MOV.i32 r6:t0, r4
+MOV.i32 r7:t1, r4
*MOV.i32 r8:t0, u1.w0
+IADD.s32 r1:t1, u1.w0, r1
*MOV.i32 r9:t0, #0
+IADD.s32 r2:t1, u1.w0, r2
*NOP t0
+IADD.s32 r3:t1, u1.w0, r3
}
clause_10:
ds(0) nbb r_uncond ncph
{
*NOP t0
+ICMP.s32.m1.ge t1, r9, 0x00000020 /* 0.000000 */
*NOP t0
+BRANCHZ.i16.eq t1, t1.h0, clause_15
}
clause_13:
ds(0) nbb next_store dwb(0)
{
*NOP t0
+JUMP t1, clause_72
}
clause_15:
ds(0) nbb ncph
{
*IMUL.i32 r1:t0, r5, r1
+NOP t1
*IMUL.i32 r2:t0, r6, r2
+NOP t1
}
clause_17:
ds(0) nbb ncph
{
*IMUL.i32 t0, r4, r8
+IADD.s32 r8:t1, t, r4
*IMUL.i32 r4:t0, t1, r4
+NOP t1
*IMUL.i32 t0, r7, r3
+IADD.s32 r3:t1, t, r7
*IMUL.i32 t0, t1, r7
+IADD.s32 r7:t1, t, t1
*IMUL.i32 t0, t1, r3
+IADD.s32 r3:t1, t, t1
*IMUL.i32 t0, t1, r7
+IADD.s32 r7:t1, t, t1
*IMUL.i32 r3:t0, t1, r3
+NOP t1
*NOP t0
+IADD.s32 r1:t1, r1, r5
}
clause_23:
ds(0) nbb ncph
{
*IMUL.i32 t0, r1, r5
+IADD.s32 r5:t1, t, r1
*IMUL.i32 t0, t1, r1
+IADD.s32 r1:t1, t, t1
*IMUL.i32 t0, t1, r5
+IADD.s32 r5:t1, t, t1
*IMUL.i32 t0, t1, r1
+IADD.s32 r1:t1, t, t1
*IMUL.i32 t0, t1, r5
+IADD.s32 r5:t1, t, t1
*IMUL.i32 t0, t1, r1
+IADD.s32 r1:t1, t, t1
*IMUL.i32 r5:t0, t1, r5
+NOP t1
*NOP t0
+IADD.s32 r4:t1, r4, r8
}
clause_29:
ds(0) nbb ncph
{
*IMUL.i32 t0, r4, r8
+IADD.s32 r8:t1, t, r4
*IMUL.i32 t0, t1, r4
+IADD.s32 r4:t1, t, t1
*IMUL.i32 t0, t1, r8
+IADD.s32 r8:t1, t, t1
*IMUL.i32 t0, t1, r4
+IADD.s32 r4:t1, t, t1
*IMUL.i32 t0, t1, r8
+IADD.s32 r8:t1, t, t1
*IMUL.i32 t0, t1, r4
+IADD.s32 r4:t1, t, t1
*IMUL.i32 r8:t0, t1, r8
+NOP t1
*NOP t0
+IADD.s32 r3:t1, r3, r7
}
clause_35:
ds(0) nbb ncph
{
*IMUL.i32 t0, r3, r7
+IADD.s32 r7:t1, t, r3
*IMUL.i32 t0, t1, r3
+IADD.s32 r3:t1, t, t1
*IMUL.i32 t0, t1, r7
+IADD.s32 r7:t1, t, t1
*IMUL.i32 t0, t1, r3
+IADD.s32 r3:t1, t, t1
*IMUL.i32 t0, t1, r7
+IADD.s32 r7:t1, t, t1
*IMUL.i32 t0, t1, r3
+IADD.s32 r3:t1, t, t1
*IMUL.i32 r7:t0, t1, r7
+NOP t1
*NOP t0
+IADD.s32 r5:t1, r5, r1
}
clause_41:
ds(0) nbb ncph
{
*IMUL.i32 t0, r5, r1
+IADD.s32 r1:t1, t, r5
*IMUL.i32 t0, t1, r5
+IADD.s32 r5:t1, t, t1
*IMUL.i32 t0, t1, r1
+IADD.s32 r1:t1, t, t1
*IMUL.i32 t0, t1, r5
+IADD.s32 r5:t1, t, t1
*IMUL.i32 t0, t1, r1
+IADD.s32 r1:t1, t, t1
*IMUL.i32 t0, t1, r5
+IADD.s32 r5:t1, t, t1
*IMUL.i32 r1:t0, t1, r1
+NOP t1
*NOP t0
+IADD.s32 r2:t1, r2, r6
}
clause_47:
ds(0) nbb ncph
{
*IMUL.i32 t0, r2, r6
+IADD.s32 r6:t1, t, r2
*IMUL.i32 t0, t1, r2
+IADD.s32 r2:t1, t, t1
*IMUL.i32 t0, t1, r6
+IADD.s32 r6:t1, t, t1
*IMUL.i32 t0, t1, r2
+IADD.s32 r2:t1, t, t1
*IMUL.i32 t0, t1, r6
+IADD.s32 r6:t1, t, t1
*IMUL.i32 t0, t1, r2
+IADD.s32 r2:t1, t, t1
*IMUL.i32 t0, t1, r6
+IADD.s32 r6:t1, t, t1
*NOP t0
+IADD.s32 r8:t1, r8, r4
}
clause_53:
ds(0) nbb ncph
{
*IMUL.i32 t0, r6, r2
+IADD.s32 r2:t1, t, r6
*IMUL.i32 t0, r8, r4
+IADD.s32 r4:t1, t, r8
*IMUL.i32 t0, r2, r6
+IADD.s32 r6:t1, t, r2
*IMUL.i32 t0, r4, r8
+IADD.s32 r8:t1, t, r4
*IMUL.i32 t0, r6, r2
+IADD.s32 r2:t1, t, r6
*IMUL.i32 t0, r8, r4
+IADD.s32 r4:t1, t, r8
*IMUL.i32 t0, r2, r6
+IADD.s32 r6:t1, t, r2
*NOP t0
+IADD.s32 r7:t1, r7, r3
}
clause_59:
ds(0) nbb ncph
{
*IMUL.i32 t0, r4, r8
+IADD.s32 r8:t1, t, r4
*IMUL.i32 t0, r6, r2
+IADD.s32 r2:t1, t, r6
*IMUL.i32 t0, r7, r3
+IADD.s32 r3:t1, t, r7
*IMUL.i32 t0, r8, r4
+IADD.s32 r4:t1, t, r8
*IMUL.i32 t0, r2, r6
+IADD.s32 r6:t1, t, r2
*IMUL.i32 t0, r3, r7
+IADD.s32 r7:t1, t, r3
*IMUL.i32 t0, r4, r8
+IADD.s32 r8:t1, t, r4
*NOP t0
+IADD.s32 r1:t1, r1, r5
}
clause_65:
ds(0) nbb r_uncond
{
*IMUL.i32 t0, r6, r2
+IADD.s32 r2:t1, t, r6
*IMUL.i32 t0, r7, r3
+IADD.s32 r3:t1, t, r7
*IMUL.i32 t0, r8, r4
+IADD.s32 r4:t1, t, r8
*IMUL.i32 t0, r1, r5
+IADD.s32 r5:t1, t, r1
*IMUL.i32 t0, r2, r6
+IADD.s32 r6:t1, t, r2
*IMUL.i32 t0, r3, r7
+IADD.s32 r7:t1, t, r3
*NOP t0
+IADD.s32 r9:t1, r9, 0x00000001 /* 0.000000 */
*NOP t0
+JUMP t1, clause_10
}
clause_72:
ds(0) eos store
{
*NOP t0
+IADD.s32 t1, r4, r5
*NOP t0
+IADD.s32 t1, t1, r6
*LSHIFT_OR.i32 t0, r0, #0, 0x00000002 /* 0.000000 */
+IADD.s32 r1:t1, t1, r7
*NOP t0
+IADD.s32 r0:t1, u0.w0, t0
*NOP t0
+ICMP.u32.gt t1, u0.w0, t1
*NOP t0
+IADD.s32 t1, t1, u0.w1
*NOP t0
+STORE.i32 t1, r0, t1, @r1
}
e20eea22 compute_sp_v4_int 21.133 GFLOPs 12.702ms
compute shader ----------
#define KERNEL compute_sp_v8
#define LOCAL_SIZE_X 256
#define DATATYPE int
#define vec2 ivec2
#define vec4 ivec4
#line 64
// Avoiding auto-vectorize by using vector-width locked dependent code
layout(local_size_x = LOCAL_SIZE_X) in;
#undef MAD_4
#undef MAD_16
#undef MAD_64
#define mad(a,b,c) (a*b+c)
#define MAD_4(x, y) x = mad(y, x, y); y = mad(x, y, x); x = mad(y, x, y); y = mad(x, y, x);
#define MAD_16(x, y) MAD_4(x, y); MAD_4(x, y); MAD_4(x, y); MAD_4(x, y);
#define MAD_64(x, y) MAD_16(x, y); MAD_16(x, y); MAD_16(x, y); MAD_16(x, y);
struct vec8 {
vec4 d0, d1;
};
#define VEC8(x0,x1,x2,x3,x4,x5,x6,x7) vec8(vec4(x0,x1,x2,x3), vec4(x4,x5,x6,x7))
#define VEC8_S(x) vec8(vec4(x,x,x,x), vec4(x,x,x,x))
#define VEC8_ADD(a, b) (vec8(a.d0 + b.d0, a.d1 + b.d1))
#define VEC8_MUL(a, b) (vec8(a.d0 * b.d0, a.d1 * b.d1))
struct vec16 {
vec8 d0,d1;
};
#define VEC16(x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15) vec16(VEC8(x0,x1,x2,x3,x4,x5,x6,x7), VEC8(x8,x9,x10,x11,x12,x13,x14,x15))
#define VEC16_S(x) vec16(VEC8_S(x), VEC8_S(x));
#define VEC16_ADD(a, b) (vec16(VEC8_ADD(a.d0, b.d0), VEC8_ADD(a.d1, b.d1)))
#define VEC16_MUL(a, b) (vec16(VEC8_MUL(a.d0, b.d0), VEC8_MUL(a.d1, b.d1)))
#define mad8(a,b,c) (VEC8_ADD(VEC8_MUL(a,b),c))
#define mad16(a,b,c) (VEC16_ADD(VEC16_MUL(a,b),c))
layout(location = 1) uniform DATATYPE _A;
#define SCALE 1e-10
layout(std430, binding = 0) restrict writeonly buffer outbuffer {
DATATYPE ptr[];
};
#line 162
void compute_sp_v8()
{
uint id = gl_GlobalInvocationID[0] + gl_GlobalInvocationID[1] * 256u + gl_GlobalInvocationID[2] * 256u * 256u;
vec8 x = VEC8(_A, (_A+DATATYPE(1)), (_A+DATATYPE(2)), (_A+DATATYPE(3)), (_A+DATATYPE(4)), (_A+DATATYPE(5)), (_A+DATATYPE(6)), (_A+DATATYPE(7)));
vec8 y = VEC8_S(DATATYPE(float(id) * SCALE));
#undef mad
#define mad mad8
for(int i=0; i<16; i++)
{
MAD_16(x, y);
}
vec4 s = y.d0 + y.d1;
vec2 t = s.xy + s.zw;
ptr[id] = t.x + t.y;
}
void main() {compute_sp_v8();}
----------
shader: MESA_SHADER_COMPUTE
source_sha1: {0x5c855ea2, 0x8971a97c, 0xa0adc9e8, 0x273dc174, 0x2821d310}
name: GLSL18
workgroup-size: 256, 1, 1
shared-size: 0
inputs: 0
outputs: 0
uniforms: 1
ubos: 1
shared: 0
ray queries: 0
decl_var ssbo INTERP_MODE_NONE restrict writeonly highp int[] ptr (0, 0, 0)
decl_var uniform INTERP_MODE_NONE highp int _A (1, 0, 0)
decl_var ubo INTERP_MODE_NONE vec4[1] uniform_0 (0, 0, 0)
decl_function main (0 params)
impl main {
block block_0:
/* preds: */
vec3 32 ssa_10 = intrinsic load_global_invocation_id () ()
vec1 32 ssa_11 = load_const (0x00000008 = 0.000000)
vec1 32 ssa_12 = ishl ssa_10.y, ssa_11
vec1 32 ssa_13 = iadd ssa_10.x, ssa_12
vec1 32 ssa_1 = load_const (0x00000001 = 0.000000)
vec1 32 ssa_319 = insert_u16 ssa_10.z, ssa_1
vec1 32 ssa_15 = iadd ssa_13, ssa_319
vec1 32 ssa_0 = load_const (0x00000000 = 0.000000)
vec1 32 ssa_16 = intrinsic load_ubo (ssa_0, ssa_0) (access=0, align_mul=1073741824, align_offset=0, range_base=0, range=4)
vec1 32 ssa_17 = iadd ssa_16, ssa_1
vec1 32 ssa_2 = load_const (0x00000002 = 0.000000)
vec1 32 ssa_18 = iadd ssa_16, ssa_2
vec1 32 ssa_3 = load_const (0x00000003 = 0.000000)
vec1 32 ssa_19 = iadd ssa_16, ssa_3
vec1 32 ssa_4 = load_const (0x00000004 = 0.000000)
vec1 32 ssa_20 = iadd ssa_16, ssa_4
vec1 32 ssa_5 = load_const (0x00000005 = 0.000000)
vec1 32 ssa_21 = iadd ssa_16, ssa_5
vec1 32 ssa_6 = load_const (0x00000006 = 0.000000)
vec1 32 ssa_22 = iadd ssa_16, ssa_6
vec1 32 ssa_7 = load_const (0x00000007 = 0.000000)
vec1 32 ssa_23 = iadd ssa_16, ssa_7
vec1 32 ssa_24 = u2f32 ssa_15
vec1 32 ssa_8 = load_const (0x2edbe6ff = 0.000000)
vec1 32 ssa_25 = fmul ssa_24, ssa_8
vec1 32 ssa_26 = f2i32 ssa_25
vec1 32 ssa_9 = load_const (0x00000010 = 0.000000)
/* succs: block_1 */
loop {
block block_1:
/* preds: block_0 block_4 */
vec1 32 ssa_27 = phi block_0: ssa_26, block_4: ssa_297
vec1 32 ssa_28 = phi block_0: ssa_26, block_4: ssa_298
vec1 32 ssa_29 = phi block_0: ssa_26, block_4: ssa_299
vec1 32 ssa_30 = phi block_0: ssa_26, block_4: ssa_300
vec1 32 ssa_31 = phi block_0: ssa_26, block_4: ssa_293
vec1 32 ssa_32 = phi block_0: ssa_26, block_4: ssa_294
vec1 32 ssa_33 = phi block_0: ssa_26, block_4: ssa_295
vec1 32 ssa_34 = phi block_0: ssa_26, block_4: ssa_296
vec1 32 ssa_35 = phi block_0: ssa_20, block_4: ssa_281
vec1 32 ssa_36 = phi block_0: ssa_21, block_4: ssa_282
vec1 32 ssa_37 = phi block_0: ssa_22, block_4: ssa_283
vec1 32 ssa_38 = phi block_0: ssa_23, block_4: ssa_284
vec1 32 ssa_39 = phi block_0: ssa_16, block_4: ssa_277
vec1 32 ssa_40 = phi block_0: ssa_17, block_4: ssa_278
vec1 32 ssa_41 = phi block_0: ssa_18, block_4: ssa_279
vec1 32 ssa_42 = phi block_0: ssa_19, block_4: ssa_280
vec1 32 ssa_43 = phi block_0: ssa_0, block_4: ssa_301
vec1 32 ssa_44 = ige32 ssa_43, ssa_9
/* succs: block_2 block_3 */
if ssa_44 {
block block_2:
/* preds: block_1 */
break
/* succs: block_5 */
} else {
block block_3:
/* preds: block_1 */
/* succs: block_4 */
}
block block_4:
/* preds: block_3 */
vec1 32 ssa_45 = imul ssa_31, ssa_39
vec1 32 ssa_46 = imul ssa_32, ssa_40
vec1 32 ssa_47 = imul ssa_33, ssa_41
vec1 32 ssa_48 = imul ssa_34, ssa_42
vec1 32 ssa_49 = imul ssa_27, ssa_35
vec1 32 ssa_50 = imul ssa_28, ssa_36
vec1 32 ssa_51 = imul ssa_29, ssa_37
vec1 32 ssa_52 = imul ssa_30, ssa_38
vec1 32 ssa_53 = iadd ssa_45, ssa_31
vec1 32 ssa_54 = iadd ssa_46, ssa_32
vec1 32 ssa_55 = iadd ssa_47, ssa_33
vec1 32 ssa_56 = iadd ssa_48, ssa_34
vec1 32 ssa_57 = iadd ssa_49, ssa_27
vec1 32 ssa_58 = iadd ssa_50, ssa_28
vec1 32 ssa_59 = iadd ssa_51, ssa_29
vec1 32 ssa_60 = iadd ssa_52, ssa_30
vec1 32 ssa_61 = imul ssa_53, ssa_31
vec1 32 ssa_62 = imul ssa_54, ssa_32
vec1 32 ssa_63 = imul ssa_55, ssa_33
vec1 32 ssa_64 = imul ssa_56, ssa_34
vec1 32 ssa_65 = imul ssa_57, ssa_27
vec1 32 ssa_66 = imul ssa_58, ssa_28
vec1 32 ssa_67 = imul ssa_59, ssa_29
vec1 32 ssa_68 = imul ssa_60, ssa_30
vec1 32 ssa_69 = iadd ssa_61, ssa_53
vec1 32 ssa_70 = iadd ssa_62, ssa_54
vec1 32 ssa_71 = iadd ssa_63, ssa_55
vec1 32 ssa_72 = iadd ssa_64, ssa_56
vec1 32 ssa_73 = iadd ssa_65, ssa_57
vec1 32 ssa_74 = iadd ssa_66, ssa_58
vec1 32 ssa_75 = iadd ssa_67, ssa_59
vec1 32 ssa_76 = iadd ssa_68, ssa_60
vec1 32 ssa_77 = imul ssa_69, ssa_53
vec1 32 ssa_78 = imul ssa_70, ssa_54
vec1 32 ssa_79 = imul ssa_71, ssa_55
vec1 32 ssa_80 = imul ssa_72, ssa_56
vec1 32 ssa_81 = imul ssa_73, ssa_57
vec1 32 ssa_82 = imul ssa_74, ssa_58
vec1 32 ssa_83 = imul ssa_75, ssa_59
vec1 32 ssa_84 = imul ssa_76, ssa_60
vec1 32 ssa_85 = iadd ssa_77, ssa_69
vec1 32 ssa_86 = iadd ssa_78, ssa_70
vec1 32 ssa_87 = iadd ssa_79, ssa_71
vec1 32 ssa_88 = iadd ssa_80, ssa_72
vec1 32 ssa_89 = iadd ssa_81, ssa_73
vec1 32 ssa_90 = iadd ssa_82, ssa_74
vec1 32 ssa_91 = iadd ssa_83, ssa_75
vec1 32 ssa_92 = iadd ssa_84, ssa_76
vec1 32 ssa_93 = imul ssa_85, ssa_69
vec1 32 ssa_94 = imul ssa_86, ssa_70
vec1 32 ssa_95 = imul ssa_87, ssa_71
vec1 32 ssa_96 = imul ssa_88, ssa_72
vec1 32 ssa_97 = imul ssa_89, ssa_73
vec1 32 ssa_98 = imul ssa_90, ssa_74
vec1 32 ssa_99 = imul ssa_91, ssa_75
vec1 32 ssa_100 = imul ssa_92, ssa_76
vec1 32 ssa_101 = iadd ssa_93, ssa_85
vec1 32 ssa_102 = iadd ssa_94, ssa_86
vec1 32 ssa_103 = iadd ssa_95, ssa_87
vec1 32 ssa_104 = iadd ssa_96, ssa_88
vec1 32 ssa_105 = iadd ssa_97, ssa_89
vec1 32 ssa_106 = iadd ssa_98, ssa_90
vec1 32 ssa_107 = iadd ssa_99, ssa_91
vec1 32 ssa_108 = iadd ssa_100, ssa_92
vec1 32 ssa_109 = imul ssa_101, ssa_85
vec1 32 ssa_110 = imul ssa_102, ssa_86
vec1 32 ssa_111 = imul ssa_103, ssa_87
vec1 32 ssa_112 = imul ssa_104, ssa_88
vec1 32 ssa_113 = imul ssa_105, ssa_89
vec1 32 ssa_114 = imul ssa_106, ssa_90
vec1 32 ssa_115 = imul ssa_107, ssa_91
vec1 32 ssa_116 = imul ssa_108, ssa_92
vec1 32 ssa_117 = iadd ssa_109, ssa_101
vec1 32 ssa_118 = iadd ssa_110, ssa_102
vec1 32 ssa_119 = iadd ssa_111, ssa_103
vec1 32 ssa_120 = iadd ssa_112, ssa_104
vec1 32 ssa_121 = iadd ssa_113, ssa_105
vec1 32 ssa_122 = iadd ssa_114, ssa_106
vec1 32 ssa_123 = iadd ssa_115, ssa_107
vec1 32 ssa_124 = iadd ssa_116, ssa_108
vec1 32 ssa_125 = imul ssa_117, ssa_101
vec1 32 ssa_126 = imul ssa_118, ssa_102
vec1 32 ssa_127 = imul ssa_119, ssa_103
vec1 32 ssa_128 = imul ssa_120, ssa_104
vec1 32 ssa_129 = imul ssa_121, ssa_105
vec1 32 ssa_130 = imul ssa_122, ssa_106
vec1 32 ssa_131 = imul ssa_123, ssa_107
vec1 32 ssa_132 = imul ssa_124, ssa_108
vec1 32 ssa_133 = iadd ssa_125, ssa_117
vec1 32 ssa_134 = iadd ssa_126, ssa_118
vec1 32 ssa_135 = iadd ssa_127, ssa_119
vec1 32 ssa_136 = iadd ssa_128, ssa_120
vec1 32 ssa_137 = iadd ssa_129, ssa_121
vec1 32 ssa_138 = iadd ssa_130, ssa_122
vec1 32 ssa_139 = iadd ssa_131, ssa_123
vec1 32 ssa_140 = iadd ssa_132, ssa_124
vec1 32 ssa_141 = imul ssa_133, ssa_117
vec1 32 ssa_142 = imul ssa_134, ssa_118
vec1 32 ssa_143 = imul ssa_135, ssa_119
vec1 32 ssa_144 = imul ssa_136, ssa_120
vec1 32 ssa_145 = imul ssa_137, ssa_121
vec1 32 ssa_146 = imul ssa_138, ssa_122
vec1 32 ssa_147 = imul ssa_139, ssa_123
vec1 32 ssa_148 = imul ssa_140, ssa_124
vec1 32 ssa_149 = iadd ssa_141, ssa_133
vec1 32 ssa_150 = iadd ssa_142, ssa_134
vec1 32 ssa_151 = iadd ssa_143, ssa_135
vec1 32 ssa_152 = iadd ssa_144, ssa_136
vec1 32 ssa_153 = iadd ssa_145, ssa_137
vec1 32 ssa_154 = iadd ssa_146, ssa_138
vec1 32 ssa_155 = iadd ssa_147, ssa_139
vec1 32 ssa_156 = iadd ssa_148, ssa_140
vec1 32 ssa_157 = imul ssa_149, ssa_133
vec1 32 ssa_158 = imul ssa_150, ssa_134
vec1 32 ssa_159 = imul ssa_151, ssa_135
vec1 32 ssa_160 = imul ssa_152, ssa_136
vec1 32 ssa_161 = imul ssa_153, ssa_137
vec1 32 ssa_162 = imul ssa_154, ssa_138
vec1 32 ssa_163 = imul ssa_155, ssa_139
vec1 32 ssa_164 = imul ssa_156, ssa_140
vec1 32 ssa_165 = iadd ssa_157, ssa_149
vec1 32 ssa_166 = iadd ssa_158, ssa_150
vec1 32 ssa_167 = iadd ssa_159, ssa_151
vec1 32 ssa_168 = iadd ssa_160, ssa_152
vec1 32 ssa_169 = iadd ssa_161, ssa_153
vec1 32 ssa_170 = iadd ssa_162, ssa_154
vec1 32 ssa_171 = iadd ssa_163, ssa_155
vec1 32 ssa_172 = iadd ssa_164, ssa_156
vec1 32 ssa_173 = imul ssa_165, ssa_149
vec1 32 ssa_174 = imul ssa_166, ssa_150
vec1 32 ssa_175 = imul ssa_167, ssa_151
vec1 32 ssa_176 = imul ssa_168, ssa_152
vec1 32 ssa_177 = imul ssa_169, ssa_153
vec1 32 ssa_178 = imul ssa_170, ssa_154
vec1 32 ssa_179 = imul ssa_171, ssa_155
vec1 32 ssa_180 = imul ssa_172, ssa_156
vec1 32 ssa_181 = iadd ssa_173, ssa_165
vec1 32 ssa_182 = iadd ssa_174, ssa_166
vec1 32 ssa_183 = iadd ssa_175, ssa_167
vec1 32 ssa_184 = iadd ssa_176, ssa_168
vec1 32 ssa_185 = iadd ssa_177, ssa_169
vec1 32 ssa_186 = iadd ssa_178, ssa_170
vec1 32 ssa_187 = iadd ssa_179, ssa_171
vec1 32 ssa_188 = iadd ssa_180, ssa_172
vec1 32 ssa_189 = imul ssa_181, ssa_165
vec1 32 ssa_190 = imul ssa_182, ssa_166
vec1 32 ssa_191 = imul ssa_183, ssa_167
vec1 32 ssa_192 = imul ssa_184, ssa_168
vec1 32 ssa_193 = imul ssa_185, ssa_169
vec1 32 ssa_194 = imul ssa_186, ssa_170
vec1 32 ssa_195 = imul ssa_187, ssa_171
vec1 32 ssa_196 = imul ssa_188, ssa_172
vec1 32 ssa_197 = iadd ssa_189, ssa_181
vec1 32 ssa_198 = iadd ssa_190, ssa_182
vec1 32 ssa_199 = iadd ssa_191, ssa_183
vec1 32 ssa_200 = iadd ssa_192, ssa_184
vec1 32 ssa_201 = iadd ssa_193, ssa_185
vec1 32 ssa_202 = iadd ssa_194, ssa_186
vec1 32 ssa_203 = iadd ssa_195, ssa_187
vec1 32 ssa_204 = iadd ssa_196, ssa_188
vec1 32 ssa_205 = imul ssa_197, ssa_181
vec1 32 ssa_206 = imul ssa_198, ssa_182
vec1 32 ssa_207 = imul ssa_199, ssa_183
vec1 32 ssa_208 = imul ssa_200, ssa_184
vec1 32 ssa_209 = imul ssa_201, ssa_185
vec1 32 ssa_210 = imul ssa_202, ssa_186
vec1 32 ssa_211 = imul ssa_203, ssa_187
vec1 32 ssa_212 = imul ssa_204, ssa_188
vec1 32 ssa_213 = iadd ssa_205, ssa_197
vec1 32 ssa_214 = iadd ssa_206, ssa_198
vec1 32 ssa_215 = iadd ssa_207, ssa_199
vec1 32 ssa_216 = iadd ssa_208, ssa_200
vec1 32 ssa_217 = iadd ssa_209, ssa_201
vec1 32 ssa_218 = iadd ssa_210, ssa_202
vec1 32 ssa_219 = iadd ssa_211, ssa_203
vec1 32 ssa_220 = iadd ssa_212, ssa_204
vec1 32 ssa_221 = imul ssa_213, ssa_197
vec1 32 ssa_222 = imul ssa_214, ssa_198
vec1 32 ssa_223 = imul ssa_215, ssa_199
vec1 32 ssa_224 = imul ssa_216, ssa_200
vec1 32 ssa_225 = imul ssa_217, ssa_201
vec1 32 ssa_226 = imul ssa_218, ssa_202
vec1 32 ssa_227 = imul ssa_219, ssa_203
vec1 32 ssa_228 = imul ssa_220, ssa_204
vec1 32 ssa_229 = iadd ssa_221, ssa_213
vec1 32 ssa_230 = iadd ssa_222, ssa_214
vec1 32 ssa_231 = iadd ssa_223, ssa_215
vec1 32 ssa_232 = iadd ssa_224, ssa_216
vec1 32 ssa_233 = iadd ssa_225, ssa_217
vec1 32 ssa_234 = iadd ssa_226, ssa_218
vec1 32 ssa_235 = iadd ssa_227, ssa_219
vec1 32 ssa_236 = iadd ssa_228, ssa_220
vec1 32 ssa_237 = imul ssa_229, ssa_213
vec1 32 ssa_238 = imul ssa_230, ssa_214
vec1 32 ssa_239 = imul ssa_231, ssa_215
vec1 32 ssa_240 = imul ssa_232, ssa_216
vec1 32 ssa_241 = imul ssa_233, ssa_217
vec1 32 ssa_242 = imul ssa_234, ssa_218
vec1 32 ssa_243 = imul ssa_235, ssa_219
vec1 32 ssa_244 = imul ssa_236, ssa_220
vec1 32 ssa_245 = iadd ssa_237, ssa_229
vec1 32 ssa_246 = iadd ssa_238, ssa_230
vec1 32 ssa_247 = iadd ssa_239, ssa_231
vec1 32 ssa_248 = iadd ssa_240, ssa_232
vec1 32 ssa_249 = iadd ssa_241, ssa_233
vec1 32 ssa_250 = iadd ssa_242, ssa_234
vec1 32 ssa_251 = iadd ssa_243, ssa_235
vec1 32 ssa_252 = iadd ssa_244, ssa_236
vec1 32 ssa_253 = imul ssa_245, ssa_229
vec1 32 ssa_254 = imul ssa_246, ssa_230
vec1 32 ssa_255 = imul ssa_247, ssa_231
vec1 32 ssa_256 = imul ssa_248, ssa_232
vec1 32 ssa_257 = imul ssa_249, ssa_233
vec1 32 ssa_258 = imul ssa_250, ssa_234
vec1 32 ssa_259 = imul ssa_251, ssa_235
vec1 32 ssa_260 = imul ssa_252, ssa_236
vec1 32 ssa_261 = iadd ssa_253, ssa_245
vec1 32 ssa_262 = iadd ssa_254, ssa_246
vec1 32 ssa_263 = iadd ssa_255, ssa_247
vec1 32 ssa_264 = iadd ssa_256, ssa_248
vec1 32 ssa_265 = iadd ssa_257, ssa_249
vec1 32 ssa_266 = iadd ssa_258, ssa_250
vec1 32 ssa_267 = iadd ssa_259, ssa_251
vec1 32 ssa_268 = iadd ssa_260, ssa_252
vec1 32 ssa_269 = imul ssa_261, ssa_245
vec1 32 ssa_270 = imul ssa_262, ssa_246
vec1 32 ssa_271 = imul ssa_263, ssa_247
vec1 32 ssa_272 = imul ssa_264, ssa_248
vec1 32 ssa_273 = imul ssa_265, ssa_249
vec1 32 ssa_274 = imul ssa_266, ssa_250
vec1 32 ssa_275 = imul ssa_267, ssa_251
vec1 32 ssa_276 = imul ssa_268, ssa_252
vec1 32 ssa_277 = iadd ssa_269, ssa_261
vec1 32 ssa_278 = iadd ssa_270, ssa_262
vec1 32 ssa_279 = iadd ssa_271, ssa_263
vec1 32 ssa_280 = iadd ssa_272, ssa_264
vec1 32 ssa_281 = iadd ssa_273, ssa_265
vec1 32 ssa_282 = iadd ssa_274, ssa_266
vec1 32 ssa_283 = iadd ssa_275, ssa_267
vec1 32 ssa_284 = iadd ssa_276, ssa_268
vec1 32 ssa_285 = imul ssa_277, ssa_261
vec1 32 ssa_286 = imul ssa_278, ssa_262
vec1 32 ssa_287 = imul ssa_279, ssa_263
vec1 32 ssa_288 = imul ssa_280, ssa_264
vec1 32 ssa_289 = imul ssa_281, ssa_265
vec1 32 ssa_290 = imul ssa_282, ssa_266
vec1 32 ssa_291 = imul ssa_283, ssa_267
vec1 32 ssa_292 = imul ssa_284, ssa_268
vec1 32 ssa_293 = iadd ssa_285, ssa_277
vec1 32 ssa_294 = iadd ssa_286, ssa_278
vec1 32 ssa_295 = iadd ssa_287, ssa_279
vec1 32 ssa_296 = iadd ssa_288, ssa_280
vec1 32 ssa_297 = iadd ssa_289, ssa_281
vec1 32 ssa_298 = iadd ssa_290, ssa_282
vec1 32 ssa_299 = iadd ssa_291, ssa_283
vec1 32 ssa_300 = iadd ssa_292, ssa_284
vec1 32 ssa_301 = iadd ssa_43, ssa_1
/* succs: block_1 */
}
block block_5:
/* preds: block_2 */
vec1 32 ssa_302 = iadd ssa_31, ssa_27
vec1 32 ssa_303 = iadd ssa_32, ssa_28
vec1 32 ssa_304 = iadd ssa_33, ssa_29
vec1 32 ssa_305 = iadd ssa_34, ssa_30
vec1 32 ssa_306 = iadd ssa_302, ssa_304
vec1 32 ssa_307 = iadd ssa_303, ssa_305
vec1 32 ssa_308 = ishl ssa_15, ssa_2
vec1 32 ssa_309 = iadd ssa_306, ssa_307
vec1 64 ssa_310 = intrinsic load_ssbo_address (ssa_0) ()
vec1 32 ssa_311 = unpack_64_2x32_split_x ssa_310
vec1 32 ssa_312 = unpack_64_2x32_split_y ssa_310
vec1 32 ssa_313 = iadd ssa_311, ssa_308
vec1 32 ssa_314 = ult32 ssa_313, ssa_311
vec1 32 ssa_315 = b2i32 ssa_314
vec1 32 ssa_316 = iadd ssa_315, ssa_312
vec1 64 ssa_317 = pack_64_2x32_split ssa_313, ssa_316
intrinsic store_global (ssa_309, ssa_317) (wrmask=x /*1*/, access=0, align_mul=4, align_offset=0)
/* succs: block_6 */
block block_6:
}
block0 {
322 = MOV.i32 r62
321 = MOV.i32 r61
320 = MOV.i32 r60
12 = LSHIFT_OR.i32 321, #0x0, #0x8.b0
13 = IADD.s32 320, 12
319 = MKVEC.v2i16 #0x0.h00, 322.h00
15 = IADD.s32 13, 319
17 = IADD.s32 u1, #0x1
18 = IADD.s32 u1, #0x2
19 = IADD.s32 u1, #0x3
20 = IADD.s32 u1, #0x4
21 = IADD.s32 u1, #0x5
22 = IADD.s32 u1, #0x6
23 = IADD.s32 u1, #0x7
24 = U32_TO_F32 15
25 = FMA.f32 24, #0x2edbe6ff, #0x0.neg
26 = F32_TO_S32.rtz 25
} -> block1
block1 {
27 = PHI 26, 297
28 = PHI 26, 298
29 = PHI 26, 299
30 = PHI 26, 300
31 = PHI 26, 293
32 = PHI 26, 294
33 = PHI 26, 295
34 = PHI 26, 296
35 = PHI 20, 281
36 = PHI 21, 282
37 = PHI 22, 283
38 = PHI 23, 284
39 = PHI u1, 277
40 = PHI 17, 278
41 = PHI 18, 279
42 = PHI 19, 280
43 = PHI #0x0, 301
44 = ICMP.s32.m1.ge 43, #0x10
BRANCHZ.i16.eq 44.h00, #0x0 -> block3
} -> block3 block2 from block0 block4
block2 {
JUMP #0x0 -> block5
} -> block5 from block1
block3 {
} -> block4 from block1
block4 {
45 = IMUL.i32 31, 39
46 = IMUL.i32 32, 40
47 = IMUL.i32 33, 41
48 = IMUL.i32 34, 42
49 = IMUL.i32 27, 35
50 = IMUL.i32 28, 36
51 = IMUL.i32 29, 37
52 = IMUL.i32 30, 38
53 = IADD.s32 45, 31
54 = IADD.s32 46, 32
55 = IADD.s32 47, 33
56 = IADD.s32 48, 34
57 = IADD.s32 49, 27
58 = IADD.s32 50, 28
59 = IADD.s32 51, 29
60 = IADD.s32 52, 30
61 = IMUL.i32 53, 31
62 = IMUL.i32 54, 32
63 = IMUL.i32 55, 33
64 = IMUL.i32 56, 34
65 = IMUL.i32 57, 27
66 = IMUL.i32 58, 28
67 = IMUL.i32 59, 29
68 = IMUL.i32 60, 30
69 = IADD.s32 61, 53
70 = IADD.s32 62, 54
71 = IADD.s32 63, 55
72 = IADD.s32 64, 56
73 = IADD.s32 65, 57
74 = IADD.s32 66, 58
75 = IADD.s32 67, 59
76 = IADD.s32 68, 60
77 = IMUL.i32 69, 53
78 = IMUL.i32 70, 54
79 = IMUL.i32 71, 55
80 = IMUL.i32 72, 56
81 = IMUL.i32 73, 57
82 = IMUL.i32 74, 58
83 = IMUL.i32 75, 59
84 = IMUL.i32 76, 60
85 = IADD.s32 77, 69
86 = IADD.s32 78, 70
87 = IADD.s32 79, 71
88 = IADD.s32 80, 72
89 = IADD.s32 81, 73
90 = IADD.s32 82, 74
91 = IADD.s32 83, 75
92 = IADD.s32 84, 76
93 = IMUL.i32 85, 69
94 = IMUL.i32 86, 70
95 = IMUL.i32 87, 71
96 = IMUL.i32 88, 72
97 = IMUL.i32 89, 73
98 = IMUL.i32 90, 74
99 = IMUL.i32 91, 75
100 = IMUL.i32 92, 76
101 = IADD.s32 93, 85
102 = IADD.s32 94, 86
103 = IADD.s32 95, 87
104 = IADD.s32 96, 88
105 = IADD.s32 97, 89
106 = IADD.s32 98, 90
107 = IADD.s32 99, 91
108 = IADD.s32 100, 92
109 = IMUL.i32 101, 85
110 = IMUL.i32 102, 86
111 = IMUL.i32 103, 87
112 = IMUL.i32 104, 88
113 = IMUL.i32 105, 89
114 = IMUL.i32 106, 90
115 = IMUL.i32 107, 91
116 = IMUL.i32 108, 92
117 = IADD.s32 109, 101
118 = IADD.s32 110, 102
119 = IADD.s32 111, 103
120 = IADD.s32 112, 104
121 = IADD.s32 113, 105
122 = IADD.s32 114, 106
123 = IADD.s32 115, 107
124 = IADD.s32 116, 108
125 = IMUL.i32 117, 101
126 = IMUL.i32 118, 102
127 = IMUL.i32 119, 103
128 = IMUL.i32 120, 104
129 = IMUL.i32 121, 105
130 = IMUL.i32 122, 106
131 = IMUL.i32 123, 107
132 = IMUL.i32 124, 108
133 = IADD.s32 125, 117
134 = IADD.s32 126, 118
135 = IADD.s32 127, 119
136 = IADD.s32 128, 120
137 = IADD.s32 129, 121
138 = IADD.s32 130, 122
139 = IADD.s32 131, 123
140 = IADD.s32 132, 124
141 = IMUL.i32 133, 117
142 = IMUL.i32 134, 118
143 = IMUL.i32 135, 119
144 = IMUL.i32 136, 120
145 = IMUL.i32 137, 121
146 = IMUL.i32 138, 122
147 = IMUL.i32 139, 123
148 = IMUL.i32 140, 124
149 = IADD.s32 141, 133
150 = IADD.s32 142, 134
151 = IADD.s32 143, 135
152 = IADD.s32 144, 136
153 = IADD.s32 145, 137
154 = IADD.s32 146, 138
155 = IADD.s32 147, 139
156 = IADD.s32 148, 140
157 = IMUL.i32 149, 133
158 = IMUL.i32 150, 134
159 = IMUL.i32 151, 135
160 = IMUL.i32 152, 136
161 = IMUL.i32 153, 137
162 = IMUL.i32 154, 138
163 = IMUL.i32 155, 139
164 = IMUL.i32 156, 140
165 = IADD.s32 157, 149
166 = IADD.s32 158, 150
167 = IADD.s32 159, 151
168 = IADD.s32 160, 152
169 = IADD.s32 161, 153
170 = IADD.s32 162, 154
171 = IADD.s32 163, 155
172 = IADD.s32 164, 156
173 = IMUL.i32 165, 149
174 = IMUL.i32 166, 150
175 = IMUL.i32 167, 151
176 = IMUL.i32 168, 152
177 = IMUL.i32 169, 153
178 = IMUL.i32 170, 154
179 = IMUL.i32 171, 155
180 = IMUL.i32 172, 156
181 = IADD.s32 173, 165
182 = IADD.s32 174, 166
183 = IADD.s32 175, 167
184 = IADD.s32 176, 168
185 = IADD.s32 177, 169
186 = IADD.s32 178, 170
187 = IADD.s32 179, 171
188 = IADD.s32 180, 172
189 = IMUL.i32 181, 165
190 = IMUL.i32 182, 166
191 = IMUL.i32 183, 167
192 = IMUL.i32 184, 168
193 = IMUL.i32 185, 169
194 = IMUL.i32 186, 170
195 = IMUL.i32 187, 171
196 = IMUL.i32 188, 172
197 = IADD.s32 189, 181
198 = IADD.s32 190, 182
199 = IADD.s32 191, 183
200 = IADD.s32 192, 184
201 = IADD.s32 193, 185
202 = IADD.s32 194, 186
203 = IADD.s32 195, 187
204 = IADD.s32 196, 188
205 = IMUL.i32 197, 181
206 = IMUL.i32 198, 182
207 = IMUL.i32 199, 183
208 = IMUL.i32 200, 184
209 = IMUL.i32 201, 185
210 = IMUL.i32 202, 186
211 = IMUL.i32 203, 187
212 = IMUL.i32 204, 188
213 = IADD.s32 205, 197
214 = IADD.s32 206, 198
215 = IADD.s32 207, 199
216 = IADD.s32 208, 200
217 = IADD.s32 209, 201
218 = IADD.s32 210, 202
219 = IADD.s32 211, 203
220 = IADD.s32 212, 204
221 = IMUL.i32 213, 197
222 = IMUL.i32 214, 198
223 = IMUL.i32 215, 199
224 = IMUL.i32 216, 200
225 = IMUL.i32 217, 201
226 = IMUL.i32 218, 202
227 = IMUL.i32 219, 203
228 = IMUL.i32 220, 204
229 = IADD.s32 221, 213
230 = IADD.s32 222, 214
231 = IADD.s32 223, 215
232 = IADD.s32 224, 216
233 = IADD.s32 225, 217
234 = IADD.s32 226, 218
235 = IADD.s32 227, 219
236 = IADD.s32 228, 220
237 = IMUL.i32 229, 213
238 = IMUL.i32 230, 214
239 = IMUL.i32 231, 215
240 = IMUL.i32 232, 216
241 = IMUL.i32 233, 217
242 = IMUL.i32 234, 218
243 = IMUL.i32 235, 219
244 = IMUL.i32 236, 220
245 = IADD.s32 237, 229
246 = IADD.s32 238, 230
247 = IADD.s32 239, 231
248 = IADD.s32 240, 232
249 = IADD.s32 241, 233
250 = IADD.s32 242, 234
251 = IADD.s32 243, 235
252 = IADD.s32 244, 236
253 = IMUL.i32 245, 229
254 = IMUL.i32 246, 230
255 = IMUL.i32 247, 231
256 = IMUL.i32 248, 232
257 = IMUL.i32 249, 233
258 = IMUL.i32 250, 234
259 = IMUL.i32 251, 235
260 = IMUL.i32 252, 236
261 = IADD.s32 253, 245
262 = IADD.s32 254, 246
263 = IADD.s32 255, 247
264 = IADD.s32 256, 248
265 = IADD.s32 257, 249
266 = IADD.s32 258, 250
267 = IADD.s32 259, 251
268 = IADD.s32 260, 252
269 = IMUL.i32 261, 245
270 = IMUL.i32 262, 246
271 = IMUL.i32 263, 247
272 = IMUL.i32 264, 248
273 = IMUL.i32 265, 249
274 = IMUL.i32 266, 250
275 = IMUL.i32 267, 251
276 = IMUL.i32 268, 252
277 = IADD.s32 269, 261
278 = IADD.s32 270, 262
279 = IADD.s32 271, 263
280 = IADD.s32 272, 264
281 = IADD.s32 273, 265
282 = IADD.s32 274, 266
283 = IADD.s32 275, 267
284 = IADD.s32 276, 268
285 = IMUL.i32 277, 261
286 = IMUL.i32 278, 262
287 = IMUL.i32 279, 263
288 = IMUL.i32 280, 264
289 = IMUL.i32 281, 265
290 = IMUL.i32 282, 266
291 = IMUL.i32 283, 267
292 = IMUL.i32 284, 268
293 = IADD.s32 285, 277
294 = IADD.s32 286, 278
295 = IADD.s32 287, 279
296 = IADD.s32 288, 280
297 = IADD.s32 289, 281
298 = IADD.s32 290, 282
299 = IADD.s32 291, 283
300 = IADD.s32 292, 284
301 = IADD.s32 43, #0x1
JUMP #0x0 -> block1
} -> block1 from block3
block5 {
302 = IADD.s32 31, 27
303 = IADD.s32 32, 28
304 = IADD.s32 33, 29
305 = IADD.s32 34, 30
306 = IADD.s32 302, 304
307 = IADD.s32 303, 305
308 = LSHIFT_OR.i32 15, #0x0, #0x2.b0
309 = IADD.s32 306, 307
313 = IADD.s32 u0, 308
315 = ICMP.u32.i1.lt 313, u0
316 = IADD.s32 315, u0[1]
STORE.i32 309, 313, 316, byte_offset:0
} from block2
block0 {
r0 = LSHIFT_OR.i32 r61, #0x0, #0x8.b0
r0 = IADD.s32 r60, r0
r1 = MKVEC.v2i16 #0x0.h00, r62.h00
r0 = IADD.s32 r0, r1
r1 = MOV.i32 #0x1
r1 = IADD.s32 u1, r1
r2 = MOV.i32 #0x2
r2 = IADD.s32 u1, r2
r3 = MOV.i32 #0x3
r3 = IADD.s32 u1, r3
r4 = MOV.i32 #0x4
r4 = IADD.s32 u1, r4
r5 = MOV.i32 #0x5
r5 = IADD.s32 u1, r5
r6 = MOV.i32 #0x6
r6 = IADD.s32 u1, r6
r7 = MOV.i32 #0x7
r7 = IADD.s32 u1, r7
r8 = U32_TO_F32 r0
r8 = FMA.f32 r8, #0x2edbe6ff, #0x0.neg
r8 = F32_TO_S32.rtz r8
r9 = MOV.i32 r8
r10 = MOV.i32 r8
r11 = MOV.i32 r8
r12 = MOV.i32 r8
r13 = MOV.i32 r8
r14 = MOV.i32 r8
r15 = MOV.i32 r8
r48 = MOV.i32 u1
r49 = MOV.i32 #0x0
} -> block1
block1 {
r50 = ICMP.s32.m1.ge r49, #0x10
BRANCHZ.i16.eq r50.h00, #0x0 -> block3
} -> block3 block2 from block0 block4
block2 {
JUMP #0x0 -> block5
} -> block5 from block1
block3 {
} -> block4 from block1
block4 {
r48 = IMUL.i32 r12, r48
r1 = IMUL.i32 r13, r1
r2 = IMUL.i32 r14, r2
r3 = IMUL.i32 r15, r3
r4 = IMUL.i32 r8, r4
r5 = IMUL.i32 r9, r5
r6 = IMUL.i32 r10, r6
r7 = IMUL.i32 r11, r7
r48 = IADD.s32 r48, r12
r1 = IADD.s32 r1, r13
r2 = IADD.s32 r2, r14
r3 = IADD.s32 r3, r15
r4 = IADD.s32 r4, r8
r5 = IADD.s32 r5, r9
r6 = IADD.s32 r6, r10
r7 = IADD.s32 r7, r11
r12 = IMUL.i32 r48, r12
r13 = IMUL.i32 r1, r13
r14 = IMUL.i32 r2, r14
r15 = IMUL.i32 r3, r15
r8 = IMUL.i32 r4, r8
r9 = IMUL.i32 r5, r9
r10 = IMUL.i32 r6, r10
r11 = IMUL.i32 r7, r11
r12 = IADD.s32 r12, r48
r13 = IADD.s32 r13, r1
r14 = IADD.s32 r14, r2
r15 = IADD.s32 r15, r3
r8 = IADD.s32 r8, r4
r9 = IADD.s32 r9, r5
r10 = IADD.s32 r10, r6
r11 = IADD.s32 r11, r7
r48 = IMUL.i32 r12, r48
r1 = IMUL.i32 r13, r1
r2 = IMUL.i32 r14, r2
r3 = IMUL.i32 r15, r3
r4 = IMUL.i32 r8, r4
r5 = IMUL.i32 r9, r5
r6 = IMUL.i32 r10, r6
r7 = IMUL.i32 r11, r7
r48 = IADD.s32 r48, r12
r1 = IADD.s32 r1, r13
r2 = IADD.s32 r2, r14
r3 = IADD.s32 r3, r15
r4 = IADD.s32 r4, r8
r5 = IADD.s32 r5, r9
r6 = IADD.s32 r6, r10
r7 = IADD.s32 r7, r11
r12 = IMUL.i32 r48, r12
r13 = IMUL.i32 r1, r13
r14 = IMUL.i32 r2, r14
r15 = IMUL.i32 r3, r15
r8 = IMUL.i32 r4, r8
r9 = IMUL.i32 r5, r9
r10 = IMUL.i32 r6, r10
r11 = IMUL.i32 r7, r11
r12 = IADD.s32 r12, r48
r13 = IADD.s32 r13, r1
r14 = IADD.s32 r14, r2
r15 = IADD.s32 r15, r3
r8 = IADD.s32 r8, r4
r9 = IADD.s32 r9, r5
r10 = IADD.s32 r10, r6
r11 = IADD.s32 r11, r7
r48 = IMUL.i32 r12, r48
r1 = IMUL.i32 r13, r1
r2 = IMUL.i32 r14, r2
r3 = IMUL.i32 r15, r3
r4 = IMUL.i32 r8, r4
r5 = IMUL.i32 r9, r5
r6 = IMUL.i32 r10, r6
r7 = IMUL.i32 r11, r7
r48 = IADD.s32 r48, r12
r1 = IADD.s32 r1, r13
r2 = IADD.s32 r2, r14
r3 = IADD.s32 r3, r15
r4 = IADD.s32 r4, r8
r5 = IADD.s32 r5, r9
r6 = IADD.s32 r6, r10
r7 = IADD.s32 r7, r11
r12 = IMUL.i32 r48, r12
r13 = IMUL.i32 r1, r13
r14 = IMUL.i32 r2, r14
r15 = IMUL.i32 r3, r15
r8 = IMUL.i32 r4, r8
r9 = IMUL.i32 r5, r9
r10 = IMUL.i32 r6, r10
r11 = IMUL.i32 r7, r11
r12 = IADD.s32 r12, r48
r13 = IADD.s32 r13, r1
r14 = IADD.s32 r14, r2
r15 = IADD.s32 r15, r3
r8 = IADD.s32 r8, r4
r9 = IADD.s32 r9, r5
r10 = IADD.s32 r10, r6
r11 = IADD.s32 r11, r7
r48 = IMUL.i32 r12, r48
r1 = IMUL.i32 r13, r1
r2 = IMUL.i32 r14, r2
r3 = IMUL.i32 r15, r3
r4 = IMUL.i32 r8, r4
r5 = IMUL.i32 r9, r5
r6 = IMUL.i32 r10, r6
r7 = IMUL.i32 r11, r7
r48 = IADD.s32 r48, r12
r1 = IADD.s32 r1, r13
r2 = IADD.s32 r2, r14
r3 = IADD.s32 r3, r15
r4 = IADD.s32 r4, r8
r5 = IADD.s32 r5, r9
r6 = IADD.s32 r6, r10
r7 = IADD.s32 r7, r11
r12 = IMUL.i32 r48, r12
r13 = IMUL.i32 r1, r13
r14 = IMUL.i32 r2, r14
r15 = IMUL.i32 r3, r15
r8 = IMUL.i32 r4, r8
r9 = IMUL.i32 r5, r9
r10 = IMUL.i32 r6, r10
r11 = IMUL.i32 r7, r11
r12 = IADD.s32 r12, r48
r13 = IADD.s32 r13, r1
r14 = IADD.s32 r14, r2
r15 = IADD.s32 r15, r3
r8 = IADD.s32 r8, r4
r9 = IADD.s32 r9, r5
r10 = IADD.s32 r10, r6
r11 = IADD.s32 r11, r7
r48 = IMUL.i32 r12, r48
r1 = IMUL.i32 r13, r1
r2 = IMUL.i32 r14, r2
r3 = IMUL.i32 r15, r3
r4 = IMUL.i32 r8, r4
r5 = IMUL.i32 r9, r5
r6 = IMUL.i32 r10, r6
r7 = IMUL.i32 r11, r7
r48 = IADD.s32 r48, r12
r1 = IADD.s32 r1, r13
r2 = IADD.s32 r2, r14
r3 = IADD.s32 r3, r15
r4 = IADD.s32 r4, r8
r5 = IADD.s32 r5, r9
r6 = IADD.s32 r6, r10
r7 = IADD.s32 r7, r11
r12 = IMUL.i32 r48, r12
r13 = IMUL.i32 r1, r13
r14 = IMUL.i32 r2, r14
r15 = IMUL.i32 r3, r15
r8 = IMUL.i32 r4, r8
r9 = IMUL.i32 r5, r9
r10 = IMUL.i32 r6, r10
r11 = IMUL.i32 r7, r11
r12 = IADD.s32 r12, r48
r13 = IADD.s32 r13, r1
r14 = IADD.s32 r14, r2
r15 = IADD.s32 r15, r3
r8 = IADD.s32 r8, r4
r9 = IADD.s32 r9, r5
r10 = IADD.s32 r10, r6
r11 = IADD.s32 r11, r7
r48 = IMUL.i32 r12, r48
r1 = IMUL.i32 r13, r1
r2 = IMUL.i32 r14, r2
r3 = IMUL.i32 r15, r3
r4 = IMUL.i32 r8, r4
r5 = IMUL.i32 r9, r5
r6 = IMUL.i32 r10, r6
r7 = IMUL.i32 r11, r7
r48 = IADD.s32 r48, r12
r1 = IADD.s32 r1, r13
r2 = IADD.s32 r2, r14
r3 = IADD.s32 r3, r15
r4 = IADD.s32 r4, r8
r5 = IADD.s32 r5, r9
r6 = IADD.s32 r6, r10
r7 = IADD.s32 r7, r11
r12 = IMUL.i32 r48, r12
r13 = IMUL.i32 r1, r13
r14 = IMUL.i32 r2, r14
r15 = IMUL.i32 r3, r15
r8 = IMUL.i32 r4, r8
r9 = IMUL.i32 r5, r9
r10 = IMUL.i32 r6, r10
r11 = IMUL.i32 r7, r11
r12 = IADD.s32 r12, r48
r13 = IADD.s32 r13, r1
r14 = IADD.s32 r14, r2
r15 = IADD.s32 r15, r3
r8 = IADD.s32 r8, r4
r9 = IADD.s32 r9, r5
r10 = IADD.s32 r10, r6
r11 = IADD.s32 r11, r7
r48 = IMUL.i32 r12, r48
r1 = IMUL.i32 r13, r1
r2 = IMUL.i32 r14, r2
r3 = IMUL.i32 r15, r3
r4 = IMUL.i32 r8, r4
r5 = IMUL.i32 r9, r5
r6 = IMUL.i32 r10, r6
r7 = IMUL.i32 r11, r7
r48 = IADD.s32 r48, r12
r1 = IADD.s32 r1, r13
r2 = IADD.s32 r2, r14
r3 = IADD.s32 r3, r15
r4 = IADD.s32 r4, r8
r5 = IADD.s32 r5, r9
r6 = IADD.s32 r6, r10
r7 = IADD.s32 r7, r11
r12 = IMUL.i32 r48, r12
r13 = IMUL.i32 r1, r13
r14 = IMUL.i32 r2, r14
r15 = IMUL.i32 r3, r15
r8 = IMUL.i32 r4, r8
r9 = IMUL.i32 r5, r9
r10 = IMUL.i32 r6, r10
r11 = IMUL.i32 r7, r11
r12 = IADD.s32 r12, r48
r13 = IADD.s32 r13, r1
r14 = IADD.s32 r14, r2
r15 = IADD.s32 r15, r3
r8 = IADD.s32 r8, r4
r9 = IADD.s32 r9, r5
r10 = IADD.s32 r10, r6
r11 = IADD.s32 r11, r7
r48 = IMUL.i32 r12, r48
r1 = IMUL.i32 r13, r1
r2 = IMUL.i32 r14, r2
r3 = IMUL.i32 r15, r3
r4 = IMUL.i32 r8, r4
r5 = IMUL.i32 r9, r5
r6 = IMUL.i32 r10, r6
r7 = IMUL.i32 r11, r7
r48 = IADD.s32 r48, r12
r1 = IADD.s32 r1, r13
r2 = IADD.s32 r2, r14
r3 = IADD.s32 r3, r15
r4 = IADD.s32 r4, r8
r5 = IADD.s32 r5, r9
r6 = IADD.s32 r6, r10
r7 = IADD.s32 r7, r11
r12 = IMUL.i32 r48, r12
r13 = IMUL.i32 r1, r13
r14 = IMUL.i32 r2, r14
r15 = IMUL.i32 r3, r15
r8 = IMUL.i32 r4, r8
r9 = IMUL.i32 r5, r9
r10 = IMUL.i32 r6, r10
r11 = IMUL.i32 r7, r11
r12 = IADD.s32 r12, r48
r13 = IADD.s32 r13, r1
r14 = IADD.s32 r14, r2
r15 = IADD.s32 r15, r3
r8 = IADD.s32 r8, r4
r9 = IADD.s32 r9, r5
r10 = IADD.s32 r10, r6
r11 = IADD.s32 r11, r7
r49 = IADD.s32 r49, #0x1
JUMP #0x0 -> block1
} -> block1 from block3
block5 {
r1 = IADD.s32 r12, r8
r2 = IADD.s32 r13, r9
r3 = IADD.s32 r14, r10
r4 = IADD.s32 r15, r11
r1 = IADD.s32 r1, r3
r2 = IADD.s32 r2, r4
r0 = LSHIFT_OR.i32 r0, #0x0, #0x2.b0
r1 = IADD.s32 r1, r2
r0 = IADD.s32 u0, r0
r2 = ICMP.u32.i1.lt r0, u0
r2 = IADD.s32 r2, u0[1]
STORE.i32 r1, r0, r2, byte_offset:0
} from block2
block0 {
id(0) nbb
* _.h00 = LSHIFT_OR.i32 r61, t, fau.y.b0
+ _.h00 = IADD.s32 r60, t
* _.h00 = MKVEC.v2i16 t.h00, r62.h00
+ r0 = IADD.s32 t1, t
* r1 = MOV.i32 fau.x
+ r2 = MOV.i32 fau.y
* r3 = MOV.i32 fau.x
+ r4 = MOV.i32 fau.y
* r5 = MOV.i32 fau.x
+ r6 = MOV.i32 fau.y
* r7 = MOV.i32 fau.x
+ _.h00 = U32_TO_F32 r0
* _.h00 = FMA.f32 t1, fau.y, t.neg
+ r8 = F32_TO_S32.rtz t
* NOP
+ r9 = MOV.i32 t1
200000001 400000003 600000005 800000007 2edbe6ff00000000
id(0) nbb r_uncond
* r10 = MOV.i32 r8
+ r11 = MOV.i32 r8
* r12 = MOV.i32 r8
+ r1 = IADD.s32 fau.x, r1
* r13 = MOV.i32 r8
+ r2 = IADD.s32 fau.x, r2
* r14 = MOV.i32 r8
+ r3 = IADD.s32 fau.x, r3
* r15 = MOV.i32 r8
+ r4 = IADD.s32 fau.x, r4
* r48 = MOV.i32 fau.x
+ r5 = IADD.s32 fau.x, r5
* r49 = MOV.i32 t
+ r6 = IADD.s32 fau.x, r6
* NOP
+ r7 = IADD.s32 fau.x, r7
} -> block1
block1 {
id(0) nbb r_uncond pcrel(0)
* NOP
+ _.h00 = ICMP.s32.m1.ge r49, fau.x
* NOP
+ BRANCHZ.i16.eq t1.h00, fau.y -> block3
4000000000000010
} -> block3 block2 from block0 block4
block2 {
id(0) nbb no_prefetch pcrel(0)
* NOP
+ JUMP fau.y -> block5
4000000000000000
} -> block5 from block1
block3 {
} -> block4 from block1
block4 {
id(0) nbb
* r48 = IMUL.i32 r12, r48
+ NOP
* _.h00 = IMUL.i32 r14, r2
+ NOP
* NOP
+ r2 = IADD.s32 t0, r14
id(0) nbb
* r14 = IMUL.i32 r2, r14
+ NOP
* _.h00 = IMUL.i32 r15, r3
+ r3 = IADD.s32 t, r15
* _.h00 = IMUL.i32 t1, r15
+ r15 = IADD.s32 t, t1
* r3 = IMUL.i32 t1, r3
+ NOP
* _.h00 = IMUL.i32 r8, r4
+ r4 = IADD.s32 t, r8
* _.h00 = IMUL.i32 t1, r8
+ r8 = IADD.s32 t, t1
* r4 = IMUL.i32 t1, r4
+ NOP
* NOP
+ r14 = IADD.s32 r14, r2
id(0) nbb
* _.h00 = IMUL.i32 r14, r2
+ r2 = IADD.s32 t, r14
* r14 = IMUL.i32 t1, r14
+ NOP
* _.h00 = IMUL.i32 r13, r1
+ r1 = IADD.s32 t, r13
* _.h00 = IMUL.i32 t1, r13
+ r13 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r1
+ r1 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r13
+ r13 = IADD.s32 t, t1
* r1 = IMUL.i32 t1, r1
+ NOP
* NOP
+ r3 = IADD.s32 r3, r15
id(0) nbb
* _.h00 = IMUL.i32 r3, r15
+ r15 = IADD.s32 t, r3
* r3 = IMUL.i32 t1, r3
+ NOP
* _.h00 = IMUL.i32 r10, r6
+ r6 = IADD.s32 t, r10
* _.h00 = IMUL.i32 t1, r10
+ r10 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r6
+ r6 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r10
+ r10 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r6
+ r6 = IADD.s32 t, t1
* r10 = IMUL.i32 t1, r10
+ NOP
id(0) nbb
* _.h00 = IMUL.i32 r11, r7
+ r7 = IADD.s32 t, r11
* _.h00 = IMUL.i32 t1, r11
+ r11 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r7
+ r7 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r11
+ r11 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r7
+ r7 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r11
+ r11 = IADD.s32 t, t1
* r7 = IMUL.i32 t1, r7
+ NOP
* NOP
+ r48 = IADD.s32 r48, r12
id(0) nbb
* _.h00 = IMUL.i32 r48, r12
+ r12 = IADD.s32 t, r48
* _.h00 = IMUL.i32 t1, r48
+ r48 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r12
+ r12 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r48
+ r48 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r12
+ r12 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r48
+ r48 = IADD.s32 t, t1
* r12 = IMUL.i32 t1, r12
+ NOP
* NOP
+ r4 = IADD.s32 r4, r8
id(0) nbb
* _.h00 = IMUL.i32 r4, r8
+ r8 = IADD.s32 t, r4
* _.h00 = IMUL.i32 t1, r4
+ r4 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r8
+ r8 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r4
+ r4 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r8
+ r8 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r4
+ r4 = IADD.s32 t, t1
* r8 = IMUL.i32 t1, r8
+ NOP
* NOP
+ r1 = IADD.s32 r1, r13
id(0) nbb
* _.h00 = IMUL.i32 r1, r13
+ r13 = IADD.s32 t, r1
* _.h00 = IMUL.i32 t1, r1
+ r1 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r13
+ r13 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r1
+ r1 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r13
+ r13 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r1
+ r1 = IADD.s32 t, t1
* r13 = IMUL.i32 t1, r13
+ NOP
* NOP
+ r3 = IADD.s32 r3, r15
id(0) nbb
* _.h00 = IMUL.i32 r3, r15
+ r15 = IADD.s32 t, r3
* _.h00 = IMUL.i32 t1, r3
+ r3 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r15
+ r15 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r3
+ r3 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r15
+ r15 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r3
+ r3 = IADD.s32 t, t1
* r15 = IMUL.i32 t1, r15
+ NOP
* NOP
+ r10 = IADD.s32 r10, r6
id(0) nbb
* _.h00 = IMUL.i32 r10, r6
+ r6 = IADD.s32 t, r10
* _.h00 = IMUL.i32 t1, r10
+ r10 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r6
+ r6 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r10
+ r10 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r6
+ r6 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r10
+ r10 = IADD.s32 t, t1
* r6 = IMUL.i32 t1, r6
+ NOP
* NOP
+ r12 = IADD.s32 r12, r48
id(0) nbb
* _.h00 = IMUL.i32 r12, r48
+ r48 = IADD.s32 t, r12
* _.h00 = IMUL.i32 t1, r12
+ r12 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r48
+ r48 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r12
+ r12 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r48
+ r48 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r12
+ r12 = IADD.s32 t, t1
* r48 = IMUL.i32 t1, r48
+ NOP
* NOP
+ r13 = IADD.s32 r13, r1
id(0) nbb
* _.h00 = IMUL.i32 r13, r1
+ r1 = IADD.s32 t, r13
* _.h00 = IMUL.i32 t1, r13
+ r13 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r1
+ r1 = IADD.s32 t, t1
* r13 = IMUL.i32 t1, r13
+ NOP
* _.h00 = IMUL.i32 r9, r5
+ r5 = IADD.s32 t, r9
* _.h00 = IMUL.i32 t1, r9
+ r9 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r5
+ r5 = IADD.s32 t, t1
* NOP
+ r14 = IADD.s32 r14, r2
id(0) nbb
* _.h00 = IMUL.i32 r5, r9
+ r9 = IADD.s32 t, r5
* _.h00 = IMUL.i32 r14, r2
+ r2 = IADD.s32 t, r14
* _.h00 = IMUL.i32 r9, r5
+ r5 = IADD.s32 t, r9
* _.h00 = IMUL.i32 r2, r14
+ r14 = IADD.s32 t, r2
* _.h00 = IMUL.i32 r5, r9
+ r9 = IADD.s32 t, r5
* _.h00 = IMUL.i32 r14, r2
+ r2 = IADD.s32 t, r14
* _.h00 = IMUL.i32 r9, r5
+ r5 = IADD.s32 t, r9
* NOP
+ r7 = IADD.s32 r7, r11
id(0) nbb
* _.h00 = IMUL.i32 r2, r14
+ r14 = IADD.s32 t, r2
* _.h00 = IMUL.i32 r5, r9
+ r9 = IADD.s32 t, r5
* _.h00 = IMUL.i32 r7, r11
+ r11 = IADD.s32 t, r7
* _.h00 = IMUL.i32 r14, r2
+ r2 = IADD.s32 t, r14
* _.h00 = IMUL.i32 r9, r5
+ r5 = IADD.s32 t, r9
* _.h00 = IMUL.i32 r11, r7
+ r7 = IADD.s32 t, r11
* _.h00 = IMUL.i32 r2, r14
+ r14 = IADD.s32 t, r2
* NOP
+ r8 = IADD.s32 r8, r4
id(0) nbb
* _.h00 = IMUL.i32 r5, r9
+ r9 = IADD.s32 t, r5
* _.h00 = IMUL.i32 r7, r11
+ r11 = IADD.s32 t, r7
* _.h00 = IMUL.i32 r14, r2
+ r2 = IADD.s32 t, r14
* _.h00 = IMUL.i32 r8, r4
+ r4 = IADD.s32 t, r8
* _.h00 = IMUL.i32 r9, r5
+ r5 = IADD.s32 t, r9
* _.h00 = IMUL.i32 r11, r7
+ r7 = IADD.s32 t, r11
* _.h00 = IMUL.i32 r2, r14
+ r14 = IADD.s32 t, r2
* NOP
+ r15 = IADD.s32 r15, r3
id(0) nbb
* _.h00 = IMUL.i32 r4, r8
+ r8 = IADD.s32 t, r4
* _.h00 = IMUL.i32 r5, r9
+ r9 = IADD.s32 t, r5
* _.h00 = IMUL.i32 r7, r11
+ r11 = IADD.s32 t, r7
* _.h00 = IMUL.i32 r14, r2
+ r2 = IADD.s32 t, r14
* _.h00 = IMUL.i32 r15, r3
+ r3 = IADD.s32 t, r15
* _.h00 = IMUL.i32 r8, r4
+ r4 = IADD.s32 t, r8
* _.h00 = IMUL.i32 r9, r5
+ r5 = IADD.s32 t, r9
* NOP
+ r6 = IADD.s32 r6, r10
id(0) nbb
* _.h00 = IMUL.i32 r11, r7
+ r7 = IADD.s32 t, r11
* _.h00 = IMUL.i32 r2, r14
+ r14 = IADD.s32 t, r2
* _.h00 = IMUL.i32 r3, r15
+ r15 = IADD.s32 t, r3
* _.h00 = IMUL.i32 r4, r8
+ r8 = IADD.s32 t, r4
* _.h00 = IMUL.i32 r5, r9
+ r9 = IADD.s32 t, r5
* _.h00 = IMUL.i32 r6, r10
+ r10 = IADD.s32 t, r6
* _.h00 = IMUL.i32 r7, r11
+ r11 = IADD.s32 t, r7
* NOP
+ r48 = IADD.s32 r48, r12
id(0) nbb
* _.h00 = IMUL.i32 r14, r2
+ r2 = IADD.s32 t, r14
* _.h00 = IMUL.i32 r15, r3
+ r3 = IADD.s32 t, r15
* _.h00 = IMUL.i32 r8, r4
+ r4 = IADD.s32 t, r8
* _.h00 = IMUL.i32 r9, r5
+ r5 = IADD.s32 t, r9
* _.h00 = IMUL.i32 r10, r6
+ r6 = IADD.s32 t, r10
* _.h00 = IMUL.i32 r11, r7
+ r7 = IADD.s32 t, r11
* _.h00 = IMUL.i32 r48, r12
+ r12 = IADD.s32 t, r48
* NOP
+ r13 = IADD.s32 r13, r1
id(0) nbb r_uncond no_prefetch pcrel(1)
* _.h00 = IMUL.i32 r2, r14
+ r14 = IADD.s32 t, r2
* _.h00 = IMUL.i32 r3, r15
+ r15 = IADD.s32 t, r3
* _.h00 = IMUL.i32 r4, r8
+ r8 = IADD.s32 t, r4
* _.h00 = IMUL.i32 r5, r9
+ r9 = IADD.s32 t, r5
* _.h00 = IMUL.i32 r6, r10
+ r10 = IADD.s32 t, r6
* _.h00 = IMUL.i32 r7, r11
+ r11 = IADD.s32 t, r7
* NOP
+ r49 = IADD.s32 r49, fau.x
* NOP
+ JUMP fau.y -> block1
0 4000000000000001
} -> block1 from block3
block5 {
id(0) nbb
* NOP
+ r1 = IADD.s32 r12, r8
* NOP
+ r2 = IADD.s32 r13, r9
* NOP
+ r3 = IADD.s32 r14, r10
id(0) wait(0 ) nbb r_uncond
* NOP
+ r4 = IADD.s32 r15, r11
* NOP
+ r1 = IADD.s32 r1, r3
* NOP
+ _.h00 = IADD.s32 r2, r4
* _.h00 = LSHIFT_OR.i32 r0, t, fau.y.b0
+ r1 = IADD.s32 r1, t1
* NOP
+ r0 = IADD.s32 fau.x, t0
* NOP
+ _.h00 = ICMP.u32.i1.lt t1, fau.x
* NOP
+ _.h00 = IADD.s32 t1, fau.y
* NOP
+ STORE.i32 r1, r0, t1, byte_offset:0
200000000
} from block2
slot 0 reads: r1
clause_0:
ds(0) nbb ncph
{
*LSHIFT_OR.i32 t0, r61, #0, 0x00000008 /* 0.000000 */
+IADD.s32 t1, r60, t
*MKVEC.v2i16 t0, #0, r62
+IADD.s32 r0:t1, t1, t
*MOV.i32 r1:t0, 0x00000001 /* 0.000000 */
+MOV.i32 r2:t1, 0x00000002 /* 0.000000 */
*MOV.i32 r3:t0, 0x00000003 /* 0.000000 */
+MOV.i32 r4:t1, 0x00000004 /* 0.000000 */
*MOV.i32 r5:t0, 0x00000005 /* 0.000000 */
+MOV.i32 r6:t1, 0x00000006 /* 0.000000 */
*MOV.i32 r7:t0, 0x00000007 /* 0.000000 */
+U32_TO_F32 t1, r0
*FMA.f32 t0, t1, 0x2edbe6ff /* 0.000000 */, #0.neg
+F32_TO_S32.rtz r8:t1, t
*NOP t0
+MOV.i32 r9:t1, t1
}
clause_8:
ds(0) nbb r_uncond ncph
{
*MOV.i32 r10:t0, r8
+MOV.i32 r11:t1, r8
*MOV.i32 r12:t0, r8
+IADD.s32 r1:t1, u1.w0, r1
*MOV.i32 r13:t0, r8
+IADD.s32 r2:t1, u1.w0, r2
*MOV.i32 r14:t0, r8
+IADD.s32 r3:t1, u1.w0, r3
*MOV.i32 r15:t0, r8
+IADD.s32 r4:t1, u1.w0, r4
*MOV.i32 r48:t0, u1.w0
+IADD.s32 r5:t1, u1.w0, r5
*MOV.i32 r49:t0, #0
+IADD.s32 r6:t1, u1.w0, r6
*NOP t0
+IADD.s32 r7:t1, u1.w0, r7
}
clause_14:
ds(0) nbb r_uncond ncph
{
*NOP t0
+ICMP.s32.m1.ge t1, r49, 0x00000010 /* 0.000000 */
*NOP t0
+BRANCHZ.i16.eq t1, t1.h0, clause_19
}
clause_17:
ds(0) nbb
{
*NOP t0
+JUMP t1, clause_131
}
clause_19:
ds(0) nbb ncph
{
*IMUL.i32 r48:t0, r12, r48
+NOP t1
*IMUL.i32 t0, r14, r2
+NOP t1
*NOP t0
+IADD.s32 r2:t1, t0, r14
}
clause_22:
ds(0) nbb ncph
{
*IMUL.i32 r14:t0, r2, r14
+NOP t1
*IMUL.i32 t0, r15, r3
+IADD.s32 r3:t1, t, r15
*IMUL.i32 t0, t1, r15
+IADD.s32 r15:t1, t, t1
*IMUL.i32 r3:t0, t1, r3
+NOP t1
*IMUL.i32 t0, r8, r4
+IADD.s32 r4:t1, t, r8
*IMUL.i32 t0, t1, r8
+IADD.s32 r8:t1, t, t1
*IMUL.i32 r4:t0, t1, r4
+NOP t1
*NOP t0
+IADD.s32 r14:t1, r14, r2
}
clause_28:
ds(0) nbb ncph
{
*IMUL.i32 t0, r14, r2
+IADD.s32 r2:t1, t, r14
*IMUL.i32 r14:t0, t1, r14
+NOP t1
*IMUL.i32 t0, r13, r1
+IADD.s32 r1:t1, t, r13
*IMUL.i32 t0, t1, r13
+IADD.s32 r13:t1, t, t1
*IMUL.i32 t0, t1, r1
+IADD.s32 r1:t1, t, t1
*IMUL.i32 t0, t1, r13
+IADD.s32 r13:t1, t, t1
*IMUL.i32 r1:t0, t1, r1
+NOP t1
*NOP t0
+IADD.s32 r3:t1, r3, r15
}
clause_34:
ds(0) nbb ncph
{
*IMUL.i32 t0, r3, r15
+IADD.s32 r15:t1, t, r3
*IMUL.i32 r3:t0, t1, r3
+NOP t1
*IMUL.i32 t0, r10, r6
+IADD.s32 r6:t1, t, r10
*IMUL.i32 t0, t1, r10
+IADD.s32 r10:t1, t, t1
*IMUL.i32 t0, t1, r6
+IADD.s32 r6:t1, t, t1
*IMUL.i32 t0, t1, r10
+IADD.s32 r10:t1, t, t1
*IMUL.i32 t0, t1, r6
+IADD.s32 r6:t1, t, t1
*IMUL.i32 r10:t0, t1, r10
+NOP t1
}
clause_40:
ds(0) nbb ncph
{
*IMUL.i32 t0, r11, r7
+IADD.s32 r7:t1, t, r11
*IMUL.i32 t0, t1, r11
+IADD.s32 r11:t1, t, t1
*IMUL.i32 t0, t1, r7
+IADD.s32 r7:t1, t, t1
*IMUL.i32 t0, t1, r11
+IADD.s32 r11:t1, t, t1
*IMUL.i32 t0, t1, r7
+IADD.s32 r7:t1, t, t1
*IMUL.i32 t0, t1, r11
+IADD.s32 r11:t1, t, t1
*IMUL.i32 r7:t0, t1, r7
+NOP t1
*NOP t0
+IADD.s32 r48:t1, r48, r12
}
clause_46:
ds(0) nbb ncph
{
*IMUL.i32 t0, r48, r12
+IADD.s32 r12:t1, t, r48
*IMUL.i32 t0, t1, r48
+IADD.s32 r48:t1, t, t1
*IMUL.i32 t0, t1, r12
+IADD.s32 r12:t1, t, t1
*IMUL.i32 t0, t1, r48
+IADD.s32 r48:t1, t, t1
*IMUL.i32 t0, t1, r12
+IADD.s32 r12:t1, t, t1
*IMUL.i32 t0, t1, r48
+IADD.s32 r48:t1, t, t1
*IMUL.i32 r12:t0, t1, r12
+NOP t1
*NOP t0
+IADD.s32 r4:t1, r4, r8
}
clause_52:
ds(0) nbb ncph
{
*IMUL.i32 t0, r4, r8
+IADD.s32 r8:t1, t, r4
*IMUL.i32 t0, t1, r4
+IADD.s32 r4:t1, t, t1
*IMUL.i32 t0, t1, r8
+IADD.s32 r8:t1, t, t1
*IMUL.i32 t0, t1, r4
+IADD.s32 r4:t1, t, t1
*IMUL.i32 t0, t1, r8
+IADD.s32 r8:t1, t, t1
*IMUL.i32 t0, t1, r4
+IADD.s32 r4:t1, t, t1
*IMUL.i32 r8:t0, t1, r8
+NOP t1
*NOP t0
+IADD.s32 r1:t1, r1, r13
}
clause_58:
ds(0) nbb ncph
{
*IMUL.i32 t0, r1, r13
+IADD.s32 r13:t1, t, r1
*IMUL.i32 t0, t1, r1
+IADD.s32 r1:t1, t, t1
*IMUL.i32 t0, t1, r13
+IADD.s32 r13:t1, t, t1
*IMUL.i32 t0, t1, r1
+IADD.s32 r1:t1, t, t1
*IMUL.i32 t0, t1, r13
+IADD.s32 r13:t1, t, t1
*IMUL.i32 t0, t1, r1
+IADD.s32 r1:t1, t, t1
*IMUL.i32 r13:t0, t1, r13
+NOP t1
*NOP t0
+IADD.s32 r3:t1, r3, r15
}
clause_64:
ds(0) nbb ncph
{
*IMUL.i32 t0, r3, r15
+IADD.s32 r15:t1, t, r3
*IMUL.i32 t0, t1, r3
+IADD.s32 r3:t1, t, t1
*IMUL.i32 t0, t1, r15
+IADD.s32 r15:t1, t, t1
*IMUL.i32 t0, t1, r3
+IADD.s32 r3:t1, t, t1
*IMUL.i32 t0, t1, r15
+IADD.s32 r15:t1, t, t1
*IMUL.i32 t0, t1, r3
+IADD.s32 r3:t1, t, t1
*IMUL.i32 r15:t0, t1, r15
+NOP t1
*NOP t0
+IADD.s32 r10:t1, r10, r6
}
clause_70:
ds(0) nbb ncph
{
*IMUL.i32 t0, r10, r6
+IADD.s32 r6:t1, t, r10
*IMUL.i32 t0, t1, r10
+IADD.s32 r10:t1, t, t1
*IMUL.i32 t0, t1, r6
+IADD.s32 r6:t1, t, t1
*IMUL.i32 t0, t1, r10
+IADD.s32 r10:t1, t, t1
*IMUL.i32 t0, t1, r6
+IADD.s32 r6:t1, t, t1
*IMUL.i32 t0, t1, r10
+IADD.s32 r10:t1, t, t1
*IMUL.i32 r6:t0, t1, r6
+NOP t1
*NOP t0
+IADD.s32 r12:t1, r12, r48
}
clause_76:
ds(0) nbb ncph
{
*IMUL.i32 t0, r12, r48
+IADD.s32 r48:t1, t, r12
*IMUL.i32 t0, t1, r12
+IADD.s32 r12:t1, t, t1
*IMUL.i32 t0, t1, r48
+IADD.s32 r48:t1, t, t1
*IMUL.i32 t0, t1, r12
+IADD.s32 r12:t1, t, t1
*IMUL.i32 t0, t1, r48
+IADD.s32 r48:t1, t, t1
*IMUL.i32 t0, t1, r12
+IADD.s32 r12:t1, t, t1
*IMUL.i32 r48:t0, t1, r48
+NOP t1
*NOP t0
+IADD.s32 r13:t1, r13, r1
}
clause_82:
ds(0) nbb ncph
{
*IMUL.i32 t0, r13, r1
+IADD.s32 r1:t1, t, r13
*IMUL.i32 t0, t1, r13
+IADD.s32 r13:t1, t, t1
*IMUL.i32 t0, t1, r1
+IADD.s32 r1:t1, t, t1
*IMUL.i32 r13:t0, t1, r13
+NOP t1
*IMUL.i32 t0, r9, r5
+IADD.s32 r5:t1, t, r9
*IMUL.i32 t0, t1, r9
+IADD.s32 r9:t1, t, t1
*IMUL.i32 t0, t1, r5
+IADD.s32 r5:t1, t, t1
*NOP t0
+IADD.s32 r14:t1, r14, r2
}
clause_88:
ds(0) nbb ncph
{
*IMUL.i32 t0, r5, r9
+IADD.s32 r9:t1, t, r5
*IMUL.i32 t0, r14, r2
+IADD.s32 r2:t1, t, r14
*IMUL.i32 t0, r9, r5
+IADD.s32 r5:t1, t, r9
*IMUL.i32 t0, r2, r14
+IADD.s32 r14:t1, t, r2
*IMUL.i32 t0, r5, r9
+IADD.s32 r9:t1, t, r5
*IMUL.i32 t0, r14, r2
+IADD.s32 r2:t1, t, r14
*IMUL.i32 t0, r9, r5
+IADD.s32 r5:t1, t, r9
*NOP t0
+IADD.s32 r7:t1, r7, r11
}
clause_94:
ds(0) nbb ncph
{
*IMUL.i32 t0, r2, r14
+IADD.s32 r14:t1, t, r2
*IMUL.i32 t0, r5, r9
+IADD.s32 r9:t1, t, r5
*IMUL.i32 t0, r7, r11
+IADD.s32 r11:t1, t, r7
*IMUL.i32 t0, r14, r2
+IADD.s32 r2:t1, t, r14
*IMUL.i32 t0, r9, r5
+IADD.s32 r5:t1, t, r9
*IMUL.i32 t0, r11, r7
+IADD.s32 r7:t1, t, r11
*IMUL.i32 t0, r2, r14
+IADD.s32 r14:t1, t, r2
*NOP t0
+IADD.s32 r8:t1, r8, r4
}
clause_100:
ds(0) nbb ncph
{
*IMUL.i32 t0, r5, r9
+IADD.s32 r9:t1, t, r5
*IMUL.i32 t0, r7, r11
+IADD.s32 r11:t1, t, r7
*IMUL.i32 t0, r14, r2
+IADD.s32 r2:t1, t, r14
*IMUL.i32 t0, r8, r4
+IADD.s32 r4:t1, t, r8
*IMUL.i32 t0, r9, r5
+IADD.s32 r5:t1, t, r9
*IMUL.i32 t0, r11, r7
+IADD.s32 r7:t1, t, r11
*IMUL.i32 t0, r2, r14
+IADD.s32 r14:t1, t, r2
*NOP t0
+IADD.s32 r15:t1, r15, r3
}
clause_106:
ds(0) nbb ncph
{
*IMUL.i32 t0, r4, r8
+IADD.s32 r8:t1, t, r4
*IMUL.i32 t0, r5, r9
+IADD.s32 r9:t1, t, r5
*IMUL.i32 t0, r7, r11
+IADD.s32 r11:t1, t, r7
*IMUL.i32 t0, r14, r2
+IADD.s32 r2:t1, t, r14
*IMUL.i32 t0, r15, r3
+IADD.s32 r3:t1, t, r15
*IMUL.i32 t0, r8, r4
+IADD.s32 r4:t1, t, r8
*IMUL.i32 t0, r9, r5
+IADD.s32 r5:t1, t, r9
*NOP t0
+IADD.s32 r6:t1, r6, r10
}
clause_112:
ds(0) nbb ncph
{
*IMUL.i32 t0, r11, r7
+IADD.s32 r7:t1, t, r11
*IMUL.i32 t0, r2, r14
+IADD.s32 r14:t1, t, r2
*IMUL.i32 t0, r3, r15
+IADD.s32 r15:t1, t, r3
*IMUL.i32 t0, r4, r8
+IADD.s32 r8:t1, t, r4
*IMUL.i32 t0, r5, r9
+IADD.s32 r9:t1, t, r5
*IMUL.i32 t0, r6, r10
+IADD.s32 r10:t1, t, r6
*IMUL.i32 t0, r7, r11
+IADD.s32 r11:t1, t, r7
*NOP t0
+IADD.s32 r48:t1, r48, r12
}
clause_118:
ds(0) nbb ncph
{
*IMUL.i32 t0, r14, r2
+IADD.s32 r2:t1, t, r14
*IMUL.i32 t0, r15, r3
+IADD.s32 r3:t1, t, r15
*IMUL.i32 t0, r8, r4
+IADD.s32 r4:t1, t, r8
*IMUL.i32 t0, r9, r5
+IADD.s32 r5:t1, t, r9
*IMUL.i32 t0, r10, r6
+IADD.s32 r6:t1, t, r10
*IMUL.i32 t0, r11, r7
+IADD.s32 r7:t1, t, r11
*IMUL.i32 t0, r48, r12
+IADD.s32 r12:t1, t, r48
*NOP t0
+IADD.s32 r13:t1, r13, r1
}
clause_124:
ds(0) nbb r_uncond
{
*IMUL.i32 t0, r2, r14
+IADD.s32 r14:t1, t, r2
*IMUL.i32 t0, r3, r15
+IADD.s32 r15:t1, t, r3
*IMUL.i32 t0, r4, r8
+IADD.s32 r8:t1, t, r4
*IMUL.i32 t0, r5, r9
+IADD.s32 r9:t1, t, r5
*IMUL.i32 t0, r6, r10
+IADD.s32 r10:t1, t, r6
*IMUL.i32 t0, r7, r11
+IADD.s32 r11:t1, t, r7
*NOP t0
+IADD.s32 r49:t1, r49, 0x00000001 /* 0.000000 */
*NOP t0
+JUMP t1, clause_14
}
clause_131:
ds(0) nbb ncph next_store dwb(0)
{
*NOP t0
+IADD.s32 r1:t1, r12, r8
*NOP t0
+IADD.s32 r2:t1, r13, r9
*NOP t0
+IADD.s32 r3:t1, r14, r10
}
clause_134:
ds(0) eos store
{
*NOP t0
+IADD.s32 r4:t1, r15, r11
*NOP t0
+IADD.s32 r1:t1, r1, r3
*NOP t0
+IADD.s32 t1, r2, r4
*LSHIFT_OR.i32 t0, r0, #0, 0x00000002 /* 0.000000 */
+IADD.s32 r1:t1, r1, t1
*NOP t0
+IADD.s32 r0:t1, u0.w0, t0
*NOP t0
+ICMP.u32.gt t1, u0.w0, t1
*NOP t0
+IADD.s32 t1, t1, u0.w1
*NOP t0
+STORE.i32 t1, r0, t1, @r1
}
e20eea22 compute_sp_v8_int 21.294 GFLOPs 12.606ms
compute shader ----------
#define KERNEL compute_sp_v16
#define LOCAL_SIZE_X 256
#define DATATYPE int
#define vec2 ivec2
#define vec4 ivec4
#line 64
// Avoiding auto-vectorize by using vector-width locked dependent code
layout(local_size_x = LOCAL_SIZE_X) in;
#undef MAD_4
#undef MAD_16
#undef MAD_64
#define mad(a,b,c) (a*b+c)
#define MAD_4(x, y) x = mad(y, x, y); y = mad(x, y, x); x = mad(y, x, y); y = mad(x, y, x);
#define MAD_16(x, y) MAD_4(x, y); MAD_4(x, y); MAD_4(x, y); MAD_4(x, y);
#define MAD_64(x, y) MAD_16(x, y); MAD_16(x, y); MAD_16(x, y); MAD_16(x, y);
struct vec8 {
vec4 d0, d1;
};
#define VEC8(x0,x1,x2,x3,x4,x5,x6,x7) vec8(vec4(x0,x1,x2,x3), vec4(x4,x5,x6,x7))
#define VEC8_S(x) vec8(vec4(x,x,x,x), vec4(x,x,x,x))
#define VEC8_ADD(a, b) (vec8(a.d0 + b.d0, a.d1 + b.d1))
#define VEC8_MUL(a, b) (vec8(a.d0 * b.d0, a.d1 * b.d1))
struct vec16 {
vec8 d0,d1;
};
#define VEC16(x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15) vec16(VEC8(x0,x1,x2,x3,x4,x5,x6,x7), VEC8(x8,x9,x10,x11,x12,x13,x14,x15))
#define VEC16_S(x) vec16(VEC8_S(x), VEC8_S(x));
#define VEC16_ADD(a, b) (vec16(VEC8_ADD(a.d0, b.d0), VEC8_ADD(a.d1, b.d1)))
#define VEC16_MUL(a, b) (vec16(VEC8_MUL(a.d0, b.d0), VEC8_MUL(a.d1, b.d1)))
#define mad8(a,b,c) (VEC8_ADD(VEC8_MUL(a,b),c))
#define mad16(a,b,c) (VEC16_ADD(VEC16_MUL(a,b),c))
layout(location = 1) uniform DATATYPE _A;
#define SCALE 1e-10
layout(std430, binding = 0) restrict writeonly buffer outbuffer {
DATATYPE ptr[];
};
#line 184
void compute_sp_v16()
{
uint id = gl_GlobalInvocationID[0] + gl_GlobalInvocationID[1] * 256u + gl_GlobalInvocationID[2] * 256u * 256u;
vec16 x = VEC16(_A, (_A+DATATYPE(1)), (_A+DATATYPE(2)), (_A+DATATYPE(3)), (_A+DATATYPE(4)), (_A+DATATYPE(5)), (_A+DATATYPE(6)), (_A+DATATYPE(7)),
(_A+DATATYPE(8)), (_A+DATATYPE(9)), (_A+DATATYPE(10)), (_A+DATATYPE(11)), (_A+DATATYPE(12)), (_A+DATATYPE(13)), (_A+DATATYPE(14)), (_A+DATATYPE(15)));
vec16 y = VEC16_S(DATATYPE((float(id) * SCALE)));
#undef mad
#define mad mad16
for(int i=0; i<8; i++)
{
MAD_16(x, y);
}
vec8 u = VEC8_ADD(y.d0, y.d1);
vec4 s = u.d0 + u.d1;
vec2 t = s.xy + s.zw;
ptr[id] = t.x + t.y;
}
void main() {compute_sp_v16();}
----------
shader: MESA_SHADER_COMPUTE
source_sha1: {0x330fb7af, 0x4ad9a35d, 0x58354a59, 0xe4dc307b, 0x9dc86d72}
name: GLSL20
workgroup-size: 256, 1, 1
shared-size: 0
inputs: 0
outputs: 0
uniforms: 1
ubos: 1
shared: 0
ray queries: 0
decl_var ssbo INTERP_MODE_NONE restrict writeonly highp int[] ptr (0, 0, 0)
decl_var uniform INTERP_MODE_NONE highp int _A (1, 0, 0)
decl_var ubo INTERP_MODE_NONE vec4[1] uniform_0 (0, 0, 0)
decl_function main (0 params)
impl main {
block block_0:
/* preds: */
vec3 32 ssa_17 = intrinsic load_global_invocation_id () ()
vec1 32 ssa_8 = load_const (0x00000008 = 0.000000)
vec1 32 ssa_18 = ishl ssa_17.y, ssa_8
vec1 32 ssa_19 = iadd ssa_17.x, ssa_18
vec1 32 ssa_1 = load_const (0x00000001 = 0.000000)
vec1 32 ssa_614 = insert_u16 ssa_17.z, ssa_1
vec1 32 ssa_22 = iadd ssa_19, ssa_614
vec1 32 ssa_0 = load_const (0x00000000 = 0.000000)
vec1 32 ssa_23 = intrinsic load_ubo (ssa_0, ssa_0) (access=0, align_mul=1073741824, align_offset=0, range_base=0, range=4)
vec1 32 ssa_24 = iadd ssa_23, ssa_1
vec1 32 ssa_2 = load_const (0x00000002 = 0.000000)
vec1 32 ssa_25 = iadd ssa_23, ssa_2
vec1 32 ssa_3 = load_const (0x00000003 = 0.000000)
vec1 32 ssa_26 = iadd ssa_23, ssa_3
vec1 32 ssa_4 = load_const (0x00000004 = 0.000000)
vec1 32 ssa_27 = iadd ssa_23, ssa_4
vec1 32 ssa_5 = load_const (0x00000005 = 0.000000)
vec1 32 ssa_28 = iadd ssa_23, ssa_5
vec1 32 ssa_6 = load_const (0x00000006 = 0.000000)
vec1 32 ssa_29 = iadd ssa_23, ssa_6
vec1 32 ssa_7 = load_const (0x00000007 = 0.000000)
vec1 32 ssa_30 = iadd ssa_23, ssa_7
vec1 32 ssa_31 = iadd ssa_23, ssa_8
vec1 32 ssa_9 = load_const (0x00000009 = 0.000000)
vec1 32 ssa_32 = iadd ssa_23, ssa_9
vec1 32 ssa_10 = load_const (0x0000000a = 0.000000)
vec1 32 ssa_33 = iadd ssa_23, ssa_10
vec1 32 ssa_11 = load_const (0x0000000b = 0.000000)
vec1 32 ssa_34 = iadd ssa_23, ssa_11
vec1 32 ssa_12 = load_const (0x0000000c = 0.000000)
vec1 32 ssa_35 = iadd ssa_23, ssa_12
vec1 32 ssa_13 = load_const (0x0000000d = 0.000000)
vec1 32 ssa_36 = iadd ssa_23, ssa_13
vec1 32 ssa_14 = load_const (0x0000000e = 0.000000)
vec1 32 ssa_37 = iadd ssa_23, ssa_14
vec1 32 ssa_15 = load_const (0x0000000f = 0.000000)
vec1 32 ssa_38 = iadd ssa_23, ssa_15
vec1 32 ssa_39 = u2f32 ssa_22
vec1 32 ssa_16 = load_const (0x2edbe6ff = 0.000000)
vec1 32 ssa_40 = fmul ssa_39, ssa_16
vec1 32 ssa_41 = f2i32 ssa_40
/* succs: block_1 */
loop {
block block_1:
/* preds: block_0 block_4 */
vec1 32 ssa_42 = phi block_0: ssa_41, block_4: ssa_584
vec1 32 ssa_43 = phi block_0: ssa_41, block_4: ssa_585
vec1 32 ssa_44 = phi block_0: ssa_41, block_4: ssa_586
vec1 32 ssa_45 = phi block_0: ssa_41, block_4: ssa_587
vec1 32 ssa_46 = phi block_0: ssa_41, block_4: ssa_580
vec1 32 ssa_47 = phi block_0: ssa_41, block_4: ssa_581
vec1 32 ssa_48 = phi block_0: ssa_41, block_4: ssa_582
vec1 32 ssa_49 = phi block_0: ssa_41, block_4: ssa_583
vec1 32 ssa_50 = phi block_0: ssa_41, block_4: ssa_568
vec1 32 ssa_51 = phi block_0: ssa_41, block_4: ssa_569
vec1 32 ssa_52 = phi block_0: ssa_41, block_4: ssa_570
vec1 32 ssa_53 = phi block_0: ssa_41, block_4: ssa_571
vec1 32 ssa_54 = phi block_0: ssa_41, block_4: ssa_564
vec1 32 ssa_55 = phi block_0: ssa_41, block_4: ssa_565
vec1 32 ssa_56 = phi block_0: ssa_41, block_4: ssa_566
vec1 32 ssa_57 = phi block_0: ssa_41, block_4: ssa_567
vec1 32 ssa_58 = phi block_0: ssa_35, block_4: ssa_552
vec1 32 ssa_59 = phi block_0: ssa_36, block_4: ssa_553
vec1 32 ssa_60 = phi block_0: ssa_37, block_4: ssa_554
vec1 32 ssa_61 = phi block_0: ssa_38, block_4: ssa_555
vec1 32 ssa_62 = phi block_0: ssa_31, block_4: ssa_548
vec1 32 ssa_63 = phi block_0: ssa_32, block_4: ssa_549
vec1 32 ssa_64 = phi block_0: ssa_33, block_4: ssa_550
vec1 32 ssa_65 = phi block_0: ssa_34, block_4: ssa_551
vec1 32 ssa_66 = phi block_0: ssa_27, block_4: ssa_536
vec1 32 ssa_67 = phi block_0: ssa_28, block_4: ssa_537
vec1 32 ssa_68 = phi block_0: ssa_29, block_4: ssa_538
vec1 32 ssa_69 = phi block_0: ssa_30, block_4: ssa_539
vec1 32 ssa_70 = phi block_0: ssa_23, block_4: ssa_532
vec1 32 ssa_71 = phi block_0: ssa_24, block_4: ssa_533
vec1 32 ssa_72 = phi block_0: ssa_25, block_4: ssa_534
vec1 32 ssa_73 = phi block_0: ssa_26, block_4: ssa_535
vec1 32 ssa_74 = phi block_0: ssa_0, block_4: ssa_588
vec1 32 ssa_75 = ige32 ssa_74, ssa_8
/* succs: block_2 block_3 */
if ssa_75 {
block block_2:
/* preds: block_1 */
break
/* succs: block_5 */
} else {
block block_3:
/* preds: block_1 */
/* succs: block_4 */
}
block block_4:
/* preds: block_3 */
vec1 32 ssa_76 = imul ssa_54, ssa_70
vec1 32 ssa_77 = imul ssa_55, ssa_71
vec1 32 ssa_78 = imul ssa_56, ssa_72
vec1 32 ssa_79 = imul ssa_57, ssa_73
vec1 32 ssa_80 = imul ssa_50, ssa_66
vec1 32 ssa_81 = imul ssa_51, ssa_67
vec1 32 ssa_82 = imul ssa_52, ssa_68
vec1 32 ssa_83 = imul ssa_53, ssa_69
vec1 32 ssa_84 = iadd ssa_76, ssa_54
vec1 32 ssa_85 = iadd ssa_77, ssa_55
vec1 32 ssa_86 = iadd ssa_78, ssa_56
vec1 32 ssa_87 = iadd ssa_79, ssa_57
vec1 32 ssa_88 = iadd ssa_80, ssa_50
vec1 32 ssa_89 = iadd ssa_81, ssa_51
vec1 32 ssa_90 = iadd ssa_82, ssa_52
vec1 32 ssa_91 = iadd ssa_83, ssa_53
vec1 32 ssa_92 = imul ssa_46, ssa_62
vec1 32 ssa_93 = imul ssa_47, ssa_63
vec1 32 ssa_94 = imul ssa_48, ssa_64
vec1 32 ssa_95 = imul ssa_49, ssa_65
vec1 32 ssa_96 = imul ssa_42, ssa_58
vec1 32 ssa_97 = imul ssa_43, ssa_59
vec1 32 ssa_98 = imul ssa_44, ssa_60
vec1 32 ssa_99 = imul ssa_45, ssa_61
vec1 32 ssa_100 = iadd ssa_92, ssa_46
vec1 32 ssa_101 = iadd ssa_93, ssa_47
vec1 32 ssa_102 = iadd ssa_94, ssa_48
vec1 32 ssa_103 = iadd ssa_95, ssa_49
vec1 32 ssa_104 = iadd ssa_96, ssa_42
vec1 32 ssa_105 = iadd ssa_97, ssa_43
vec1 32 ssa_106 = iadd ssa_98, ssa_44
vec1 32 ssa_107 = iadd ssa_99, ssa_45
vec1 32 ssa_108 = imul ssa_84, ssa_54
vec1 32 ssa_109 = imul ssa_85, ssa_55
vec1 32 ssa_110 = imul ssa_86, ssa_56
vec1 32 ssa_111 = imul ssa_87, ssa_57
vec1 32 ssa_112 = imul ssa_88, ssa_50
vec1 32 ssa_113 = imul ssa_89, ssa_51
vec1 32 ssa_114 = imul ssa_90, ssa_52
vec1 32 ssa_115 = imul ssa_91, ssa_53
vec1 32 ssa_116 = iadd ssa_108, ssa_84
vec1 32 ssa_117 = iadd ssa_109, ssa_85
vec1 32 ssa_118 = iadd ssa_110, ssa_86
vec1 32 ssa_119 = iadd ssa_111, ssa_87
vec1 32 ssa_120 = iadd ssa_112, ssa_88
vec1 32 ssa_121 = iadd ssa_113, ssa_89
vec1 32 ssa_122 = iadd ssa_114, ssa_90
vec1 32 ssa_123 = iadd ssa_115, ssa_91
vec1 32 ssa_124 = imul ssa_100, ssa_46
vec1 32 ssa_125 = imul ssa_101, ssa_47
vec1 32 ssa_126 = imul ssa_102, ssa_48
vec1 32 ssa_127 = imul ssa_103, ssa_49
vec1 32 ssa_128 = imul ssa_104, ssa_42
vec1 32 ssa_129 = imul ssa_105, ssa_43
vec1 32 ssa_130 = imul ssa_106, ssa_44
vec1 32 ssa_131 = imul ssa_107, ssa_45
vec1 32 ssa_132 = iadd ssa_124, ssa_100
vec1 32 ssa_133 = iadd ssa_125, ssa_101
vec1 32 ssa_134 = iadd ssa_126, ssa_102
vec1 32 ssa_135 = iadd ssa_127, ssa_103
vec1 32 ssa_136 = iadd ssa_128, ssa_104
vec1 32 ssa_137 = iadd ssa_129, ssa_105
vec1 32 ssa_138 = iadd ssa_130, ssa_106
vec1 32 ssa_139 = iadd ssa_131, ssa_107
vec1 32 ssa_140 = imul ssa_116, ssa_84
vec1 32 ssa_141 = imul ssa_117, ssa_85
vec1 32 ssa_142 = imul ssa_118, ssa_86
vec1 32 ssa_143 = imul ssa_119, ssa_87
vec1 32 ssa_144 = imul ssa_120, ssa_88
vec1 32 ssa_145 = imul ssa_121, ssa_89
vec1 32 ssa_146 = imul ssa_122, ssa_90
vec1 32 ssa_147 = imul ssa_123, ssa_91
vec1 32 ssa_148 = iadd ssa_140, ssa_116
vec1 32 ssa_149 = iadd ssa_141, ssa_117
vec1 32 ssa_150 = iadd ssa_142, ssa_118
vec1 32 ssa_151 = iadd ssa_143, ssa_119
vec1 32 ssa_152 = iadd ssa_144, ssa_120
vec1 32 ssa_153 = iadd ssa_145, ssa_121
vec1 32 ssa_154 = iadd ssa_146, ssa_122
vec1 32 ssa_155 = iadd ssa_147, ssa_123
vec1 32 ssa_156 = imul ssa_132, ssa_100
vec1 32 ssa_157 = imul ssa_133, ssa_101
vec1 32 ssa_158 = imul ssa_134, ssa_102
vec1 32 ssa_159 = imul ssa_135, ssa_103
vec1 32 ssa_160 = imul ssa_136, ssa_104
vec1 32 ssa_161 = imul ssa_137, ssa_105
vec1 32 ssa_162 = imul ssa_138, ssa_106
vec1 32 ssa_163 = imul ssa_139, ssa_107
vec1 32 ssa_164 = iadd ssa_156, ssa_132
vec1 32 ssa_165 = iadd ssa_157, ssa_133
vec1 32 ssa_166 = iadd ssa_158, ssa_134
vec1 32 ssa_167 = iadd ssa_159, ssa_135
vec1 32 ssa_168 = iadd ssa_160, ssa_136
vec1 32 ssa_169 = iadd ssa_161, ssa_137
vec1 32 ssa_170 = iadd ssa_162, ssa_138
vec1 32 ssa_171 = iadd ssa_163, ssa_139
vec1 32 ssa_172 = imul ssa_148, ssa_116
vec1 32 ssa_173 = imul ssa_149, ssa_117
vec1 32 ssa_174 = imul ssa_150, ssa_118
vec1 32 ssa_175 = imul ssa_151, ssa_119
vec1 32 ssa_176 = imul ssa_152, ssa_120
vec1 32 ssa_177 = imul ssa_153, ssa_121
vec1 32 ssa_178 = imul ssa_154, ssa_122
vec1 32 ssa_179 = imul ssa_155, ssa_123
vec1 32 ssa_180 = iadd ssa_172, ssa_148
vec1 32 ssa_181 = iadd ssa_173, ssa_149
vec1 32 ssa_182 = iadd ssa_174, ssa_150
vec1 32 ssa_183 = iadd ssa_175, ssa_151
vec1 32 ssa_184 = iadd ssa_176, ssa_152
vec1 32 ssa_185 = iadd ssa_177, ssa_153
vec1 32 ssa_186 = iadd ssa_178, ssa_154
vec1 32 ssa_187 = iadd ssa_179, ssa_155
vec1 32 ssa_188 = imul ssa_164, ssa_132
vec1 32 ssa_189 = imul ssa_165, ssa_133
vec1 32 ssa_190 = imul ssa_166, ssa_134
vec1 32 ssa_191 = imul ssa_167, ssa_135
vec1 32 ssa_192 = imul ssa_168, ssa_136
vec1 32 ssa_193 = imul ssa_169, ssa_137
vec1 32 ssa_194 = imul ssa_170, ssa_138
vec1 32 ssa_195 = imul ssa_171, ssa_139
vec1 32 ssa_196 = iadd ssa_188, ssa_164
vec1 32 ssa_197 = iadd ssa_189, ssa_165
vec1 32 ssa_198 = iadd ssa_190, ssa_166
vec1 32 ssa_199 = iadd ssa_191, ssa_167
vec1 32 ssa_200 = iadd ssa_192, ssa_168
vec1 32 ssa_201 = iadd ssa_193, ssa_169
vec1 32 ssa_202 = iadd ssa_194, ssa_170
vec1 32 ssa_203 = iadd ssa_195, ssa_171
vec1 32 ssa_204 = imul ssa_180, ssa_148
vec1 32 ssa_205 = imul ssa_181, ssa_149
vec1 32 ssa_206 = imul ssa_182, ssa_150
vec1 32 ssa_207 = imul ssa_183, ssa_151
vec1 32 ssa_208 = imul ssa_184, ssa_152
vec1 32 ssa_209 = imul ssa_185, ssa_153
vec1 32 ssa_210 = imul ssa_186, ssa_154
vec1 32 ssa_211 = imul ssa_187, ssa_155
vec1 32 ssa_212 = iadd ssa_204, ssa_180
vec1 32 ssa_213 = iadd ssa_205, ssa_181
vec1 32 ssa_214 = iadd ssa_206, ssa_182
vec1 32 ssa_215 = iadd ssa_207, ssa_183
vec1 32 ssa_216 = iadd ssa_208, ssa_184
vec1 32 ssa_217 = iadd ssa_209, ssa_185
vec1 32 ssa_218 = iadd ssa_210, ssa_186
vec1 32 ssa_219 = iadd ssa_211, ssa_187
vec1 32 ssa_220 = imul ssa_196, ssa_164
vec1 32 ssa_221 = imul ssa_197, ssa_165
vec1 32 ssa_222 = imul ssa_198, ssa_166
vec1 32 ssa_223 = imul ssa_199, ssa_167
vec1 32 ssa_224 = imul ssa_200, ssa_168
vec1 32 ssa_225 = imul ssa_201, ssa_169
vec1 32 ssa_226 = imul ssa_202, ssa_170
vec1 32 ssa_227 = imul ssa_203, ssa_171
vec1 32 ssa_228 = iadd ssa_220, ssa_196
vec1 32 ssa_229 = iadd ssa_221, ssa_197
vec1 32 ssa_230 = iadd ssa_222, ssa_198
vec1 32 ssa_231 = iadd ssa_223, ssa_199
vec1 32 ssa_232 = iadd ssa_224, ssa_200
vec1 32 ssa_233 = iadd ssa_225, ssa_201
vec1 32 ssa_234 = iadd ssa_226, ssa_202
vec1 32 ssa_235 = iadd ssa_227, ssa_203
vec1 32 ssa_236 = imul ssa_212, ssa_180
vec1 32 ssa_237 = imul ssa_213, ssa_181
vec1 32 ssa_238 = imul ssa_214, ssa_182
vec1 32 ssa_239 = imul ssa_215, ssa_183
vec1 32 ssa_240 = imul ssa_216, ssa_184
vec1 32 ssa_241 = imul ssa_217, ssa_185
vec1 32 ssa_242 = imul ssa_218, ssa_186
vec1 32 ssa_243 = imul ssa_219, ssa_187
vec1 32 ssa_244 = iadd ssa_236, ssa_212
vec1 32 ssa_245 = iadd ssa_237, ssa_213
vec1 32 ssa_246 = iadd ssa_238, ssa_214
vec1 32 ssa_247 = iadd ssa_239, ssa_215
vec1 32 ssa_248 = iadd ssa_240, ssa_216
vec1 32 ssa_249 = iadd ssa_241, ssa_217
vec1 32 ssa_250 = iadd ssa_242, ssa_218
vec1 32 ssa_251 = iadd ssa_243, ssa_219
vec1 32 ssa_252 = imul ssa_228, ssa_196
vec1 32 ssa_253 = imul ssa_229, ssa_197
vec1 32 ssa_254 = imul ssa_230, ssa_198
vec1 32 ssa_255 = imul ssa_231, ssa_199
vec1 32 ssa_256 = imul ssa_232, ssa_200
vec1 32 ssa_257 = imul ssa_233, ssa_201
vec1 32 ssa_258 = imul ssa_234, ssa_202
vec1 32 ssa_259 = imul ssa_235, ssa_203
vec1 32 ssa_260 = iadd ssa_252, ssa_228
vec1 32 ssa_261 = iadd ssa_253, ssa_229
vec1 32 ssa_262 = iadd ssa_254, ssa_230
vec1 32 ssa_263 = iadd ssa_255, ssa_231
vec1 32 ssa_264 = iadd ssa_256, ssa_232
vec1 32 ssa_265 = iadd ssa_257, ssa_233
vec1 32 ssa_266 = iadd ssa_258, ssa_234
vec1 32 ssa_267 = iadd ssa_259, ssa_235
vec1 32 ssa_268 = imul ssa_244, ssa_212
vec1 32 ssa_269 = imul ssa_245, ssa_213
vec1 32 ssa_270 = imul ssa_246, ssa_214
vec1 32 ssa_271 = imul ssa_247, ssa_215
vec1 32 ssa_272 = imul ssa_248, ssa_216
vec1 32 ssa_273 = imul ssa_249, ssa_217
vec1 32 ssa_274 = imul ssa_250, ssa_218
vec1 32 ssa_275 = imul ssa_251, ssa_219
vec1 32 ssa_276 = iadd ssa_268, ssa_244
vec1 32 ssa_277 = iadd ssa_269, ssa_245
vec1 32 ssa_278 = iadd ssa_270, ssa_246
vec1 32 ssa_279 = iadd ssa_271, ssa_247
vec1 32 ssa_280 = iadd ssa_272, ssa_248
vec1 32 ssa_281 = iadd ssa_273, ssa_249
vec1 32 ssa_282 = iadd ssa_274, ssa_250
vec1 32 ssa_283 = iadd ssa_275, ssa_251
vec1 32 ssa_284 = imul ssa_260, ssa_228
vec1 32 ssa_285 = imul ssa_261, ssa_229
vec1 32 ssa_286 = imul ssa_262, ssa_230
vec1 32 ssa_287 = imul ssa_263, ssa_231
vec1 32 ssa_288 = imul ssa_264, ssa_232
vec1 32 ssa_289 = imul ssa_265, ssa_233
vec1 32 ssa_290 = imul ssa_266, ssa_234
vec1 32 ssa_291 = imul ssa_267, ssa_235
vec1 32 ssa_292 = iadd ssa_284, ssa_260
vec1 32 ssa_293 = iadd ssa_285, ssa_261
vec1 32 ssa_294 = iadd ssa_286, ssa_262
vec1 32 ssa_295 = iadd ssa_287, ssa_263
vec1 32 ssa_296 = iadd ssa_288, ssa_264
vec1 32 ssa_297 = iadd ssa_289, ssa_265
vec1 32 ssa_298 = iadd ssa_290, ssa_266
vec1 32 ssa_299 = iadd ssa_291, ssa_267
vec1 32 ssa_300 = imul ssa_276, ssa_244
vec1 32 ssa_301 = imul ssa_277, ssa_245
vec1 32 ssa_302 = imul ssa_278, ssa_246
vec1 32 ssa_303 = imul ssa_279, ssa_247
vec1 32 ssa_304 = imul ssa_280, ssa_248
vec1 32 ssa_305 = imul ssa_281, ssa_249
vec1 32 ssa_306 = imul ssa_282, ssa_250
vec1 32 ssa_307 = imul ssa_283, ssa_251
vec1 32 ssa_308 = iadd ssa_300, ssa_276
vec1 32 ssa_309 = iadd ssa_301, ssa_277
vec1 32 ssa_310 = iadd ssa_302, ssa_278
vec1 32 ssa_311 = iadd ssa_303, ssa_279
vec1 32 ssa_312 = iadd ssa_304, ssa_280
vec1 32 ssa_313 = iadd ssa_305, ssa_281
vec1 32 ssa_314 = iadd ssa_306, ssa_282
vec1 32 ssa_315 = iadd ssa_307, ssa_283
vec1 32 ssa_316 = imul ssa_292, ssa_260
vec1 32 ssa_317 = imul ssa_293, ssa_261
vec1 32 ssa_318 = imul ssa_294, ssa_262
vec1 32 ssa_319 = imul ssa_295, ssa_263
vec1 32 ssa_320 = imul ssa_296, ssa_264
vec1 32 ssa_321 = imul ssa_297, ssa_265
vec1 32 ssa_322 = imul ssa_298, ssa_266
vec1 32 ssa_323 = imul ssa_299, ssa_267
vec1 32 ssa_324 = iadd ssa_316, ssa_292
vec1 32 ssa_325 = iadd ssa_317, ssa_293
vec1 32 ssa_326 = iadd ssa_318, ssa_294
vec1 32 ssa_327 = iadd ssa_319, ssa_295
vec1 32 ssa_328 = iadd ssa_320, ssa_296
vec1 32 ssa_329 = iadd ssa_321, ssa_297
vec1 32 ssa_330 = iadd ssa_322, ssa_298
vec1 32 ssa_331 = iadd ssa_323, ssa_299
vec1 32 ssa_332 = imul ssa_308, ssa_276
vec1 32 ssa_333 = imul ssa_309, ssa_277
vec1 32 ssa_334 = imul ssa_310, ssa_278
vec1 32 ssa_335 = imul ssa_311, ssa_279
vec1 32 ssa_336 = imul ssa_312, ssa_280
vec1 32 ssa_337 = imul ssa_313, ssa_281
vec1 32 ssa_338 = imul ssa_314, ssa_282
vec1 32 ssa_339 = imul ssa_315, ssa_283
vec1 32 ssa_340 = iadd ssa_332, ssa_308
vec1 32 ssa_341 = iadd ssa_333, ssa_309
vec1 32 ssa_342 = iadd ssa_334, ssa_310
vec1 32 ssa_343 = iadd ssa_335, ssa_311
vec1 32 ssa_344 = iadd ssa_336, ssa_312
vec1 32 ssa_345 = iadd ssa_337, ssa_313
vec1 32 ssa_346 = iadd ssa_338, ssa_314
vec1 32 ssa_347 = iadd ssa_339, ssa_315
vec1 32 ssa_348 = imul ssa_324, ssa_292
vec1 32 ssa_349 = imul ssa_325, ssa_293
vec1 32 ssa_350 = imul ssa_326, ssa_294
vec1 32 ssa_351 = imul ssa_327, ssa_295
vec1 32 ssa_352 = imul ssa_328, ssa_296
vec1 32 ssa_353 = imul ssa_329, ssa_297
vec1 32 ssa_354 = imul ssa_330, ssa_298
vec1 32 ssa_355 = imul ssa_331, ssa_299
vec1 32 ssa_356 = iadd ssa_348, ssa_324
vec1 32 ssa_357 = iadd ssa_349, ssa_325
vec1 32 ssa_358 = iadd ssa_350, ssa_326
vec1 32 ssa_359 = iadd ssa_351, ssa_327
vec1 32 ssa_360 = iadd ssa_352, ssa_328
vec1 32 ssa_361 = iadd ssa_353, ssa_329
vec1 32 ssa_362 = iadd ssa_354, ssa_330
vec1 32 ssa_363 = iadd ssa_355, ssa_331
vec1 32 ssa_364 = imul ssa_340, ssa_308
vec1 32 ssa_365 = imul ssa_341, ssa_309
vec1 32 ssa_366 = imul ssa_342, ssa_310
vec1 32 ssa_367 = imul ssa_343, ssa_311
vec1 32 ssa_368 = imul ssa_344, ssa_312
vec1 32 ssa_369 = imul ssa_345, ssa_313
vec1 32 ssa_370 = imul ssa_346, ssa_314
vec1 32 ssa_371 = imul ssa_347, ssa_315
vec1 32 ssa_372 = iadd ssa_364, ssa_340
vec1 32 ssa_373 = iadd ssa_365, ssa_341
vec1 32 ssa_374 = iadd ssa_366, ssa_342
vec1 32 ssa_375 = iadd ssa_367, ssa_343
vec1 32 ssa_376 = iadd ssa_368, ssa_344
vec1 32 ssa_377 = iadd ssa_369, ssa_345
vec1 32 ssa_378 = iadd ssa_370, ssa_346
vec1 32 ssa_379 = iadd ssa_371, ssa_347
vec1 32 ssa_380 = imul ssa_356, ssa_324
vec1 32 ssa_381 = imul ssa_357, ssa_325
vec1 32 ssa_382 = imul ssa_358, ssa_326
vec1 32 ssa_383 = imul ssa_359, ssa_327
vec1 32 ssa_384 = imul ssa_360, ssa_328
vec1 32 ssa_385 = imul ssa_361, ssa_329
vec1 32 ssa_386 = imul ssa_362, ssa_330
vec1 32 ssa_387 = imul ssa_363, ssa_331
vec1 32 ssa_388 = iadd ssa_380, ssa_356
vec1 32 ssa_389 = iadd ssa_381, ssa_357
vec1 32 ssa_390 = iadd ssa_382, ssa_358
vec1 32 ssa_391 = iadd ssa_383, ssa_359
vec1 32 ssa_392 = iadd ssa_384, ssa_360
vec1 32 ssa_393 = iadd ssa_385, ssa_361
vec1 32 ssa_394 = iadd ssa_386, ssa_362
vec1 32 ssa_395 = iadd ssa_387, ssa_363
vec1 32 ssa_396 = imul ssa_372, ssa_340
vec1 32 ssa_397 = imul ssa_373, ssa_341
vec1 32 ssa_398 = imul ssa_374, ssa_342
vec1 32 ssa_399 = imul ssa_375, ssa_343
vec1 32 ssa_400 = imul ssa_376, ssa_344
vec1 32 ssa_401 = imul ssa_377, ssa_345
vec1 32 ssa_402 = imul ssa_378, ssa_346
vec1 32 ssa_403 = imul ssa_379, ssa_347
vec1 32 ssa_404 = iadd ssa_396, ssa_372
vec1 32 ssa_405 = iadd ssa_397, ssa_373
vec1 32 ssa_406 = iadd ssa_398, ssa_374
vec1 32 ssa_407 = iadd ssa_399, ssa_375
vec1 32 ssa_408 = iadd ssa_400, ssa_376
vec1 32 ssa_409 = iadd ssa_401, ssa_377
vec1 32 ssa_410 = iadd ssa_402, ssa_378
vec1 32 ssa_411 = iadd ssa_403, ssa_379
vec1 32 ssa_412 = imul ssa_388, ssa_356
vec1 32 ssa_413 = imul ssa_389, ssa_357
vec1 32 ssa_414 = imul ssa_390, ssa_358
vec1 32 ssa_415 = imul ssa_391, ssa_359
vec1 32 ssa_416 = imul ssa_392, ssa_360
vec1 32 ssa_417 = imul ssa_393, ssa_361
vec1 32 ssa_418 = imul ssa_394, ssa_362
vec1 32 ssa_419 = imul ssa_395, ssa_363
vec1 32 ssa_420 = iadd ssa_412, ssa_388
vec1 32 ssa_421 = iadd ssa_413, ssa_389
vec1 32 ssa_422 = iadd ssa_414, ssa_390
vec1 32 ssa_423 = iadd ssa_415, ssa_391
vec1 32 ssa_424 = iadd ssa_416, ssa_392
vec1 32 ssa_425 = iadd ssa_417, ssa_393
vec1 32 ssa_426 = iadd ssa_418, ssa_394
vec1 32 ssa_427 = iadd ssa_419, ssa_395
vec1 32 ssa_428 = imul ssa_404, ssa_372
vec1 32 ssa_429 = imul ssa_405, ssa_373
vec1 32 ssa_430 = imul ssa_406, ssa_374
vec1 32 ssa_431 = imul ssa_407, ssa_375
vec1 32 ssa_432 = imul ssa_408, ssa_376
vec1 32 ssa_433 = imul ssa_409, ssa_377
vec1 32 ssa_434 = imul ssa_410, ssa_378
vec1 32 ssa_435 = imul ssa_411, ssa_379
vec1 32 ssa_436 = iadd ssa_428, ssa_404
vec1 32 ssa_437 = iadd ssa_429, ssa_405
vec1 32 ssa_438 = iadd ssa_430, ssa_406
vec1 32 ssa_439 = iadd ssa_431, ssa_407
vec1 32 ssa_440 = iadd ssa_432, ssa_408
vec1 32 ssa_441 = iadd ssa_433, ssa_409
vec1 32 ssa_442 = iadd ssa_434, ssa_410
vec1 32 ssa_443 = iadd ssa_435, ssa_411
vec1 32 ssa_444 = imul ssa_420, ssa_388
vec1 32 ssa_445 = imul ssa_421, ssa_389
vec1 32 ssa_446 = imul ssa_422, ssa_390
vec1 32 ssa_447 = imul ssa_423, ssa_391
vec1 32 ssa_448 = imul ssa_424, ssa_392
vec1 32 ssa_449 = imul ssa_425, ssa_393
vec1 32 ssa_450 = imul ssa_426, ssa_394
vec1 32 ssa_451 = imul ssa_427, ssa_395
vec1 32 ssa_452 = iadd ssa_444, ssa_420
vec1 32 ssa_453 = iadd ssa_445, ssa_421
vec1 32 ssa_454 = iadd ssa_446, ssa_422
vec1 32 ssa_455 = iadd ssa_447, ssa_423
vec1 32 ssa_456 = iadd ssa_448, ssa_424
vec1 32 ssa_457 = iadd ssa_449, ssa_425
vec1 32 ssa_458 = iadd ssa_450, ssa_426
vec1 32 ssa_459 = iadd ssa_451, ssa_427
vec1 32 ssa_460 = imul ssa_436, ssa_404
vec1 32 ssa_461 = imul ssa_437, ssa_405
vec1 32 ssa_462 = imul ssa_438, ssa_406
vec1 32 ssa_463 = imul ssa_439, ssa_407
vec1 32 ssa_464 = imul ssa_440, ssa_408
vec1 32 ssa_465 = imul ssa_441, ssa_409
vec1 32 ssa_466 = imul ssa_442, ssa_410
vec1 32 ssa_467 = imul ssa_443, ssa_411
vec1 32 ssa_468 = iadd ssa_460, ssa_436
vec1 32 ssa_469 = iadd ssa_461, ssa_437
vec1 32 ssa_470 = iadd ssa_462, ssa_438
vec1 32 ssa_471 = iadd ssa_463, ssa_439
vec1 32 ssa_472 = iadd ssa_464, ssa_440
vec1 32 ssa_473 = iadd ssa_465, ssa_441
vec1 32 ssa_474 = iadd ssa_466, ssa_442
vec1 32 ssa_475 = iadd ssa_467, ssa_443
vec1 32 ssa_476 = imul ssa_452, ssa_420
vec1 32 ssa_477 = imul ssa_453, ssa_421
vec1 32 ssa_478 = imul ssa_454, ssa_422
vec1 32 ssa_479 = imul ssa_455, ssa_423
vec1 32 ssa_480 = imul ssa_456, ssa_424
vec1 32 ssa_481 = imul ssa_457, ssa_425
vec1 32 ssa_482 = imul ssa_458, ssa_426
vec1 32 ssa_483 = imul ssa_459, ssa_427
vec1 32 ssa_484 = iadd ssa_476, ssa_452
vec1 32 ssa_485 = iadd ssa_477, ssa_453
vec1 32 ssa_486 = iadd ssa_478, ssa_454
vec1 32 ssa_487 = iadd ssa_479, ssa_455
vec1 32 ssa_488 = iadd ssa_480, ssa_456
vec1 32 ssa_489 = iadd ssa_481, ssa_457
vec1 32 ssa_490 = iadd ssa_482, ssa_458
vec1 32 ssa_491 = iadd ssa_483, ssa_459
vec1 32 ssa_492 = imul ssa_468, ssa_436
vec1 32 ssa_493 = imul ssa_469, ssa_437
vec1 32 ssa_494 = imul ssa_470, ssa_438
vec1 32 ssa_495 = imul ssa_471, ssa_439
vec1 32 ssa_496 = imul ssa_472, ssa_440
vec1 32 ssa_497 = imul ssa_473, ssa_441
vec1 32 ssa_498 = imul ssa_474, ssa_442
vec1 32 ssa_499 = imul ssa_475, ssa_443
vec1 32 ssa_500 = iadd ssa_492, ssa_468
vec1 32 ssa_501 = iadd ssa_493, ssa_469
vec1 32 ssa_502 = iadd ssa_494, ssa_470
vec1 32 ssa_503 = iadd ssa_495, ssa_471
vec1 32 ssa_504 = iadd ssa_496, ssa_472
vec1 32 ssa_505 = iadd ssa_497, ssa_473
vec1 32 ssa_506 = iadd ssa_498, ssa_474
vec1 32 ssa_507 = iadd ssa_499, ssa_475
vec1 32 ssa_508 = imul ssa_484, ssa_452
vec1 32 ssa_509 = imul ssa_485, ssa_453
vec1 32 ssa_510 = imul ssa_486, ssa_454
vec1 32 ssa_511 = imul ssa_487, ssa_455
vec1 32 ssa_512 = imul ssa_488, ssa_456
vec1 32 ssa_513 = imul ssa_489, ssa_457
vec1 32 ssa_514 = imul ssa_490, ssa_458
vec1 32 ssa_515 = imul ssa_491, ssa_459
vec1 32 ssa_516 = iadd ssa_508, ssa_484
vec1 32 ssa_517 = iadd ssa_509, ssa_485
vec1 32 ssa_518 = iadd ssa_510, ssa_486
vec1 32 ssa_519 = iadd ssa_511, ssa_487
vec1 32 ssa_520 = iadd ssa_512, ssa_488
vec1 32 ssa_521 = iadd ssa_513, ssa_489
vec1 32 ssa_522 = iadd ssa_514, ssa_490
vec1 32 ssa_523 = iadd ssa_515, ssa_491
vec1 32 ssa_524 = imul ssa_500, ssa_468
vec1 32 ssa_525 = imul ssa_501, ssa_469
vec1 32 ssa_526 = imul ssa_502, ssa_470
vec1 32 ssa_527 = imul ssa_503, ssa_471
vec1 32 ssa_528 = imul ssa_504, ssa_472
vec1 32 ssa_529 = imul ssa_505, ssa_473
vec1 32 ssa_530 = imul ssa_506, ssa_474
vec1 32 ssa_531 = imul ssa_507, ssa_475
vec1 32 ssa_532 = iadd ssa_524, ssa_500
vec1 32 ssa_533 = iadd ssa_525, ssa_501
vec1 32 ssa_534 = iadd ssa_526, ssa_502
vec1 32 ssa_535 = iadd ssa_527, ssa_503
vec1 32 ssa_536 = iadd ssa_528, ssa_504
vec1 32 ssa_537 = iadd ssa_529, ssa_505
vec1 32 ssa_538 = iadd ssa_530, ssa_506
vec1 32 ssa_539 = iadd ssa_531, ssa_507
vec1 32 ssa_540 = imul ssa_516, ssa_484
vec1 32 ssa_541 = imul ssa_517, ssa_485
vec1 32 ssa_542 = imul ssa_518, ssa_486
vec1 32 ssa_543 = imul ssa_519, ssa_487
vec1 32 ssa_544 = imul ssa_520, ssa_488
vec1 32 ssa_545 = imul ssa_521, ssa_489
vec1 32 ssa_546 = imul ssa_522, ssa_490
vec1 32 ssa_547 = imul ssa_523, ssa_491
vec1 32 ssa_548 = iadd ssa_540, ssa_516
vec1 32 ssa_549 = iadd ssa_541, ssa_517
vec1 32 ssa_550 = iadd ssa_542, ssa_518
vec1 32 ssa_551 = iadd ssa_543, ssa_519
vec1 32 ssa_552 = iadd ssa_544, ssa_520
vec1 32 ssa_553 = iadd ssa_545, ssa_521
vec1 32 ssa_554 = iadd ssa_546, ssa_522
vec1 32 ssa_555 = iadd ssa_547, ssa_523
vec1 32 ssa_556 = imul ssa_532, ssa_500
vec1 32 ssa_557 = imul ssa_533, ssa_501
vec1 32 ssa_558 = imul ssa_534, ssa_502
vec1 32 ssa_559 = imul ssa_535, ssa_503
vec1 32 ssa_560 = imul ssa_536, ssa_504
vec1 32 ssa_561 = imul ssa_537, ssa_505
vec1 32 ssa_562 = imul ssa_538, ssa_506
vec1 32 ssa_563 = imul ssa_539, ssa_507
vec1 32 ssa_564 = iadd ssa_556, ssa_532
vec1 32 ssa_565 = iadd ssa_557, ssa_533
vec1 32 ssa_566 = iadd ssa_558, ssa_534
vec1 32 ssa_567 = iadd ssa_559, ssa_535
vec1 32 ssa_568 = iadd ssa_560, ssa_536
vec1 32 ssa_569 = iadd ssa_561, ssa_537
vec1 32 ssa_570 = iadd ssa_562, ssa_538
vec1 32 ssa_571 = iadd ssa_563, ssa_539
vec1 32 ssa_572 = imul ssa_548, ssa_516
vec1 32 ssa_573 = imul ssa_549, ssa_517
vec1 32 ssa_574 = imul ssa_550, ssa_518
vec1 32 ssa_575 = imul ssa_551, ssa_519
vec1 32 ssa_576 = imul ssa_552, ssa_520
vec1 32 ssa_577 = imul ssa_553, ssa_521
vec1 32 ssa_578 = imul ssa_554, ssa_522
vec1 32 ssa_579 = imul ssa_555, ssa_523
vec1 32 ssa_580 = iadd ssa_572, ssa_548
vec1 32 ssa_581 = iadd ssa_573, ssa_549
vec1 32 ssa_582 = iadd ssa_574, ssa_550
vec1 32 ssa_583 = iadd ssa_575, ssa_551
vec1 32 ssa_584 = iadd ssa_576, ssa_552
vec1 32 ssa_585 = iadd ssa_577, ssa_553
vec1 32 ssa_586 = iadd ssa_578, ssa_554
vec1 32 ssa_587 = iadd ssa_579, ssa_555
vec1 32 ssa_588 = iadd ssa_74, ssa_1
/* succs: block_1 */
}
block block_5:
/* preds: block_2 */
vec1 32 ssa_589 = iadd ssa_54, ssa_46
vec1 32 ssa_590 = iadd ssa_55, ssa_47
vec1 32 ssa_591 = iadd ssa_56, ssa_48
vec1 32 ssa_592 = iadd ssa_57, ssa_49
vec1 32 ssa_593 = iadd ssa_50, ssa_42
vec1 32 ssa_594 = iadd ssa_51, ssa_43
vec1 32 ssa_595 = iadd ssa_52, ssa_44
vec1 32 ssa_596 = iadd ssa_53, ssa_45
vec1 32 ssa_597 = iadd ssa_589, ssa_593
vec1 32 ssa_598 = iadd ssa_590, ssa_594
vec1 32 ssa_599 = iadd ssa_591, ssa_595
vec1 32 ssa_600 = iadd ssa_592, ssa_596
vec1 32 ssa_601 = iadd ssa_597, ssa_599
vec1 32 ssa_602 = iadd ssa_598, ssa_600
vec1 32 ssa_603 = ishl ssa_22, ssa_2
vec1 32 ssa_604 = iadd ssa_601, ssa_602
vec1 64 ssa_605 = intrinsic load_ssbo_address (ssa_0) ()
vec1 32 ssa_606 = unpack_64_2x32_split_x ssa_605
vec1 32 ssa_607 = unpack_64_2x32_split_y ssa_605
vec1 32 ssa_608 = iadd ssa_606, ssa_603
vec1 32 ssa_609 = ult32 ssa_608, ssa_606
vec1 32 ssa_610 = b2i32 ssa_609
vec1 32 ssa_611 = iadd ssa_610, ssa_607
vec1 64 ssa_612 = pack_64_2x32_split ssa_608, ssa_611
intrinsic store_global (ssa_604, ssa_612) (wrmask=x /*1*/, access=0, align_mul=4, align_offset=0)
/* succs: block_6 */
block block_6:
}
block0 {
617 = MOV.i32 r62
616 = MOV.i32 r61
615 = MOV.i32 r60
18 = LSHIFT_OR.i32 616, #0x0, #0x8.b0
19 = IADD.s32 615, 18
614 = MKVEC.v2i16 #0x0.h00, 617.h00
22 = IADD.s32 19, 614
24 = IADD.s32 u1, #0x1
25 = IADD.s32 u1, #0x2
26 = IADD.s32 u1, #0x3
27 = IADD.s32 u1, #0x4
28 = IADD.s32 u1, #0x5
29 = IADD.s32 u1, #0x6
30 = IADD.s32 u1, #0x7
31 = IADD.s32 u1, #0x8
32 = IADD.s32 u1, #0x9
33 = IADD.s32 u1, #0xa
34 = IADD.s32 u1, #0xb
35 = IADD.s32 u1, #0xc
36 = IADD.s32 u1, #0xd
37 = IADD.s32 u1, #0xe
38 = IADD.s32 u1, #0xf
39 = U32_TO_F32 22
40 = FMA.f32 39, #0x2edbe6ff, #0x0.neg
41 = F32_TO_S32.rtz 40
} -> block1
block1 {
42 = PHI 41, 584
43 = PHI 41, 585
44 = PHI 41, 586
45 = PHI 41, 587
46 = PHI 41, 580
47 = PHI 41, 581
48 = PHI 41, 582
49 = PHI 41, 583
50 = PHI 41, 568
51 = PHI 41, 569
52 = PHI 41, 570
53 = PHI 41, 571
54 = PHI 41, 564
55 = PHI 41, 565
56 = PHI 41, 566
57 = PHI 41, 567
58 = PHI 35, 552
59 = PHI 36, 553
60 = PHI 37, 554
61 = PHI 38, 555
62 = PHI 31, 548
63 = PHI 32, 549
64 = PHI 33, 550
65 = PHI 34, 551
66 = PHI 27, 536
67 = PHI 28, 537
68 = PHI 29, 538
69 = PHI 30, 539
70 = PHI u1, 532
71 = PHI 24, 533
72 = PHI 25, 534
73 = PHI 26, 535
74 = PHI #0x0, 588
75 = ICMP.s32.m1.ge 74, #0x8
BRANCHZ.i16.eq 75.h00, #0x0 -> block3
} -> block3 block2 from block0 block4
block2 {
JUMP #0x0 -> block5
} -> block5 from block1
block3 {
} -> block4 from block1
block4 {
76 = IMUL.i32 54, 70
77 = IMUL.i32 55, 71
78 = IMUL.i32 56, 72
79 = IMUL.i32 57, 73
80 = IMUL.i32 50, 66
81 = IMUL.i32 51, 67
82 = IMUL.i32 52, 68
83 = IMUL.i32 53, 69
84 = IADD.s32 76, 54
85 = IADD.s32 77, 55
86 = IADD.s32 78, 56
87 = IADD.s32 79, 57
88 = IADD.s32 80, 50
89 = IADD.s32 81, 51
90 = IADD.s32 82, 52
91 = IADD.s32 83, 53
92 = IMUL.i32 46, 62
93 = IMUL.i32 47, 63
94 = IMUL.i32 48, 64
95 = IMUL.i32 49, 65
96 = IMUL.i32 42, 58
97 = IMUL.i32 43, 59
98 = IMUL.i32 44, 60
99 = IMUL.i32 45, 61
100 = IADD.s32 92, 46
101 = IADD.s32 93, 47
102 = IADD.s32 94, 48
103 = IADD.s32 95, 49
104 = IADD.s32 96, 42
105 = IADD.s32 97, 43
106 = IADD.s32 98, 44
107 = IADD.s32 99, 45
108 = IMUL.i32 84, 54
109 = IMUL.i32 85, 55
110 = IMUL.i32 86, 56
111 = IMUL.i32 87, 57
112 = IMUL.i32 88, 50
113 = IMUL.i32 89, 51
114 = IMUL.i32 90, 52
115 = IMUL.i32 91, 53
116 = IADD.s32 108, 84
117 = IADD.s32 109, 85
118 = IADD.s32 110, 86
119 = IADD.s32 111, 87
120 = IADD.s32 112, 88
121 = IADD.s32 113, 89
122 = IADD.s32 114, 90
123 = IADD.s32 115, 91
124 = IMUL.i32 100, 46
125 = IMUL.i32 101, 47
126 = IMUL.i32 102, 48
127 = IMUL.i32 103, 49
128 = IMUL.i32 104, 42
129 = IMUL.i32 105, 43
130 = IMUL.i32 106, 44
131 = IMUL.i32 107, 45
132 = IADD.s32 124, 100
133 = IADD.s32 125, 101
134 = IADD.s32 126, 102
135 = IADD.s32 127, 103
136 = IADD.s32 128, 104
137 = IADD.s32 129, 105
138 = IADD.s32 130, 106
139 = IADD.s32 131, 107
140 = IMUL.i32 116, 84
141 = IMUL.i32 117, 85
142 = IMUL.i32 118, 86
143 = IMUL.i32 119, 87
144 = IMUL.i32 120, 88
145 = IMUL.i32 121, 89
146 = IMUL.i32 122, 90
147 = IMUL.i32 123, 91
148 = IADD.s32 140, 116
149 = IADD.s32 141, 117
150 = IADD.s32 142, 118
151 = IADD.s32 143, 119
152 = IADD.s32 144, 120
153 = IADD.s32 145, 121
154 = IADD.s32 146, 122
155 = IADD.s32 147, 123
156 = IMUL.i32 132, 100
157 = IMUL.i32 133, 101
158 = IMUL.i32 134, 102
159 = IMUL.i32 135, 103
160 = IMUL.i32 136, 104
161 = IMUL.i32 137, 105
162 = IMUL.i32 138, 106
163 = IMUL.i32 139, 107
164 = IADD.s32 156, 132
165 = IADD.s32 157, 133
166 = IADD.s32 158, 134
167 = IADD.s32 159, 135
168 = IADD.s32 160, 136
169 = IADD.s32 161, 137
170 = IADD.s32 162, 138
171 = IADD.s32 163, 139
172 = IMUL.i32 148, 116
173 = IMUL.i32 149, 117
174 = IMUL.i32 150, 118
175 = IMUL.i32 151, 119
176 = IMUL.i32 152, 120
177 = IMUL.i32 153, 121
178 = IMUL.i32 154, 122
179 = IMUL.i32 155, 123
180 = IADD.s32 172, 148
181 = IADD.s32 173, 149
182 = IADD.s32 174, 150
183 = IADD.s32 175, 151
184 = IADD.s32 176, 152
185 = IADD.s32 177, 153
186 = IADD.s32 178, 154
187 = IADD.s32 179, 155
188 = IMUL.i32 164, 132
189 = IMUL.i32 165, 133
190 = IMUL.i32 166, 134
191 = IMUL.i32 167, 135
192 = IMUL.i32 168, 136
193 = IMUL.i32 169, 137
194 = IMUL.i32 170, 138
195 = IMUL.i32 171, 139
196 = IADD.s32 188, 164
197 = IADD.s32 189, 165
198 = IADD.s32 190, 166
199 = IADD.s32 191, 167
200 = IADD.s32 192, 168
201 = IADD.s32 193, 169
202 = IADD.s32 194, 170
203 = IADD.s32 195, 171
204 = IMUL.i32 180, 148
205 = IMUL.i32 181, 149
206 = IMUL.i32 182, 150
207 = IMUL.i32 183, 151
208 = IMUL.i32 184, 152
209 = IMUL.i32 185, 153
210 = IMUL.i32 186, 154
211 = IMUL.i32 187, 155
212 = IADD.s32 204, 180
213 = IADD.s32 205, 181
214 = IADD.s32 206, 182
215 = IADD.s32 207, 183
216 = IADD.s32 208, 184
217 = IADD.s32 209, 185
218 = IADD.s32 210, 186
219 = IADD.s32 211, 187
220 = IMUL.i32 196, 164
221 = IMUL.i32 197, 165
222 = IMUL.i32 198, 166
223 = IMUL.i32 199, 167
224 = IMUL.i32 200, 168
225 = IMUL.i32 201, 169
226 = IMUL.i32 202, 170
227 = IMUL.i32 203, 171
228 = IADD.s32 220, 196
229 = IADD.s32 221, 197
230 = IADD.s32 222, 198
231 = IADD.s32 223, 199
232 = IADD.s32 224, 200
233 = IADD.s32 225, 201
234 = IADD.s32 226, 202
235 = IADD.s32 227, 203
236 = IMUL.i32 212, 180
237 = IMUL.i32 213, 181
238 = IMUL.i32 214, 182
239 = IMUL.i32 215, 183
240 = IMUL.i32 216, 184
241 = IMUL.i32 217, 185
242 = IMUL.i32 218, 186
243 = IMUL.i32 219, 187
244 = IADD.s32 236, 212
245 = IADD.s32 237, 213
246 = IADD.s32 238, 214
247 = IADD.s32 239, 215
248 = IADD.s32 240, 216
249 = IADD.s32 241, 217
250 = IADD.s32 242, 218
251 = IADD.s32 243, 219
252 = IMUL.i32 228, 196
253 = IMUL.i32 229, 197
254 = IMUL.i32 230, 198
255 = IMUL.i32 231, 199
256 = IMUL.i32 232, 200
257 = IMUL.i32 233, 201
258 = IMUL.i32 234, 202
259 = IMUL.i32 235, 203
260 = IADD.s32 252, 228
261 = IADD.s32 253, 229
262 = IADD.s32 254, 230
263 = IADD.s32 255, 231
264 = IADD.s32 256, 232
265 = IADD.s32 257, 233
266 = IADD.s32 258, 234
267 = IADD.s32 259, 235
268 = IMUL.i32 244, 212
269 = IMUL.i32 245, 213
270 = IMUL.i32 246, 214
271 = IMUL.i32 247, 215
272 = IMUL.i32 248, 216
273 = IMUL.i32 249, 217
274 = IMUL.i32 250, 218
275 = IMUL.i32 251, 219
276 = IADD.s32 268, 244
277 = IADD.s32 269, 245
278 = IADD.s32 270, 246
279 = IADD.s32 271, 247
280 = IADD.s32 272, 248
281 = IADD.s32 273, 249
282 = IADD.s32 274, 250
283 = IADD.s32 275, 251
284 = IMUL.i32 260, 228
285 = IMUL.i32 261, 229
286 = IMUL.i32 262, 230
287 = IMUL.i32 263, 231
288 = IMUL.i32 264, 232
289 = IMUL.i32 265, 233
290 = IMUL.i32 266, 234
291 = IMUL.i32 267, 235
292 = IADD.s32 284, 260
293 = IADD.s32 285, 261
294 = IADD.s32 286, 262
295 = IADD.s32 287, 263
296 = IADD.s32 288, 264
297 = IADD.s32 289, 265
298 = IADD.s32 290, 266
299 = IADD.s32 291, 267
300 = IMUL.i32 276, 244
301 = IMUL.i32 277, 245
302 = IMUL.i32 278, 246
303 = IMUL.i32 279, 247
304 = IMUL.i32 280, 248
305 = IMUL.i32 281, 249
306 = IMUL.i32 282, 250
307 = IMUL.i32 283, 251
308 = IADD.s32 300, 276
309 = IADD.s32 301, 277
310 = IADD.s32 302, 278
311 = IADD.s32 303, 279
312 = IADD.s32 304, 280
313 = IADD.s32 305, 281
314 = IADD.s32 306, 282
315 = IADD.s32 307, 283
316 = IMUL.i32 292, 260
317 = IMUL.i32 293, 261
318 = IMUL.i32 294, 262
319 = IMUL.i32 295, 263
320 = IMUL.i32 296, 264
321 = IMUL.i32 297, 265
322 = IMUL.i32 298, 266
323 = IMUL.i32 299, 267
324 = IADD.s32 316, 292
325 = IADD.s32 317, 293
326 = IADD.s32 318, 294
327 = IADD.s32 319, 295
328 = IADD.s32 320, 296
329 = IADD.s32 321, 297
330 = IADD.s32 322, 298
331 = IADD.s32 323, 299
332 = IMUL.i32 308, 276
333 = IMUL.i32 309, 277
334 = IMUL.i32 310, 278
335 = IMUL.i32 311, 279
336 = IMUL.i32 312, 280
337 = IMUL.i32 313, 281
338 = IMUL.i32 314, 282
339 = IMUL.i32 315, 283
340 = IADD.s32 332, 308
341 = IADD.s32 333, 309
342 = IADD.s32 334, 310
343 = IADD.s32 335, 311
344 = IADD.s32 336, 312
345 = IADD.s32 337, 313
346 = IADD.s32 338, 314
347 = IADD.s32 339, 315
348 = IMUL.i32 324, 292
349 = IMUL.i32 325, 293
350 = IMUL.i32 326, 294
351 = IMUL.i32 327, 295
352 = IMUL.i32 328, 296
353 = IMUL.i32 329, 297
354 = IMUL.i32 330, 298
355 = IMUL.i32 331, 299
356 = IADD.s32 348, 324
357 = IADD.s32 349, 325
358 = IADD.s32 350, 326
359 = IADD.s32 351, 327
360 = IADD.s32 352, 328
361 = IADD.s32 353, 329
362 = IADD.s32 354, 330
363 = IADD.s32 355, 331
364 = IMUL.i32 340, 308
365 = IMUL.i32 341, 309
366 = IMUL.i32 342, 310
367 = IMUL.i32 343, 311
368 = IMUL.i32 344, 312
369 = IMUL.i32 345, 313
370 = IMUL.i32 346, 314
371 = IMUL.i32 347, 315
372 = IADD.s32 364, 340
373 = IADD.s32 365, 341
374 = IADD.s32 366, 342
375 = IADD.s32 367, 343
376 = IADD.s32 368, 344
377 = IADD.s32 369, 345
378 = IADD.s32 370, 346
379 = IADD.s32 371, 347
380 = IMUL.i32 356, 324
381 = IMUL.i32 357, 325
382 = IMUL.i32 358, 326
383 = IMUL.i32 359, 327
384 = IMUL.i32 360, 328
385 = IMUL.i32 361, 329
386 = IMUL.i32 362, 330
387 = IMUL.i32 363, 331
388 = IADD.s32 380, 356
389 = IADD.s32 381, 357
390 = IADD.s32 382, 358
391 = IADD.s32 383, 359
392 = IADD.s32 384, 360
393 = IADD.s32 385, 361
394 = IADD.s32 386, 362
395 = IADD.s32 387, 363
396 = IMUL.i32 372, 340
397 = IMUL.i32 373, 341
398 = IMUL.i32 374, 342
399 = IMUL.i32 375, 343
400 = IMUL.i32 376, 344
401 = IMUL.i32 377, 345
402 = IMUL.i32 378, 346
403 = IMUL.i32 379, 347
404 = IADD.s32 396, 372
405 = IADD.s32 397, 373
406 = IADD.s32 398, 374
407 = IADD.s32 399, 375
408 = IADD.s32 400, 376
409 = IADD.s32 401, 377
410 = IADD.s32 402, 378
411 = IADD.s32 403, 379
412 = IMUL.i32 388, 356
413 = IMUL.i32 389, 357
414 = IMUL.i32 390, 358
415 = IMUL.i32 391, 359
416 = IMUL.i32 392, 360
417 = IMUL.i32 393, 361
418 = IMUL.i32 394, 362
419 = IMUL.i32 395, 363
420 = IADD.s32 412, 388
421 = IADD.s32 413, 389
422 = IADD.s32 414, 390
423 = IADD.s32 415, 391
424 = IADD.s32 416, 392
425 = IADD.s32 417, 393
426 = IADD.s32 418, 394
427 = IADD.s32 419, 395
428 = IMUL.i32 404, 372
429 = IMUL.i32 405, 373
430 = IMUL.i32 406, 374
431 = IMUL.i32 407, 375
432 = IMUL.i32 408, 376
433 = IMUL.i32 409, 377
434 = IMUL.i32 410, 378
435 = IMUL.i32 411, 379
436 = IADD.s32 428, 404
437 = IADD.s32 429, 405
438 = IADD.s32 430, 406
439 = IADD.s32 431, 407
440 = IADD.s32 432, 408
441 = IADD.s32 433, 409
442 = IADD.s32 434, 410
443 = IADD.s32 435, 411
444 = IMUL.i32 420, 388
445 = IMUL.i32 421, 389
446 = IMUL.i32 422, 390
447 = IMUL.i32 423, 391
448 = IMUL.i32 424, 392
449 = IMUL.i32 425, 393
450 = IMUL.i32 426, 394
451 = IMUL.i32 427, 395
452 = IADD.s32 444, 420
453 = IADD.s32 445, 421
454 = IADD.s32 446, 422
455 = IADD.s32 447, 423
456 = IADD.s32 448, 424
457 = IADD.s32 449, 425
458 = IADD.s32 450, 426
459 = IADD.s32 451, 427
460 = IMUL.i32 436, 404
461 = IMUL.i32 437, 405
462 = IMUL.i32 438, 406
463 = IMUL.i32 439, 407
464 = IMUL.i32 440, 408
465 = IMUL.i32 441, 409
466 = IMUL.i32 442, 410
467 = IMUL.i32 443, 411
468 = IADD.s32 460, 436
469 = IADD.s32 461, 437
470 = IADD.s32 462, 438
471 = IADD.s32 463, 439
472 = IADD.s32 464, 440
473 = IADD.s32 465, 441
474 = IADD.s32 466, 442
475 = IADD.s32 467, 443
476 = IMUL.i32 452, 420
477 = IMUL.i32 453, 421
478 = IMUL.i32 454, 422
479 = IMUL.i32 455, 423
480 = IMUL.i32 456, 424
481 = IMUL.i32 457, 425
482 = IMUL.i32 458, 426
483 = IMUL.i32 459, 427
484 = IADD.s32 476, 452
485 = IADD.s32 477, 453
486 = IADD.s32 478, 454
487 = IADD.s32 479, 455
488 = IADD.s32 480, 456
489 = IADD.s32 481, 457
490 = IADD.s32 482, 458
491 = IADD.s32 483, 459
492 = IMUL.i32 468, 436
493 = IMUL.i32 469, 437
494 = IMUL.i32 470, 438
495 = IMUL.i32 471, 439
496 = IMUL.i32 472, 440
497 = IMUL.i32 473, 441
498 = IMUL.i32 474, 442
499 = IMUL.i32 475, 443
500 = IADD.s32 492, 468
501 = IADD.s32 493, 469
502 = IADD.s32 494, 470
503 = IADD.s32 495, 471
504 = IADD.s32 496, 472
505 = IADD.s32 497, 473
506 = IADD.s32 498, 474
507 = IADD.s32 499, 475
508 = IMUL.i32 484, 452
509 = IMUL.i32 485, 453
510 = IMUL.i32 486, 454
511 = IMUL.i32 487, 455
512 = IMUL.i32 488, 456
513 = IMUL.i32 489, 457
514 = IMUL.i32 490, 458
515 = IMUL.i32 491, 459
516 = IADD.s32 508, 484
517 = IADD.s32 509, 485
518 = IADD.s32 510, 486
519 = IADD.s32 511, 487
520 = IADD.s32 512, 488
521 = IADD.s32 513, 489
522 = IADD.s32 514, 490
523 = IADD.s32 515, 491
524 = IMUL.i32 500, 468
525 = IMUL.i32 501, 469
526 = IMUL.i32 502, 470
527 = IMUL.i32 503, 471
528 = IMUL.i32 504, 472
529 = IMUL.i32 505, 473
530 = IMUL.i32 506, 474
531 = IMUL.i32 507, 475
532 = IADD.s32 524, 500
533 = IADD.s32 525, 501
534 = IADD.s32 526, 502
535 = IADD.s32 527, 503
536 = IADD.s32 528, 504
537 = IADD.s32 529, 505
538 = IADD.s32 530, 506
539 = IADD.s32 531, 507
540 = IMUL.i32 516, 484
541 = IMUL.i32 517, 485
542 = IMUL.i32 518, 486
543 = IMUL.i32 519, 487
544 = IMUL.i32 520, 488
545 = IMUL.i32 521, 489
546 = IMUL.i32 522, 490
547 = IMUL.i32 523, 491
548 = IADD.s32 540, 516
549 = IADD.s32 541, 517
550 = IADD.s32 542, 518
551 = IADD.s32 543, 519
552 = IADD.s32 544, 520
553 = IADD.s32 545, 521
554 = IADD.s32 546, 522
555 = IADD.s32 547, 523
556 = IMUL.i32 532, 500
557 = IMUL.i32 533, 501
558 = IMUL.i32 534, 502
559 = IMUL.i32 535, 503
560 = IMUL.i32 536, 504
561 = IMUL.i32 537, 505
562 = IMUL.i32 538, 506
563 = IMUL.i32 539, 507
564 = IADD.s32 556, 532
565 = IADD.s32 557, 533
566 = IADD.s32 558, 534
567 = IADD.s32 559, 535
568 = IADD.s32 560, 536
569 = IADD.s32 561, 537
570 = IADD.s32 562, 538
571 = IADD.s32 563, 539
572 = IMUL.i32 548, 516
573 = IMUL.i32 549, 517
574 = IMUL.i32 550, 518
575 = IMUL.i32 551, 519
576 = IMUL.i32 552, 520
577 = IMUL.i32 553, 521
578 = IMUL.i32 554, 522
579 = IMUL.i32 555, 523
580 = IADD.s32 572, 548
581 = IADD.s32 573, 549
582 = IADD.s32 574, 550
583 = IADD.s32 575, 551
584 = IADD.s32 576, 552
585 = IADD.s32 577, 553
586 = IADD.s32 578, 554
587 = IADD.s32 579, 555
588 = IADD.s32 74, #0x1
JUMP #0x0 -> block1
} -> block1 from block3
block5 {
589 = IADD.s32 54, 46
590 = IADD.s32 55, 47
591 = IADD.s32 56, 48
592 = IADD.s32 57, 49
593 = IADD.s32 50, 42
594 = IADD.s32 51, 43
595 = IADD.s32 52, 44
596 = IADD.s32 53, 45
597 = IADD.s32 589, 593
598 = IADD.s32 590, 594
599 = IADD.s32 591, 595
600 = IADD.s32 592, 596
601 = IADD.s32 597, 599
602 = IADD.s32 598, 600
603 = LSHIFT_OR.i32 22, #0x0, #0x2.b0
604 = IADD.s32 601, 602
608 = IADD.s32 u0, 603
610 = ICMP.u32.i1.lt 608, u0
611 = IADD.s32 610, u0[1]
STORE.i32 604, 608, 611, byte_offset:0
} from block2
block0 {
r0 = LSHIFT_OR.i32 r61, #0x0, #0x8.b0
r0 = IADD.s32 r60, r0
r1 = MKVEC.v2i16 #0x0.h00, r62.h00
r0 = IADD.s32 r0, r1
r1 = MOV.i32 #0x1
r1 = IADD.s32 u1, r1
r2 = MOV.i32 #0x2
r2 = IADD.s32 u1, r2
r3 = MOV.i32 #0x3
r3 = IADD.s32 u1, r3
r4 = MOV.i32 #0x4
r4 = IADD.s32 u1, r4
r5 = MOV.i32 #0x5
r5 = IADD.s32 u1, r5
r6 = MOV.i32 #0x6
r6 = IADD.s32 u1, r6
r7 = MOV.i32 #0x7
r7 = IADD.s32 u1, r7
r8 = MOV.i32 #0x8
r8 = IADD.s32 u1, r8
r9 = MOV.i32 #0x9
r9 = IADD.s32 u1, r9
r10 = MOV.i32 #0xa
r10 = IADD.s32 u1, r10
r11 = MOV.i32 #0xb
r11 = IADD.s32 u1, r11
r12 = MOV.i32 #0xc
r12 = IADD.s32 u1, r12
r13 = MOV.i32 #0xd
r13 = IADD.s32 u1, r13
r14 = MOV.i32 #0xe
r14 = IADD.s32 u1, r14
r15 = MOV.i32 #0xf
r15 = IADD.s32 u1, r15
r16 = U32_TO_F32 r0
r16 = FMA.f32 r16, #0x2edbe6ff, #0x0.neg
r16 = F32_TO_S32.rtz r16
r17 = MOV.i32 r16
r18 = MOV.i32 r16
r19 = MOV.i32 r16
r20 = MOV.i32 r16
r21 = MOV.i32 r16
r22 = MOV.i32 r16
r23 = MOV.i32 r16
r24 = MOV.i32 r16
r25 = MOV.i32 r16
r26 = MOV.i32 r16
r27 = MOV.i32 r16
r28 = MOV.i32 r16
r29 = MOV.i32 r16
r30 = MOV.i32 r16
r31 = MOV.i32 r16
r32 = MOV.i32 u1
r33 = MOV.i32 #0x0
} -> block1
block1 {
r34 = ICMP.s32.m1.ge r33, #0x8
BRANCHZ.i16.eq r34.h00, #0x0 -> block3
} -> block3 block2 from block0 block4
block2 {
JUMP #0x0 -> block5
} -> block5 from block1
block3 {
} -> block4 from block1
block4 {
r32 = IMUL.i32 r28, r32
r1 = IMUL.i32 r29, r1
r2 = IMUL.i32 r30, r2
r3 = IMUL.i32 r31, r3
r4 = IMUL.i32 r24, r4
r5 = IMUL.i32 r25, r5
r6 = IMUL.i32 r26, r6
r7 = IMUL.i32 r27, r7
r32 = IADD.s32 r32, r28
r1 = IADD.s32 r1, r29
r2 = IADD.s32 r2, r30
r3 = IADD.s32 r3, r31
r4 = IADD.s32 r4, r24
r5 = IADD.s32 r5, r25
r6 = IADD.s32 r6, r26
r7 = IADD.s32 r7, r27
r8 = IMUL.i32 r20, r8
r9 = IMUL.i32 r21, r9
r10 = IMUL.i32 r22, r10
r11 = IMUL.i32 r23, r11
r12 = IMUL.i32 r16, r12
r13 = IMUL.i32 r17, r13
r14 = IMUL.i32 r18, r14
r15 = IMUL.i32 r19, r15
r8 = IADD.s32 r8, r20
r9 = IADD.s32 r9, r21
r10 = IADD.s32 r10, r22
r11 = IADD.s32 r11, r23
r12 = IADD.s32 r12, r16
r13 = IADD.s32 r13, r17
r14 = IADD.s32 r14, r18
r15 = IADD.s32 r15, r19
r28 = IMUL.i32 r32, r28
r29 = IMUL.i32 r1, r29
r30 = IMUL.i32 r2, r30
r31 = IMUL.i32 r3, r31
r24 = IMUL.i32 r4, r24
r25 = IMUL.i32 r5, r25
r26 = IMUL.i32 r6, r26
r27 = IMUL.i32 r7, r27
r28 = IADD.s32 r28, r32
r29 = IADD.s32 r29, r1
r30 = IADD.s32 r30, r2
r31 = IADD.s32 r31, r3
r24 = IADD.s32 r24, r4
r25 = IADD.s32 r25, r5
r26 = IADD.s32 r26, r6
r27 = IADD.s32 r27, r7
r20 = IMUL.i32 r8, r20
r21 = IMUL.i32 r9, r21
r22 = IMUL.i32 r10, r22
r23 = IMUL.i32 r11, r23
r16 = IMUL.i32 r12, r16
r17 = IMUL.i32 r13, r17
r18 = IMUL.i32 r14, r18
r19 = IMUL.i32 r15, r19
r20 = IADD.s32 r20, r8
r21 = IADD.s32 r21, r9
r22 = IADD.s32 r22, r10
r23 = IADD.s32 r23, r11
r16 = IADD.s32 r16, r12
r17 = IADD.s32 r17, r13
r18 = IADD.s32 r18, r14
r19 = IADD.s32 r19, r15
r32 = IMUL.i32 r28, r32
r1 = IMUL.i32 r29, r1
r2 = IMUL.i32 r30, r2
r3 = IMUL.i32 r31, r3
r4 = IMUL.i32 r24, r4
r5 = IMUL.i32 r25, r5
r6 = IMUL.i32 r26, r6
r7 = IMUL.i32 r27, r7
r32 = IADD.s32 r32, r28
r1 = IADD.s32 r1, r29
r2 = IADD.s32 r2, r30
r3 = IADD.s32 r3, r31
r4 = IADD.s32 r4, r24
r5 = IADD.s32 r5, r25
r6 = IADD.s32 r6, r26
r7 = IADD.s32 r7, r27
r8 = IMUL.i32 r20, r8
r9 = IMUL.i32 r21, r9
r10 = IMUL.i32 r22, r10
r11 = IMUL.i32 r23, r11
r12 = IMUL.i32 r16, r12
r13 = IMUL.i32 r17, r13
r14 = IMUL.i32 r18, r14
r15 = IMUL.i32 r19, r15
r8 = IADD.s32 r8, r20
r9 = IADD.s32 r9, r21
r10 = IADD.s32 r10, r22
r11 = IADD.s32 r11, r23
r12 = IADD.s32 r12, r16
r13 = IADD.s32 r13, r17
r14 = IADD.s32 r14, r18
r15 = IADD.s32 r15, r19
r28 = IMUL.i32 r32, r28
r29 = IMUL.i32 r1, r29
r30 = IMUL.i32 r2, r30
r31 = IMUL.i32 r3, r31
r24 = IMUL.i32 r4, r24
r25 = IMUL.i32 r5, r25
r26 = IMUL.i32 r6, r26
r27 = IMUL.i32 r7, r27
r28 = IADD.s32 r28, r32
r29 = IADD.s32 r29, r1
r30 = IADD.s32 r30, r2
r31 = IADD.s32 r31, r3
r24 = IADD.s32 r24, r4
r25 = IADD.s32 r25, r5
r26 = IADD.s32 r26, r6
r27 = IADD.s32 r27, r7
r20 = IMUL.i32 r8, r20
r21 = IMUL.i32 r9, r21
r22 = IMUL.i32 r10, r22
r23 = IMUL.i32 r11, r23
r16 = IMUL.i32 r12, r16
r17 = IMUL.i32 r13, r17
r18 = IMUL.i32 r14, r18
r19 = IMUL.i32 r15, r19
r20 = IADD.s32 r20, r8
r21 = IADD.s32 r21, r9
r22 = IADD.s32 r22, r10
r23 = IADD.s32 r23, r11
r16 = IADD.s32 r16, r12
r17 = IADD.s32 r17, r13
r18 = IADD.s32 r18, r14
r19 = IADD.s32 r19, r15
r32 = IMUL.i32 r28, r32
r1 = IMUL.i32 r29, r1
r2 = IMUL.i32 r30, r2
r3 = IMUL.i32 r31, r3
r4 = IMUL.i32 r24, r4
r5 = IMUL.i32 r25, r5
r6 = IMUL.i32 r26, r6
r7 = IMUL.i32 r27, r7
r32 = IADD.s32 r32, r28
r1 = IADD.s32 r1, r29
r2 = IADD.s32 r2, r30
r3 = IADD.s32 r3, r31
r4 = IADD.s32 r4, r24
r5 = IADD.s32 r5, r25
r6 = IADD.s32 r6, r26
r7 = IADD.s32 r7, r27
r8 = IMUL.i32 r20, r8
r9 = IMUL.i32 r21, r9
r10 = IMUL.i32 r22, r10
r11 = IMUL.i32 r23, r11
r12 = IMUL.i32 r16, r12
r13 = IMUL.i32 r17, r13
r14 = IMUL.i32 r18, r14
r15 = IMUL.i32 r19, r15
r8 = IADD.s32 r8, r20
r9 = IADD.s32 r9, r21
r10 = IADD.s32 r10, r22
r11 = IADD.s32 r11, r23
r12 = IADD.s32 r12, r16
r13 = IADD.s32 r13, r17
r14 = IADD.s32 r14, r18
r15 = IADD.s32 r15, r19
r28 = IMUL.i32 r32, r28
r29 = IMUL.i32 r1, r29
r30 = IMUL.i32 r2, r30
r31 = IMUL.i32 r3, r31
r24 = IMUL.i32 r4, r24
r25 = IMUL.i32 r5, r25
r26 = IMUL.i32 r6, r26
r27 = IMUL.i32 r7, r27
r28 = IADD.s32 r28, r32
r29 = IADD.s32 r29, r1
r30 = IADD.s32 r30, r2
r31 = IADD.s32 r31, r3
r24 = IADD.s32 r24, r4
r25 = IADD.s32 r25, r5
r26 = IADD.s32 r26, r6
r27 = IADD.s32 r27, r7
r20 = IMUL.i32 r8, r20
r21 = IMUL.i32 r9, r21
r22 = IMUL.i32 r10, r22
r23 = IMUL.i32 r11, r23
r16 = IMUL.i32 r12, r16
r17 = IMUL.i32 r13, r17
r18 = IMUL.i32 r14, r18
r19 = IMUL.i32 r15, r19
r20 = IADD.s32 r20, r8
r21 = IADD.s32 r21, r9
r22 = IADD.s32 r22, r10
r23 = IADD.s32 r23, r11
r16 = IADD.s32 r16, r12
r17 = IADD.s32 r17, r13
r18 = IADD.s32 r18, r14
r19 = IADD.s32 r19, r15
r32 = IMUL.i32 r28, r32
r1 = IMUL.i32 r29, r1
r2 = IMUL.i32 r30, r2
r3 = IMUL.i32 r31, r3
r4 = IMUL.i32 r24, r4
r5 = IMUL.i32 r25, r5
r6 = IMUL.i32 r26, r6
r7 = IMUL.i32 r27, r7
r32 = IADD.s32 r32, r28
r1 = IADD.s32 r1, r29
r2 = IADD.s32 r2, r30
r3 = IADD.s32 r3, r31
r4 = IADD.s32 r4, r24
r5 = IADD.s32 r5, r25
r6 = IADD.s32 r6, r26
r7 = IADD.s32 r7, r27
r8 = IMUL.i32 r20, r8
r9 = IMUL.i32 r21, r9
r10 = IMUL.i32 r22, r10
r11 = IMUL.i32 r23, r11
r12 = IMUL.i32 r16, r12
r13 = IMUL.i32 r17, r13
r14 = IMUL.i32 r18, r14
r15 = IMUL.i32 r19, r15
r8 = IADD.s32 r8, r20
r9 = IADD.s32 r9, r21
r10 = IADD.s32 r10, r22
r11 = IADD.s32 r11, r23
r12 = IADD.s32 r12, r16
r13 = IADD.s32 r13, r17
r14 = IADD.s32 r14, r18
r15 = IADD.s32 r15, r19
r28 = IMUL.i32 r32, r28
r29 = IMUL.i32 r1, r29
r30 = IMUL.i32 r2, r30
r31 = IMUL.i32 r3, r31
r24 = IMUL.i32 r4, r24
r25 = IMUL.i32 r5, r25
r26 = IMUL.i32 r6, r26
r27 = IMUL.i32 r7, r27
r28 = IADD.s32 r28, r32
r29 = IADD.s32 r29, r1
r30 = IADD.s32 r30, r2
r31 = IADD.s32 r31, r3
r24 = IADD.s32 r24, r4
r25 = IADD.s32 r25, r5
r26 = IADD.s32 r26, r6
r27 = IADD.s32 r27, r7
r20 = IMUL.i32 r8, r20
r21 = IMUL.i32 r9, r21
r22 = IMUL.i32 r10, r22
r23 = IMUL.i32 r11, r23
r16 = IMUL.i32 r12, r16
r17 = IMUL.i32 r13, r17
r18 = IMUL.i32 r14, r18
r19 = IMUL.i32 r15, r19
r20 = IADD.s32 r20, r8
r21 = IADD.s32 r21, r9
r22 = IADD.s32 r22, r10
r23 = IADD.s32 r23, r11
r16 = IADD.s32 r16, r12
r17 = IADD.s32 r17, r13
r18 = IADD.s32 r18, r14
r19 = IADD.s32 r19, r15
r32 = IMUL.i32 r28, r32
r1 = IMUL.i32 r29, r1
r2 = IMUL.i32 r30, r2
r3 = IMUL.i32 r31, r3
r4 = IMUL.i32 r24, r4
r5 = IMUL.i32 r25, r5
r6 = IMUL.i32 r26, r6
r7 = IMUL.i32 r27, r7
r32 = IADD.s32 r32, r28
r1 = IADD.s32 r1, r29
r2 = IADD.s32 r2, r30
r3 = IADD.s32 r3, r31
r4 = IADD.s32 r4, r24
r5 = IADD.s32 r5, r25
r6 = IADD.s32 r6, r26
r7 = IADD.s32 r7, r27
r8 = IMUL.i32 r20, r8
r9 = IMUL.i32 r21, r9
r10 = IMUL.i32 r22, r10
r11 = IMUL.i32 r23, r11
r12 = IMUL.i32 r16, r12
r13 = IMUL.i32 r17, r13
r14 = IMUL.i32 r18, r14
r15 = IMUL.i32 r19, r15
r8 = IADD.s32 r8, r20
r9 = IADD.s32 r9, r21
r10 = IADD.s32 r10, r22
r11 = IADD.s32 r11, r23
r12 = IADD.s32 r12, r16
r13 = IADD.s32 r13, r17
r14 = IADD.s32 r14, r18
r15 = IADD.s32 r15, r19
r28 = IMUL.i32 r32, r28
r29 = IMUL.i32 r1, r29
r30 = IMUL.i32 r2, r30
r31 = IMUL.i32 r3, r31
r24 = IMUL.i32 r4, r24
r25 = IMUL.i32 r5, r25
r26 = IMUL.i32 r6, r26
r27 = IMUL.i32 r7, r27
r28 = IADD.s32 r28, r32
r29 = IADD.s32 r29, r1
r30 = IADD.s32 r30, r2
r31 = IADD.s32 r31, r3
r24 = IADD.s32 r24, r4
r25 = IADD.s32 r25, r5
r26 = IADD.s32 r26, r6
r27 = IADD.s32 r27, r7
r20 = IMUL.i32 r8, r20
r21 = IMUL.i32 r9, r21
r22 = IMUL.i32 r10, r22
r23 = IMUL.i32 r11, r23
r16 = IMUL.i32 r12, r16
r17 = IMUL.i32 r13, r17
r18 = IMUL.i32 r14, r18
r19 = IMUL.i32 r15, r19
r20 = IADD.s32 r20, r8
r21 = IADD.s32 r21, r9
r22 = IADD.s32 r22, r10
r23 = IADD.s32 r23, r11
r16 = IADD.s32 r16, r12
r17 = IADD.s32 r17, r13
r18 = IADD.s32 r18, r14
r19 = IADD.s32 r19, r15
r32 = IMUL.i32 r28, r32
r1 = IMUL.i32 r29, r1
r2 = IMUL.i32 r30, r2
r3 = IMUL.i32 r31, r3
r4 = IMUL.i32 r24, r4
r5 = IMUL.i32 r25, r5
r6 = IMUL.i32 r26, r6
r7 = IMUL.i32 r27, r7
r32 = IADD.s32 r32, r28
r1 = IADD.s32 r1, r29
r2 = IADD.s32 r2, r30
r3 = IADD.s32 r3, r31
r4 = IADD.s32 r4, r24
r5 = IADD.s32 r5, r25
r6 = IADD.s32 r6, r26
r7 = IADD.s32 r7, r27
r8 = IMUL.i32 r20, r8
r9 = IMUL.i32 r21, r9
r10 = IMUL.i32 r22, r10
r11 = IMUL.i32 r23, r11
r12 = IMUL.i32 r16, r12
r13 = IMUL.i32 r17, r13
r14 = IMUL.i32 r18, r14
r15 = IMUL.i32 r19, r15
r8 = IADD.s32 r8, r20
r9 = IADD.s32 r9, r21
r10 = IADD.s32 r10, r22
r11 = IADD.s32 r11, r23
r12 = IADD.s32 r12, r16
r13 = IADD.s32 r13, r17
r14 = IADD.s32 r14, r18
r15 = IADD.s32 r15, r19
r28 = IMUL.i32 r32, r28
r29 = IMUL.i32 r1, r29
r30 = IMUL.i32 r2, r30
r31 = IMUL.i32 r3, r31
r24 = IMUL.i32 r4, r24
r25 = IMUL.i32 r5, r25
r26 = IMUL.i32 r6, r26
r27 = IMUL.i32 r7, r27
r28 = IADD.s32 r28, r32
r29 = IADD.s32 r29, r1
r30 = IADD.s32 r30, r2
r31 = IADD.s32 r31, r3
r24 = IADD.s32 r24, r4
r25 = IADD.s32 r25, r5
r26 = IADD.s32 r26, r6
r27 = IADD.s32 r27, r7
r20 = IMUL.i32 r8, r20
r21 = IMUL.i32 r9, r21
r22 = IMUL.i32 r10, r22
r23 = IMUL.i32 r11, r23
r16 = IMUL.i32 r12, r16
r17 = IMUL.i32 r13, r17
r18 = IMUL.i32 r14, r18
r19 = IMUL.i32 r15, r19
r20 = IADD.s32 r20, r8
r21 = IADD.s32 r21, r9
r22 = IADD.s32 r22, r10
r23 = IADD.s32 r23, r11
r16 = IADD.s32 r16, r12
r17 = IADD.s32 r17, r13
r18 = IADD.s32 r18, r14
r19 = IADD.s32 r19, r15
r32 = IMUL.i32 r28, r32
r1 = IMUL.i32 r29, r1
r2 = IMUL.i32 r30, r2
r3 = IMUL.i32 r31, r3
r4 = IMUL.i32 r24, r4
r5 = IMUL.i32 r25, r5
r6 = IMUL.i32 r26, r6
r7 = IMUL.i32 r27, r7
r32 = IADD.s32 r32, r28
r1 = IADD.s32 r1, r29
r2 = IADD.s32 r2, r30
r3 = IADD.s32 r3, r31
r4 = IADD.s32 r4, r24
r5 = IADD.s32 r5, r25
r6 = IADD.s32 r6, r26
r7 = IADD.s32 r7, r27
r8 = IMUL.i32 r20, r8
r9 = IMUL.i32 r21, r9
r10 = IMUL.i32 r22, r10
r11 = IMUL.i32 r23, r11
r12 = IMUL.i32 r16, r12
r13 = IMUL.i32 r17, r13
r14 = IMUL.i32 r18, r14
r15 = IMUL.i32 r19, r15
r8 = IADD.s32 r8, r20
r9 = IADD.s32 r9, r21
r10 = IADD.s32 r10, r22
r11 = IADD.s32 r11, r23
r12 = IADD.s32 r12, r16
r13 = IADD.s32 r13, r17
r14 = IADD.s32 r14, r18
r15 = IADD.s32 r15, r19
r28 = IMUL.i32 r32, r28
r29 = IMUL.i32 r1, r29
r30 = IMUL.i32 r2, r30
r31 = IMUL.i32 r3, r31
r24 = IMUL.i32 r4, r24
r25 = IMUL.i32 r5, r25
r26 = IMUL.i32 r6, r26
r27 = IMUL.i32 r7, r27
r28 = IADD.s32 r28, r32
r29 = IADD.s32 r29, r1
r30 = IADD.s32 r30, r2
r31 = IADD.s32 r31, r3
r24 = IADD.s32 r24, r4
r25 = IADD.s32 r25, r5
r26 = IADD.s32 r26, r6
r27 = IADD.s32 r27, r7
r20 = IMUL.i32 r8, r20
r21 = IMUL.i32 r9, r21
r22 = IMUL.i32 r10, r22
r23 = IMUL.i32 r11, r23
r16 = IMUL.i32 r12, r16
r17 = IMUL.i32 r13, r17
r18 = IMUL.i32 r14, r18
r19 = IMUL.i32 r15, r19
r20 = IADD.s32 r20, r8
r21 = IADD.s32 r21, r9
r22 = IADD.s32 r22, r10
r23 = IADD.s32 r23, r11
r16 = IADD.s32 r16, r12
r17 = IADD.s32 r17, r13
r18 = IADD.s32 r18, r14
r19 = IADD.s32 r19, r15
r32 = IMUL.i32 r28, r32
r1 = IMUL.i32 r29, r1
r2 = IMUL.i32 r30, r2
r3 = IMUL.i32 r31, r3
r4 = IMUL.i32 r24, r4
r5 = IMUL.i32 r25, r5
r6 = IMUL.i32 r26, r6
r7 = IMUL.i32 r27, r7
r32 = IADD.s32 r32, r28
r1 = IADD.s32 r1, r29
r2 = IADD.s32 r2, r30
r3 = IADD.s32 r3, r31
r4 = IADD.s32 r4, r24
r5 = IADD.s32 r5, r25
r6 = IADD.s32 r6, r26
r7 = IADD.s32 r7, r27
r8 = IMUL.i32 r20, r8
r9 = IMUL.i32 r21, r9
r10 = IMUL.i32 r22, r10
r11 = IMUL.i32 r23, r11
r12 = IMUL.i32 r16, r12
r13 = IMUL.i32 r17, r13
r14 = IMUL.i32 r18, r14
r15 = IMUL.i32 r19, r15
r8 = IADD.s32 r8, r20
r9 = IADD.s32 r9, r21
r10 = IADD.s32 r10, r22
r11 = IADD.s32 r11, r23
r12 = IADD.s32 r12, r16
r13 = IADD.s32 r13, r17
r14 = IADD.s32 r14, r18
r15 = IADD.s32 r15, r19
r28 = IMUL.i32 r32, r28
r29 = IMUL.i32 r1, r29
r30 = IMUL.i32 r2, r30
r31 = IMUL.i32 r3, r31
r24 = IMUL.i32 r4, r24
r25 = IMUL.i32 r5, r25
r26 = IMUL.i32 r6, r26
r27 = IMUL.i32 r7, r27
r28 = IADD.s32 r28, r32
r29 = IADD.s32 r29, r1
r30 = IADD.s32 r30, r2
r31 = IADD.s32 r31, r3
r24 = IADD.s32 r24, r4
r25 = IADD.s32 r25, r5
r26 = IADD.s32 r26, r6
r27 = IADD.s32 r27, r7
r20 = IMUL.i32 r8, r20
r21 = IMUL.i32 r9, r21
r22 = IMUL.i32 r10, r22
r23 = IMUL.i32 r11, r23
r16 = IMUL.i32 r12, r16
r17 = IMUL.i32 r13, r17
r18 = IMUL.i32 r14, r18
r19 = IMUL.i32 r15, r19
r20 = IADD.s32 r20, r8
r21 = IADD.s32 r21, r9
r22 = IADD.s32 r22, r10
r23 = IADD.s32 r23, r11
r16 = IADD.s32 r16, r12
r17 = IADD.s32 r17, r13
r18 = IADD.s32 r18, r14
r19 = IADD.s32 r19, r15
r33 = IADD.s32 r33, #0x1
JUMP #0x0 -> block1
} -> block1 from block3
block5 {
r1 = IADD.s32 r28, r20
r2 = IADD.s32 r29, r21
r3 = IADD.s32 r30, r22
r4 = IADD.s32 r31, r23
r5 = IADD.s32 r24, r16
r6 = IADD.s32 r25, r17
r7 = IADD.s32 r26, r18
r8 = IADD.s32 r27, r19
r1 = IADD.s32 r1, r5
r2 = IADD.s32 r2, r6
r3 = IADD.s32 r3, r7
r4 = IADD.s32 r4, r8
r1 = IADD.s32 r1, r3
r2 = IADD.s32 r2, r4
r0 = LSHIFT_OR.i32 r0, #0x0, #0x2.b0
r1 = IADD.s32 r1, r2
r0 = IADD.s32 u0, r0
r2 = ICMP.u32.i1.lt r0, u0
r2 = IADD.s32 r2, u0[1]
STORE.i32 r1, r0, r2, byte_offset:0
} from block2
block0 {
id(0) nbb
* _.h00 = LSHIFT_OR.i32 r61, t, fau.y.b0
+ _.h00 = IADD.s32 r60, t
* _.h00 = MKVEC.v2i16 t.h00, r62.h00
+ r0 = IADD.s32 t1, t
* r1 = MOV.i32 fau.x
+ r2 = MOV.i32 fau.y
* r3 = MOV.i32 fau.x
+ r4 = MOV.i32 fau.y
* r5 = MOV.i32 fau.x
+ r6 = MOV.i32 fau.y
* NOP
+ r7 = MOV.i32 fau.x
200000001 400000003 600000005 800000007
id(0) nbb
* r8 = MOV.i32 fau.x
+ r9 = MOV.i32 fau.y
* r10 = MOV.i32 fau.x
+ r11 = MOV.i32 fau.y
* r12 = MOV.i32 fau.x
+ r13 = MOV.i32 fau.y
* r14 = MOV.i32 fau.x
+ _.h00 = U32_TO_F32 r0
* _.h00 = FMA.f32 t1, fau.y, t.neg
+ r16 = F32_TO_S32.rtz t
* r15 = MOV.i32 fau.y
+ r17 = MOV.i32 t1
* NOP
+ r18 = MOV.i32 r16
900000008 b0000000a d0000000c 2edbe6ff0000000e f00000000
id(0) nbb
* r19 = MOV.i32 r16
+ r20 = MOV.i32 r16
* r21 = MOV.i32 r16
+ r1 = IADD.s32 fau.x, r1
* r22 = MOV.i32 r16
+ r2 = IADD.s32 fau.x, r2
* r23 = MOV.i32 r16
+ r3 = IADD.s32 fau.x, r3
* r24 = MOV.i32 r16
+ r4 = IADD.s32 fau.x, r4
* r25 = MOV.i32 r16
+ r5 = IADD.s32 fau.x, r5
* r26 = MOV.i32 r16
+ r6 = IADD.s32 fau.x, r6
* NOP
+ r7 = IADD.s32 fau.x, r7
id(0) nbb r_uncond
* r27 = MOV.i32 r16
+ r8 = IADD.s32 fau.x, r8
* r28 = MOV.i32 r16
+ r9 = IADD.s32 fau.x, r9
* r29 = MOV.i32 r16
+ r10 = IADD.s32 fau.x, r10
* r30 = MOV.i32 r16
+ r11 = IADD.s32 fau.x, r11
* r31 = MOV.i32 r16
+ r12 = IADD.s32 fau.x, r12
* r32 = MOV.i32 fau.x
+ r13 = IADD.s32 fau.x, r13
* r33 = MOV.i32 t
+ r14 = IADD.s32 fau.x, r14
* NOP
+ r15 = IADD.s32 fau.x, r15
} -> block1
block1 {
id(0) nbb r_uncond pcrel(0)
* NOP
+ _.h00 = ICMP.s32.m1.ge r33, fau.x
* NOP
+ BRANCHZ.i16.eq t1.h00, fau.y -> block3
4000000000000008
} -> block3 block2 from block0 block4
block2 {
id(0) nbb no_prefetch pcrel(0)
* NOP
+ JUMP fau.y -> block5
4000000000000000
} -> block5 from block1
block3 {
} -> block4 from block1
block4 {
id(0) nbb
* r1 = IMUL.i32 r29, r1
+ NOP
* r8 = IMUL.i32 r20, r8
+ NOP
* r12 = IMUL.i32 r16, r12
+ NOP
* NOP
+ r1 = IADD.s32 r1, r29
id(0) nbb
* r29 = IMUL.i32 r1, r29
+ NOP
* _.h00 = IMUL.i32 r17, r13
+ r13 = IADD.s32 t, r17
* r17 = IMUL.i32 t1, r17
+ NOP
* _.h00 = IMUL.i32 r18, r14
+ r14 = IADD.s32 t, r18
* r18 = IMUL.i32 t1, r18
+ NOP
* _.h00 = IMUL.i32 r28, r32
+ r32 = IADD.s32 t, r28
* _.h00 = IMUL.i32 t1, r28
+ r28 = IADD.s32 t, t1
* r32 = IMUL.i32 t1, r32
+ NOP
id(0) nbb
* _.h00 = IMUL.i32 r27, r7
+ r7 = IADD.s32 t, r27
* _.h00 = IMUL.i32 t1, r27
+ r27 = IADD.s32 t, t1
* r7 = IMUL.i32 t1, r7
+ NOP
* _.h00 = IMUL.i32 r31, r3
+ r3 = IADD.s32 t, r31
* _.h00 = IMUL.i32 t1, r31
+ r31 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r3
+ r3 = IADD.s32 t, t1
* r31 = IMUL.i32 t1, r31
+ NOP
* NOP
+ r18 = IADD.s32 r18, r14
id(0) nbb
* _.h00 = IMUL.i32 r18, r14
+ r14 = IADD.s32 t, r18
* r18 = IMUL.i32 t1, r18
+ NOP
* _.h00 = IMUL.i32 r30, r2
+ r2 = IADD.s32 t, r30
* _.h00 = IMUL.i32 t1, r30
+ r30 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r2
+ r2 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r30
+ r30 = IADD.s32 t, t1
* r2 = IMUL.i32 t1, r2
+ NOP
* NOP
+ r7 = IADD.s32 r7, r27
id(0) nbb
* _.h00 = IMUL.i32 r7, r27
+ r27 = IADD.s32 t, r7
* r7 = IMUL.i32 t1, r7
+ NOP
* _.h00 = IMUL.i32 r21, r9
+ r9 = IADD.s32 t, r21
* _.h00 = IMUL.i32 t1, r21
+ r21 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r9
+ r9 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r21
+ r21 = IADD.s32 t, t1
* r9 = IMUL.i32 t1, r9
+ NOP
* NOP
+ r18 = IADD.s32 r18, r14
id(0) nbb
* r14 = IMUL.i32 r18, r14
+ NOP
* _.h00 = IMUL.i32 r25, r5
+ r5 = IADD.s32 t, r25
* _.h00 = IMUL.i32 t1, r25
+ r25 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r5
+ r5 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r25
+ r25 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r5
+ r5 = IADD.s32 t, t1
* r25 = IMUL.i32 t1, r25
+ NOP
* NOP
+ r9 = IADD.s32 r9, r21
id(0) nbb
* r21 = IMUL.i32 r9, r21
+ NOP
* _.h00 = IMUL.i32 r23, r11
+ r11 = IADD.s32 t, r23
* _.h00 = IMUL.i32 t1, r23
+ r23 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r11
+ r11 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r23
+ r23 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r11
+ r11 = IADD.s32 t, t1
* r23 = IMUL.i32 t1, r23
+ NOP
* NOP
+ r25 = IADD.s32 r25, r5
id(0) nbb
* r5 = IMUL.i32 r25, r5
+ NOP
* _.h00 = IMUL.i32 r26, r6
+ r6 = IADD.s32 t, r26
* _.h00 = IMUL.i32 t1, r26
+ r26 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r6
+ r6 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r26
+ r26 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r6
+ r6 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r26
+ r26 = IADD.s32 t, t1
* r6 = IMUL.i32 t1, r6
+ NOP
id(0) nbb
* _.h00 = IMUL.i32 r19, r15
+ r15 = IADD.s32 t, r19
* _.h00 = IMUL.i32 t1, r19
+ r19 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r15
+ r15 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r19
+ r19 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r15
+ r15 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r19
+ r19 = IADD.s32 t, t1
* r15 = IMUL.i32 t1, r15
+ NOP
* NOP
+ r8 = IADD.s32 r8, r20
id(0) nbb
* _.h00 = IMUL.i32 r8, r20
+ r20 = IADD.s32 t, r8
* _.h00 = IMUL.i32 t1, r8
+ r8 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r20
+ r20 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r8
+ r8 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r20
+ r20 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r8
+ r8 = IADD.s32 t, t1
* r20 = IMUL.i32 t1, r20
+ NOP
* NOP
+ r12 = IADD.s32 r12, r16
id(0) nbb
* _.h00 = IMUL.i32 r12, r16
+ r16 = IADD.s32 t, r12
* _.h00 = IMUL.i32 t1, r12
+ r12 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r16
+ r16 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r12
+ r12 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r16
+ r16 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r12
+ r12 = IADD.s32 t, t1
* r16 = IMUL.i32 t1, r16
+ NOP
* NOP
+ r29 = IADD.s32 r29, r1
id(0) nbb
* _.h00 = IMUL.i32 r29, r1
+ r1 = IADD.s32 t, r29
* _.h00 = IMUL.i32 t1, r29
+ r29 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r1
+ r1 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r29
+ r29 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r1
+ r1 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r29
+ r29 = IADD.s32 t, t1
* r1 = IMUL.i32 t1, r1
+ NOP
* NOP
+ r32 = IADD.s32 r32, r28
id(0) nbb
* _.h00 = IMUL.i32 r32, r28
+ r28 = IADD.s32 t, r32
* _.h00 = IMUL.i32 t1, r32
+ r32 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r28
+ r28 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r32
+ r32 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r28
+ r28 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r32
+ r32 = IADD.s32 t, t1
* r28 = IMUL.i32 t1, r28
+ NOP
* NOP
+ r31 = IADD.s32 r31, r3
id(0) nbb
* _.h00 = IMUL.i32 r31, r3
+ r3 = IADD.s32 t, r31
* _.h00 = IMUL.i32 t1, r31
+ r31 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r3
+ r3 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r31
+ r31 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r3
+ r3 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r31
+ r31 = IADD.s32 t, t1
* r3 = IMUL.i32 t1, r3
+ NOP
* NOP
+ r2 = IADD.s32 r2, r30
id(0) nbb
* _.h00 = IMUL.i32 r2, r30
+ r30 = IADD.s32 t, r2
* _.h00 = IMUL.i32 t1, r2
+ r2 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r30
+ r30 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r2
+ r2 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r30
+ r30 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r2
+ r2 = IADD.s32 t, t1
* r30 = IMUL.i32 t1, r30
+ NOP
* NOP
+ r14 = IADD.s32 r14, r18
id(0) nbb
* _.h00 = IMUL.i32 r14, r18
+ r18 = IADD.s32 t, r14
* _.h00 = IMUL.i32 t1, r14
+ r14 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r18
+ r18 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r14
+ r14 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r18
+ r18 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r14
+ r14 = IADD.s32 t, t1
* r18 = IMUL.i32 t1, r18
+ NOP
* NOP
+ r21 = IADD.s32 r21, r9
id(0) nbb
* _.h00 = IMUL.i32 r21, r9
+ r9 = IADD.s32 t, r21
* _.h00 = IMUL.i32 t1, r21
+ r21 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r9
+ r9 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r21
+ r21 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r9
+ r9 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r21
+ r21 = IADD.s32 t, t1
* r9 = IMUL.i32 t1, r9
+ NOP
* NOP
+ r23 = IADD.s32 r23, r11
id(0) nbb
* _.h00 = IMUL.i32 r23, r11
+ r11 = IADD.s32 t, r23
* _.h00 = IMUL.i32 t1, r23
+ r23 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r11
+ r11 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r23
+ r23 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r11
+ r11 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r23
+ r23 = IADD.s32 t, t1
* r11 = IMUL.i32 t1, r11
+ NOP
* NOP
+ r5 = IADD.s32 r5, r25
id(0) nbb
* _.h00 = IMUL.i32 r5, r25
+ r25 = IADD.s32 t, r5
* _.h00 = IMUL.i32 t1, r5
+ r5 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r25
+ r25 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r5
+ r5 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r25
+ r25 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r5
+ r5 = IADD.s32 t, t1
* r25 = IMUL.i32 t1, r25
+ NOP
* NOP
+ r15 = IADD.s32 r15, r19
id(0) nbb
* _.h00 = IMUL.i32 r15, r19
+ r19 = IADD.s32 t, r15
* _.h00 = IMUL.i32 t1, r15
+ r15 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r19
+ r19 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r15
+ r15 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r19
+ r19 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r15
+ r15 = IADD.s32 t, t1
* r19 = IMUL.i32 t1, r19
+ NOP
* NOP
+ r20 = IADD.s32 r20, r8
id(0) nbb
* _.h00 = IMUL.i32 r20, r8
+ r8 = IADD.s32 t, r20
* _.h00 = IMUL.i32 t1, r20
+ r20 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r8
+ r8 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r20
+ r20 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r8
+ r8 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r20
+ r20 = IADD.s32 t, t1
* r8 = IMUL.i32 t1, r8
+ NOP
* NOP
+ r1 = IADD.s32 r1, r29
id(0) nbb
* _.h00 = IMUL.i32 r1, r29
+ r29 = IADD.s32 t, r1
* _.h00 = IMUL.i32 t1, r1
+ r1 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r29
+ r29 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r1
+ r1 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r29
+ r29 = IADD.s32 t, t1
* _.h00 = IMUL.i32 t1, r1
+ r1 = IADD.s32 t, t1
* r29 = IMUL.i32 t1, r29
+ NOP
* NOP
+ r9 = IADD.s32 r9, r21
id(0) nbb
* _.h00 = IMUL.i32 r9, r21
+ r21 = IADD.s32 t, r9
* _.h00 = IMUL.i32 t1, r9
+ r9 = IADD.s32 t, t1
* r21 = IMUL.i32 t1, r21
+ NOP
* _.h00 = IMUL.i32 r24, r4
+ r4 = IADD.s32 t, r24
* _.h00 = IMUL.i32 r22, r10
+ r10 = IADD.s32 t, r22
* _.h00 = IMUL.i32 r4, r24
+ r24 = IADD.s32 t, r4
* _.h00 = IMUL.i32 r10, r22
+ r22 = IADD.s32 t, r10
* NOP
+ r17 = IADD.s32 r17, r13
id(0) nbb
* _.h00 = IMUL.i32 r24, r4
+ r4 = IADD.s32 t, r24
* _.h00 = IMUL.i32 r22, r10
+ r10 = IADD.s32 t, r22
* _.h00 = IMUL.i32 r17, r13
+ r13 = IADD.s32 t, r17
* _.h00 = IMUL.i32 r4, r24
+ r24 = IADD.s32 t, r4
* _.h00 = IMUL.i32 r10, r22
+ r22 = IADD.s32 t, r10
* _.h00 = IMUL.i32 r13, r17
+ r17 = IADD.s32 t, r13
* _.h00 = IMUL.i32 r24, r4
+ r4 = IADD.s32 t, r24
* NOP
+ r7 = IADD.s32 r7, r27
id(0) nbb
* _.h00 = IMUL.i32 r22, r10
+ r10 = IADD.s32 t, r22
* _.h00 = IMUL.i32 r17, r13
+ r13 = IADD.s32 t, r17
* _.h00 = IMUL.i32 r4, r24
+ r24 = IADD.s32 t, r4
* _.h00 = IMUL.i32 r7, r27
+ r27 = IADD.s32 t, r7
* _.h00 = IMUL.i32 r10, r22
+ r22 = IADD.s32 t, r10
* _.h00 = IMUL.i32 r13, r17
+ r17 = IADD.s32 t, r13
* _.h00 = IMUL.i32 r24, r4
+ r4 = IADD.s32 t, r24
* NOP
+ r6 = IADD.s32 r6, r26
id(0) nbb
* _.h00 = IMUL.i32 r27, r7
+ r7 = IADD.s32 t, r27
* _.h00 = IMUL.i32 r22, r10
+ r10 = IADD.s32 t, r22
* _.h00 = IMUL.i32 r17, r13
+ r13 = IADD.s32 t, r17
* _.h00 = IMUL.i32 r4, r24
+ r24 = IADD.s32 t, r4
* _.h00 = IMUL.i32 r6, r26
+ r26 = IADD.s32 t, r6
* _.h00 = IMUL.i32 r7, r27
+ r27 = IADD.s32 t, r7
* _.h00 = IMUL.i32 r10, r22
+ r22 = IADD.s32 t, r10
* NOP
+ r16 = IADD.s32 r16, r12
id(0) nbb
* _.h00 = IMUL.i32 r13, r17
+ r17 = IADD.s32 t, r13
* _.h00 = IMUL.i32 r24, r4
+ r4 = IADD.s32 t, r24
* _.h00 = IMUL.i32 r26, r6
+ r6 = IADD.s32 t, r26
* _.h00 = IMUL.i32 r27, r7
+ r7 = IADD.s32 t, r27
* _.h00 = IMUL.i32 r22, r10
+ r10 = IADD.s32 t, r22
* _.h00 = IMUL.i32 r16, r12
+ r12 = IADD.s32 t, r16
* _.h00 = IMUL.i32 r17, r13
+ r13 = IADD.s32 t, r17
* NOP
+ r28 = IADD.s32 r28, r32
id(0) nbb
* _.h00 = IMUL.i32 r4, r24
+ r24 = IADD.s32 t, r4
* _.h00 = IMUL.i32 r6, r26
+ r26 = IADD.s32 t, r6
* _.h00 = IMUL.i32 r7, r27
+ r27 = IADD.s32 t, r7
* _.h00 = IMUL.i32 r10, r22
+ r22 = IADD.s32 t, r10
* _.h00 = IMUL.i32 r12, r16
+ r16 = IADD.s32 t, r12
* _.h00 = IMUL.i32 r13, r17
+ r17 = IADD.s32 t, r13
* _.h00 = IMUL.i32 r28, r32
+ r32 = IADD.s32 t, r28
* NOP
+ r3 = IADD.s32 r3, r31
id(0) nbb
* _.h00 = IMUL.i32 r24, r4
+ r4 = IADD.s32 t, r24
* _.h00 = IMUL.i32 r26, r6
+ r6 = IADD.s32 t, r26
* _.h00 = IMUL.i32 r27, r7
+ r7 = IADD.s32 t, r27
* _.h00 = IMUL.i32 r22, r10
+ r10 = IADD.s32 t, r22
* _.h00 = IMUL.i32 r16, r12
+ r12 = IADD.s32 t, r16
* _.h00 = IMUL.i32 r17, r13
+ r13 = IADD.s32 t, r17
* _.h00 = IMUL.i32 r32, r28
+ r28 = IADD.s32 t, r32
* NOP
+ r30 = IADD.s32 r30, r2
id(0) nbb
* _.h00 = IMUL.i32 r3, r31
+ r31 = IADD.s32 t, r3
* _.h00 = IMUL.i32 r4, r24
+ r24 = IADD.s32 t, r4
* _.h00 = IMUL.i32 r6, r26
+ r26 = IADD.s32 t, r6
* _.h00 = IMUL.i32 r7, r27
+ r27 = IADD.s32 t, r7
* _.h00 = IMUL.i32 r10, r22
+ r22 = IADD.s32 t, r10
* _.h00 = IMUL.i32 r12, r16
+ r16 = IADD.s32 t, r12
* _.h00 = IMUL.i32 r13, r17
+ r17 = IADD.s32 t, r13
* NOP
+ r18 = IADD.s32 r18, r14
id(0) nbb
* _.h00 = IMUL.i32 r28, r32
+ r32 = IADD.s32 t, r28
* _.h00 = IMUL.i32 r30, r2
+ r2 = IADD.s32 t, r30
* _.h00 = IMUL.i32 r31, r3
+ r3 = IADD.s32 t, r31
* _.h00 = IMUL.i32 r24, r4
+ r4 = IADD.s32 t, r24
* _.h00 = IMUL.i32 r26, r6
+ r6 = IADD.s32 t, r26
* _.h00 = IMUL.i32 r27, r7
+ r7 = IADD.s32 t, r27
* _.h00 = IMUL.i32 r22, r10
+ r10 = IADD.s32 t, r22
* NOP
+ r11 = IADD.s32 r11, r23
id(0) nbb
* _.h00 = IMUL.i32 r16, r12
+ r12 = IADD.s32 t, r16
* _.h00 = IMUL.i32 r17, r13
+ r13 = IADD.s32 t, r17
* _.h00 = IMUL.i32 r18, r14
+ r14 = IADD.s32 t, r18
* _.h00 = IMUL.i32 r32, r28
+ r28 = IADD.s32 t, r32
* _.h00 = IMUL.i32 r2, r30
+ r30 = IADD.s32 t, r2
* _.h00 = IMUL.i32 r3, r31
+ r31 = IADD.s32 t, r3
* _.h00 = IMUL.i32 r4, r24
+ r24 = IADD.s32 t, r4
* NOP
+ r25 = IADD.s32 r25, r5
id(0) nbb
* _.h00 = IMUL.i32 r6, r26
+ r26 = IADD.s32 t, r6
* _.h00 = IMUL.i32 r7, r27
+ r27 = IADD.s32 t, r7
* _.h00 = IMUL.i32 r10, r22
+ r22 = IADD.s32 t, r10
* _.h00 = IMUL.i32 r11, r23
+ r23 = IADD.s32 t, r11
* _.h00 = IMUL.i32 r12, r16
+ r16 = IADD.s32 t, r12
* _.h00 = IMUL.i32 r13, r17
+ r17 = IADD.s32 t, r13
* _.h00 = IMUL.i32 r14, r18
+ r18 = IADD.s32 t, r14
* NOP
+ r19 = IADD.s32 r19, r15
id(0) nbb
* _.h00 = IMUL.i32 r28, r32
+ r32 = IADD.s32 t, r28
* _.h00 = IMUL.i32 r30, r2
+ r2 = IADD.s32 t, r30
* _.h00 = IMUL.i32 r31, r3
+ r3 = IADD.s32 t, r31
* _.h00 = IMUL.i32 r24, r4
+ r4 = IADD.s32 t, r24
* _.h00 = IMUL.i32 r25, r5
+ r5 = IADD.s32 t, r25
* _.h00 = IMUL.i32 r26, r6
+ r6 = IADD.s32 t, r26
* _.h00 = IMUL.i32 r27, r7
+ r7 = IADD.s32 t, r27
* NOP
+ r8 = IADD.s32 r8, r20
id(0) nbb
* _.h00 = IMUL.i32 r22, r10
+ r10 = IADD.s32 t, r22
* _.h00 = IMUL.i32 r23, r11
+ r11 = IADD.s32 t, r23
* _.h00 = IMUL.i32 r16, r12
+ r12 = IADD.s32 t, r16
* _.h00 = IMUL.i32 r17, r13
+ r13 = IADD.s32 t, r17
* _.h00 = IMUL.i32 r18, r14
+ r14 = IADD.s32 t, r18
* _.h00 = IMUL.i32 r19, r15
+ r15 = IADD.s32 t, r19
* _.h00 = IMUL.i32 r32, r28
+ r28 = IADD.s32 t, r32
* NOP
+ r29 = IADD.s32 r29, r1
id(0) nbb
* _.h00 = IMUL.i32 r2, r30
+ r30 = IADD.s32 t, r2
* _.h00 = IMUL.i32 r3, r31
+ r31 = IADD.s32 t, r3
* _.h00 = IMUL.i32 r4, r24
+ r24 = IADD.s32 t, r4
* _.h00 = IMUL.i32 r5, r25
+ r25 = IADD.s32 t, r5
* _.h00 = IMUL.i32 r6, r26
+ r26 = IADD.s32 t, r6
* _.h00 = IMUL.i32 r7, r27
+ r27 = IADD.s32 t, r7
* _.h00 = IMUL.i32 r8, r20
+ r20 = IADD.s32 t, r8
* NOP
+ r21 = IADD.s32 r21, r9
id(0) nbb r_uncond no_prefetch pcrel(1)
* _.h00 = IMUL.i32 r10, r22
+ r22 = IADD.s32 t, r10
* _.h00 = IMUL.i32 r11, r23
+ r23 = IADD.s32 t, r11
* _.h00 = IMUL.i32 r12, r16
+ r16 = IADD.s32 t, r12
* _.h00 = IMUL.i32 r13, r17
+ r17 = IADD.s32 t, r13
* _.h00 = IMUL.i32 r14, r18
+ r18 = IADD.s32 t, r14
* _.h00 = IMUL.i32 r15, r19
+ r19 = IADD.s32 t, r15
* NOP
+ r33 = IADD.s32 r33, fau.x
* NOP
+ JUMP fau.y -> block1
0 4000000000000001
} -> block1 from block3
block5 {
id(0) nbb
* NOP
+ r1 = IADD.s32 r28, r20
* NOP
+ r2 = IADD.s32 r29, r21
* NOP
+ r3 = IADD.s32 r30, r22
id(0) nbb
* NOP
+ r4 = IADD.s32 r31, r23
* NOP
+ r5 = IADD.s32 r24, r16
* NOP
+ r6 = IADD.s32 r25, r17
* NOP
+ r7 = IADD.s32 r26, r18
* NOP
+ r8 = IADD.s32 r27, r19
* NOP
+ r1 = IADD.s32 r1, r5
* NOP
+ r2 = IADD.s32 r2, r6
* NOP
+ r3 = IADD.s32 r3, r7
id(0) wait(0 ) nbb r_uncond
* NOP
+ r4 = IADD.s32 r4, r8
* NOP
+ r1 = IADD.s32 r1, r3
* NOP
+ _.h00 = IADD.s32 r2, r4
* _.h00 = LSHIFT_OR.i32 r0, t, fau.y.b0
+ r1 = IADD.s32 r1, t1
* NOP
+ r0 = IADD.s32 fau.x, t0
* NOP
+ _.h00 = ICMP.u32.i1.lt t1, fau.x
* NOP
+ _.h00 = IADD.s32 t1, fau.y
* NOP
+ STORE.i32 r1, r0, t1, byte_offset:0
200000000
} from block2
slot 0 reads: r1
clause_0:
ds(0) nbb ncph
{
*LSHIFT_OR.i32 t0, r61, #0, 0x00000008 /* 0.000000 */
+IADD.s32 t1, r60, t
*MKVEC.v2i16 t0, #0, r62
+IADD.s32 r0:t1, t1, t
*MOV.i32 r1:t0, 0x00000001 /* 0.000000 */
+MOV.i32 r2:t1, 0x00000002 /* 0.000000 */
*MOV.i32 r3:t0, 0x00000003 /* 0.000000 */
+MOV.i32 r4:t1, 0x00000004 /* 0.000000 */
*MOV.i32 r5:t0, 0x00000005 /* 0.000000 */
+MOV.i32 r6:t1, 0x00000006 /* 0.000000 */
*NOP t0
+MOV.i32 r7:t1, 0x00000007 /* 0.000000 */
}
clause_7:
ds(0) nbb ncph
{
*MOV.i32 r8:t0, 0x00000008 /* 0.000000 */
+MOV.i32 r9:t1, 0x00000009 /* 0.000000 */
*MOV.i32 r10:t0, 0x0000000a /* 0.000000 */
+MOV.i32 r11:t1, 0x0000000b /* 0.000000 */
*MOV.i32 r12:t0, 0x0000000c /* 0.000000 */
+MOV.i32 r13:t1, 0x0000000d /* 0.000000 */
*MOV.i32 r14:t0, 0x0000000e /* 0.000000 */
+U32_TO_F32 t1, r0
*FMA.f32 t0, t1, 0x2edbe6ff /* 0.000000 */, #0.neg
+F32_TO_S32.rtz r16:t1, t
*MOV.i32 r15:t0, 0x0000000f /* 0.000000 */
+MOV.i32 r17:t1, t1
*NOP t0
+MOV.i32 r18:t1, r16
}
clause_15:
ds(0) nbb ncph
{
*MOV.i32 r19:t0, r16
+MOV.i32 r20:t1, r16
*MOV.i32 r21:t0, r16
+IADD.s32 r1:t1, u1.w0, r1
*MOV.i32 r22:t0, r16
+IADD.s32 r2:t1, u1.w0, r2
*MOV.i32 r23:t0, r16
+IADD.s32 r3:t1, u1.w0, r3
*MOV.i32 r24:t0, r16
+IADD.s32 r4:t1, u1.w0, r4
*MOV.i32 r25:t0, r16
+IADD.s32 r5:t1, u1.w0, r5
*MOV.i32 r26:t0, r16
+IADD.s32 r6:t1, u1.w0, r6
*NOP t0
+IADD.s32 r7:t1, u1.w0, r7
}
clause_21:
ds(0) nbb r_uncond ncph
{
*MOV.i32 r27:t0, r16
+IADD.s32 r8:t1, u1.w0, r8
*MOV.i32 r28:t0, r16
+IADD.s32 r9:t1, u1.w0, r9
*MOV.i32 r29:t0, r16
+IADD.s32 r10:t1, u1.w0, r10
*MOV.i32 r30:t0, r16
+IADD.s32 r11:t1, u1.w0, r11
*MOV.i32 r31:t0, r16
+IADD.s32 r12:t1, u1.w0, r12
*MOV.i32 r32:t0, u1.w0
+IADD.s32 r13:t1, u1.w0, r13
*MOV.i32 r33:t0, #0
+IADD.s32 r14:t1, u1.w0, r14
*NOP t0
+IADD.s32 r15:t1, u1.w0, r15
}
clause_27:
ds(0) nbb r_uncond ncph
{
*NOP t0
+ICMP.s32.m1.ge t1, r33, 0x00000008 /* 0.000000 */
*NOP t0
+BRANCHZ.i16.eq t1, t1.h0, clause_32
}
clause_30:
ds(0) nbb
{
*NOP t0
+JUMP t1, clause_252
}
clause_32:
ds(0) nbb ncph
{
*IMUL.i32 r1:t0, r29, r1
+NOP t1
*IMUL.i32 r8:t0, r20, r8
+NOP t1
*IMUL.i32 r12:t0, r16, r12
+NOP t1
*NOP t0
+IADD.s32 r1:t1, r1, r29
}
clause_35:
ds(0) nbb ncph
{
*IMUL.i32 r29:t0, r1, r29
+NOP t1
*IMUL.i32 t0, r17, r13
+IADD.s32 r13:t1, t, r17
*IMUL.i32 r17:t0, t1, r17
+NOP t1
*IMUL.i32 t0, r18, r14
+IADD.s32 r14:t1, t, r18
*IMUL.i32 r18:t0, t1, r18
+NOP t1
*IMUL.i32 t0, r28, r32
+IADD.s32 r32:t1, t, r28
*IMUL.i32 t0, t1, r28
+IADD.s32 r28:t1, t, t1
*IMUL.i32 r32:t0, t1, r32
+NOP t1
}
clause_41:
ds(0) nbb ncph
{
*IMUL.i32 t0, r27, r7
+IADD.s32 r7:t1, t, r27
*IMUL.i32 t0, t1, r27
+IADD.s32 r27:t1, t, t1
*IMUL.i32 r7:t0, t1, r7
+NOP t1
*IMUL.i32 t0, r31, r3
+IADD.s32 r3:t1, t, r31
*IMUL.i32 t0, t1, r31
+IADD.s32 r31:t1, t, t1
*IMUL.i32 t0, t1, r3
+IADD.s32 r3:t1, t, t1
*IMUL.i32 r31:t0, t1, r31
+NOP t1
*NOP t0
+IADD.s32 r18:t1, r18, r14
}
clause_47:
ds(0) nbb ncph
{
*IMUL.i32 t0, r18, r14
+IADD.s32 r14:t1, t, r18
*IMUL.i32 r18:t0, t1, r18
+NOP t1
*IMUL.i32 t0, r30, r2
+IADD.s32 r2:t1, t, r30
*IMUL.i32 t0, t1, r30
+IADD.s32 r30:t1, t, t1
*IMUL.i32 t0, t1, r2
+IADD.s32 r2:t1, t, t1
*IMUL.i32 t0, t1, r30
+IADD.s32 r30:t1, t, t1
*IMUL.i32 r2:t0, t1, r2
+NOP t1
*NOP t0
+IADD.s32 r7:t1, r7, r27
}
clause_53:
ds(0) nbb ncph
{
*IMUL.i32 t0, r7, r27
+IADD.s32 r27:t1, t, r7
*IMUL.i32 r7:t0, t1, r7
+NOP t1
*IMUL.i32 t0, r21, r9
+IADD.s32 r9:t1, t, r21
*IMUL.i32 t0, t1, r21
+IADD.s32 r21:t1, t, t1
*IMUL.i32 t0, t1, r9
+IADD.s32 r9:t1, t, t1
*IMUL.i32 t0, t1, r21
+IADD.s32 r21:t1, t, t1
*IMUL.i32 r9:t0, t1, r9
+NOP t1
*NOP t0
+IADD.s32 r18:t1, r18, r14
}
clause_59:
ds(0) nbb ncph
{
*IMUL.i32 r14:t0, r18, r14
+NOP t1
*IMUL.i32 t0, r25, r5
+IADD.s32 r5:t1, t, r25
*IMUL.i32 t0, t1, r25
+IADD.s32 r25:t1, t, t1
*IMUL.i32 t0, t1, r5
+IADD.s32 r5:t1, t, t1
*IMUL.i32 t0, t1, r25
+IADD.s32 r25:t1, t, t1
*IMUL.i32 t0, t1, r5
+IADD.s32 r5:t1, t, t1
*IMUL.i32 r25:t0, t1, r25
+NOP t1
*NOP t0
+IADD.s32 r9:t1, r9, r21
}
clause_65:
ds(0) nbb ncph
{
*IMUL.i32 r21:t0, r9, r21
+NOP t1
*IMUL.i32 t0, r23, r11
+IADD.s32 r11:t1, t, r23
*IMUL.i32 t0, t1, r23
+IADD.s32 r23:t1, t, t1
*IMUL.i32 t0, t1, r11
+IADD.s32 r11:t1, t, t1
*IMUL.i32 t0, t1, r23
+IADD.s32 r23:t1, t, t1
*IMUL.i32 t0, t1, r11
+IADD.s32 r11:t1, t, t1
*IMUL.i32 r23:t0, t1, r23
+NOP t1
*NOP t0
+IADD.s32 r25:t1, r25, r5
}
clause_71:
ds(0) nbb ncph
{
*IMUL.i32 r5:t0, r25, r5
+NOP t1
*IMUL.i32 t0, r26, r6
+IADD.s32 r6:t1, t, r26
*IMUL.i32 t0, t1, r26
+IADD.s32 r26:t1, t, t1
*IMUL.i32 t0, t1, r6
+IADD.s32 r6:t1, t, t1
*IMUL.i32 t0, t1, r26
+IADD.s32 r26:t1, t, t1
*IMUL.i32 t0, t1, r6
+IADD.s32 r6:t1, t, t1
*IMUL.i32 t0, t1, r26
+IADD.s32 r26:t1, t, t1
*IMUL.i32 r6:t0, t1, r6
+NOP t1
}
clause_77:
ds(0) nbb ncph
{
*IMUL.i32 t0, r19, r15
+IADD.s32 r15:t1, t, r19
*IMUL.i32 t0, t1, r19
+IADD.s32 r19:t1, t, t1
*IMUL.i32 t0, t1, r15
+IADD.s32 r15:t1, t, t1
*IMUL.i32 t0, t1, r19
+IADD.s32 r19:t1, t, t1
*IMUL.i32 t0, t1, r15
+IADD.s32 r15:t1, t, t1
*IMUL.i32 t0, t1, r19
+IADD.s32 r19:t1, t, t1
*IMUL.i32 r15:t0, t1, r15
+NOP t1
*NOP t0
+IADD.s32 r8:t1, r8, r20
}
clause_83:
ds(0) nbb ncph
{
*IMUL.i32 t0, r8, r20
+IADD.s32 r20:t1, t, r8
*IMUL.i32 t0, t1, r8
+IADD.s32 r8:t1, t, t1
*IMUL.i32 t0, t1, r20
+IADD.s32 r20:t1, t, t1
*IMUL.i32 t0, t1, r8
+IADD.s32 r8:t1, t, t1
*IMUL.i32 t0, t1, r20
+IADD.s32 r20:t1, t, t1
*IMUL.i32 t0, t1, r8
+IADD.s32 r8:t1, t, t1
*IMUL.i32 r20:t0, t1, r20
+NOP t1
*NOP t0
+IADD.s32 r12:t1, r12, r16
}
clause_89:
ds(0) nbb ncph
{
*IMUL.i32 t0, r12, r16
+IADD.s32 r16:t1, t, r12
*IMUL.i32 t0, t1, r12
+IADD.s32 r12:t1, t, t1
*IMUL.i32 t0, t1, r16
+IADD.s32 r16:t1, t, t1
*IMUL.i32 t0, t1, r12
+IADD.s32 r12:t1, t, t1
*IMUL.i32 t0, t1, r16
+IADD.s32 r16:t1, t, t1
*IMUL.i32 t0, t1, r12
+IADD.s32 r12:t1, t, t1
*IMUL.i32 r16:t0, t1, r16
+NOP t1
*NOP t0
+IADD.s32 r29:t1, r29, r1
}
clause_95:
ds(0) nbb ncph
{
*IMUL.i32 t0, r29, r1
+IADD.s32 r1:t1, t, r29
*IMUL.i32 t0, t1, r29
+IADD.s32 r29:t1, t, t1
*IMUL.i32 t0, t1, r1
+IADD.s32 r1:t1, t, t1
*IMUL.i32 t0, t1, r29
+IADD.s32 r29:t1, t, t1
*IMUL.i32 t0, t1, r1
+IADD.s32 r1:t1, t, t1
*IMUL.i32 t0, t1, r29
+IADD.s32 r29:t1, t, t1
*IMUL.i32 r1:t0, t1, r1
+NOP t1
*NOP t0
+IADD.s32 r32:t1, r32, r28
}
clause_101:
ds(0) nbb ncph
{
*IMUL.i32 t0, r32, r28
+IADD.s32 r28:t1, t, r32
*IMUL.i32 t0, t1, r32
+IADD.s32 r32:t1, t, t1
*IMUL.i32 t0, t1, r28
+IADD.s32 r28:t1, t, t1
*IMUL.i32 t0, t1, r32
+IADD.s32 r32:t1, t, t1
*IMUL.i32 t0, t1, r28
+IADD.s32 r28:t1, t, t1
*IMUL.i32 t0, t1, r32
+IADD.s32 r32:t1, t, t1
*IMUL.i32 r28:t0, t1, r28
+NOP t1
*NOP t0
+IADD.s32 r31:t1, r31, r3
}
clause_107:
ds(0) nbb ncph
{
*IMUL.i32 t0, r31, r3
+IADD.s32 r3:t1, t, r31
*IMUL.i32 t0, t1, r31
+IADD.s32 r31:t1, t, t1
*IMUL.i32 t0, t1, r3
+IADD.s32 r3:t1, t, t1
*IMUL.i32 t0, t1, r31
+IADD.s32 r31:t1, t, t1
*IMUL.i32 t0, t1, r3
+IADD.s32 r3:t1, t, t1
*IMUL.i32 t0, t1, r31
+IADD.s32 r31:t1, t, t1
*IMUL.i32 r3:t0, t1, r3
+NOP t1
*NOP t0
+IADD.s32 r2:t1, r2, r30
}
clause_113:
ds(0) nbb ncph
{
*IMUL.i32 t0, r2, r30
+IADD.s32 r30:t1, t, r2
*IMUL.i32 t0, t1, r2
+IADD.s32 r2:t1, t, t1
*IMUL.i32 t0, t1, r30
+IADD.s32 r30:t1, t, t1
*IMUL.i32 t0, t1, r2
+IADD.s32 r2:t1, t, t1
*IMUL.i32 t0, t1, r30
+IADD.s32 r30:t1, t, t1
*IMUL.i32 t0, t1, r2
+IADD.s32 r2:t1, t, t1
*IMUL.i32 r30:t0, t1, r30
+NOP t1
*NOP t0
+IADD.s32 r14:t1, r14, r18
}
clause_119:
ds(0) nbb ncph
{
*IMUL.i32 t0, r14, r18
+IADD.s32 r18:t1, t, r14
*IMUL.i32 t0, t1, r14
+IADD.s32 r14:t1, t, t1
*IMUL.i32 t0, t1, r18
+IADD.s32 r18:t1, t, t1
*IMUL.i32 t0, t1, r14
+IADD.s32 r14:t1, t, t1
*IMUL.i32 t0, t1, r18
+IADD.s32 r18:t1, t, t1
*IMUL.i32 t0, t1, r14
+IADD.s32 r14:t1, t, t1
*IMUL.i32 r18:t0, t1, r18
+NOP t1
*NOP t0
+IADD.s32 r21:t1, r21, r9
}
clause_125:
ds(0) nbb ncph
{
*IMUL.i32 t0, r21, r9
+IADD.s32 r9:t1, t, r21
*IMUL.i32 t0, t1, r21
+IADD.s32 r21:t1, t, t1
*IMUL.i32 t0, t1, r9
+IADD.s32 r9:t1, t, t1
*IMUL.i32 t0, t1, r21
+IADD.s32 r21:t1, t, t1
*IMUL.i32 t0, t1, r9
+IADD.s32 r9:t1, t, t1
*IMUL.i32 t0, t1, r21
+IADD.s32 r21:t1, t, t1
*IMUL.i32 r9:t0, t1, r9
+NOP t1
*NOP t0
+IADD.s32 r23:t1, r23, r11
}
clause_131:
ds(0) nbb ncph
{
*IMUL.i32 t0, r23, r11
+IADD.s32 r11:t1, t, r23
*IMUL.i32 t0, t1, r23
+IADD.s32 r23:t1, t, t1
*IMUL.i32 t0, t1, r11
+IADD.s32 r11:t1, t, t1
*IMUL.i32 t0, t1, r23
+IADD.s32 r23:t1, t, t1
*IMUL.i32 t0, t1, r11
+IADD.s32 r11:t1, t, t1
*IMUL.i32 t0, t1, r23
+IADD.s32 r23:t1, t, t1
*IMUL.i32 r11:t0, t1, r11
+NOP t1
*NOP t0
+IADD.s32 r5:t1, r5, r25
}
clause_137:
ds(0) nbb ncph
{
*IMUL.i32 t0, r5, r25
+IADD.s32 r25:t1, t, r5
*IMUL.i32 t0, t1, r5
+IADD.s32 r5:t1, t, t1
*IMUL.i32 t0, t1, r25
+IADD.s32 r25:t1, t, t1
*IMUL.i32 t0, t1, r5
+IADD.s32 r5:t1, t, t1
*IMUL.i32 t0, t1, r25
+IADD.s32 r25:t1, t, t1
*IMUL.i32 t0, t1, r5
+IADD.s32 r5:t1, t, t1
*IMUL.i32 r25:t0, t1, r25
+NOP t1
*NOP t0
+IADD.s32 r15:t1, r15, r19
}
clause_143:
ds(0) nbb ncph
{
*IMUL.i32 t0, r15, r19
+IADD.s32 r19:t1, t, r15
*IMUL.i32 t0, t1, r15
+IADD.s32 r15:t1, t, t1
*IMUL.i32 t0, t1, r19
+IADD.s32 r19:t1, t, t1
*IMUL.i32 t0, t1, r15
+IADD.s32 r15:t1, t, t1
*IMUL.i32 t0, t1, r19
+IADD.s32 r19:t1, t, t1
*IMUL.i32 t0, t1, r15
+IADD.s32 r15:t1, t, t1
*IMUL.i32 r19:t0, t1, r19
+NOP t1
*NOP t0
+IADD.s32 r20:t1, r20, r8
}
clause_149:
ds(0) nbb ncph
{
*IMUL.i32 t0, r20, r8
+IADD.s32 r8:t1, t, r20
*IMUL.i32 t0, t1, r20
+IADD.s32 r20:t1, t, t1
*IMUL.i32 t0, t1, r8
+IADD.s32 r8:t1, t, t1
*IMUL.i32 t0, t1, r20
+IADD.s32 r20:t1, t, t1
*IMUL.i32 t0, t1, r8
+IADD.s32 r8:t1, t, t1
*IMUL.i32 t0, t1, r20
+IADD.s32 r20:t1, t, t1
*IMUL.i32 r8:t0, t1, r8
+NOP t1
*NOP t0
+IADD.s32 r1:t1, r1, r29
}
clause_155:
ds(0) nbb ncph
{
*IMUL.i32 t0, r1, r29
+IADD.s32 r29:t1, t, r1
*IMUL.i32 t0, t1, r1
+IADD.s32 r1:t1, t, t1
*IMUL.i32 t0, t1, r29
+IADD.s32 r29:t1, t, t1
*IMUL.i32 t0, t1, r1
+IADD.s32 r1:t1, t, t1
*IMUL.i32 t0, t1, r29
+IADD.s32 r29:t1, t, t1
*IMUL.i32 t0, t1, r1
+IADD.s32 r1:t1, t, t1
*IMUL.i32 r29:t0, t1, r29
+NOP t1
*NOP t0
+IADD.s32 r9:t1, r9, r21
}
clause_161:
ds(0) nbb ncph
{
*IMUL.i32 t0, r9, r21
+IADD.s32 r21:t1, t, r9
*IMUL.i32 t0, t1, r9
+IADD.s32 r9:t1, t, t1
*IMUL.i32 r21:t0, t1, r21
+NOP t1
*IMUL.i32 t0, r24, r4
+IADD.s32 r4:t1, t, r24
*IMUL.i32 t0, r22, r10
+IADD.s32 r10:t1, t, r22
*IMUL.i32 t0, r4, r24
+IADD.s32 r24:t1, t, r4
*IMUL.i32 t0, r10, r22
+IADD.s32 r22:t1, t, r10
*NOP t0
+IADD.s32 r17:t1, r17, r13
}
clause_167:
ds(0) nbb ncph
{
*IMUL.i32 t0, r24, r4
+IADD.s32 r4:t1, t, r24
*IMUL.i32 t0, r22, r10
+IADD.s32 r10:t1, t, r22
*IMUL.i32 t0, r17, r13
+IADD.s32 r13:t1, t, r17
*IMUL.i32 t0, r4, r24
+IADD.s32 r24:t1, t, r4
*IMUL.i32 t0, r10, r22
+IADD.s32 r22:t1, t, r10
*IMUL.i32 t0, r13, r17
+IADD.s32 r17:t1, t, r13
*IMUL.i32 t0, r24, r4
+IADD.s32 r4:t1, t, r24
*NOP t0
+IADD.s32 r7:t1, r7, r27
}
clause_173:
ds(0) nbb ncph
{
*IMUL.i32 t0, r22, r10
+IADD.s32 r10:t1, t, r22
*IMUL.i32 t0, r17, r13
+IADD.s32 r13:t1, t, r17
*IMUL.i32 t0, r4, r24
+IADD.s32 r24:t1, t, r4
*IMUL.i32 t0, r7, r27
+IADD.s32 r27:t1, t, r7
*IMUL.i32 t0, r10, r22
+IADD.s32 r22:t1, t, r10
*IMUL.i32 t0, r13, r17
+IADD.s32 r17:t1, t, r13
*IMUL.i32 t0, r24, r4
+IADD.s32 r4:t1, t, r24
*NOP t0
+IADD.s32 r6:t1, r6, r26
}
clause_179:
ds(0) nbb ncph
{
*IMUL.i32 t0, r27, r7
+IADD.s32 r7:t1, t, r27
*IMUL.i32 t0, r22, r10
+IADD.s32 r10:t1, t, r22
*IMUL.i32 t0, r17, r13
+IADD.s32 r13:t1, t, r17
*IMUL.i32 t0, r4, r24
+IADD.s32 r24:t1, t, r4
*IMUL.i32 t0, r6, r26
+IADD.s32 r26:t1, t, r6
*IMUL.i32 t0, r7, r27
+IADD.s32 r27:t1, t, r7
*IMUL.i32 t0, r10, r22
+IADD.s32 r22:t1, t, r10
*NOP t0
+IADD.s32 r16:t1, r16, r12
}
clause_185:
ds(0) nbb ncph
{
*IMUL.i32 t0, r13, r17
+IADD.s32 r17:t1, t, r13
*IMUL.i32 t0, r24, r4
+IADD.s32 r4:t1, t, r24
*IMUL.i32 t0, r26, r6
+IADD.s32 r6:t1, t, r26
*IMUL.i32 t0, r27, r7
+IADD.s32 r7:t1, t, r27
*IMUL.i32 t0, r22, r10
+IADD.s32 r10:t1, t, r22
*IMUL.i32 t0, r16, r12
+IADD.s32 r12:t1, t, r16
*IMUL.i32 t0, r17, r13
+IADD.s32 r13:t1, t, r17
*NOP t0
+IADD.s32 r28:t1, r28, r32
}
clause_191:
ds(0) nbb ncph
{
*IMUL.i32 t0, r4, r24
+IADD.s32 r24:t1, t, r4
*IMUL.i32 t0, r6, r26
+IADD.s32 r26:t1, t, r6
*IMUL.i32 t0, r7, r27
+IADD.s32 r27:t1, t, r7
*IMUL.i32 t0, r10, r22
+IADD.s32 r22:t1, t, r10
*IMUL.i32 t0, r12, r16
+IADD.s32 r16:t1, t, r12
*IMUL.i32 t0, r13, r17
+IADD.s32 r17:t1, t, r13
*IMUL.i32 t0, r28, r32
+IADD.s32 r32:t1, t, r28
*NOP t0
+IADD.s32 r3:t1, r3, r31
}
clause_197:
ds(0) nbb ncph
{
*IMUL.i32 t0, r24, r4
+IADD.s32 r4:t1, t, r24
*IMUL.i32 t0, r26, r6
+IADD.s32 r6:t1, t, r26
*IMUL.i32 t0, r27, r7
+IADD.s32 r7:t1, t, r27
*IMUL.i32 t0, r22, r10
+IADD.s32 r10:t1, t, r22
*IMUL.i32 t0, r16, r12
+IADD.s32 r12:t1, t, r16
*IMUL.i32 t0, r17, r13
+IADD.s32 r13:t1, t, r17
*IMUL.i32 t0, r32, r28
+IADD.s32 r28:t1, t, r32
*NOP t0
+IADD.s32 r30:t1, r30, r2
}
clause_203:
ds(0) nbb ncph
{
*IMUL.i32 t0, r3, r31
+IADD.s32 r31:t1, t, r3
*IMUL.i32 t0, r4, r24
+IADD.s32 r24:t1, t, r4
*IMUL.i32 t0, r6, r26
+IADD.s32 r26:t1, t, r6
*IMUL.i32 t0, r7, r27
+IADD.s32 r27:t1, t, r7
*IMUL.i32 t0, r10, r22
+IADD.s32 r22:t1, t, r10
*IMUL.i32 t0, r12, r16
+IADD.s32 r16:t1, t, r12
*IMUL.i32 t0, r13, r17
+IADD.s32 r17:t1, t, r13
*NOP t0
+IADD.s32 r18:t1, r18, r14
}
clause_209:
ds(0) nbb ncph
{
*IMUL.i32 t0, r28, r32
+IADD.s32 r32:t1, t, r28
*IMUL.i32 t0, r30, r2
+IADD.s32 r2:t1, t, r30
*IMUL.i32 t0, r31, r3
+IADD.s32 r3:t1, t, r31
*IMUL.i32 t0, r24, r4
+IADD.s32 r4:t1, t, r24
*IMUL.i32 t0, r26, r6
+IADD.s32 r6:t1, t, r26
*IMUL.i32 t0, r27, r7
+IADD.s32 r7:t1, t, r27
*IMUL.i32 t0, r22, r10
+IADD.s32 r10:t1, t, r22
*NOP t0
+IADD.s32 r11:t1, r11, r23
}
clause_215:
ds(0) nbb ncph
{
*IMUL.i32 t0, r16, r12
+IADD.s32 r12:t1, t, r16
*IMUL.i32 t0, r17, r13
+IADD.s32 r13:t1, t, r17
*IMUL.i32 t0, r18, r14
+IADD.s32 r14:t1, t, r18
*IMUL.i32 t0, r32, r28
+IADD.s32 r28:t1, t, r32
*IMUL.i32 t0, r2, r30
+IADD.s32 r30:t1, t, r2
*IMUL.i32 t0, r3, r31
+IADD.s32 r31:t1, t, r3
*IMUL.i32 t0, r4, r24
+IADD.s32 r24:t1, t, r4
*NOP t0
+IADD.s32 r25:t1, r25, r5
}
clause_221:
ds(0) nbb ncph
{
*IMUL.i32 t0, r6, r26
+IADD.s32 r26:t1, t, r6
*IMUL.i32 t0, r7, r27
+IADD.s32 r27:t1, t, r7
*IMUL.i32 t0, r10, r22
+IADD.s32 r22:t1, t, r10
*IMUL.i32 t0, r11, r23
+IADD.s32 r23:t1, t, r11
*IMUL.i32 t0, r12, r16
+IADD.s32 r16:t1, t, r12
*IMUL.i32 t0, r13, r17
+IADD.s32 r17:t1, t, r13
*IMUL.i32 t0, r14, r18
+IADD.s32 r18:t1, t, r14
*NOP t0
+IADD.s32 r19:t1, r19, r15
}
clause_227:
ds(0) nbb ncph
{
*IMUL.i32 t0, r28, r32
+IADD.s32 r32:t1, t, r28
*IMUL.i32 t0, r30, r2
+IADD.s32 r2:t1, t, r30
*IMUL.i32 t0, r31, r3
+IADD.s32 r3:t1, t, r31
*IMUL.i32 t0, r24, r4
+IADD.s32 r4:t1, t, r24
*IMUL.i32 t0, r25, r5
+IADD.s32 r5:t1, t, r25
*IMUL.i32 t0, r26, r6
+IADD.s32 r6:t1, t, r26
*IMUL.i32 t0, r27, r7
+IADD.s32 r7:t1, t, r27
*NOP t0
+IADD.s32 r8:t1, r8, r20
}
clause_233:
ds(0) nbb ncph
{
*IMUL.i32 t0, r22, r10
+IADD.s32 r10:t1, t, r22
*IMUL.i32 t0, r23, r11
+IADD.s32 r11:t1, t, r23
*IMUL.i32 t0, r16, r12
+IADD.s32 r12:t1, t, r16
*IMUL.i32 t0, r17, r13
+IADD.s32 r13:t1, t, r17
*IMUL.i32 t0, r18, r14
+IADD.s32 r14:t1, t, r18
*IMUL.i32 t0, r19, r15
+IADD.s32 r15:t1, t, r19
*IMUL.i32 t0, r32, r28
+IADD.s32 r28:t1, t, r32
*NOP t0
+IADD.s32 r29:t1, r29, r1
}
clause_239:
ds(0) nbb ncph
{
*IMUL.i32 t0, r2, r30
+IADD.s32 r30:t1, t, r2
*IMUL.i32 t0, r3, r31
+IADD.s32 r31:t1, t, r3
*IMUL.i32 t0, r4, r24
+IADD.s32 r24:t1, t, r4
*IMUL.i32 t0, r5, r25
+IADD.s32 r25:t1, t, r5
*IMUL.i32 t0, r6, r26
+IADD.s32 r26:t1, t, r6
*IMUL.i32 t0, r7, r27
+IADD.s32 r27:t1, t, r7
*IMUL.i32 t0, r8, r20
+IADD.s32 r20:t1, t, r8
*NOP t0
+IADD.s32 r21:t1, r21, r9
}
clause_245:
ds(0) nbb r_uncond
{
*IMUL.i32 t0, r10, r22
+IADD.s32 r22:t1, t, r10
*IMUL.i32 t0, r11, r23
+IADD.s32 r23:t1, t, r11
*IMUL.i32 t0, r12, r16
+IADD.s32 r16:t1, t, r12
*IMUL.i32 t0, r13, r17
+IADD.s32 r17:t1, t, r13
*IMUL.i32 t0, r14, r18
+IADD.s32 r18:t1, t, r14
*IMUL.i32 t0, r15, r19
+IADD.s32 r19:t1, t, r15
*NOP t0
+IADD.s32 r33:t1, r33, 0x00000001 /* 0.000000 */
*NOP t0
+JUMP t1, clause_27
}
clause_252:
ds(0) nbb ncph
{
*NOP t0
+IADD.s32 r1:t1, r28, r20
*NOP t0
+IADD.s32 r2:t1, r29, r21
*NOP t0
+IADD.s32 r3:t1, r30, r22
}
clause_255:
ds(0) nbb ncph next_store dwb(0)
{
*NOP t0
+IADD.s32 r4:t1, r31, r23
*NOP t0
+IADD.s32 r5:t1, r24, r16
*NOP t0
+IADD.s32 r6:t1, r25, r17
*NOP t0
+IADD.s32 r7:t1, r26, r18
*NOP t0
+IADD.s32 r8:t1, r27, r19
*NOP t0
+IADD.s32 r1:t1, r1, r5
*NOP t0
+IADD.s32 r2:t1, r2, r6
*NOP t0
+IADD.s32 r3:t1, r3, r7
}
clause_261:
ds(0) eos store
{
*NOP t0
+IADD.s32 r4:t1, r4, r8
*NOP t0
+IADD.s32 r1:t1, r1, r3
*NOP t0
+IADD.s32 t1, r2, r4
*LSHIFT_OR.i32 t0, r0, #0, 0x00000002 /* 0.000000 */
+IADD.s32 r1:t1, r1, t1
*NOP t0
+IADD.s32 r0:t1, u0.w0, t0
*NOP t0
+ICMP.u32.gt t1, u0.w0, t1
*NOP t0
+IADD.s32 t1, t1, u0.w1
*NOP t0
+STORE.i32 t1, r0, t1, @r1
}
e20eea22 compute_sp_v16_int 16.765 GFLOPs 16.012ms
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment