Skip to content

Instantly share code, notes, and snippets.

@TheBadGod

TheBadGod/ml.c Secret

Created August 1, 2022 13:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save TheBadGod/2ac4f082eb4086d1f27b6085bd5d2109 to your computer and use it in GitHub Desktop.
Save TheBadGod/2ac4f082eb4086d1f27b6085bd5d2109 to your computer and use it in GitHub Desktop.
void *main_graph(float **a1, __int64 a2, float *a3)
{
// [COLLAPSED LOCAL DECLARATIONS. PRESS KEYPAD CTRL-"+" TO EXPAND]
// PART IS OMITTED
// Tanh node
v296 = ALIGN(malloc(0x64010uLL));
for ( i25 = 0LL; i25 < 64; ++i25 )
{
for ( i26 = 0LL; i26 < 16; ++i26 )
{
for ( i27 = 0LL; i27 < 10; ++i27 )
{
for ( i28 = 0LL; i28 < 10; i28 = v53 + 1 )
{
v46 = (__m128)LODWORD(Tanh_16[1600 * i25 + 100 * i26 + 10 * i27 + i28]);
v47 = _mm_cmpgt_ss((__m128)0x40FFF644u, v46);
v48 = _mm_or_ps(_mm_andnot_ps(v47, (__m128)0x40FFF644u), _mm_and_ps(v47, v46));
v49 = _mm_cmpgt_ss(v48, (__m128)0xC0FFF644);
v50 = _mm_or_ps(_mm_andnot_ps(v49, (__m128)0xC0FFF644), _mm_and_ps(v49, v48));
v287 = v50;
v288 = _mm_and_si128((__m128i)v46, (__m128i)xmmword_55007040).m128i_u32[0];
v50.m128_f32[0] = v50.m128_f32[0] * v50.m128_f32[0];
*(__m128 *)v285 = v50;
v46.m128_i32[0] = fmaf(v50.m128_f32[0], -2.7607684e-16, 2.0001879e-13);
v46.m128_i32[0] = fmaf(v285[0], v46.m128_f32[0], -8.6046718e-11);
v46.m128_i32[0] = fmaf(v285[0], v46.m128_f32[0], 0.000000051222973);
v50.m128_f32[0] = fmaf(v285[0], v46.m128_f32[0], 0.000014857224);
v50.m128_f32[0] = fmaf(v285[0], v50.m128_f32[0], 0.00063726195);
v50.m128_f32[0] = fmaf(v285[0], v50.m128_f32[0], 0.0048935246);
v51 = v287;
v51.m128_f32[0] = v287.m128_f32[0] * v50.m128_f32[0];
v286 = v51;
v50.m128_f32[0] = fmaf(v285[0], 0.0000011982584, 0.00011853471);
v50.m128_f32[0] = fmaf(v285[0], v50.m128_f32[0], 0.0022684347);
v50.m128_f32[0] = fmaf(v285[0], v50.m128_f32[0], 0.004893525);
v52 = v286;
v53 = i28;
v52.m128_f32[0] = v286.m128_f32[0] / v50.m128_f32[0];
v51.m128_i32[0] = 970045207;
v54 = _mm_cmplt_ss((__m128)v288, v51);
LODWORD(Pad_17[1600 * i25 + 100 * i26 + 10 * i27 + i28]) = _mm_andnot_ps(v54, v52).m128_u32[0] | v54.m128_i32[0] & v287.m128_i32[0];
}
}
}
}
// PART IS OMITTED
float *v208 = ALIGN(malloc(0x20uLL));
v208[0] = constant_8[0]; // 0x40
v208[1] = constant_9[0]; // -1
dim0 = v208[0];
dim1 = v208[1];
if ( !*v208 ) // if(dim0 == 0) dim0 = 64;
dim0 = 64LL;
if ( dim0 == -1 ) // if(dim0 == -1) dim0 = 1;
dim0 = 1LL;
if ( !dim1 ) // if(dim1 == 0) dim1 = 120;
dim1 = 120LL;
if ( dim1 == -1 ) // if(dim1 == -1) dim1 = 1; // we're here
dim1 = 1LL;
num_elems = dim1 * dim0;
second_dim = dim0; // just set to the constant 64
remaining = 0x1E00 / (__int128)num_elems; // (120*64) / num_elements
first_dim = dim1;
if ( dim0 == -1 )
second_dim = remaining;
if ( dim1 == -1 )
first_dim = remaining; // is now 120 (as 120*64//64 == 120)
// first dim = 120, second dim = 64
// the operation right before the first custom operation
// => generates the input for custom operation in customop_1_29
// => gets input from the node before (which is the reshape node)
float *customop_1_29 = ALIGN(malloc(336 * second_dim + 128));
float tmp_float;
float *tmp_float_ptr = &tmp_float;
for ( i57 = 0LL; i57 < second_dim; ++i57 )
{
for ( i58 = 0LL; i58 < 84; ++i58 )
{
*tmp_float_ptr = 0.0;
for ( i59 = 0LL; i59 < first_dim; ++i59 )
*tmp_float_ptr = Reshape_22[i59 + first_dim * i57] * constant_10[120 * i58 + i59] + *tmp_float_ptr;
customop_1_29[84 * i57 + i58] = *tmp_float_ptr + constant_11[i58];
}
}
// begin of first custom operation
float *cos_table = ALIGN(malloc(336 * second_dim + 16));
for ( i60 = 0LL; i60 < second_dim; ++i60 )
{
for ( i61 = 0LL; i61 < 84; ++i61 )
{
cos_table[84 * i60 + i61] = cosf(customop_1_29[84 * i60 + i61]);
}
}
float *sin_table = ALIGN(malloc(336 * second_dim + 16));
for ( i62 = 0LL; i62 < second_dim; ++i62 )
{
for ( i63 = 0LL; i63 < 84; ++i63 )
{
sin_table[84 * i62 + i63] = sinf(customop_1_29[84 * i62 + i63]);
}
}
float *customop_2_30 = ALIGN(malloc(336 * second_dim + 16)); // input buffer for next custom operation
for ( i64 = 0LL; i64 < second_dim; ++i64 )
{
for ( i65 = 0LL; i65 < 84; ++i65 )
{
customop_2_30[84 * i64 + i65] = cos_table[84 * i64 + i65] + sin_table[84 * i64 + i65];
}
}
// begin second custom operation
float *customop_2_30_tanh = ALIGN(malloc(336 * second_dim + 16));
for ( i66 = 0LL; i66 < second_dim; ++i66 ) // kinda feels familiar :)
{
for ( i67 = 0LL; i67 < 84; ++i67 ) // variable renaming kinda broke this, but eh, structure is the same
{
v108 = (__m128)LODWORD(customop_2_30[84 * i66 + i67]);
v109 = _mm_cmpgt_ss((__m128)0x40FFF644u, v108);
v110 = _mm_or_ps(_mm_andnot_ps(v109, (__m128)0x40FFF644u), _mm_and_ps(v109, v108));
v111 = _mm_cmpgt_ss(v110, (__m128)0xC0FFF644);
v112 = _mm_or_ps(_mm_andnot_ps(v111, (__m128)0xC0FFF644), _mm_and_ps(v111, v110));
v165 = _mm_and_si128((__m128i)v108, (__m128i)xmmword_55007040).m128i_u32[0];
v112.m128_f32[0] = v112.m128_f32[0] * v112.m128_f32[0];
*(__m128 *)v162 = v112;
v108.m128_i32[0] = fmaf(v112.m128_f32[0], -2.7607684e-16, 2.0001879e-13);
v108.m128_i32[0] = fmaf(v162[0], v108.m128_f32[0], -8.6046718e-11);
v108.m128_i32[0] = fmaf(v162[0], v108.m128_f32[0], 0.000000051222973);
v112.m128_f32[0] = fmaf(v162[0], v108.m128_f32[0], 0.000014857224);
v112.m128_f32[0] = fmaf(v162[0], v112.m128_f32[0], 0.00063726195);
v112.m128_f32[0] = fmaf(v162[0], v112.m128_f32[0], 0.0048935246);
v112.m128_f32[0] = v112.m128_f32[0] * v112.m128_f32[0];
v112.m128_f32[0] = fmaf(v162[0], 0.0000011982584, 0.00011853471);
v112.m128_f32[0] = fmaf(v162[0], v112.m128_f32[0], 0.0022684347);
v112.m128_f32[0] = fmaf(v162[0], v112.m128_f32[0], 0.004893525);
v112.m128_f32[0] = v112.m128_f32[0] / v112.m128_f32[0];
v112.m128_i32[0] = 970045207;
v116 = _mm_cmplt_ss((__m128)v165, v112);
customop_2_30_tanh[84 * i66 + i67] =
_mm_andnot_ps(v116, v112).m128_u32[0] | v116.m128_i32[0] & v112.m128_i32[0];
}
}
float *customop_2_31 = ALIGN(malloc(336 * second_dim + 16)); // input buffer for next custom operation
for ( i68 = 0LL; i68 < second_dim; ++i68 )
{
for ( i69 = 0LL; i69 < 84; ++i69 )
{
customop_2_31[84 * i68 + i69] = customop_2_30[84 * i68 + i69] - customop_2_30_tanh[84 * i68 + i69];
}
}
// begin third custom operation
gemm_inp = ALIGN(malloc(336 * second_dim + 16)); // input for last operation (not custom)
for ( i70 = 0LL; i70 < second_dim; ++i70 )
{
for ( i71 = 0LL; i71 < 84; i71++ )
{
elem = customop_2_31[84 * i70 + i71];
mask = _mm_cmplt_ss(elem, 0); // elem < 0 ? 0xffffffff : 0
gemm_inp[84 * i70 + i71] =
_mm_andnot_ps(mask, elem).m128_u32[0] | // ~mask & elem
mask.m128_i32[0] & COERCE_UNSIGNED_INT(customop_2_31[84 * i70 + i71] * 0.0099999998); // mask & (elem / 100.0)
}
}
float *unaligned_output = malloc(40 * second_dim + 128);
float *output = ALIGN(unaligned_output);
for ( i72 = 0LL; i72 < second_dim; ++i72 ) // GEMM
{
for ( i73 = 0LL; i73 < 10; ++i73 )
{
*tmp_float_ptr = 0.0;
for ( i74 = 0LL; i74 < 84; ++i74 )
*tmp_float_ptr = (gemm_inp[84 * i72 + i74] * constant_12[84 * i73 + i74]) + *tmp_float_ptr;
output[10 * i72 + i73] = *tmp_float_ptr + constant_13[i73];
}
}
// returns some kind of struct
return struct data {
unaligned_output,
output,
0,
second_dim,
10,
10,
1
};
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment