Skip to content

Instantly share code, notes, and snippets.

@rygorous
Created August 11, 2022 19:43
Show Gist options
  • Save rygorous/ba9a5bfbfc3555d7242b8a47c522b093 to your computer and use it in GitHub Desktop.
Save rygorous/ba9a5bfbfc3555d7242b8a47c522b093 to your computer and use it in GitHub Desktop.
Intel RSQRTSS logic
#include <stdio.h>
#include <string.h>
#include <stdint.h>
#include <emmintrin.h>
static inline uint32_t test_func(uint32_t x)
{
__m128 xs = _mm_castsi128_ps(_mm_cvtsi32_si128(x));
__m128 value = _mm_rsqrt_ss(xs);
uint32_t result = _mm_cvtsi128_si32(_mm_castps_si128(value));
return result;
}
//static uint16_t intel_rsqrt_tab[2048];
static const uint16_t intel_rsqrt_tab[2048] =
{
1695,1692,1690,1687,1684,1681,1678,1676,1673,1670,1667,1664,1662,1659,1656,1653,
1651,1648,1645,1642,1639,1637,1634,1631,1629,1626,1623,1620,1618,1615,1612,1610,
1607,1604,1601,1599,1596,1593,1591,1588,1585,1583,1580,1577,1575,1572,1569,1567,
1564,1561,1559,1556,1554,1551,1548,1546,1543,1541,1538,1535,1533,1530,1528,1525,
1522,1520,1517,1515,1512,1510,1507,1504,1502,1499,1497,1494,1492,1489,1487,1484,
1482,1479,1476,1474,1471,1469,1466,1464,1461,1459,1456,1454,1451,1449,1447,1444,
1442,1439,1437,1434,1432,1429,1427,1424,1422,1419,1417,1415,1412,1410,1407,1405,
1402,1400,1398,1395,1393,1390,1388,1386,1383,1381,1378,1376,1374,1371,1369,1367,
1364,1362,1359,1357,1355,1352,1350,1348,1345,1343,1341,1338,1336,1334,1331,1329,
1327,1324,1322,1320,1317,1315,1313,1310,1308,1306,1304,1301,1299,1297,1294,1292,
1290,1288,1285,1283,1281,1279,1276,1274,1272,1270,1267,1265,1263,1261,1258,1256,
1254,1252,1249,1247,1245,1243,1241,1238,1236,1234,1232,1230,1227,1225,1223,1221,
1219,1216,1214,1212,1210,1208,1206,1203,1201,1199,1197,1195,1193,1190,1188,1186,
1184,1182,1180,1178,1175,1173,1171,1169,1167,1165,1163,1161,1158,1156,1154,1152,
1150,1148,1146,1144,1142,1140,1137,1135,1133,1131,1129,1127,1125,1123,1121,1119,
1117,1115,1113,1111,1109,1106,1104,1102,1100,1098,1096,1094,1092,1090,1088,1086,
1084,1082,1080,1078,1076,1074,1072,1070,1068,1066,1064,1062,1060,1058,1056,1054,
1052,1050,1048,1046,1044,1042,1040,1038,1036,1034,1032,1030,1028,1026,1024,1022,
1021,1019,1017,1015,1013,1011,1009,1007,1005,1003,1001, 999, 997, 995, 993, 992,
990, 988, 986, 984, 982, 980, 978, 976, 974, 972, 971, 969, 967, 965, 963, 961,
959, 957, 956, 954, 952, 950, 948, 946, 944, 942, 941, 939, 937, 935, 933, 931,
929, 928, 926, 924, 922, 920, 918, 917, 915, 913, 911, 909, 907, 906, 904, 902,
900, 898, 897, 895, 893, 891, 889, 888, 886, 884, 882, 880, 879, 877, 875, 873,
871, 870, 868, 866, 864, 862, 861, 859, 857, 855, 854, 852, 850, 848, 847, 845,
843, 841, 840, 838, 836, 834, 833, 831, 829, 827, 826, 824, 822, 820, 819, 817,
815, 814, 812, 810, 808, 807, 805, 803, 802, 800, 798, 796, 795, 793, 791, 790,
788, 786, 785, 783, 781, 779, 778, 776, 774, 773, 771, 769, 768, 766, 764, 763,
761, 759, 758, 756, 754, 753, 751, 749, 748, 746, 744, 743, 741, 739, 738, 736,
735, 733, 731, 730, 728, 726, 725, 723, 721, 720, 718, 717, 715, 713, 712, 710,
709, 707, 705, 704, 702, 700, 699, 697, 696, 694, 692, 691, 689, 688, 686, 684,
683, 681, 680, 678, 677, 675, 673, 672, 670, 669, 667, 666, 664, 662, 661, 659,
658, 656, 655, 653, 651, 650, 648, 647, 645, 644, 642, 641, 639, 638, 636, 634,
633, 631, 630, 628, 627, 625, 624, 622, 621, 619, 618, 616, 615, 613, 611, 610,
608, 607, 605, 604, 602, 601, 599, 598, 596, 595, 593, 592, 590, 589, 587, 586,
584, 583, 581, 580, 578, 577, 575, 574, 573, 571, 570, 568, 567, 565, 564, 562,
561, 559, 558, 556, 555, 553, 552, 550, 549, 548, 546, 545, 543, 542, 540, 539,
537, 536, 534, 533, 532, 530, 529, 527, 526, 524, 523, 522, 520, 519, 517, 516,
514, 513, 512, 510, 509, 507, 506, 504, 503, 502, 500, 499, 497, 496, 495, 493,
492, 490, 489, 488, 486, 485, 483, 482, 481, 479, 478, 476, 475, 474, 472, 471,
469, 468, 467, 465, 464, 463, 461, 460, 458, 457, 456, 454, 453, 452, 450, 449,
447, 446, 445, 443, 442, 441, 439, 438, 437, 435, 434, 432, 431, 430, 428, 427,
426, 424, 423, 422, 420, 419, 418, 416, 415, 414, 412, 411, 410, 408, 407, 406,
404, 403, 402, 400, 399, 398, 396, 395, 394, 392, 391, 390, 389, 387, 386, 385,
383, 382, 381, 379, 378, 377, 375, 374, 373, 372, 370, 369, 368, 366, 365, 364,
363, 361, 360, 359, 357, 356, 355, 354, 352, 351, 350, 348, 347, 346, 345, 343,
342, 341, 339, 338, 337, 336, 334, 333, 332, 331, 329, 328, 327, 326, 324, 323,
322, 321, 319, 318, 317, 316, 314, 313, 312, 311, 309, 308, 307, 306, 304, 303,
302, 301, 299, 298, 297, 296, 294, 293, 292, 291, 290, 288, 287, 286, 285, 283,
282, 281, 280, 279, 277, 276, 275, 274, 272, 271, 270, 269, 268, 266, 265, 264,
263, 262, 260, 259, 258, 257, 256, 254, 253, 252, 251, 250, 248, 247, 246, 245,
244, 242, 241, 240, 239, 238, 237, 235, 234, 233, 232, 231, 229, 228, 227, 226,
225, 224, 222, 221, 220, 219, 218, 217, 215, 214, 213, 212, 211, 210, 208, 207,
206, 205, 204, 203, 201, 200, 199, 198, 197, 196, 195, 193, 192, 191, 190, 189,
188, 187, 185, 184, 183, 182, 181, 180, 179, 177, 176, 175, 174, 173, 172, 171,
169, 168, 167, 166, 165, 164, 163, 162, 160, 159, 158, 157, 156, 155, 154, 153,
152, 150, 149, 148, 147, 146, 145, 144, 143, 142, 140, 139, 138, 137, 136, 135,
134, 133, 132, 130, 129, 128, 127, 126, 125, 124, 123, 122, 121, 120, 118, 117,
116, 115, 114, 113, 112, 111, 110, 109, 108, 107, 105, 104, 103, 102, 101, 100,
99, 98, 97, 96, 95, 94, 93, 91, 90, 89, 88, 87, 86, 85, 84, 83,
82, 81, 80, 79, 78, 77, 76, 74, 73, 72, 71, 70, 69, 68, 67, 66,
65, 64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 52, 51, 50, 49,
48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33,
32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17,
16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1,
4094,4090,4086,4082,4078,4074,4070,4066,4062,4058,4054,4050,4046,4043,4039,4035,
4031,4027,4023,4019,4015,4011,4007,4004,4000,3996,3992,3988,3984,3980,3977,3973,
3969,3965,3961,3958,3954,3950,3946,3942,3939,3935,3931,3927,3924,3920,3916,3912,
3909,3905,3901,3897,3894,3890,3886,3883,3879,3875,3872,3868,3864,3861,3857,3853,
3850,3846,3842,3839,3835,3831,3828,3824,3821,3817,3813,3810,3806,3803,3799,3795,
3792,3788,3785,3781,3778,3774,3770,3767,3763,3760,3756,3753,3749,3746,3742,3739,
3735,3732,3728,3725,3721,3718,3714,3711,3707,3704,3701,3697,3694,3690,3687,3683,
3680,3677,3673,3670,3666,3663,3660,3656,3653,3649,3646,3643,3639,3636,3633,3629,
3626,3622,3619,3616,3612,3609,3606,3602,3599,3596,3593,3589,3586,3583,3579,3576,
3573,3569,3566,3563,3560,3556,3553,3550,3547,3543,3540,3537,3534,3530,3527,3524,
3521,3518,3514,3511,3508,3505,3502,3498,3495,3492,3489,3486,3483,3479,3476,3473,
3470,3467,3464,3460,3457,3454,3451,3448,3445,3442,3439,3435,3432,3429,3426,3423,
3420,3417,3414,3411,3408,3405,3401,3398,3395,3392,3389,3386,3383,3380,3377,3374,
3371,3368,3365,3362,3359,3356,3353,3350,3347,3344,3341,3338,3335,3332,3329,3326,
3323,3320,3317,3314,3311,3308,3305,3302,3299,3296,3293,3291,3288,3285,3282,3279,
3276,3273,3270,3267,3264,3261,3258,3256,3253,3250,3247,3244,3241,3238,3235,3233,
3230,3227,3224,3221,3218,3215,3213,3210,3207,3204,3201,3198,3196,3193,3190,3187,
3184,3182,3179,3176,3173,3170,3168,3165,3162,3159,3156,3154,3151,3148,3145,3143,
3140,3137,3134,3132,3129,3126,3123,3121,3118,3115,3112,3110,3107,3104,3102,3099,
3096,3093,3091,3088,3085,3083,3080,3077,3075,3072,3069,3067,3064,3061,3059,3056,
3053,3051,3048,3045,3043,3040,3037,3035,3032,3029,3027,3024,3022,3019,3016,3014,
3011,3008,3006,3003,3001,2998,2995,2993,2990,2988,2985,2983,2980,2977,2975,2972,
2970,2967,2965,2962,2959,2957,2954,2952,2949,2947,2944,2942,2939,2937,2934,2931,
2929,2926,2924,2921,2919,2916,2914,2911,2909,2906,2904,2901,2899,2896,2894,2891,
2889,2886,2884,2881,2879,2877,2874,2872,2869,2867,2864,2862,2859,2857,2854,2852,
2850,2847,2845,2842,2840,2837,2835,2833,2830,2828,2825,2823,2821,2818,2816,2813,
2811,2809,2806,2804,2801,2799,2797,2794,2792,2789,2787,2785,2782,2780,2778,2775,
2773,2771,2768,2766,2763,2761,2759,2756,2754,2752,2749,2747,2745,2742,2740,2738,
2735,2733,2731,2728,2726,2724,2722,2719,2717,2715,2712,2710,2708,2705,2703,2701,
2699,2696,2694,2692,2690,2687,2685,2683,2680,2678,2676,2674,2671,2669,2667,2665,
2662,2660,2658,2656,2653,2651,2649,2647,2645,2642,2640,2638,2636,2633,2631,2629,
2627,2625,2622,2620,2618,2616,2614,2611,2609,2607,2605,2603,2600,2598,2596,2594,
2592,2589,2587,2585,2583,2581,2579,2576,2574,2572,2570,2568,2566,2564,2561,2559,
2557,2555,2553,2551,2549,2546,2544,2542,2540,2538,2536,2534,2532,2529,2527,2525,
2523,2521,2519,2517,2515,2513,2510,2508,2506,2504,2502,2500,2498,2496,2494,2492,
2490,2487,2485,2483,2481,2479,2477,2475,2473,2471,2469,2467,2465,2463,2461,2459,
2457,2455,2452,2450,2448,2446,2444,2442,2440,2438,2436,2434,2432,2430,2428,2426,
2424,2422,2420,2418,2416,2414,2412,2410,2408,2406,2404,2402,2400,2398,2396,2394,
2392,2390,2388,2386,2384,2382,2380,2378,2376,2374,2372,2370,2368,2366,2364,2362,
2360,2359,2357,2355,2353,2351,2349,2347,2345,2343,2341,2339,2337,2335,2333,2331,
2329,2327,2326,2324,2322,2320,2318,2316,2314,2312,2310,2308,2306,2304,2303,2301,
2299,2297,2295,2293,2291,2289,2287,2285,2284,2282,2280,2278,2276,2274,2272,2270,
2268,2267,2265,2263,2261,2259,2257,2255,2254,2252,2250,2248,2246,2244,2242,2241,
2239,2237,2235,2233,2231,2229,2228,2226,2224,2222,2220,2218,2217,2215,2213,2211,
2209,2207,2206,2204,2202,2200,2198,2197,2195,2193,2191,2189,2188,2186,2184,2182,
2180,2179,2177,2175,2173,2171,2170,2168,2166,2164,2162,2161,2159,2157,2155,2154,
2152,2150,2148,2146,2145,2143,2141,2139,2138,2136,2134,2132,2131,2129,2127,2125,
2124,2122,2120,2118,2117,2115,2113,2111,2110,2108,2106,2104,2103,2101,2099,2097,
2096,2094,2092,2091,2089,2087,2085,2084,2082,2080,2079,2077,2075,2073,2072,2070,
2068,2067,2065,2063,2061,2060,2058,2056,2055,2053,2051,2050,2048,2046,2045,2043,
2041,2039,2038,2036,2034,2033,2031,2029,2028,2026,2024,2023,2021,2019,2018,2016,
2014,2013,2011,2009,2008,2006,2004,2003,2001,2000,1998,1996,1995,1993,1991,1990,
1988,1986,1985,1983,1982,1980,1978,1977,1975,1973,1972,1970,1968,1967,1965,1964,
1962,1960,1959,1957,1956,1954,1952,1951,1949,1947,1946,1944,1943,1941,1939,1938,
1936,1935,1933,1931,1930,1928,1927,1925,1924,1922,1920,1919,1917,1916,1914,1912,
1911,1909,1908,1906,1905,1903,1901,1900,1898,1897,1895,1894,1892,1890,1889,1887,
1886,1884,1883,1881,1880,1878,1876,1875,1873,1872,1870,1869,1867,1866,1864,1863,
1861,1860,1858,1856,1855,1853,1852,1850,1849,1847,1846,1844,1843,1841,1840,1838,
1837,1835,1834,1832,1831,1829,1827,1826,1824,1823,1821,1820,1818,1817,1815,1814,
1812,1811,1809,1808,1806,1805,1803,1802,1800,1799,1797,1796,1795,1793,1792,1790,
1789,1787,1786,1784,1783,1781,1780,1778,1777,1775,1774,1772,1771,1769,1768,1766,
1765,1764,1762,1761,1759,1758,1756,1755,1753,1752,1750,1749,1747,1746,1745,1743,
1742,1740,1739,1737,1736,1734,1733,1732,1730,1729,1727,1726,1724,1723,1722,1720,
1719,1717,1716,1714,1713,1712,1710,1709,1707,1706,1704,1703,1702,1700,1699,1697,
};
// x is a 32-bit float bit pattern
static uint32_t intel_rsqrt(uint32_t x)
{
const uint32_t exp_mask = 0x7f800000u;
if (x >= exp_mask)
{
// Inf, NaN, or negative
if (x == exp_mask) // +Inf -> +0
return 0;
else if ((x & exp_mask) == exp_mask) // NaN?
return x | 0x00400000u; // set quiet flag
else // sign bit set
{
if (x < 0x80800000u) // -0 or -subnorm -> -Inf (!)
return 0xff800000u;
else // other negatives -> qNaN with sign bit set
return 0xffc00000u;
}
}
else if (x < 0x00800000u) // 0 or subnormal -> Inf
return exp_mask;
else
{
// We now know that x is positive and normal
// Exponent computed directly
//
// x in [1,2) -> sqrt(x) in [1,sqrt(2)) -> rsqrt(x) in (1/sqrt(2),1]
// x in [2,4) -> sqrt(x) in [sqrt(2),2) -> rsqrt(x) in (1/2,1/sqrt(2)]
//
// Intel rsqrt decides that rsqrt(1) < 1 so the exponents are always the same
// for the entire bucket.
//
// So exponent range [0,1] maps to exponent -1.
uint32_t old_biased_exp = x >> 23;
uint32_t new_biased_exp = (380 - old_biased_exp) >> 1;
uint32_t new_exp = new_biased_exp << 23;
// Table lookup from high mantissa bits and exponent LSB
// Low 11 bits of table entry are always 0 -> 12-bit entries
uint32_t result = new_exp | (intel_rsqrt_tab[(x >> 13) & 0x7ff] << 11);
return result;
}
}
int main()
{
#if 0
// initialize table
printf("static const uint16_t intel_rsqrt_tab[2048] =\n{\n");
for (uint32_t i = 0; i < 2048; ++i)
{
__m128 xs = _mm_castsi128_ps(_mm_cvtsi32_si128(0x3f000000 + (i<<13)));
__m128 rsqrt = _mm_rsqrt_ss(xs);
uint32_t bits = _mm_cvtsi128_si32(_mm_castps_si128(rsqrt));
uint16_t tabval = (bits >> 11) & 0xfff;
//intel_rsqrt_tab[i] = tabval;
printf("%s%4d,%s", ((i & 15) == 0) ? "\t" : "", tabval, ((i & 15) == 15) ? "\n" : "");
}
printf("};\n");
#endif
uint32_t x = 0;
do
{
uint32_t ref_result = test_func(x);
uint32_t test_result = intel_rsqrt(x);
if (ref_result != test_result)
{
printf("! x=0x%08x ref=0x%08x test=0x%08x\n", x, ref_result, test_result);
return 1;
}
++x;
} while (x != 0);
printf("all ok!\n");
#if 0
// for NaN buckets, verify expected results
for (uint32_t sign = 0; sign < 2; ++sign)
{
for (uint32_t mant = 0; mant < (1u << 23); ++mant)
{
uint32_t x = (sign << 31) | 0x7f800000u | mant;
uint32_t result = test_func(x);
uint32_t expected;
if (sign == 0 && mant == 0) // rsqrt of +Inf is 0
expected = 0;
else
expected = x | 0x00400000u; // quiet the NaN
if (result != expected)
{
printf("! x=0x%08x result=0x%08x expected=0x%08x\n", x, result, expected);
return 1;
}
}
}
// bucket = sign + exp
for (uint32_t bucket = 0; bucket < 512; ++bucket)
{
uint32_t prev_result = 0;
// skip NaN/Inf buckets
if ((bucket & 255) == 255)
continue;
for (uint32_t mant = 0; mant < (1u << 23); ++mant)
{
uint32_t x = (bucket << 23) | mant;
uint32_t result = test_func(x);
if (mant == 0 || result != prev_result)
printf("x=0x%08x result=0x%08x\n", x, result);
prev_result = result;
}
}
#endif
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment