Created
August 11, 2022 19:43
-
-
Save rygorous/ba9a5bfbfc3555d7242b8a47c522b093 to your computer and use it in GitHub Desktop.
Intel RSQRTSS logic
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <string.h> | |
#include <stdint.h> | |
#include <emmintrin.h> | |
static inline uint32_t test_func(uint32_t x) | |
{ | |
__m128 xs = _mm_castsi128_ps(_mm_cvtsi32_si128(x)); | |
__m128 value = _mm_rsqrt_ss(xs); | |
uint32_t result = _mm_cvtsi128_si32(_mm_castps_si128(value)); | |
return result; | |
} | |
//static uint16_t intel_rsqrt_tab[2048]; | |
static const uint16_t intel_rsqrt_tab[2048] = | |
{ | |
1695,1692,1690,1687,1684,1681,1678,1676,1673,1670,1667,1664,1662,1659,1656,1653, | |
1651,1648,1645,1642,1639,1637,1634,1631,1629,1626,1623,1620,1618,1615,1612,1610, | |
1607,1604,1601,1599,1596,1593,1591,1588,1585,1583,1580,1577,1575,1572,1569,1567, | |
1564,1561,1559,1556,1554,1551,1548,1546,1543,1541,1538,1535,1533,1530,1528,1525, | |
1522,1520,1517,1515,1512,1510,1507,1504,1502,1499,1497,1494,1492,1489,1487,1484, | |
1482,1479,1476,1474,1471,1469,1466,1464,1461,1459,1456,1454,1451,1449,1447,1444, | |
1442,1439,1437,1434,1432,1429,1427,1424,1422,1419,1417,1415,1412,1410,1407,1405, | |
1402,1400,1398,1395,1393,1390,1388,1386,1383,1381,1378,1376,1374,1371,1369,1367, | |
1364,1362,1359,1357,1355,1352,1350,1348,1345,1343,1341,1338,1336,1334,1331,1329, | |
1327,1324,1322,1320,1317,1315,1313,1310,1308,1306,1304,1301,1299,1297,1294,1292, | |
1290,1288,1285,1283,1281,1279,1276,1274,1272,1270,1267,1265,1263,1261,1258,1256, | |
1254,1252,1249,1247,1245,1243,1241,1238,1236,1234,1232,1230,1227,1225,1223,1221, | |
1219,1216,1214,1212,1210,1208,1206,1203,1201,1199,1197,1195,1193,1190,1188,1186, | |
1184,1182,1180,1178,1175,1173,1171,1169,1167,1165,1163,1161,1158,1156,1154,1152, | |
1150,1148,1146,1144,1142,1140,1137,1135,1133,1131,1129,1127,1125,1123,1121,1119, | |
1117,1115,1113,1111,1109,1106,1104,1102,1100,1098,1096,1094,1092,1090,1088,1086, | |
1084,1082,1080,1078,1076,1074,1072,1070,1068,1066,1064,1062,1060,1058,1056,1054, | |
1052,1050,1048,1046,1044,1042,1040,1038,1036,1034,1032,1030,1028,1026,1024,1022, | |
1021,1019,1017,1015,1013,1011,1009,1007,1005,1003,1001, 999, 997, 995, 993, 992, | |
990, 988, 986, 984, 982, 980, 978, 976, 974, 972, 971, 969, 967, 965, 963, 961, | |
959, 957, 956, 954, 952, 950, 948, 946, 944, 942, 941, 939, 937, 935, 933, 931, | |
929, 928, 926, 924, 922, 920, 918, 917, 915, 913, 911, 909, 907, 906, 904, 902, | |
900, 898, 897, 895, 893, 891, 889, 888, 886, 884, 882, 880, 879, 877, 875, 873, | |
871, 870, 868, 866, 864, 862, 861, 859, 857, 855, 854, 852, 850, 848, 847, 845, | |
843, 841, 840, 838, 836, 834, 833, 831, 829, 827, 826, 824, 822, 820, 819, 817, | |
815, 814, 812, 810, 808, 807, 805, 803, 802, 800, 798, 796, 795, 793, 791, 790, | |
788, 786, 785, 783, 781, 779, 778, 776, 774, 773, 771, 769, 768, 766, 764, 763, | |
761, 759, 758, 756, 754, 753, 751, 749, 748, 746, 744, 743, 741, 739, 738, 736, | |
735, 733, 731, 730, 728, 726, 725, 723, 721, 720, 718, 717, 715, 713, 712, 710, | |
709, 707, 705, 704, 702, 700, 699, 697, 696, 694, 692, 691, 689, 688, 686, 684, | |
683, 681, 680, 678, 677, 675, 673, 672, 670, 669, 667, 666, 664, 662, 661, 659, | |
658, 656, 655, 653, 651, 650, 648, 647, 645, 644, 642, 641, 639, 638, 636, 634, | |
633, 631, 630, 628, 627, 625, 624, 622, 621, 619, 618, 616, 615, 613, 611, 610, | |
608, 607, 605, 604, 602, 601, 599, 598, 596, 595, 593, 592, 590, 589, 587, 586, | |
584, 583, 581, 580, 578, 577, 575, 574, 573, 571, 570, 568, 567, 565, 564, 562, | |
561, 559, 558, 556, 555, 553, 552, 550, 549, 548, 546, 545, 543, 542, 540, 539, | |
537, 536, 534, 533, 532, 530, 529, 527, 526, 524, 523, 522, 520, 519, 517, 516, | |
514, 513, 512, 510, 509, 507, 506, 504, 503, 502, 500, 499, 497, 496, 495, 493, | |
492, 490, 489, 488, 486, 485, 483, 482, 481, 479, 478, 476, 475, 474, 472, 471, | |
469, 468, 467, 465, 464, 463, 461, 460, 458, 457, 456, 454, 453, 452, 450, 449, | |
447, 446, 445, 443, 442, 441, 439, 438, 437, 435, 434, 432, 431, 430, 428, 427, | |
426, 424, 423, 422, 420, 419, 418, 416, 415, 414, 412, 411, 410, 408, 407, 406, | |
404, 403, 402, 400, 399, 398, 396, 395, 394, 392, 391, 390, 389, 387, 386, 385, | |
383, 382, 381, 379, 378, 377, 375, 374, 373, 372, 370, 369, 368, 366, 365, 364, | |
363, 361, 360, 359, 357, 356, 355, 354, 352, 351, 350, 348, 347, 346, 345, 343, | |
342, 341, 339, 338, 337, 336, 334, 333, 332, 331, 329, 328, 327, 326, 324, 323, | |
322, 321, 319, 318, 317, 316, 314, 313, 312, 311, 309, 308, 307, 306, 304, 303, | |
302, 301, 299, 298, 297, 296, 294, 293, 292, 291, 290, 288, 287, 286, 285, 283, | |
282, 281, 280, 279, 277, 276, 275, 274, 272, 271, 270, 269, 268, 266, 265, 264, | |
263, 262, 260, 259, 258, 257, 256, 254, 253, 252, 251, 250, 248, 247, 246, 245, | |
244, 242, 241, 240, 239, 238, 237, 235, 234, 233, 232, 231, 229, 228, 227, 226, | |
225, 224, 222, 221, 220, 219, 218, 217, 215, 214, 213, 212, 211, 210, 208, 207, | |
206, 205, 204, 203, 201, 200, 199, 198, 197, 196, 195, 193, 192, 191, 190, 189, | |
188, 187, 185, 184, 183, 182, 181, 180, 179, 177, 176, 175, 174, 173, 172, 171, | |
169, 168, 167, 166, 165, 164, 163, 162, 160, 159, 158, 157, 156, 155, 154, 153, | |
152, 150, 149, 148, 147, 146, 145, 144, 143, 142, 140, 139, 138, 137, 136, 135, | |
134, 133, 132, 130, 129, 128, 127, 126, 125, 124, 123, 122, 121, 120, 118, 117, | |
116, 115, 114, 113, 112, 111, 110, 109, 108, 107, 105, 104, 103, 102, 101, 100, | |
99, 98, 97, 96, 95, 94, 93, 91, 90, 89, 88, 87, 86, 85, 84, 83, | |
82, 81, 80, 79, 78, 77, 76, 74, 73, 72, 71, 70, 69, 68, 67, 66, | |
65, 64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 52, 51, 50, 49, | |
48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, | |
32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, | |
16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, | |
4094,4090,4086,4082,4078,4074,4070,4066,4062,4058,4054,4050,4046,4043,4039,4035, | |
4031,4027,4023,4019,4015,4011,4007,4004,4000,3996,3992,3988,3984,3980,3977,3973, | |
3969,3965,3961,3958,3954,3950,3946,3942,3939,3935,3931,3927,3924,3920,3916,3912, | |
3909,3905,3901,3897,3894,3890,3886,3883,3879,3875,3872,3868,3864,3861,3857,3853, | |
3850,3846,3842,3839,3835,3831,3828,3824,3821,3817,3813,3810,3806,3803,3799,3795, | |
3792,3788,3785,3781,3778,3774,3770,3767,3763,3760,3756,3753,3749,3746,3742,3739, | |
3735,3732,3728,3725,3721,3718,3714,3711,3707,3704,3701,3697,3694,3690,3687,3683, | |
3680,3677,3673,3670,3666,3663,3660,3656,3653,3649,3646,3643,3639,3636,3633,3629, | |
3626,3622,3619,3616,3612,3609,3606,3602,3599,3596,3593,3589,3586,3583,3579,3576, | |
3573,3569,3566,3563,3560,3556,3553,3550,3547,3543,3540,3537,3534,3530,3527,3524, | |
3521,3518,3514,3511,3508,3505,3502,3498,3495,3492,3489,3486,3483,3479,3476,3473, | |
3470,3467,3464,3460,3457,3454,3451,3448,3445,3442,3439,3435,3432,3429,3426,3423, | |
3420,3417,3414,3411,3408,3405,3401,3398,3395,3392,3389,3386,3383,3380,3377,3374, | |
3371,3368,3365,3362,3359,3356,3353,3350,3347,3344,3341,3338,3335,3332,3329,3326, | |
3323,3320,3317,3314,3311,3308,3305,3302,3299,3296,3293,3291,3288,3285,3282,3279, | |
3276,3273,3270,3267,3264,3261,3258,3256,3253,3250,3247,3244,3241,3238,3235,3233, | |
3230,3227,3224,3221,3218,3215,3213,3210,3207,3204,3201,3198,3196,3193,3190,3187, | |
3184,3182,3179,3176,3173,3170,3168,3165,3162,3159,3156,3154,3151,3148,3145,3143, | |
3140,3137,3134,3132,3129,3126,3123,3121,3118,3115,3112,3110,3107,3104,3102,3099, | |
3096,3093,3091,3088,3085,3083,3080,3077,3075,3072,3069,3067,3064,3061,3059,3056, | |
3053,3051,3048,3045,3043,3040,3037,3035,3032,3029,3027,3024,3022,3019,3016,3014, | |
3011,3008,3006,3003,3001,2998,2995,2993,2990,2988,2985,2983,2980,2977,2975,2972, | |
2970,2967,2965,2962,2959,2957,2954,2952,2949,2947,2944,2942,2939,2937,2934,2931, | |
2929,2926,2924,2921,2919,2916,2914,2911,2909,2906,2904,2901,2899,2896,2894,2891, | |
2889,2886,2884,2881,2879,2877,2874,2872,2869,2867,2864,2862,2859,2857,2854,2852, | |
2850,2847,2845,2842,2840,2837,2835,2833,2830,2828,2825,2823,2821,2818,2816,2813, | |
2811,2809,2806,2804,2801,2799,2797,2794,2792,2789,2787,2785,2782,2780,2778,2775, | |
2773,2771,2768,2766,2763,2761,2759,2756,2754,2752,2749,2747,2745,2742,2740,2738, | |
2735,2733,2731,2728,2726,2724,2722,2719,2717,2715,2712,2710,2708,2705,2703,2701, | |
2699,2696,2694,2692,2690,2687,2685,2683,2680,2678,2676,2674,2671,2669,2667,2665, | |
2662,2660,2658,2656,2653,2651,2649,2647,2645,2642,2640,2638,2636,2633,2631,2629, | |
2627,2625,2622,2620,2618,2616,2614,2611,2609,2607,2605,2603,2600,2598,2596,2594, | |
2592,2589,2587,2585,2583,2581,2579,2576,2574,2572,2570,2568,2566,2564,2561,2559, | |
2557,2555,2553,2551,2549,2546,2544,2542,2540,2538,2536,2534,2532,2529,2527,2525, | |
2523,2521,2519,2517,2515,2513,2510,2508,2506,2504,2502,2500,2498,2496,2494,2492, | |
2490,2487,2485,2483,2481,2479,2477,2475,2473,2471,2469,2467,2465,2463,2461,2459, | |
2457,2455,2452,2450,2448,2446,2444,2442,2440,2438,2436,2434,2432,2430,2428,2426, | |
2424,2422,2420,2418,2416,2414,2412,2410,2408,2406,2404,2402,2400,2398,2396,2394, | |
2392,2390,2388,2386,2384,2382,2380,2378,2376,2374,2372,2370,2368,2366,2364,2362, | |
2360,2359,2357,2355,2353,2351,2349,2347,2345,2343,2341,2339,2337,2335,2333,2331, | |
2329,2327,2326,2324,2322,2320,2318,2316,2314,2312,2310,2308,2306,2304,2303,2301, | |
2299,2297,2295,2293,2291,2289,2287,2285,2284,2282,2280,2278,2276,2274,2272,2270, | |
2268,2267,2265,2263,2261,2259,2257,2255,2254,2252,2250,2248,2246,2244,2242,2241, | |
2239,2237,2235,2233,2231,2229,2228,2226,2224,2222,2220,2218,2217,2215,2213,2211, | |
2209,2207,2206,2204,2202,2200,2198,2197,2195,2193,2191,2189,2188,2186,2184,2182, | |
2180,2179,2177,2175,2173,2171,2170,2168,2166,2164,2162,2161,2159,2157,2155,2154, | |
2152,2150,2148,2146,2145,2143,2141,2139,2138,2136,2134,2132,2131,2129,2127,2125, | |
2124,2122,2120,2118,2117,2115,2113,2111,2110,2108,2106,2104,2103,2101,2099,2097, | |
2096,2094,2092,2091,2089,2087,2085,2084,2082,2080,2079,2077,2075,2073,2072,2070, | |
2068,2067,2065,2063,2061,2060,2058,2056,2055,2053,2051,2050,2048,2046,2045,2043, | |
2041,2039,2038,2036,2034,2033,2031,2029,2028,2026,2024,2023,2021,2019,2018,2016, | |
2014,2013,2011,2009,2008,2006,2004,2003,2001,2000,1998,1996,1995,1993,1991,1990, | |
1988,1986,1985,1983,1982,1980,1978,1977,1975,1973,1972,1970,1968,1967,1965,1964, | |
1962,1960,1959,1957,1956,1954,1952,1951,1949,1947,1946,1944,1943,1941,1939,1938, | |
1936,1935,1933,1931,1930,1928,1927,1925,1924,1922,1920,1919,1917,1916,1914,1912, | |
1911,1909,1908,1906,1905,1903,1901,1900,1898,1897,1895,1894,1892,1890,1889,1887, | |
1886,1884,1883,1881,1880,1878,1876,1875,1873,1872,1870,1869,1867,1866,1864,1863, | |
1861,1860,1858,1856,1855,1853,1852,1850,1849,1847,1846,1844,1843,1841,1840,1838, | |
1837,1835,1834,1832,1831,1829,1827,1826,1824,1823,1821,1820,1818,1817,1815,1814, | |
1812,1811,1809,1808,1806,1805,1803,1802,1800,1799,1797,1796,1795,1793,1792,1790, | |
1789,1787,1786,1784,1783,1781,1780,1778,1777,1775,1774,1772,1771,1769,1768,1766, | |
1765,1764,1762,1761,1759,1758,1756,1755,1753,1752,1750,1749,1747,1746,1745,1743, | |
1742,1740,1739,1737,1736,1734,1733,1732,1730,1729,1727,1726,1724,1723,1722,1720, | |
1719,1717,1716,1714,1713,1712,1710,1709,1707,1706,1704,1703,1702,1700,1699,1697, | |
}; | |
// x is a 32-bit float bit pattern | |
static uint32_t intel_rsqrt(uint32_t x) | |
{ | |
const uint32_t exp_mask = 0x7f800000u; | |
if (x >= exp_mask) | |
{ | |
// Inf, NaN, or negative | |
if (x == exp_mask) // +Inf -> +0 | |
return 0; | |
else if ((x & exp_mask) == exp_mask) // NaN? | |
return x | 0x00400000u; // set quiet flag | |
else // sign bit set | |
{ | |
if (x < 0x80800000u) // -0 or -subnorm -> -Inf (!) | |
return 0xff800000u; | |
else // other negatives -> qNaN with sign bit set | |
return 0xffc00000u; | |
} | |
} | |
else if (x < 0x00800000u) // 0 or subnormal -> Inf | |
return exp_mask; | |
else | |
{ | |
// We now know that x is positive and normal | |
// Exponent computed directly | |
// | |
// x in [1,2) -> sqrt(x) in [1,sqrt(2)) -> rsqrt(x) in (1/sqrt(2),1] | |
// x in [2,4) -> sqrt(x) in [sqrt(2),2) -> rsqrt(x) in (1/2,1/sqrt(2)] | |
// | |
// Intel rsqrt decides that rsqrt(1) < 1 so the exponents are always the same | |
// for the entire bucket. | |
// | |
// So exponent range [0,1] maps to exponent -1. | |
uint32_t old_biased_exp = x >> 23; | |
uint32_t new_biased_exp = (380 - old_biased_exp) >> 1; | |
uint32_t new_exp = new_biased_exp << 23; | |
// Table lookup from high mantissa bits and exponent LSB | |
// Low 11 bits of table entry are always 0 -> 12-bit entries | |
uint32_t result = new_exp | (intel_rsqrt_tab[(x >> 13) & 0x7ff] << 11); | |
return result; | |
} | |
} | |
int main() | |
{ | |
#if 0 | |
// initialize table | |
printf("static const uint16_t intel_rsqrt_tab[2048] =\n{\n"); | |
for (uint32_t i = 0; i < 2048; ++i) | |
{ | |
__m128 xs = _mm_castsi128_ps(_mm_cvtsi32_si128(0x3f000000 + (i<<13))); | |
__m128 rsqrt = _mm_rsqrt_ss(xs); | |
uint32_t bits = _mm_cvtsi128_si32(_mm_castps_si128(rsqrt)); | |
uint16_t tabval = (bits >> 11) & 0xfff; | |
//intel_rsqrt_tab[i] = tabval; | |
printf("%s%4d,%s", ((i & 15) == 0) ? "\t" : "", tabval, ((i & 15) == 15) ? "\n" : ""); | |
} | |
printf("};\n"); | |
#endif | |
uint32_t x = 0; | |
do | |
{ | |
uint32_t ref_result = test_func(x); | |
uint32_t test_result = intel_rsqrt(x); | |
if (ref_result != test_result) | |
{ | |
printf("! x=0x%08x ref=0x%08x test=0x%08x\n", x, ref_result, test_result); | |
return 1; | |
} | |
++x; | |
} while (x != 0); | |
printf("all ok!\n"); | |
#if 0 | |
// for NaN buckets, verify expected results | |
for (uint32_t sign = 0; sign < 2; ++sign) | |
{ | |
for (uint32_t mant = 0; mant < (1u << 23); ++mant) | |
{ | |
uint32_t x = (sign << 31) | 0x7f800000u | mant; | |
uint32_t result = test_func(x); | |
uint32_t expected; | |
if (sign == 0 && mant == 0) // rsqrt of +Inf is 0 | |
expected = 0; | |
else | |
expected = x | 0x00400000u; // quiet the NaN | |
if (result != expected) | |
{ | |
printf("! x=0x%08x result=0x%08x expected=0x%08x\n", x, result, expected); | |
return 1; | |
} | |
} | |
} | |
// bucket = sign + exp | |
for (uint32_t bucket = 0; bucket < 512; ++bucket) | |
{ | |
uint32_t prev_result = 0; | |
// skip NaN/Inf buckets | |
if ((bucket & 255) == 255) | |
continue; | |
for (uint32_t mant = 0; mant < (1u << 23); ++mant) | |
{ | |
uint32_t x = (bucket << 23) | mant; | |
uint32_t result = test_func(x); | |
if (mant == 0 || result != prev_result) | |
printf("x=0x%08x result=0x%08x\n", x, result); | |
prev_result = result; | |
} | |
} | |
#endif | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment