sh1boot/gist:5732738

## gistfile1.c
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>

#define TABBITS 8

#if 0
extern uint32_t divfn(uint16_t i, uint16_t j);
#else

#if TABBITS != 8
uint32_t table2[(1 << TABBITS) + 1];
#else/*{{{*/
static const uint32_t table2[256 + 1] __attribute__((progmem)) = {
0xffffffff, 0x00000000, 0x80000000, 0x55555555, 0x40000000, 0x33333333, 0x2aaaaaab, 0x24924925, 0x20000000, 0x1c71c71c, 0x1999999a, 0x1745d174, 0x15555555, 0x13b13b14, 0x12492492, 0x11111111,
0x10000000, 0x0f0f0f0f, 0x0e38e38e, 0x0d79435e, 0x0ccccccd, 0x0c30c30c, 0x0ba2e8ba, 0x0b21642d, 0x0aaaaaab, 0x0a3d70a4, 0x09d89d8a, 0x097b425f, 0x09249249, 0x08d3dcb1, 0x08888889, 0x08421084,
0x08000000, 0x07c1f07c, 0x07878788, 0x07507507, 0x071c71c7, 0x06eb3e45, 0x06bca1af, 0x06906907, 0x06666666, 0x063e7064, 0x06186186, 0x05f417d0, 0x05d1745d, 0x05b05b06, 0x0590b216, 0x0572620b,
0x05555555, 0x0539782a, 0x051eb852, 0x05050505, 0x04ec4ec5, 0x04d4873f, 0x04bda12f, 0x04a7904a, 0x04924925, 0x047dc11f, 0x0469ee58, 0x0456c798, 0x04444444, 0x04325c54, 0x04210842, 0x04104104,
0x04000000, 0x03f03f04, 0x03e0f83e, 0x03d22635, 0x03c3c3c4, 0x03b5cc0f, 0x03a83a84, 0x039b0ad1, 0x038e38e4, 0x0381c0e0, 0x03759f23, 0x0369d037, 0x035e50d8, 0x03531dec, 0x03483483, 0x033d91d3,
0x03333333, 0x03291620, 0x031f3832, 0x03159722, 0x030c30c3, 0x03030303, 0x02fa0be8, 0x02f14990, 0x02e8ba2f, 0x02e05c0c, 0x02d82d83, 0x02d02d03, 0x02c8590b, 0x02c0b02c, 0x02b93105, 0x02b1da46,
0x02aaaaab, 0x02a3a0fd, 0x029cbc15, 0x0295fad4, 0x028f5c29, 0x0288df0d, 0x02828283, 0x027c4598, 0x02762762, 0x02702702, 0x026a439f, 0x02647c69, 0x025ed098, 0x02593f6a, 0x0253c825, 0x024e6a17,
0x02492492, 0x0243f6f0, 0x023ee090, 0x0239e0d6, 0x0234f72c, 0x02302302, 0x022b63cc, 0x0226b902, 0x02222222, 0x021d9ead, 0x02192e2a, 0x0214d021, 0x02108421, 0x020c49ba, 0x02082082, 0x02040810,
0x02000000, 0x01fc07f0, 0x01f81f82, 0x01f4465a, 0x01f07c1f, 0x01ecc07b, 0x01e9131b, 0x01e573ad, 0x01e1e1e2, 0x01de5d6e, 0x01dae607, 0x01d77b65, 0x01d41d42, 0x01d0cb59, 0x01cd8569, 0x01ca4b30,
0x01c71c72, 0x01c3f8f0, 0x01c0e070, 0x01bdd2b9, 0x01bacf91, 0x01b7d6c4, 0x01b4e81b, 0x01b20364, 0x01af286c, 0x01ac5702, 0x01a98ef6, 0x01a6d01a, 0x01a41a42, 0x01a16d40, 0x019ec8e9, 0x019c2d15,
0x0199999a, 0x01970e50, 0x01948b10, 0x01920fb5, 0x018f9c19, 0x018d3019, 0x018acb91, 0x01886e5f, 0x01861862, 0x0183c978, 0x01818182, 0x017f4060, 0x017d05f4, 0x017ad221, 0x0178a4c8, 0x01767dce,
0x01745d17, 0x01724288, 0x01702e06, 0x016e1f77, 0x016c16c1, 0x016a13cd, 0x01681681, 0x01661ec7, 0x01642c86, 0x01623fa7, 0x01605816, 0x015e75bc, 0x015c9883, 0x015ac057, 0x0158ed23, 0x01571ed4,
0x01555555, 0x01539095, 0x0151d07f, 0x01501501, 0x014e5e0a, 0x014cab88, 0x014afd6a, 0x0149539e, 0x0147ae14, 0x01460cbc, 0x01446f86, 0x0142d662, 0x01414141, 0x013fb014, 0x013e22cc, 0x013c995a,
0x013b13b1, 0x013991c3, 0x01381381, 0x013698df, 0x013521d0, 0x0133ae46, 0x01323e35, 0x0130d190, 0x012f684c, 0x012e025c, 0x012c9fb5, 0x012b404b, 0x0129e413, 0x01288b01, 0x0127350c, 0x0125e227,
0x01249249, 0x01234568, 0x0121fb78, 0x0120b471, 0x011f7048, 0x011e2ef4, 0x011cf06b, 0x011bb4a4, 0x011a7b96, 0x01194538, 0x01181181, 0x0116e069, 0x0115b1e6, 0x011485f1, 0x01135c81, 0x0112358e,
0x01111111, 0x010fef01, 0x010ecf57, 0x010db20b, 0x010c9715, 0x010b7e6f, 0x010a6811, 0x010953f4, 0x01084211, 0x01073261, 0x010624dd, 0x0105197f, 0x01041041, 0x0103091b, 0x01020408, 0x01010101,
0x01000000,
};
#endif/*}}}*/

uint32_t divfn(uint16_t i, uint16_t j)
{
    uint32_t t;
    uint8_t shift = 0;
    uint8_t k;

    if (j <= 1)
        return (int32_t)i << 16;

    /* if j is too large to index the table, then we just shift it right until
     * it's within bounds.  In this case, we'll use the fractional part (k) for
     * linear interpolation, and the integer part for the index.
     */
    t = (uint32_t)j << 8;
    while (t >= ((uint32_t)1 << (TABBITS + 8)))
    {
        t >>= 1;
        shift++;
    }
    j = t >> 8;
    k = (uint8_t)t;

    /* look up (0x80000000 / j) */
    t = table2[j];
    if (k)
    {
        /* Linearly interpolate between adjacent table entries.  Note that this
         * works well because the set of values which are large and need to be
         * shifted down and interpolated represent an especially linear part of
         * the table.
         */
        uint16_t tmp;
        uint32_t n = table2[j + 1] - t;
        uint32_t m;
        tmp = (uint16_t)((uint8_t)(n >> 0)) * k;
        tmp = (tmp >> 8) + (uint16_t)((uint8_t)(n >> 8)) * k;
        m = (uint8_t)tmp;
        tmp = (tmp >> 8) + (uint16_t)((uint8_t)(n >> 16)) * k;
        m |= (uint32_t)((uint8_t)tmp) << 8;
        tmp = (tmp >> 8) + (uint16_t)((uint8_t)(n >> 24)) * k;
        m |= (uint32_t)((uint8_t)tmp) << 16;
        t += m;
    }

    /* compute i * (0x100000000/j) >> 16 for a 16.16 fixed-point value i/j.
     * If we had to shift j right, earlier, then we'll shift the result right
     * as well, because larger should give us a smaller result.
     */
    {
        uint16_t tmp;
        uint8_t b[4];
        uint16_t hi;
#if 0 /* correct, but ineffective */
        tmp = ((uint16_t)((uint8_t)i) * (uint8_t)t) >> 8;
#else
        tmp = 0x80;
#endif
        tmp += (uint16_t)((uint8_t)i) * (uint8_t)(t >> 8);
        hi = (uint16_t)((uint8_t)(i >> 8)) * (uint8_t)t;
        if (shift < 1)
            hi += 0x80;
        tmp += (uint8_t)hi;
        tmp >>= 8;
        tmp += hi >> 8;

        tmp += ((uint8_t)i) * (uint8_t)(t >> 16);
        b[0] = (uint8_t)tmp;
        tmp >>= 8;
        tmp += ((uint8_t)i) * (uint8_t)(t >> 24);
        b[1] = (uint8_t)tmp;
        b[2] = (uint8_t)(tmp >> 8);

        tmp = b[0] + (uint16_t)((uint8_t)(i >> 8)) * (uint8_t)(t >> 8);
        b[0] = (uint8_t)tmp;
        tmp >>= 8;
        tmp += b[1] + (uint16_t)((uint8_t)(i >> 8)) * (uint8_t)(t >> 16);
        b[1] = (uint8_t)tmp;
        tmp >>= 8;
        tmp += b[2] + (uint16_t)((uint8_t)(i >> 8)) * (uint8_t)(t >> 24);
        b[2] = (uint8_t)tmp;
        b[3] = (uint8_t)(tmp >> 8);

        t = b[0] | (uint32_t)b[1] << 8 | (uint32_t)b[2] << 16 | (uint32_t)b[3] << 24;
    }
    if (shift == 0)
        return t;

    while (shift-- > 1)
        t >>= 1;
    t++;
    return t >> 1;
}
#endif

int main(void)
{
    unsigned long i, j;
    uint64_t err = 0;

#if TABBITS != 8
    printf("static const uint32_t table2[%d + 1] __attribute__((progmem)) = {\n0xffffffff, ", 1 << TABBITS);
    for (j = 1; j < (1 << TABBITS) + 1; j++)
    {
        table2[j] = (0x100000000u + (j / 2)) / j;
        printf("0x%08x,%c", table2[j], ((j & 15) == 15) ? '\n' : ' ');
    }
    printf("\n}\n");
#endif

    for (j = 1; j < 65536; j += 3)
        for (i = 0; i < 65536; i += 3)
        {
            uint32_t ref = (uint32_t)(((uint32_t)i << 16) + (j / 2)) / j;
#if 0
            uint32_t tabent = (0x80000000 + (j / 2)) / j;
            uint32_t test = ((uint64_t)i * tabent + 0x4000) >> 15;
#else
            uint32_t test = divfn(i, j);
#endif
            int32_t e = (int32_t)(ref - test);
            if (e > 1)
                printf("%u/%u==%lu (0x%lx) (not %lu (0x%lx)) error: %ld\n", (unsigned)i, (unsigned)j, ref, ref, test, test, e);
            err += abs(e);
        }
    return 0;
}
	#include <stdio.h>
	#include <stdlib.h>
	#include <stdint.h>

	#define TABBITS 8

	#if 0
	extern uint32_t divfn(uint16_t i, uint16_t j);
	#else

	#if TABBITS != 8
	uint32_t table2[(1 << TABBITS) + 1];
	#else/{{{/
	static const uint32_t table2[256 + 1] __attribute__((progmem)) = {
	0xffffffff, 0x00000000, 0x80000000, 0x55555555, 0x40000000, 0x33333333, 0x2aaaaaab, 0x24924925, 0x20000000, 0x1c71c71c, 0x1999999a, 0x1745d174, 0x15555555, 0x13b13b14, 0x12492492, 0x11111111,
	0x10000000, 0x0f0f0f0f, 0x0e38e38e, 0x0d79435e, 0x0ccccccd, 0x0c30c30c, 0x0ba2e8ba, 0x0b21642d, 0x0aaaaaab, 0x0a3d70a4, 0x09d89d8a, 0x097b425f, 0x09249249, 0x08d3dcb1, 0x08888889, 0x08421084,
	0x08000000, 0x07c1f07c, 0x07878788, 0x07507507, 0x071c71c7, 0x06eb3e45, 0x06bca1af, 0x06906907, 0x06666666, 0x063e7064, 0x06186186, 0x05f417d0, 0x05d1745d, 0x05b05b06, 0x0590b216, 0x0572620b,
	0x05555555, 0x0539782a, 0x051eb852, 0x05050505, 0x04ec4ec5, 0x04d4873f, 0x04bda12f, 0x04a7904a, 0x04924925, 0x047dc11f, 0x0469ee58, 0x0456c798, 0x04444444, 0x04325c54, 0x04210842, 0x04104104,
	0x04000000, 0x03f03f04, 0x03e0f83e, 0x03d22635, 0x03c3c3c4, 0x03b5cc0f, 0x03a83a84, 0x039b0ad1, 0x038e38e4, 0x0381c0e0, 0x03759f23, 0x0369d037, 0x035e50d8, 0x03531dec, 0x03483483, 0x033d91d3,
	0x03333333, 0x03291620, 0x031f3832, 0x03159722, 0x030c30c3, 0x03030303, 0x02fa0be8, 0x02f14990, 0x02e8ba2f, 0x02e05c0c, 0x02d82d83, 0x02d02d03, 0x02c8590b, 0x02c0b02c, 0x02b93105, 0x02b1da46,
	0x02aaaaab, 0x02a3a0fd, 0x029cbc15, 0x0295fad4, 0x028f5c29, 0x0288df0d, 0x02828283, 0x027c4598, 0x02762762, 0x02702702, 0x026a439f, 0x02647c69, 0x025ed098, 0x02593f6a, 0x0253c825, 0x024e6a17,
	0x02492492, 0x0243f6f0, 0x023ee090, 0x0239e0d6, 0x0234f72c, 0x02302302, 0x022b63cc, 0x0226b902, 0x02222222, 0x021d9ead, 0x02192e2a, 0x0214d021, 0x02108421, 0x020c49ba, 0x02082082, 0x02040810,
	0x02000000, 0x01fc07f0, 0x01f81f82, 0x01f4465a, 0x01f07c1f, 0x01ecc07b, 0x01e9131b, 0x01e573ad, 0x01e1e1e2, 0x01de5d6e, 0x01dae607, 0x01d77b65, 0x01d41d42, 0x01d0cb59, 0x01cd8569, 0x01ca4b30,
	0x01c71c72, 0x01c3f8f0, 0x01c0e070, 0x01bdd2b9, 0x01bacf91, 0x01b7d6c4, 0x01b4e81b, 0x01b20364, 0x01af286c, 0x01ac5702, 0x01a98ef6, 0x01a6d01a, 0x01a41a42, 0x01a16d40, 0x019ec8e9, 0x019c2d15,
	0x0199999a, 0x01970e50, 0x01948b10, 0x01920fb5, 0x018f9c19, 0x018d3019, 0x018acb91, 0x01886e5f, 0x01861862, 0x0183c978, 0x01818182, 0x017f4060, 0x017d05f4, 0x017ad221, 0x0178a4c8, 0x01767dce,
	0x01745d17, 0x01724288, 0x01702e06, 0x016e1f77, 0x016c16c1, 0x016a13cd, 0x01681681, 0x01661ec7, 0x01642c86, 0x01623fa7, 0x01605816, 0x015e75bc, 0x015c9883, 0x015ac057, 0x0158ed23, 0x01571ed4,
	0x01555555, 0x01539095, 0x0151d07f, 0x01501501, 0x014e5e0a, 0x014cab88, 0x014afd6a, 0x0149539e, 0x0147ae14, 0x01460cbc, 0x01446f86, 0x0142d662, 0x01414141, 0x013fb014, 0x013e22cc, 0x013c995a,
	0x013b13b1, 0x013991c3, 0x01381381, 0x013698df, 0x013521d0, 0x0133ae46, 0x01323e35, 0x0130d190, 0x012f684c, 0x012e025c, 0x012c9fb5, 0x012b404b, 0x0129e413, 0x01288b01, 0x0127350c, 0x0125e227,
	0x01249249, 0x01234568, 0x0121fb78, 0x0120b471, 0x011f7048, 0x011e2ef4, 0x011cf06b, 0x011bb4a4, 0x011a7b96, 0x01194538, 0x01181181, 0x0116e069, 0x0115b1e6, 0x011485f1, 0x01135c81, 0x0112358e,
	0x01111111, 0x010fef01, 0x010ecf57, 0x010db20b, 0x010c9715, 0x010b7e6f, 0x010a6811, 0x010953f4, 0x01084211, 0x01073261, 0x010624dd, 0x0105197f, 0x01041041, 0x0103091b, 0x01020408, 0x01010101,
	0x01000000,
	};
	#endif/}}}/

	uint32_t divfn(uint16_t i, uint16_t j)
	{
	uint32_t t;
	uint8_t shift = 0;
	uint8_t k;

	if (j <= 1)
	return (int32_t)i << 16;

	/* if j is too large to index the table, then we just shift it right until
	* it's within bounds. In this case, we'll use the fractional part (k) for
	* linear interpolation, and the integer part for the index.
	*/
	t = (uint32_t)j << 8;
	while (t >= ((uint32_t)1 << (TABBITS + 8)))
	{
	t >>= 1;
	shift++;
	}
	j = t >> 8;
	k = (uint8_t)t;

	/* look up (0x80000000 / j) */
	t = table2[j];
	if (k)
	{
	/* Linearly interpolate between adjacent table entries. Note that this
	* works well because the set of values which are large and need to be
	* shifted down and interpolated represent an especially linear part of
	* the table.
	*/
	uint16_t tmp;
	uint32_t n = table2[j + 1] - t;
	uint32_t m;
	tmp = (uint16_t)((uint8_t)(n >> 0)) * k;
	tmp = (tmp >> 8) + (uint16_t)((uint8_t)(n >> 8)) * k;
	m = (uint8_t)tmp;
	tmp = (tmp >> 8) + (uint16_t)((uint8_t)(n >> 16)) * k;
	m \|= (uint32_t)((uint8_t)tmp) << 8;
	tmp = (tmp >> 8) + (uint16_t)((uint8_t)(n >> 24)) * k;
	m \|= (uint32_t)((uint8_t)tmp) << 16;
	t += m;
	}

	/* compute i * (0x100000000/j) >> 16 for a 16.16 fixed-point value i/j.
	* If we had to shift j right, earlier, then we'll shift the result right
	* as well, because larger should give us a smaller result.
	*/
	{
	uint16_t tmp;
	uint8_t b[4];
	uint16_t hi;
	#if 0 /* correct, but ineffective */
	tmp = ((uint16_t)((uint8_t)i) * (uint8_t)t) >> 8;
	#else
	tmp = 0x80;
	#endif
	tmp += (uint16_t)((uint8_t)i) * (uint8_t)(t >> 8);
	hi = (uint16_t)((uint8_t)(i >> 8)) * (uint8_t)t;
	if (shift < 1)
	hi += 0x80;
	tmp += (uint8_t)hi;
	tmp >>= 8;
	tmp += hi >> 8;

	tmp += ((uint8_t)i) * (uint8_t)(t >> 16);
	b[0] = (uint8_t)tmp;
	tmp >>= 8;
	tmp += ((uint8_t)i) * (uint8_t)(t >> 24);
	b[1] = (uint8_t)tmp;
	b[2] = (uint8_t)(tmp >> 8);

	tmp = b[0] + (uint16_t)((uint8_t)(i >> 8)) * (uint8_t)(t >> 8);
	b[0] = (uint8_t)tmp;
	tmp >>= 8;
	tmp += b[1] + (uint16_t)((uint8_t)(i >> 8)) * (uint8_t)(t >> 16);
	b[1] = (uint8_t)tmp;
	tmp >>= 8;
	tmp += b[2] + (uint16_t)((uint8_t)(i >> 8)) * (uint8_t)(t >> 24);
	b[2] = (uint8_t)tmp;
	b[3] = (uint8_t)(tmp >> 8);

	t = b[0] \| (uint32_t)b[1] << 8 \| (uint32_t)b[2] << 16 \| (uint32_t)b[3] << 24;
	}
	if (shift == 0)
	return t;

	while (shift-- > 1)
	t >>= 1;
	t++;
	return t >> 1;
	}
	#endif

	int main(void)
	{
	unsigned long i, j;
	uint64_t err = 0;

	#if TABBITS != 8
	printf("static const uint32_t table2[%d + 1] __attribute__((progmem)) = {\n0xffffffff, ", 1 << TABBITS);
	for (j = 1; j < (1 << TABBITS) + 1; j++)
	{
	table2[j] = (0x100000000u + (j / 2)) / j;
	printf("0x%08x,%c", table2[j], ((j & 15) == 15) ? '\n' : ' ');
	}
	printf("\n}\n");
	#endif

	for (j = 1; j < 65536; j += 3)
	for (i = 0; i < 65536; i += 3)
	{
	uint32_t ref = (uint32_t)(((uint32_t)i << 16) + (j / 2)) / j;
	#if 0
	uint32_t tabent = (0x80000000 + (j / 2)) / j;
	uint32_t test = ((uint64_t)i * tabent + 0x4000) >> 15;
	#else
	uint32_t test = divfn(i, j);
	#endif
	int32_t e = (int32_t)(ref - test);
	if (e > 1)
	printf("%u/%u==%lu (0x%lx) (not %lu (0x%lx)) error: %ld\n", (unsigned)i, (unsigned)j, ref, ref, test, test, e);
	err += abs(e);
	}
	return 0;
	}