chriso/prefix_varint.h

## prefix_varint.h
/**
 * This is free and unencumbered software released into the public domain.
 *
 * Anyone is free to copy, modify, publish, use, compile, sell, or
 * distribute this software, either in source code form or as a compiled
 * binary, for any purpose, commercial or non-commercial, and by any
 * means.
 *
 * In jurisdictions that recognize copyright laws, the author or authors
 * of this software dedicate any and all copyright interest in the
 * software to the public domain. We make this dedication for the benefit
 * of the public at large and to the detriment of our heirs and
 * successors. We intend this dedication to be an overt act of
 * relinquishment in perpetuity of all present and future rights to this
 * software under copyright law.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 * OTHER DEALINGS IN THE SOFTWARE.
 *
 * For more information, please refer to <http://unlicense.org/>
 */

#include <stddef.h>
#include <stdint.h>
#include <string.h>

#ifdef VARINT_SMALL_BIAS
# define LIKELY(e) __builtin_expect((e), 1)
#endif

const size_t varint_max_len = 9;

size_t varint_write_len(uint64_t n) {
    int bits = 64 - __builtin_clzll(n | 1);
    return bits > 56 ? varint_max_len : (size_t)(1 + (bits - 1) / 7);
}

size_t varint_read_len(const uint8_t *buf) {
    return 1 + (size_t)__builtin_ctz((uint32_t)*buf | 0x100);
}

uint8_t *varint_encode(uint8_t *buf, size_t limit, uint64_t n) {
#ifdef VARINT_SMALL_BIAS
    if (LIKELY(limit && n < 0x80)) {
        *buf = (uint8_t)(2 * n + 1);
        return buf + 1;
    }
#endif
    size_t len = varint_write_len(n);
    if (len > limit)
        return NULL;
    if (len == varint_max_len) {
        *buf++ = 0;
        len = 8;
    } else {
        n = (2 * n + 1) << (len - 1);
    }
    // when encoding a sequence of varints, the branch predictor
    // will see/predict limit>=8 until the end of the sequence
    memcpy(buf, &n, limit >= 8 ? 8 : len);
    return buf + len;
}

const uint8_t *varint_decode(const uint8_t *buf, size_t limit, uint64_t *n) {
#ifdef VARINT_SMALL_BIAS
    if (LIKELY(limit && (*buf & 1))) {
        *n = *buf >> 1;
        return buf + 1;
    }
#endif
    size_t len = varint_read_len(buf);
    if (len > limit)
        return NULL;
    if (len == varint_max_len) {
        memcpy(n, buf + 1, 8);
    } else {
        // when decoding a sequence of varints, the branch predictor
        // will see/predict limit>=8 until the end of the sequence
        memcpy(n, buf, limit >= 8 ? 8 : len);
        *n &= ~0ULL >> (8 * (8 - len));
        *n >>= len;
    }
    return buf + len;
}

size_t varint_array_max_size(size_t len) {
    return len * varint_max_len;
}

uint8_t *varint_array_encode(const uint64_t *in, size_t len, uint8_t *out,
                             size_t out_len) {
    const uint8_t *end = out + out_len;
    for (size_t i = 0; out && i < len; i++)
        out = varint_encode(out, (size_t)(end - out), in[i]);
    return out;
}

const uint8_t *varint_array_decode(const uint8_t *in, size_t len, uint64_t *out,
                                   size_t out_len, size_t *written) {
    const uint8_t *end = in + len;
    size_t i = 0;
    for (; in && in < end && i < out_len; i++)
        in = varint_decode(in, (size_t)(end - in), &out[i]);
    *written = i;
    return in;
}

int64_t varint_array_count(const uint8_t *in, size_t len) {
    int64_t count = 0;
    const uint8_t *end = in + len;
    for (; in < end; in += varint_read_len(in), count++)
        ;
    return in == end ? count : -1;
}

## prefix_varint_test.c
#include <assert.h>
#include <stdlib.h>

#include "prefix_varint.h"

static struct {
    uint64_t in;
    uint64_t out1;
    uint64_t out2;
    size_t len;
} fixtures[] = {
    {0, 1, 0, 1},
    {1, 3, 0, 1},
    {2, 5, 0, 1},
    {0x7F, 0xFF, 0, 1},
    {0x80, 0x202, 0, 2},
    {0x81, 0x206, 0, 2},
    {0x3FFF, 0xFFFE, 0, 2},
    {0x4000, 0x20004, 0, 3},
    {0x4001, 0x2000C, 0, 3},
    {0x1FFFFF, 0xFFFFFC, 0, 3},
    {0x200000, 0x2000008, 0, 4},
    {0x200001, 0x2000018, 0, 4},
    {0xFFFFFFF, 0xFFFFFFF8, 0, 4},
    {0x10000000, 0x200000010, 0, 5},
    {0x10000001, 0x200000030, 0, 5},
    {0x7FFFFFFFF, 0xFFFFFFFFF0, 0, 5},
    {0x800000000, 0x20000000020, 0, 6},
    {0x800000001, 0x20000000060, 0, 6},
    {0x3FFFFFFFFFF, 0xFFFFFFFFFFE0, 0, 6},
    {0x40000000000, 0x2000000000040, 0, 7},
    {0x40000000001, 0x20000000000C0, 0, 7},
    {0x1FFFFFFFFFFFF, 0xFFFFFFFFFFFFC0, 0, 7},
    {0x2000000000000, 0x200000000000080, 0, 8},
    {0x2000000000001, 0x200000000000180, 0, 8},
    {0xFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFF80, 0, 8},
    {0x100000000000000, 0, 1, 9},
    {0x100000000000001, 0x100, 1, 9},
    {0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFF00, 0xFF, 9},
};

int main() {
    for (size_t i = 0; i < sizeof(fixtures) / sizeof(fixtures[0]); i++) {
        uint64_t n = 0, out[] = {0, 0};
        uint8_t *buf = (uint8_t *)out;

        // encode
        for (size_t j = 0; j <= 16; j++) {
            uint8_t *pos = varint_encode(buf, j, fixtures[i].in);
            assert((pos == NULL) == (j < fixtures[i].len));
            if (j >= fixtures[i].len) {
                assert(out[0] == fixtures[i].out1 &&
                       out[1] == fixtures[i].out2);
                assert((size_t)(pos - buf) == fixtures[i].len);
            }
        }

        // decode
        for (size_t j = 0; j <= 16; j++) {
            const uint8_t *pos = varint_decode(buf, j, &n);
            assert((pos == NULL) == (j < fixtures[i].len));
            if (j >= fixtures[i].len) {
                assert(n == fixtures[i].in);
                assert((size_t)(pos - buf) == fixtures[i].len);
            }
        }
    }

    size_t mb = 1;
    size_t size = mb * 1024 * 1024;
    size_t count = size / sizeof(uint64_t);
    uint64_t *input = calloc(1, size);
    uint64_t *copy = calloc(1, size);
    uint8_t *packed = calloc(count, varint_max_len);
    assert(input && copy && packed);

    size_t max_bytes = 8;
    size_t packed_len = 0;
    for (size_t i = 0; i < count; i++) {
        size_t bytes = (size_t)rand() % max_bytes;
        for (size_t j = 0; j < bytes; j++) {
            input[i] <<= 8;
            input[i] |= (uint64_t)rand() & 0xFF;
        }
        packed_len += varint_write_len(input[i]);
    }

    uint8_t *pos =
        varint_array_encode(input, count, packed, count * varint_max_len);
    assert(pos && pos == packed + packed_len);

    assert(varint_array_count(packed, packed_len) == (int64_t)count);

    size_t out_len = 0;
    const uint8_t *decode_pos =
        varint_array_decode(packed, packed_len, copy, count, &out_len);
    assert(decode_pos && decode_pos == packed + packed_len);
    assert(out_len == count);
    assert(memcmp(input, copy, size) == 0);
}
	/**
	* This is free and unencumbered software released into the public domain.
	*
	* Anyone is free to copy, modify, publish, use, compile, sell, or
	* distribute this software, either in source code form or as a compiled
	* binary, for any purpose, commercial or non-commercial, and by any
	* means.
	*
	* In jurisdictions that recognize copyright laws, the author or authors
	* of this software dedicate any and all copyright interest in the
	* software to the public domain. We make this dedication for the benefit
	* of the public at large and to the detriment of our heirs and
	* successors. We intend this dedication to be an overt act of
	* relinquishment in perpetuity of all present and future rights to this
	* software under copyright law.
	*
	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
	* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
	* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
	* IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
	* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
	* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
	* OTHER DEALINGS IN THE SOFTWARE.
	*
	* For more information, please refer to <http://unlicense.org/>
	*/

	#include <stddef.h>
	#include <stdint.h>
	#include <string.h>

	#ifdef VARINT_SMALL_BIAS
	# define LIKELY(e) __builtin_expect((e), 1)
	#endif

	const size_t varint_max_len = 9;

	size_t varint_write_len(uint64_t n) {
	int bits = 64 - __builtin_clzll(n \| 1);
	return bits > 56 ? varint_max_len : (size_t)(1 + (bits - 1) / 7);
	}

	size_t varint_read_len(const uint8_t *buf) {
	return 1 + (size_t)__builtin_ctz((uint32_t)*buf \| 0x100);
	}

	uint8_t varint_encode(uint8_t buf, size_t limit, uint64_t n) {
	#ifdef VARINT_SMALL_BIAS
	if (LIKELY(limit && n < 0x80)) {
	buf = (uint8_t)(2 n + 1);
	return buf + 1;
	}
	#endif
	size_t len = varint_write_len(n);
	if (len > limit)
	return NULL;
	if (len == varint_max_len) {
	*buf++ = 0;
	len = 8;
	} else {
	n = (2 * n + 1) << (len - 1);
	}
	// when encoding a sequence of varints, the branch predictor
	// will see/predict limit>=8 until the end of the sequence
	memcpy(buf, &n, limit >= 8 ? 8 : len);
	return buf + len;
	}

	const uint8_t varint_decode(const uint8_t buf, size_t limit, uint64_t *n) {
	#ifdef VARINT_SMALL_BIAS
	if (LIKELY(limit && (*buf & 1))) {
	n = buf >> 1;
	return buf + 1;
	}
	#endif
	size_t len = varint_read_len(buf);
	if (len > limit)
	return NULL;
	if (len == varint_max_len) {
	memcpy(n, buf + 1, 8);
	} else {
	// when decoding a sequence of varints, the branch predictor
	// will see/predict limit>=8 until the end of the sequence
	memcpy(n, buf, limit >= 8 ? 8 : len);
	n &= ~0ULL >> (8 (8 - len));
	*n >>= len;
	}
	return buf + len;
	}

	size_t varint_array_max_size(size_t len) {
	return len * varint_max_len;
	}

	uint8_t varint_array_encode(const uint64_t in, size_t len, uint8_t *out,
	size_t out_len) {
	const uint8_t *end = out + out_len;
	for (size_t i = 0; out && i < len; i++)
	out = varint_encode(out, (size_t)(end - out), in[i]);
	return out;
	}

	const uint8_t varint_array_decode(const uint8_t in, size_t len, uint64_t *out,
	size_t out_len, size_t *written) {
	const uint8_t *end = in + len;
	size_t i = 0;
	for (; in && in < end && i < out_len; i++)
	in = varint_decode(in, (size_t)(end - in), &out[i]);
	*written = i;
	return in;
	}

	int64_t varint_array_count(const uint8_t *in, size_t len) {
	int64_t count = 0;
	const uint8_t *end = in + len;
	for (; in < end; in += varint_read_len(in), count++)
	;
	return in == end ? count : -1;
	}
	#include <assert.h>
	#include <stdlib.h>

	#include "prefix_varint.h"

	static struct {
	uint64_t in;
	uint64_t out1;
	uint64_t out2;
	size_t len;
	} fixtures[] = {
	{0, 1, 0, 1},
	{1, 3, 0, 1},
	{2, 5, 0, 1},
	{0x7F, 0xFF, 0, 1},
	{0x80, 0x202, 0, 2},
	{0x81, 0x206, 0, 2},
	{0x3FFF, 0xFFFE, 0, 2},
	{0x4000, 0x20004, 0, 3},
	{0x4001, 0x2000C, 0, 3},
	{0x1FFFFF, 0xFFFFFC, 0, 3},
	{0x200000, 0x2000008, 0, 4},
	{0x200001, 0x2000018, 0, 4},
	{0xFFFFFFF, 0xFFFFFFF8, 0, 4},
	{0x10000000, 0x200000010, 0, 5},
	{0x10000001, 0x200000030, 0, 5},
	{0x7FFFFFFFF, 0xFFFFFFFFF0, 0, 5},
	{0x800000000, 0x20000000020, 0, 6},
	{0x800000001, 0x20000000060, 0, 6},
	{0x3FFFFFFFFFF, 0xFFFFFFFFFFE0, 0, 6},
	{0x40000000000, 0x2000000000040, 0, 7},
	{0x40000000001, 0x20000000000C0, 0, 7},
	{0x1FFFFFFFFFFFF, 0xFFFFFFFFFFFFC0, 0, 7},
	{0x2000000000000, 0x200000000000080, 0, 8},
	{0x2000000000001, 0x200000000000180, 0, 8},
	{0xFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFF80, 0, 8},
	{0x100000000000000, 0, 1, 9},
	{0x100000000000001, 0x100, 1, 9},
	{0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFF00, 0xFF, 9},
	};

	int main() {
	for (size_t i = 0; i < sizeof(fixtures) / sizeof(fixtures[0]); i++) {
	uint64_t n = 0, out[] = {0, 0};
	uint8_t buf = (uint8_t )out;

	// encode
	for (size_t j = 0; j <= 16; j++) {
	uint8_t *pos = varint_encode(buf, j, fixtures[i].in);
	assert((pos == NULL) == (j < fixtures[i].len));
	if (j >= fixtures[i].len) {
	assert(out[0] == fixtures[i].out1 &&
	out[1] == fixtures[i].out2);
	assert((size_t)(pos - buf) == fixtures[i].len);
	}
	}

	// decode
	for (size_t j = 0; j <= 16; j++) {
	const uint8_t *pos = varint_decode(buf, j, &n);
	assert((pos == NULL) == (j < fixtures[i].len));
	if (j >= fixtures[i].len) {
	assert(n == fixtures[i].in);
	assert((size_t)(pos - buf) == fixtures[i].len);
	}
	}
	}

	size_t mb = 1;
	size_t size = mb * 1024 * 1024;
	size_t count = size / sizeof(uint64_t);
	uint64_t *input = calloc(1, size);
	uint64_t *copy = calloc(1, size);
	uint8_t *packed = calloc(count, varint_max_len);
	assert(input && copy && packed);

	size_t max_bytes = 8;
	size_t packed_len = 0;
	for (size_t i = 0; i < count; i++) {
	size_t bytes = (size_t)rand() % max_bytes;
	for (size_t j = 0; j < bytes; j++) {
	input[i] <<= 8;
	input[i] \|= (uint64_t)rand() & 0xFF;
	}
	packed_len += varint_write_len(input[i]);
	}

	uint8_t *pos =
	varint_array_encode(input, count, packed, count * varint_max_len);
	assert(pos && pos == packed + packed_len);

	assert(varint_array_count(packed, packed_len) == (int64_t)count);

	size_t out_len = 0;
	const uint8_t *decode_pos =
	varint_array_decode(packed, packed_len, copy, count, &out_len);
	assert(decode_pos && decode_pos == packed + packed_len);
	assert(out_len == count);
	assert(memcmp(input, copy, size) == 0);
	}