Skip to content

Instantly share code, notes, and snippets.

@dpzmick
Created February 9, 2019 17:39
Show Gist options
  • Save dpzmick/c86e4a090b12028e24eeb84bb90dd585 to your computer and use it in GitHub Desktop.
Save dpzmick/c86e4a090b12028e24eeb84bb90dd585 to your computer and use it in GitHub Desktop.
#include <assert.h>
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <x86intrin.h>
/*
0 when: 0 <= x < rs
|-----------| 0 <= y < rs
| 0 | 1 | 1 when: rs <= x < 2*rs
|-----------| 0 <= y < rs
| 2 | 3 | 2 when: 0 <= x < rs
------------| rs <= y < rs
3 when: rs <= x < rs
rs <= y < rs
Notice that:
----------- first bit 0 when: 0 <= y < rs
| 00 | 01 | second bit 0 when: 0 <= x < rs
----------
| 10 | 11 |
------------
So all movement in the grid can be done branch free
*/
#define X_MASK ((0b01))
#define Y_MASK ((0b10))
#define LEAF_FLAG ((void*)-1) /* never a valid pointer on linux */
#define prod_assert(expr) ((expr) ? (void)(0) : abort())
#ifdef DEBUG_VERBOSE
static size_t depth = 0;
static char *
deep_string()
{
static char buffer[1024];
memset(buffer, 0, sizeof(buffer));
for (size_t i = 0; i < depth; ++i) buffer[i] = ' ';
return buffer;
}
#define DBG_PRINTF(fmt, ...) printf("%s" fmt "\n", deep_string(), __VA_ARGS__);
#define QT_ENTER do { depth += 1; } while (0)
#define QT_RETURN(...) do { depth -= 1; return __VA_ARGS__; } while (0)
#else
#define DBG_PRINTF(...)
#define QT_ENTER
#define QT_RETURN(...) do { return __VA_ARGS__; } while (0)
#endif
typedef struct coord coord_t;
typedef struct qt_set qt_set_t;
typedef struct qt_node qt_node_t;
static inline bool __attribute__((unused))
is_pow2(size_t n) { return __builtin_popcount(n) == 1; }
/* -------------------------------------------------------------------------- */
struct __attribute__((packed)) coord {
size_t x;
size_t y;
};
/* QuadTree set node. Always assumes anchored at (0, 0).
When a node is a leaf node (signaled by LEAF_FLAG in child[0]), child[1] and
child[2] indicate what value the node is holding. In this QuadTree, a leaf
holds either exactly one value, or more children
Region size does not need to be stored more than once. The exterior wrapper
object stores the max region size, and it is computed down the recursion.
Should consider allowing a node to hold a filled range, e.g. if (0,0) ->
(1,1) is fully populated, we should only need one QT node. This can be done
without changing the size of the struct, we just need to use another bit
somewhere for a flag.
In theory, node can be one of:
1) Normal node with 4 ptrs
2) Leaf node with 1 coordinate
3) Leaf node with region extending for some direction in x, some direction in
y (non-square), rooted at some root. This would need a coordinate (2
size_t), an anchor (2 size_t).
4) A variant of (3) that says "entire cell satisfied" but does not allow you
to specify a parietal region that is satisfied. i.e. the entire region
from (0,0)->region_size is satisfied.
Without mode (3), we need: 48*4 ("64 bit" ptrs) = 192 bits (but actually
using 256). That's two nodes per cache line, if the nodes are arranged
properly (see last paragraph).
We can support modes (2) and (3) by putting flag pointers in children[0],
since there's a large number of pointers that are unused on a 64 bit
platform.
There's also space to add in another pointer for an associative set (this
would require storing it in the unused parts of the 64 bit pointer, then
masking the pointers before use).
If we add mode (3), we need: at least 1 bit of flag somewhere, or another
flag ptr in children[0] + 2*64 (anchor point which might be non-zero) + 2*64
(length/width or region) = 256 + 1 extra bit of flagging. We might be able to
play a game where we allocate this type of node from a specific pool, then
use some "is this pointer in range" test for the flag, or some other game.
Probably don't need a full 64 bits for anchor or length/width ever, so there
might be some savings available that would let us fit all 4 types into a 32
byte node.
Making type (3) nodes associative would mean we'd need another void* for each
contiguous region of points, so one more void* per node. Would probably want
to enforce the policy that a node of type (3) is only created for a regions
where all points in the region are associated with the same void*.
If a region of contigous points is "added to the set of keys", but subregions
in that region are associated with different void*'s, the region should be
split into multiple leaves anyway for programmer sanity. Despite this
conceptual simplicifaction, nodes of type (3) would still require additional
bits from somewhere to store the void*.
Insert and delete should compress the QuadTree so that we maximize the amount
of (2), (3), and (4) nodes in the tree. This isn't yet implemented.
Finally, these nodes should be allocated in a cache efficient manner.
That probably means grabbing a slab of them all at once (even if you aren't
sure you need them), then allocating children of a node out of the same slab
as the node. If the nodes can be nicely aligned (say, they are all 256 bits),
then it's trivial to figure out when you've run out of slab, what node goes
where in the slab, etc... There's a memory tradeoff if you do this because
you might overallocate, but if the nodes are small enough, that's probably
okay in most cases. */
struct qt_node {
qt_node_t* children[4];
};
struct qt_set {
size_t region_size;
qt_node_t root[1];
};
static inline bool
_qts_node_empty(qt_node_t const* node) { QT_ENTER;
static_assert(sizeof(node->children) == sizeof(__m256i), "!"); // array sizeof ok
__m256i maybe_zero = _mm256_loadu_si256((__m256i*)node->children);
bool ret = 0 != _mm256_testz_si256(maybe_zero, maybe_zero);
/* _mm256_testz_si256:
if zf is set, returns non-zero.
zf will be set when the and of the arguments results in all zero bits */
if (ret) { // seems to be getting properly removed in prod build
assert(node->children[0] == NULL);
assert(node->children[1] == NULL);
assert(node->children[2] == NULL);
assert(node->children[3] == NULL);
}
QT_RETURN(ret);
}
static inline size_t
_qts_compute_child_idx(size_t region_size, coord_t const* coord) {
return ((uint8_t)(coord->y >= region_size/2) << 1)
| ((uint8_t)(coord->x >= region_size/2));
}
static inline void
_qts_apply_offset_coord(size_t idx, size_t region_size, coord_t* coord) {
/* ensure we won't underflow the coordinate */
assert((idx & X_MASK) ? (coord->x <= region_size) : true);
assert((idx & Y_MASK) ? (coord->y <= region_size) : true);
/* idx is hard to predict, should probably force this to be branch free,
despite overeager branching compilers. As written, this compiles to
branches. */
size_t x_off = (idx & X_MASK) ? region_size : 0;
size_t y_off = (idx & Y_MASK) ? region_size : 0;
coord->x -= x_off;
coord->y -= y_off;
}
static inline bool
_qts_has(qt_node_t const* root, size_t region_size, coord_t coord[1]) { QT_ENTER;
DBG_PRINTF("searching for x=%zu y=%zu", coord->x, coord->y);
/* Check a bunch of invariant in debug mode */
assert(coord->x < region_size || region_size == 1);
assert(coord->y < region_size || region_size == 1);
assert(is_pow2(region_size));
assert(region_size == 1 ? (root->children[0] == LEAF_FLAG) : true);
if (root->children[0] == LEAF_FLAG) {
DBG_PRINTF("searching for x=%zu y=%zu in leaf with x=%zu y=%zu",
coord->x, coord->y,
(size_t)root->children[1], (size_t)root->children[2]);
#if 0
QT_RETURN(root->children[1] == (void*)coord->x
&& root->children[2] == (void*)coord->y);
#endif
/* Equivalent:
- coord is packed, so its a 128 bit value.
- pointers are both 64 bit values, must be contiguous when in array. */
QT_RETURN(0 == memcmp(root->children + 1, coord, sizeof(*coord)));
}
size_t idx = _qts_compute_child_idx(region_size, coord);
if (!root->children[idx]) QT_RETURN(false);
_qts_apply_offset_coord(idx, region_size/2, coord);
bool ret = _qts_has(root->children[idx], region_size/2, coord);
DBG_PRINTF("done searching for x=%zu y=%zu, ret=%d", coord->x, coord->y, ret);
QT_RETURN(ret); /* Tail call with printing turned off */
}
static inline void
_qts_insert(qt_node_t* root, size_t region_size, coord_t coord[1]) { QT_ENTER;
DBG_PRINTF("inserting x=%zu y=%zu at %p", coord->x, coord->y, root);
/* Check all of the invariant */
assert(region_size >= 1);
assert(coord->x < region_size || region_size == 1);
assert(coord->y < region_size || region_size == 1);
assert(is_pow2(region_size));
/* If we have no children (all NULL), we can become a leaf node.
If we've hit the max-depth of the tree, we have a leaf node and can go no
further. Add the element here. It is guaranteed to fit because it was in
range on the outermost node in this quad tree. */
if (region_size == 1 || _qts_node_empty(root)) {
DBG_PRINTF("inserting complete, used leaf rs=%zu, empty=%d, %p",
region_size, _qts_node_empty(root), root);
root->children[0] = LEAF_FLAG;
root->children[1] = (void*)coord->x;
root->children[2] = (void*)coord->y;
QT_RETURN();
}
/* If we are inserting into a leaf, we're gonna have to put that somewhere */
if (root->children[0] == LEAF_FLAG) {
coord_t coord[1];
coord->x = (size_t)root->children[1];
coord->y = (size_t)root->children[2];
DBG_PRINTF("leaf promotion of %zu %zu", coord->x, coord->y);
/* Reset the data region */
memset(root->children, 0, sizeof(root->children));
size_t idx = _qts_compute_child_idx(region_size, coord);
root->children[idx] = calloc(1, sizeof(*root->children[idx]));
// FIXME check if allocation success
_qts_apply_offset_coord(idx, region_size/2, coord);
_qts_insert(root->children[idx], region_size, coord);
DBG_PRINTF("leaf promotion of %zu %zu complete", coord->x, coord->y);
}
size_t idx = _qts_compute_child_idx(region_size, coord);
if (!root->children[idx]) {
root->children[idx] = calloc(1, sizeof(*root->children[idx]));
// FIXME check if allocation success
}
_qts_apply_offset_coord(idx, region_size/2, coord);
_qts_insert(root->children[idx], region_size/2, coord);
DBG_PRINTF("insert of x=%zu y=%zu complete", coord->x, coord->y);
QT_RETURN();
// path compression w/ lookahead? I've just completely filled up a child, this
// node should be collapsed
}
/* -------------------------------------------------------------------------- */
bool
qts_insert(qt_set_t* set, coord_t coord[1]) {
if (coord->x > set->region_size) return false;
if (coord->y > set->region_size) return false;
_qts_insert(set->root, set->region_size, coord);
return true;
}
bool
qts_has(qt_set_t* set, coord_t coord[1]) {
return _qts_has(set->root, set->region_size, coord);
}
void
qts_remove(qt_set_t* root, coord_t coord[1]) {
(void)root;
(void)coord;
prod_assert(false); // reimplement this
// should do path compression and dallocation
}
int main()
{
qt_set_t* qt = calloc(1, sizeof(*qt)); // should probaly be aligned for the avx trick
qt->region_size = 2;
prod_assert(qts_insert(qt, &(coord_t){.x = 0, .y = 0}));
prod_assert(qts_insert(qt, &(coord_t){.x = 0, .y = 1}));
prod_assert(qts_insert(qt, &(coord_t){.x = 1, .y = 0}));
prod_assert(qts_insert(qt, &(coord_t){.x = 1, .y = 1}));
prod_assert(qts_has(qt, &(coord_t){.x = 0, .y = 0}));
prod_assert(qts_has(qt, &(coord_t){.x = 0, .y = 1}));
prod_assert(qts_has(qt, &(coord_t){.x = 1, .y = 0}));
prod_assert(qts_has(qt, &(coord_t){.x = 1, .y = 1}));
qt = calloc(1, sizeof(*qt)); // should probaly be aligned for the avx trick
qt->region_size = 16;
printf("--- adding 0,0\n");
qts_insert(qt, &(coord_t){.x = 0, .y = 0});
prod_assert(qts_has(qt, &(coord_t){.x = 0, .y = 0}));
prod_assert(!qts_has(qt, &(coord_t){.x = 0, .y = 1}));
printf("--- adding 0,1\n");
qts_insert(qt, &(coord_t){.x = 0, .y = 1});
prod_assert(qts_has(qt, &(coord_t){.x = 0, .y = 1}));
prod_assert(qts_has(qt, &(coord_t){.x = 0, .y = 0}));
// printf("--- remove 0,1\n");
// qts_remove(qt, &(coord_t){.x = 0, .y = 1});
// assert(qts_has(qt, &(coord_t){.x = 0, .y = 0}));
// assert(!qts_has(qt, &(coord_t){.x = 0, .y = 1}));
// printf("--- remove 0,0\n");
// qts_remove(qt, &(coord_t){.x = 0, .y = 0});
// assert(!qts_has(qt, &(coord_t){.x = 0, .y = 0}));
// assert(!qts_has(qt, &(coord_t){.x = 1, .y = 1}));
// printf("--- adding 7,7\n");
// qts_insert(qt, &(coord_t){.x = 7, .y = 7});
// assert(!qts_has(qt, &(coord_t){.x = 0, .y = 0}));
// assert(!qts_has(qt, &(coord_t){.x = 0, .y = 1}));
// assert(qts_has(qt, &(coord_t){.x = 7, .y = 7}));
// printf("--- remove 7,7\n");
// qts_remove(qt, &(coord_t){.x = 7, .y = 7});
// assert(!qts_has(qt, &(coord_t){.x = 0, .y = 0}));
// assert(!qts_has(qt, &(coord_t){.x = 1, .y = 1}));
// assert(!qts_has(qt, &(coord_t){.x = 7, .y = 7}));
free(qt);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment