Created
February 9, 2019 17:39
-
-
Save dpzmick/c86e4a090b12028e24eeb84bb90dd585 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <assert.h> | |
#include <stdbool.h> | |
#include <stddef.h> | |
#include <stdint.h> | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <string.h> | |
#include <x86intrin.h> | |
/* | |
0 when: 0 <= x < rs | |
|-----------| 0 <= y < rs | |
| 0 | 1 | 1 when: rs <= x < 2*rs | |
|-----------| 0 <= y < rs | |
| 2 | 3 | 2 when: 0 <= x < rs | |
------------| rs <= y < rs | |
3 when: rs <= x < rs | |
rs <= y < rs | |
Notice that: | |
----------- first bit 0 when: 0 <= y < rs | |
| 00 | 01 | second bit 0 when: 0 <= x < rs | |
---------- | |
| 10 | 11 | | |
------------ | |
So all movement in the grid can be done branch free | |
*/ | |
#define X_MASK ((0b01)) | |
#define Y_MASK ((0b10)) | |
#define LEAF_FLAG ((void*)-1) /* never a valid pointer on linux */ | |
#define prod_assert(expr) ((expr) ? (void)(0) : abort()) | |
#ifdef DEBUG_VERBOSE | |
static size_t depth = 0; | |
static char * | |
deep_string() | |
{ | |
static char buffer[1024]; | |
memset(buffer, 0, sizeof(buffer)); | |
for (size_t i = 0; i < depth; ++i) buffer[i] = ' '; | |
return buffer; | |
} | |
#define DBG_PRINTF(fmt, ...) printf("%s" fmt "\n", deep_string(), __VA_ARGS__); | |
#define QT_ENTER do { depth += 1; } while (0) | |
#define QT_RETURN(...) do { depth -= 1; return __VA_ARGS__; } while (0) | |
#else | |
#define DBG_PRINTF(...) | |
#define QT_ENTER | |
#define QT_RETURN(...) do { return __VA_ARGS__; } while (0) | |
#endif | |
typedef struct coord coord_t; | |
typedef struct qt_set qt_set_t; | |
typedef struct qt_node qt_node_t; | |
static inline bool __attribute__((unused)) | |
is_pow2(size_t n) { return __builtin_popcount(n) == 1; } | |
/* -------------------------------------------------------------------------- */ | |
struct __attribute__((packed)) coord { | |
size_t x; | |
size_t y; | |
}; | |
/* QuadTree set node. Always assumes anchored at (0, 0). | |
When a node is a leaf node (signaled by LEAF_FLAG in child[0]), child[1] and | |
child[2] indicate what value the node is holding. In this QuadTree, a leaf | |
holds either exactly one value, or more children | |
Region size does not need to be stored more than once. The exterior wrapper | |
object stores the max region size, and it is computed down the recursion. | |
Should consider allowing a node to hold a filled range, e.g. if (0,0) -> | |
(1,1) is fully populated, we should only need one QT node. This can be done | |
without changing the size of the struct, we just need to use another bit | |
somewhere for a flag. | |
In theory, node can be one of: | |
1) Normal node with 4 ptrs | |
2) Leaf node with 1 coordinate | |
3) Leaf node with region extending for some direction in x, some direction in | |
y (non-square), rooted at some root. This would need a coordinate (2 | |
size_t), an anchor (2 size_t). | |
4) A variant of (3) that says "entire cell satisfied" but does not allow you | |
to specify a parietal region that is satisfied. i.e. the entire region | |
from (0,0)->region_size is satisfied. | |
Without mode (3), we need: 48*4 ("64 bit" ptrs) = 192 bits (but actually | |
using 256). That's two nodes per cache line, if the nodes are arranged | |
properly (see last paragraph). | |
We can support modes (2) and (3) by putting flag pointers in children[0], | |
since there's a large number of pointers that are unused on a 64 bit | |
platform. | |
There's also space to add in another pointer for an associative set (this | |
would require storing it in the unused parts of the 64 bit pointer, then | |
masking the pointers before use). | |
If we add mode (3), we need: at least 1 bit of flag somewhere, or another | |
flag ptr in children[0] + 2*64 (anchor point which might be non-zero) + 2*64 | |
(length/width or region) = 256 + 1 extra bit of flagging. We might be able to | |
play a game where we allocate this type of node from a specific pool, then | |
use some "is this pointer in range" test for the flag, or some other game. | |
Probably don't need a full 64 bits for anchor or length/width ever, so there | |
might be some savings available that would let us fit all 4 types into a 32 | |
byte node. | |
Making type (3) nodes associative would mean we'd need another void* for each | |
contiguous region of points, so one more void* per node. Would probably want | |
to enforce the policy that a node of type (3) is only created for a regions | |
where all points in the region are associated with the same void*. | |
If a region of contigous points is "added to the set of keys", but subregions | |
in that region are associated with different void*'s, the region should be | |
split into multiple leaves anyway for programmer sanity. Despite this | |
conceptual simplicifaction, nodes of type (3) would still require additional | |
bits from somewhere to store the void*. | |
Insert and delete should compress the QuadTree so that we maximize the amount | |
of (2), (3), and (4) nodes in the tree. This isn't yet implemented. | |
Finally, these nodes should be allocated in a cache efficient manner. | |
That probably means grabbing a slab of them all at once (even if you aren't | |
sure you need them), then allocating children of a node out of the same slab | |
as the node. If the nodes can be nicely aligned (say, they are all 256 bits), | |
then it's trivial to figure out when you've run out of slab, what node goes | |
where in the slab, etc... There's a memory tradeoff if you do this because | |
you might overallocate, but if the nodes are small enough, that's probably | |
okay in most cases. */ | |
struct qt_node { | |
qt_node_t* children[4]; | |
}; | |
struct qt_set { | |
size_t region_size; | |
qt_node_t root[1]; | |
}; | |
static inline bool | |
_qts_node_empty(qt_node_t const* node) { QT_ENTER; | |
static_assert(sizeof(node->children) == sizeof(__m256i), "!"); // array sizeof ok | |
__m256i maybe_zero = _mm256_loadu_si256((__m256i*)node->children); | |
bool ret = 0 != _mm256_testz_si256(maybe_zero, maybe_zero); | |
/* _mm256_testz_si256: | |
if zf is set, returns non-zero. | |
zf will be set when the and of the arguments results in all zero bits */ | |
if (ret) { // seems to be getting properly removed in prod build | |
assert(node->children[0] == NULL); | |
assert(node->children[1] == NULL); | |
assert(node->children[2] == NULL); | |
assert(node->children[3] == NULL); | |
} | |
QT_RETURN(ret); | |
} | |
static inline size_t | |
_qts_compute_child_idx(size_t region_size, coord_t const* coord) { | |
return ((uint8_t)(coord->y >= region_size/2) << 1) | |
| ((uint8_t)(coord->x >= region_size/2)); | |
} | |
static inline void | |
_qts_apply_offset_coord(size_t idx, size_t region_size, coord_t* coord) { | |
/* ensure we won't underflow the coordinate */ | |
assert((idx & X_MASK) ? (coord->x <= region_size) : true); | |
assert((idx & Y_MASK) ? (coord->y <= region_size) : true); | |
/* idx is hard to predict, should probably force this to be branch free, | |
despite overeager branching compilers. As written, this compiles to | |
branches. */ | |
size_t x_off = (idx & X_MASK) ? region_size : 0; | |
size_t y_off = (idx & Y_MASK) ? region_size : 0; | |
coord->x -= x_off; | |
coord->y -= y_off; | |
} | |
static inline bool | |
_qts_has(qt_node_t const* root, size_t region_size, coord_t coord[1]) { QT_ENTER; | |
DBG_PRINTF("searching for x=%zu y=%zu", coord->x, coord->y); | |
/* Check a bunch of invariant in debug mode */ | |
assert(coord->x < region_size || region_size == 1); | |
assert(coord->y < region_size || region_size == 1); | |
assert(is_pow2(region_size)); | |
assert(region_size == 1 ? (root->children[0] == LEAF_FLAG) : true); | |
if (root->children[0] == LEAF_FLAG) { | |
DBG_PRINTF("searching for x=%zu y=%zu in leaf with x=%zu y=%zu", | |
coord->x, coord->y, | |
(size_t)root->children[1], (size_t)root->children[2]); | |
#if 0 | |
QT_RETURN(root->children[1] == (void*)coord->x | |
&& root->children[2] == (void*)coord->y); | |
#endif | |
/* Equivalent: | |
- coord is packed, so its a 128 bit value. | |
- pointers are both 64 bit values, must be contiguous when in array. */ | |
QT_RETURN(0 == memcmp(root->children + 1, coord, sizeof(*coord))); | |
} | |
size_t idx = _qts_compute_child_idx(region_size, coord); | |
if (!root->children[idx]) QT_RETURN(false); | |
_qts_apply_offset_coord(idx, region_size/2, coord); | |
bool ret = _qts_has(root->children[idx], region_size/2, coord); | |
DBG_PRINTF("done searching for x=%zu y=%zu, ret=%d", coord->x, coord->y, ret); | |
QT_RETURN(ret); /* Tail call with printing turned off */ | |
} | |
static inline void | |
_qts_insert(qt_node_t* root, size_t region_size, coord_t coord[1]) { QT_ENTER; | |
DBG_PRINTF("inserting x=%zu y=%zu at %p", coord->x, coord->y, root); | |
/* Check all of the invariant */ | |
assert(region_size >= 1); | |
assert(coord->x < region_size || region_size == 1); | |
assert(coord->y < region_size || region_size == 1); | |
assert(is_pow2(region_size)); | |
/* If we have no children (all NULL), we can become a leaf node. | |
If we've hit the max-depth of the tree, we have a leaf node and can go no | |
further. Add the element here. It is guaranteed to fit because it was in | |
range on the outermost node in this quad tree. */ | |
if (region_size == 1 || _qts_node_empty(root)) { | |
DBG_PRINTF("inserting complete, used leaf rs=%zu, empty=%d, %p", | |
region_size, _qts_node_empty(root), root); | |
root->children[0] = LEAF_FLAG; | |
root->children[1] = (void*)coord->x; | |
root->children[2] = (void*)coord->y; | |
QT_RETURN(); | |
} | |
/* If we are inserting into a leaf, we're gonna have to put that somewhere */ | |
if (root->children[0] == LEAF_FLAG) { | |
coord_t coord[1]; | |
coord->x = (size_t)root->children[1]; | |
coord->y = (size_t)root->children[2]; | |
DBG_PRINTF("leaf promotion of %zu %zu", coord->x, coord->y); | |
/* Reset the data region */ | |
memset(root->children, 0, sizeof(root->children)); | |
size_t idx = _qts_compute_child_idx(region_size, coord); | |
root->children[idx] = calloc(1, sizeof(*root->children[idx])); | |
// FIXME check if allocation success | |
_qts_apply_offset_coord(idx, region_size/2, coord); | |
_qts_insert(root->children[idx], region_size, coord); | |
DBG_PRINTF("leaf promotion of %zu %zu complete", coord->x, coord->y); | |
} | |
size_t idx = _qts_compute_child_idx(region_size, coord); | |
if (!root->children[idx]) { | |
root->children[idx] = calloc(1, sizeof(*root->children[idx])); | |
// FIXME check if allocation success | |
} | |
_qts_apply_offset_coord(idx, region_size/2, coord); | |
_qts_insert(root->children[idx], region_size/2, coord); | |
DBG_PRINTF("insert of x=%zu y=%zu complete", coord->x, coord->y); | |
QT_RETURN(); | |
// path compression w/ lookahead? I've just completely filled up a child, this | |
// node should be collapsed | |
} | |
/* -------------------------------------------------------------------------- */ | |
bool | |
qts_insert(qt_set_t* set, coord_t coord[1]) { | |
if (coord->x > set->region_size) return false; | |
if (coord->y > set->region_size) return false; | |
_qts_insert(set->root, set->region_size, coord); | |
return true; | |
} | |
bool | |
qts_has(qt_set_t* set, coord_t coord[1]) { | |
return _qts_has(set->root, set->region_size, coord); | |
} | |
void | |
qts_remove(qt_set_t* root, coord_t coord[1]) { | |
(void)root; | |
(void)coord; | |
prod_assert(false); // reimplement this | |
// should do path compression and dallocation | |
} | |
int main() | |
{ | |
qt_set_t* qt = calloc(1, sizeof(*qt)); // should probaly be aligned for the avx trick | |
qt->region_size = 2; | |
prod_assert(qts_insert(qt, &(coord_t){.x = 0, .y = 0})); | |
prod_assert(qts_insert(qt, &(coord_t){.x = 0, .y = 1})); | |
prod_assert(qts_insert(qt, &(coord_t){.x = 1, .y = 0})); | |
prod_assert(qts_insert(qt, &(coord_t){.x = 1, .y = 1})); | |
prod_assert(qts_has(qt, &(coord_t){.x = 0, .y = 0})); | |
prod_assert(qts_has(qt, &(coord_t){.x = 0, .y = 1})); | |
prod_assert(qts_has(qt, &(coord_t){.x = 1, .y = 0})); | |
prod_assert(qts_has(qt, &(coord_t){.x = 1, .y = 1})); | |
qt = calloc(1, sizeof(*qt)); // should probaly be aligned for the avx trick | |
qt->region_size = 16; | |
printf("--- adding 0,0\n"); | |
qts_insert(qt, &(coord_t){.x = 0, .y = 0}); | |
prod_assert(qts_has(qt, &(coord_t){.x = 0, .y = 0})); | |
prod_assert(!qts_has(qt, &(coord_t){.x = 0, .y = 1})); | |
printf("--- adding 0,1\n"); | |
qts_insert(qt, &(coord_t){.x = 0, .y = 1}); | |
prod_assert(qts_has(qt, &(coord_t){.x = 0, .y = 1})); | |
prod_assert(qts_has(qt, &(coord_t){.x = 0, .y = 0})); | |
// printf("--- remove 0,1\n"); | |
// qts_remove(qt, &(coord_t){.x = 0, .y = 1}); | |
// assert(qts_has(qt, &(coord_t){.x = 0, .y = 0})); | |
// assert(!qts_has(qt, &(coord_t){.x = 0, .y = 1})); | |
// printf("--- remove 0,0\n"); | |
// qts_remove(qt, &(coord_t){.x = 0, .y = 0}); | |
// assert(!qts_has(qt, &(coord_t){.x = 0, .y = 0})); | |
// assert(!qts_has(qt, &(coord_t){.x = 1, .y = 1})); | |
// printf("--- adding 7,7\n"); | |
// qts_insert(qt, &(coord_t){.x = 7, .y = 7}); | |
// assert(!qts_has(qt, &(coord_t){.x = 0, .y = 0})); | |
// assert(!qts_has(qt, &(coord_t){.x = 0, .y = 1})); | |
// assert(qts_has(qt, &(coord_t){.x = 7, .y = 7})); | |
// printf("--- remove 7,7\n"); | |
// qts_remove(qt, &(coord_t){.x = 7, .y = 7}); | |
// assert(!qts_has(qt, &(coord_t){.x = 0, .y = 0})); | |
// assert(!qts_has(qt, &(coord_t){.x = 1, .y = 1})); | |
// assert(!qts_has(qt, &(coord_t){.x = 7, .y = 7})); | |
free(qt); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment