Skip to content

Instantly share code, notes, and snippets.

@graphitemaster
Created November 13, 2012 06:54
Show Gist options
  • Save graphitemaster/4064388 to your computer and use it in GitHub Desktop.
Save graphitemaster/4064388 to your computer and use it in GitHub Desktop.
#include <string.h>
#define BLOCK_ITEMS 16 /* largest alloc 262272 bytes */
/*
* This is a blockade allocator, a new idea in the world of allocators for small short
* lived objects. This allocator creates a table of BLOCK_ITEM chunks, which all contain
* pieces of memory that are allocated for N instances (by blockade_init(N)). Unlike a
* traditional block allocator we use integer log base 2 to map the byte-size to a linear
* sequence: e.g
* 8 bytes = 0
* 16 bytes = 1
* 32 bytes = 2
* 64 bytes = 3 (etc etc)
*
* With the correct mapping we can index into the table with O(1) complexity. Giving us the
* memory required, then incrementing the current position to allow us to get the next item
* in the list for that byte size.
*
* On free we simply copy the address of the pointer into a table with the same layout as the
* chunk table. Using this table (also called a freelist) the allocator can determine if something
* is freed and reuse the existing memory. Since everything works like a basic stack it's easy to
* branchlessly allocate and deallocate.
*
* We use some optimizations like masking pointers so they're either 0 or (cur-last), so when we
* simply add last you either get cur or last (without a branch). Some other optimizations the
* allocator implements is absolute inlinability of integer log base 2 calculation (using a fast
* dejbruin lookup sequence) and some bitwise tricks (via a macro). We also round up all
* allocations to the next power of two using a macro.
*
* With everything implemented the way it is we benefit from O(1) in both time and space complexity
* for both allocations and free's (entirely branchless). The cost for initialization however is
* O(log(N)). The allocator destruction is also only O(1)
*
* The allocator performs a single allocation for everything so all data is contiguous as possible
* for cache friendliness. There is no gaps in-between allocations (other than the tiny int before
* the data to store the size). There is also no holes in allocations either, we make use of the
* space efficiently.
*
* Traditionally speaking I've never seen anything like this before. If something has been done
* of this nature I'd like to know. But for the most part this is MY invention (since freethinking
* is what allowed me to vision it).
*/
#define SIZE_JMP2DATA(X) ((char*)(X) + sizeof(int))
#define SIZE_DEC2DATA(X) ((char*)(X) - sizeof(int))
#define SIZE_GETAFTER(X) *((int*)(SIZE_DEC2DATA(X)))
#define SIZE_GETLATER(X) *((int*)(X))
#define LOGBASE2(X) blockade_logtable[((size_t)((X) * 0x077CB531U) >> 0x1B) & 0x1F] - 3; /* minus three since we map from 8 not 4 */
#define ROUNDPOT(X) X--,X|=X>>1,X|=X>>2,X|=X>>4,X|=X>>8,X|=X>>16,X++
#define MAPMEM(T,X) (T)raw; raw += X
#define MAKEIMPL(X) ((blockade_i*)(X))
#define MAKETYPE(X) ((blockade_t*)(X))
/* __builtin_ctz replacement table for integer log base 2*/
static const int blockade_logtable[0x20] = {
0x00, 0x01, 0x1C, 0x02, 0x0D, 0x0E, 0x18, 0x03,
0x1E, 0x16, 0x14, 0x0F, 0x19, 0x11, 0x04, 0x08,
0x1F, 0x1B, 0x0D, 0x17, 0x15, 0x13, 0x10, 0x07,
0x1A, 0x0C, 0x12, 0x06, 0x0B, 0x05, 0x0A, 0x09
};
/* actual allocator, none of this needs changing */
struct blockade_T; /* incomplete type used as handle to blockade_i (impl) */
typedef struct blockade_T blockade_t;
typedef struct {
void **blocks[BLOCK_ITEMS];
void **freeme[BLOCK_ITEMS];
size_t chunks[BLOCK_ITEMS];
size_t indexs[BLOCK_ITEMS];
size_t pieces;
void *buffer;
} blockade_i;
blockade_t *blockade_init(size_t instances) {
size_t i,j;
size_t bytes = 4;
char *raw = NULL;
blockade_i *b = NULL;
/* Round instances to POT to keep malloc happy (could get lucky and hit an alignment :3) */
ROUNDPOT(instances);
if (!(raw = (char*)malloc(sizeof(blockade_i) + (((sizeof(void*) * instances) << 1) * BLOCK_ITEMS) + ((((8 << (BLOCK_ITEMS + 1)) - 1) & ~7) * instances))))
perror("\n:[blockade] failed to allocate memory\n");
b = MAPMEM(blockade_i*, sizeof(blockade_i));
b->pieces = instances;
b->buffer = &raw[-sizeof(blockade_i)]; /* -sizeof(blockade_i) because MAPMEM increments raw */
/* Map memory for pointers to memory for blocks */
for (i = 0; i < BLOCK_ITEMS; i++, bytes <<= 1) {
b->blocks[i] = MAPMEM(void** ,sizeof(void*) * instances);
b->freeme[i] = MAPMEM(void**, sizeof(void*) * instances);
/* Map memory to blocks in current item */
for (j = 0; j < instances; j++)
b->blocks[i][j] = MAPMEM(void*, bytes);
}
memset(
/* skip blocks and freeme */
((char*)b) + ((sizeof(void**) * BLOCK_ITEMS) << 1),
/* nullify */
0,
/* calculate size of:
* size_t chunks[BLOCK_ITEMS];
* size_t indexs[BLOCK_ITEMS];
* size_t pieces;
* void *buffer;
*/
((sizeof(size_t) * BLOCK_ITEMS) << 1) + sizeof(size_t) + sizeof(void*)
);
return MAKETYPE(b);
}
void *blockade_alloc(blockade_t *i, size_t bytes) {
void *ret = NULL;
void *chk = NULL;
void *lst = NULL;
size_t idx = 0;
blockade_i *b = MAKEIMPL(i);
/* STORE for less indirection */
size_t *indexs = &b->indexs[idx];
size_t *chunks = &b->chunks[idx];
/* additional space for size and round to POT */
bytes += sizeof(int);
ROUNDPOT(bytes);
idx = LOGBASE2(bytes);
lst = b->freeme[idx][*indexs];
chk = b->blocks[idx][*chunks];
/* branchless masking magic bitchs ;-) */
#if 1
ret = (((!! lst-1 ) & (((char*)chk)-((char*)lst))) + ((char*)lst));
#else
/* same as this */
ret = (lst) ? lst : chk;
#endif
*indexs -= !!lst;
*chunks += !!chk;
SIZE_GETLATER(ret) = bytes;
return SIZE_JMP2DATA(ret);
}
void blockade_free(blockade_t *i, void *ptr) {
blockade_i *b = MAKEIMPL(i);
size_t v = LOGBASE2(SIZE_GETAFTER(ptr));
b->freeme[v][b->indexs[v]] = SIZE_DEC2DATA(ptr);
b->indexs[v] ++;
}
void blockade_destroy(blockade_i *b) {
free(MAKEIMPL(b)->buffer); /* will destroy b too */
}
#undef SIZE_JMP2DATA
#undef SIZE_DEC2DATA
#undef SIZE_GETAFTER
#undef SIZE_GETLATER
#undef LOGBASE2
#undef ROUNDPOT
#undef MAPMEM
#undef MAKEIMPL
#undef MAKETYPE
int main() {
blockade_t *b = blockade_init(21); /* will round POt to 32 */
int *z = blockade_alloc(b, sizeof(int));
#if 0
char *p = blockade_alloc(b, 55); /* 55 + sizeof(int)[4] = 59 (POT round to 64) */
blockade_free(b, p);
char *e = blockade_alloc(b, 54); /* same as last one (will reuse mem block) */
blockade_free(b, e);
void *f = blockade_alloc(b, 999); /* 1024 alloc */
blockade_free(b, z);
blockade_free(b, f);
blockade_destroy(b);
#endif
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment