Skip to content

Instantly share code, notes, and snippets.

@markpapadakis
Last active August 29, 2015 14:09
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save markpapadakis/f985d05710d0819250ad to your computer and use it in GitHub Desktop.
Save markpapadakis/f985d05710d0819250ad to your computer and use it in GitHub Desktop.
// A simple LUT backed index for faster binary search, with LUT partition encoding
// A bloom filter can be attached to it, for when you have many, many values - though in practice
// it is rarely needed, especially if the bits(resolution) is over 16
//
// You need (1 << bits) * sizeof(T::key) for the lut, e.g if T::key is uint32_t, for a 16bits LUT, that's
// 262k. Increasing resolution results in higher lookup efficiency, reducing it results in lower memory requirements
template<typename T>
static __attribute__((always_inline)) constexpr T MSB(const T n, const uint8_t span)
{
return n >> ((sizeof(T) * 8) - span);
}
template<typename T>
struct Index
{
Vector<T> list;
using key_t = decltype(T::key);
static constexpr key_t keyMax = (key_t)(uint32_t)((1U << (sizeof(key_t) * 8)) - 1);
// TODO: provide sizeBase in constructor or default to this computed value
static constexpr key_t sizeBase = (key_t)(uint32_t)((1U << ((sizeof(key_t) * 8) - 1)) - 1);
key_t *lut;
const uint8_t res;
const uint32_t lutSize;
uint8_t padding = 0;
Index(const uint8_t r)
: res{r}, lutSize{1U<<r}
{
lut = (key_t *)malloc(sizeof(key_t) * lutSize);
for (uint32_t i = 0; i != lutSize; ++i)
lut[i] = keyMax;
}
const T *Lookup(const key_t key) const
{
const auto lutIndex = MSB(((uint32_t)key) << padding, res);
const auto v = lut[lutIndex];
if (v >= sizeBase)
return NULL;
int32_t btm = v;
int32_t top;
if (lutIndex == lutSize - 1)
{
top = list.size() - 1;
}
else
{
const auto next = lut[lutIndex + 1];
if (next >= sizeBase)
{
const auto size = next - sizeBase;
top = (btm + size) - 1;
}
else
{
top = next - 1;
}
}
const auto *const all = list.Values();
// Binary search in [btm, top]
while (btm <= top)
{
const int32_t mid = (btm + top) / 2;
const auto *const r = all + mid;
if (key == r->key)
return r;
else if (key < r->key)
top = mid - 1;
else
btm = mid + 1;
}
return NULL;
}
void Commit(void) // Make sure list[] is sorted
{
const auto cnt = list.Size();
if (!cnt)
return;
const auto maxKey = list.LastValue().key;
const auto *const all = list.Values();
auto *const lutPartitionSize = (key_t *)calloc(lutSize, sizeof(key_t));
padding = __builtin_clz(maxKey);
for (uint32_t i = 0; i != cnt; )
{
const auto k = (uint32_t)all[i].key;
const auto a = k << padding;
const uint32_t lutIndex = MSB(a, res);
const auto base = i;
for (lut[lutIndex] = i, ++i; i != cnt && MSB(all[i].key << padding, res) == lutIndex; ++i)
continue;
lutPartitionSize[lutIndex] = i - base;
}
// Patch lut[].
// No value may have been mapped to lut[index + 1]; we need to account for that
// In those cases, encode lut[index] partition size into lut[index + 1]
const auto last = lutSize - 1;
for (uint32_t i = 0; i < last; )
{
if (lut[i] != keyMax && lut[i + 1] == keyMax)
{
lut[i + 1] = lutPartitionSize[i] + sizeBase;
i+=2;
}
else
++i;
}
free(lutPartitionSize);
}
};
struct document
{
uint32_t key;
uint8_t type;
uint32_t owner;
};
int main(int argc, char *argv[])
{
Index<document> index(12);
// Add all documents in index.list[]
// Make sure it's sorted when done, e.g
// std::sort(index.list.begin(), index.list.end(), [](const document &d1, const document &d2) { return d1.key < d2.key; });
index.Commit();
auto *const res = index.Lookup(8100);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment