Skip to content

Instantly share code, notes, and snippets.

@jweinst1
Created July 22, 2024 02:03
Show Gist options
  • Save jweinst1/05c52501776cdc6f58a634764f3aa3f6 to your computer and use it in GitHub Desktop.
Save jweinst1/05c52501776cdc6f58a634764f3aa3f6 to your computer and use it in GitHub Desktop.
A bench mark of a database trie index in C++
#include <cstdio>
#include <cstring>
#include <cstdint>
#include <stdlib.h>
#include <vector>
#include <chrono>
#include <unordered_set>
#include <string>
#include <sys/stat.h>
#include <assert.h>
#include <errno.h>
#include <fcntl.h>
#include <unistd.h>
#include <time.h>
static int fileExists(const char* path) {
struct stat buffer;
return stat(path, &buffer) == 0;
}
static ssize_t getFileSize(const char* path) {
struct stat sb;
if (stat(path, &sb) == 0) {
return sb.st_size;
}
return -1;
}
static void createInitFile(const char* path, size_t size) {
int fd = open(path, O_RDWR | O_CREAT, (mode_t)0600);
pwrite(fd, "", 1, size-1);
close(fd);
}
template<size_t pageSize, size_t rangeSize>
class Trie {
static_assert(pageSize % 2 == 0);
static constexpr size_t blockSize = rangeSize * sizeof(int64_t);
static_assert(pageSize % blockSize == 0);
static constexpr size_t blocksPerPage = pageSize / blockSize;
public:
constexpr size_t getBlocksPerPage() {
return blocksPerPage;
}
constexpr size_t getPageSize() {
return pageSize;
}
Trie(const char* path) {
if (!fileExists(path)) {
createInitFile(path, pageSize * 4);
}
_fd = open(path, O_RDWR | O_CREAT, (mode_t)0600);
_path = path;
loadPages();
initHeader();
}
~Trie() {
close(_fd);
freePages();
}
size_t getPageCount() const {
return _pages.size();
}
void removeFile() {
remove(_path.c_str());
}
char* getPage(size_t at) {
if (at >= _pages.size()) {
addPages((at - _pages.size()) + 1);
}
return _pages[at];
}
int64_t* getHeader() {
return reinterpret_cast<int64_t*>(_pages[0]);
}
int64_t* getRoot() {
return reinterpret_cast<int64_t*>(_pages[1]);
}
int64_t getBlockCount() {
return getHeader()[2];
}
int64_t getBlockCountAndInc() {
int64_t* head = getHeader();
int64_t prev = head[2];
head[2] += 1;
return prev;
}
int64_t* getBlock(int64_t n, int64_t* pageNo = nullptr) {
int64_t pageNumber = (n / blocksPerPage) + 1; // root is at 1
if (pageNo) *pageNo = pageNumber;
int64_t blockInPage = n % blocksPerPage;
return reinterpret_cast<int64_t*>(getPage(pageNumber) + (blockInPage * blockSize));
}
void syncPages(const std::unordered_set<int64_t>& pages) {
for (const auto& page : pages) {
//printf("Syncing the page %lld\n", page);
char* memPage = getPage(page);
pwrite(_fd, memPage, pageSize, page * pageSize);
}
}
void insert(const size_t* seq, size_t size) {
std::unordered_set<int64_t> pagesToSync;
size_t sizeNow = size;
int64_t* cur = getRoot();
while (sizeNow--) {
bool update = false;
if (cur[*seq] == 0) {
cur[*seq] = getBlockCountAndInc();
update = true;
}
int64_t page = 0;
cur = getBlock(cur[*seq], &page);
if (update) pagesToSync.insert(page);
++seq;
}
syncPages(pagesToSync);
}
bool contains(const size_t* seq, size_t size) {
size_t sizeNow = size;
int64_t* cur = getRoot();
while (sizeNow--) {
if (cur[*seq] == 0) {
return false;
}
cur = getBlock(cur[*seq]);
++seq;
}
return true;
}
private:
void initHeader() {
int64_t* head = getHeader();
head[0] = (int64_t)pageSize;
head[1] = (int64_t)rangeSize;
head[2] = 1; // block length
}
void addPages(size_t n) {
size_t curSize = _pages.size() * pageSize;
size_t newSize = curSize + (n * pageSize);
pwrite(_fd, "", 1, newSize-1);
for (size_t i = curSize; i < newSize; i += pageSize) {
char* page = static_cast<char*>(::malloc(pageSize));
pread(_fd, page, pageSize, i);
_pages.push_back(page);
}
}
bool loadPages() {
ssize_t fileSize = getFileSize(_path.c_str());
if (fileSize == -1) {
return false;
}
for (size_t i = 0; i < fileSize; i += pageSize) {
char* page = static_cast<char*>(::malloc(pageSize));
pread(_fd, page, pageSize, i);
_pages.push_back(page);
}
return true;
}
void freePages() {
for (auto& page : _pages) {
::free((void*)page);
}
}
int _fd = -1;
std::string _path;
std::vector<char*> _pages;
};
template <size_t rangeSize, size_t seqSize, size_t seqAmount>
struct KeyGen {
constexpr size_t size() {
return seqSize;
}
KeyGen() {
srand(time(nullptr));
for (size_t i = 0; i < seqAmount; ++i)
{
for (size_t j = 0; j < seqSize; ++j)
{
sequences[i][j] = rand() % rangeSize;
}
}
}
size_t sequences[seqAmount][seqSize];
};
static void writePerfTest() {
static constexpr size_t testSampleSize = 1000000;
static KeyGen<4, 100, testSampleSize> kg;
Trie<4096, 4> tr("perftest");
printf("using blocks per page %zu\n", tr.getBlocksPerPage());
const auto start = std::chrono::steady_clock::now();
for (int i = 0; i < testSampleSize; ++i)
{
tr.insert(kg.sequences[i], kg.size());
}
const auto end = std::chrono::steady_clock::now();
const auto timeSpent = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
printf("%s took %lluus\n", __FUNCTION__, timeSpent.count());
printf("Total pages used %zu, total size %zu\n", tr.getPageCount(), tr.getPageCount() * tr.getPageSize());
tr.removeFile();
}
int main(int argc, char const *argv[])
{
writePerfTest();
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment