Created
July 22, 2024 02:03
-
-
Save jweinst1/05c52501776cdc6f58a634764f3aa3f6 to your computer and use it in GitHub Desktop.
A bench mark of a database trie index in C++
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <cstdio> | |
#include <cstring> | |
#include <cstdint> | |
#include <stdlib.h> | |
#include <vector> | |
#include <chrono> | |
#include <unordered_set> | |
#include <string> | |
#include <sys/stat.h> | |
#include <assert.h> | |
#include <errno.h> | |
#include <fcntl.h> | |
#include <unistd.h> | |
#include <time.h> | |
static int fileExists(const char* path) { | |
struct stat buffer; | |
return stat(path, &buffer) == 0; | |
} | |
static ssize_t getFileSize(const char* path) { | |
struct stat sb; | |
if (stat(path, &sb) == 0) { | |
return sb.st_size; | |
} | |
return -1; | |
} | |
static void createInitFile(const char* path, size_t size) { | |
int fd = open(path, O_RDWR | O_CREAT, (mode_t)0600); | |
pwrite(fd, "", 1, size-1); | |
close(fd); | |
} | |
template<size_t pageSize, size_t rangeSize> | |
class Trie { | |
static_assert(pageSize % 2 == 0); | |
static constexpr size_t blockSize = rangeSize * sizeof(int64_t); | |
static_assert(pageSize % blockSize == 0); | |
static constexpr size_t blocksPerPage = pageSize / blockSize; | |
public: | |
constexpr size_t getBlocksPerPage() { | |
return blocksPerPage; | |
} | |
constexpr size_t getPageSize() { | |
return pageSize; | |
} | |
Trie(const char* path) { | |
if (!fileExists(path)) { | |
createInitFile(path, pageSize * 4); | |
} | |
_fd = open(path, O_RDWR | O_CREAT, (mode_t)0600); | |
_path = path; | |
loadPages(); | |
initHeader(); | |
} | |
~Trie() { | |
close(_fd); | |
freePages(); | |
} | |
size_t getPageCount() const { | |
return _pages.size(); | |
} | |
void removeFile() { | |
remove(_path.c_str()); | |
} | |
char* getPage(size_t at) { | |
if (at >= _pages.size()) { | |
addPages((at - _pages.size()) + 1); | |
} | |
return _pages[at]; | |
} | |
int64_t* getHeader() { | |
return reinterpret_cast<int64_t*>(_pages[0]); | |
} | |
int64_t* getRoot() { | |
return reinterpret_cast<int64_t*>(_pages[1]); | |
} | |
int64_t getBlockCount() { | |
return getHeader()[2]; | |
} | |
int64_t getBlockCountAndInc() { | |
int64_t* head = getHeader(); | |
int64_t prev = head[2]; | |
head[2] += 1; | |
return prev; | |
} | |
int64_t* getBlock(int64_t n, int64_t* pageNo = nullptr) { | |
int64_t pageNumber = (n / blocksPerPage) + 1; // root is at 1 | |
if (pageNo) *pageNo = pageNumber; | |
int64_t blockInPage = n % blocksPerPage; | |
return reinterpret_cast<int64_t*>(getPage(pageNumber) + (blockInPage * blockSize)); | |
} | |
void syncPages(const std::unordered_set<int64_t>& pages) { | |
for (const auto& page : pages) { | |
//printf("Syncing the page %lld\n", page); | |
char* memPage = getPage(page); | |
pwrite(_fd, memPage, pageSize, page * pageSize); | |
} | |
} | |
void insert(const size_t* seq, size_t size) { | |
std::unordered_set<int64_t> pagesToSync; | |
size_t sizeNow = size; | |
int64_t* cur = getRoot(); | |
while (sizeNow--) { | |
bool update = false; | |
if (cur[*seq] == 0) { | |
cur[*seq] = getBlockCountAndInc(); | |
update = true; | |
} | |
int64_t page = 0; | |
cur = getBlock(cur[*seq], &page); | |
if (update) pagesToSync.insert(page); | |
++seq; | |
} | |
syncPages(pagesToSync); | |
} | |
bool contains(const size_t* seq, size_t size) { | |
size_t sizeNow = size; | |
int64_t* cur = getRoot(); | |
while (sizeNow--) { | |
if (cur[*seq] == 0) { | |
return false; | |
} | |
cur = getBlock(cur[*seq]); | |
++seq; | |
} | |
return true; | |
} | |
private: | |
void initHeader() { | |
int64_t* head = getHeader(); | |
head[0] = (int64_t)pageSize; | |
head[1] = (int64_t)rangeSize; | |
head[2] = 1; // block length | |
} | |
void addPages(size_t n) { | |
size_t curSize = _pages.size() * pageSize; | |
size_t newSize = curSize + (n * pageSize); | |
pwrite(_fd, "", 1, newSize-1); | |
for (size_t i = curSize; i < newSize; i += pageSize) { | |
char* page = static_cast<char*>(::malloc(pageSize)); | |
pread(_fd, page, pageSize, i); | |
_pages.push_back(page); | |
} | |
} | |
bool loadPages() { | |
ssize_t fileSize = getFileSize(_path.c_str()); | |
if (fileSize == -1) { | |
return false; | |
} | |
for (size_t i = 0; i < fileSize; i += pageSize) { | |
char* page = static_cast<char*>(::malloc(pageSize)); | |
pread(_fd, page, pageSize, i); | |
_pages.push_back(page); | |
} | |
return true; | |
} | |
void freePages() { | |
for (auto& page : _pages) { | |
::free((void*)page); | |
} | |
} | |
int _fd = -1; | |
std::string _path; | |
std::vector<char*> _pages; | |
}; | |
template <size_t rangeSize, size_t seqSize, size_t seqAmount> | |
struct KeyGen { | |
constexpr size_t size() { | |
return seqSize; | |
} | |
KeyGen() { | |
srand(time(nullptr)); | |
for (size_t i = 0; i < seqAmount; ++i) | |
{ | |
for (size_t j = 0; j < seqSize; ++j) | |
{ | |
sequences[i][j] = rand() % rangeSize; | |
} | |
} | |
} | |
size_t sequences[seqAmount][seqSize]; | |
}; | |
static void writePerfTest() { | |
static constexpr size_t testSampleSize = 1000000; | |
static KeyGen<4, 100, testSampleSize> kg; | |
Trie<4096, 4> tr("perftest"); | |
printf("using blocks per page %zu\n", tr.getBlocksPerPage()); | |
const auto start = std::chrono::steady_clock::now(); | |
for (int i = 0; i < testSampleSize; ++i) | |
{ | |
tr.insert(kg.sequences[i], kg.size()); | |
} | |
const auto end = std::chrono::steady_clock::now(); | |
const auto timeSpent = std::chrono::duration_cast<std::chrono::microseconds>(end - start); | |
printf("%s took %lluus\n", __FUNCTION__, timeSpent.count()); | |
printf("Total pages used %zu, total size %zu\n", tr.getPageCount(), tr.getPageCount() * tr.getPageSize()); | |
tr.removeFile(); | |
} | |
int main(int argc, char const *argv[]) | |
{ | |
writePerfTest(); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment