Created
June 28, 2020 10:32
-
-
Save azat/694a9012d10fc4536d0ce17481d113d7 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Copied from ClickHouse sources | |
// https://github.com/ClickHouse/ClickHouse/pull/11590 | |
#include <cstdlib> | |
#include <cstring> | |
#include <vector> | |
#include <thread> | |
#include <iostream> | |
// - jemalloc: | |
// real 0m10.816s | |
// user 2m24.375s | |
// sys 0m0.230s | |
// - tcmalloc: | |
// PerCpuCachesActive: 1 | |
// | |
// GetProfileSamplingRate: -1 | |
// GetGuardedSamplingRate: -1 | |
// | |
// GetMaxPerCpuCacheSize: 3145728 | |
// GetMaxTotalThreadCacheBytes: 33554432 | |
// | |
// real 0m19.837s | |
// user 4m32.754s | |
// sys 0m3.329s | |
#ifdef USE_TCMALLOC_CPP | |
#include <tcmalloc/malloc_extension.h> | |
#include <tcmalloc/tcmalloc.h> | |
#include <tcmalloc/common.h> | |
void bootstrap() | |
{ | |
using ext = tcmalloc::MallocExtension; | |
// XXX: does not changes anything... | |
// ext::SetMaxTotalThreadCacheBytes(1 << 30); | |
// ext::SetMaxPerCpuCacheSize(1 << 30); | |
// NOTE: makes tcmalloc-cpp a little bit faster: | |
// | |
// - w/ sampling: | |
// GetProfileSamplingRate: 2097152 | |
// GetGuardedSamplingRate: 104857600 | |
// | |
// real 0m22.121s | |
// user 5m20.928s | |
// sys 0m9.570s | |
// | |
// - w/o sampling: | |
// GetProfileSamplingRate: -1 | |
// GetGuardedSamplingRate: -1 | |
// | |
// real 0m19.837s | |
// user 4m32.754s | |
// sys 0m3.329s | |
ext::SetProfileSamplingRate(SIZE_MAX); | |
ext::SetGuardedSamplingRate(SIZE_MAX); | |
/// Also tried SCHEDULE_COOPERATIVE_AND_KERNEL -- no difference | |
std::cerr << "tcmalloc:\n"; | |
std::cerr << '\n'; | |
std::cerr << "PerCpuCachesActive: " << ext::PerCpuCachesActive() << '\n'; | |
std::cerr << '\n'; | |
std::cerr << "GetProfileSamplingRate: " << ext::GetProfileSamplingRate() << '\n'; | |
std::cerr << "GetGuardedSamplingRate: " << ext::GetGuardedSamplingRate() << '\n'; | |
std::cerr << '\n'; | |
std::cerr << "GetMaxPerCpuCacheSize: " << ext::GetMaxPerCpuCacheSize() << '\n'; | |
std::cerr << "GetMaxTotalThreadCacheBytes: " << ext::GetMaxTotalThreadCacheBytes() << '\n'; | |
std::cerr << '\n'; | |
std::cerr << "kNumClasses: " << kNumClasses << '\n'; | |
} | |
// XXX: does not helps anyway, but let's keep for now | |
void free_(void *ptr, size_t size) { TCMallocInternalDeleteSized(ptr, size); } | |
void* malloc_(size_t size) { return TCMallocInternalNew(size); } | |
#else | |
void bootstrap() {} | |
void free_(void *ptr, size_t /*size*/) { free(ptr); } | |
void* malloc_(size_t size) { return malloc(size); } | |
#endif | |
void alloc_loop() | |
{ | |
for (size_t i = 0; i < 100; ++i) | |
{ | |
size_t size = 4096; | |
void * buf = malloc_(size); | |
if (!buf) | |
abort(); | |
memset(buf, 0, size); | |
/// tcmalloc is faster only when it uses front-end only [1], since only | |
/// it allows parallel access w/o locks, otherwise locking is required | |
/// and it is slower. | |
/// | |
/// [1]: https://github.com/google/tcmalloc/blob/master/docs/design.md#the-tcmalloc-front-end | |
/// | |
/// If allocation is done only up to 256K (i.e. `size < 256<<10`): | |
/// | |
/// - tcmalloc | |
/// real 0m2.335s | |
/// user 0m28.804s | |
/// sys 0m0.010s | |
/// | |
/// - jemalloc | |
/// real 0m2.567s | |
/// user 0m32.748s | |
/// sys 0m0.020s | |
while (size < 1048576) | |
{ | |
size_t next_size = size * 4; | |
free_(buf, size); | |
void * new_buf = malloc_(next_size); | |
if (!new_buf) | |
abort(); | |
buf = new_buf; | |
memset(reinterpret_cast<char*>(buf) + size, 0, next_size - size); | |
size = next_size; | |
} | |
free_(buf, size); | |
} | |
} | |
void thread_func() | |
{ | |
for (size_t i = 0; i < 1000; ++i) | |
{ | |
alloc_loop(); | |
} | |
} | |
int main(int, char **) | |
{ | |
bootstrap(); | |
std::vector<std::thread> threads(16); | |
for (auto & thread : threads) | |
thread = std::thread(thread_func); | |
for (auto & thread : threads) | |
thread.join(); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment