Skip to content

Instantly share code, notes, and snippets.

@lorenzhs
Last active February 8, 2021 22:30
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save lorenzhs/0a2a67b669779ab7a60e34fa0c566227 to your computer and use it in GitHub Desktop.
Save lorenzhs/0a2a67b669779ab7a60e34fa0c566227 to your computer and use it in GitHub Desktop.
Arrays that are transparently distributed onto multiple NUMA nodes using Silo (https://github.com/stanford-mast/Silo) and make use of transparent hugepages, with a fallback for non-NUMA systems
/*******************************************************************************
* numa_array.hpp
*
* Arrays that are transparently distributed onto multiple NUMA nodes using Silo
* (https://github.com/stanford-mast/Silo) and make use of transparent huge
* pages, with a fallback for non-NUMA systems
*
* Copyright (C) 2018 Lorenz Hübschle-Schneider <lorenz@4z2.de>
*
* All rights reserved. Published under the BSD-2 license.
******************************************************************************/
#pragma once
#ifndef NUMA_ARRAY_HEADER
#define NUMA_ARRAY_HEADER
// https://github.com/tlx/tlx, but only used for logging. You can get rid of
// this dependency by removing all the lines with "LOG" / "sLOG" statements.
#include <tlx/logger.hpp>
/*
* You should detect whether libnuma is installed using your build system.
* With cmake, you can do something like:
find_package(Numa)
if(NUMA_FOUND)
list(APPEND MY_DEFINITIONS "HAVE_LIBNUMA")
else()
message(STATUS "Could not find libnuma, disabling NUMA awareness")
endif()
# use MY_DEFINITIONS as follows:
target_compile_definitions(my_target PUBLIC ${MY_DEFINITIONS})
*/
#ifdef HAVE_LIBNUMA
#include <silo.h>
#include <topo.h>
#endif // HAVE_LIBNUMA
#include <sys/mman.h> // madvise
#include <cstdlib> // aligned_alloc
#include <memory>
// Align allocation size by rounding up (Silo sometimes rounds down, I'm not
// quite sure why)
constexpr size_t align_size(size_t size, size_t alignment) {
return ((size + alignment - 1) / alignment) * alignment;
}
// Allocate `size` bytes and tell the Linux kernel that it would be a good idea
// to use hugepages for this. Will align the size to 2MB.
void* alloc_hugepage(size_t size) {
constexpr size_t alignment = 2 * 1024 * 1024;
size_t bytes = align_size(size, alignment);
void* ptr = aligned_alloc(alignment, bytes);
madvise(ptr, bytes, MADV_HUGEPAGE);
return ptr;
}
// Allocate memory, automatically switching to 2MB-aligned allocations with
// support for transparent huge pages if `size` exceeds 1MB. You can use this
// as a replacement for `malloc`.
void* allocate(size_t size) {
if (size >= 1024 * 1024) {
return alloc_hugepage(size);
} else {
return malloc(size);
}
}
// Allocate an array distributed over the available NUMA nodes
void* numa_alloc(size_t bytes, bool align = true) {
#ifndef HAVE_LIBNUMA
(void) align;
return allocate(bytes);
#else
constexpr bool debug = true;
// the code below is designed to handle *fewer* threads than available,
// especially for the case where num_numa_nodes does not divide num_threads.
// In that case, the first NUMA nodes are assumed to get an additional
// thread each, and the portion of the array allocated on each NUMA node is
// distributed the same way (i.e., more memory on the first few nodes).
// This is irrelevant if using all threads.
int num_threads = std::thread::hardware_concurrency();
int num_numa_nodes = topoGetSystemNUMANodeCount();
int threads_per_node = (num_threads + num_numa_nodes - 1) /
num_numa_nodes;
size_t bytes_per_thread = (bytes + num_threads - 1) / num_threads;
// Fallback to normal allocations if this is a non-NUMA system
if (num_numa_nodes == 1) {
return allocate(bytes);
}
sLOG << "Allocating" << bytes << "bytes on" << num_numa_nodes
<< "NUMA nodes";
SSiloMemorySpec* specs = (SSiloMemorySpec*)malloc(
sizeof(SSiloMemorySpec) * num_numa_nodes);
if (specs == nullptr) { // abort
assert(false);
return nullptr;
}
// Initialize the specifications.
int min = 0, max = threads_per_node;
for (int i = 0; i < num_numa_nodes; ++i) {
// Align sizes to 2MB
size_t size = bytes_per_thread * (max - min);
if (align) size = align_size(size, 2048*1024);
specs[i].size = size;
specs[i].numaNode = i;
sLOG << "Thereof" << specs[i].size << "bytes on node" << i;
min = max;
max = std::min(num_threads, max + threads_per_node);
}
// Allocate the multi-node array. Uses transparent hugepages.
void* buffer = siloMultinodeArrayAlloc(num_numa_nodes, specs);
// If for some reason the multi-NUMA-node allocation failed, fallback to
// the simple version
if (buffer == nullptr) {
LOG << "failed to allocate; fallback to simple";
buffer = allocate(bytes);
assert(buffer != nullptr);
}
return buffer;
#endif
}
// Pointers allocated with silo need to be freed with `siloFree`. This also
// frees the SSiloMemorySpec object (Silo tracks these internally)
void numa_free(void* ptr) {
if (ptr != nullptr) {
#if HAVE_LIBNUMA
siloFree(ptr);
#else
free(ptr);
#endif
}
}
// A struct that fulfills the deleter requirements of std::unique_ptr
struct numa_deleter {
template <typename T>
void operator()(T* ptr) {
numa_free((void*)ptr);
}
};
// A type definition for easy use
template <typename T>
using numa_arr_ptr = std::unique_ptr<T[], numa_deleter>;
// Helper function to create a numa_arr_ptr akin to std::make_unique
template <typename T>
numa_arr_ptr<T> make_numa_arr(size_t num_elems) {
T* ptr = static_cast<T*>(numa_alloc(num_elems * sizeof(T)));
return numa_arr_ptr<T>(ptr);
}
// remove this from the header before actual use...
void example() {
size_t size = 1000000; // size of the array
auto array = make_numa_arr<double>(size);
// do something with the array
array[0] = 1.234;
// the array is automatically deallocated using siloFree when it goes out of
// scope, no need to free it manually
}
#endif // NUMA_ARRAY_HEADER
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment