lorenzhs/numa_array.hpp

## numa_array.hpp
/*******************************************************************************
 * numa_array.hpp
 *
 * Arrays that are transparently distributed onto multiple NUMA nodes using Silo
 * (https://github.com/stanford-mast/Silo) and make use of transparent huge
 * pages, with a fallback for non-NUMA systems
 *
 * Copyright (C) 2018 Lorenz Hübschle-Schneider <lorenz@4z2.de>
 *
 * All rights reserved. Published under the BSD-2 license.
 ******************************************************************************/

#pragma once
#ifndef NUMA_ARRAY_HEADER
#define NUMA_ARRAY_HEADER

// https://github.com/tlx/tlx, but only used for logging. You can get rid of
// this dependency by removing all the lines with "LOG" / "sLOG" statements.
#include <tlx/logger.hpp>

/*
 * You should detect whether libnuma is installed using your build system.
 * With cmake, you can do something like:
  find_package(Numa)
  if(NUMA_FOUND)
    list(APPEND MY_DEFINITIONS "HAVE_LIBNUMA")
   else()
    message(STATUS "Could not find libnuma, disabling NUMA awareness")
   endif()
   # use MY_DEFINITIONS as follows:
   target_compile_definitions(my_target PUBLIC ${MY_DEFINITIONS})
*/
#ifdef HAVE_LIBNUMA
#include <silo.h>
#include <topo.h>
#endif // HAVE_LIBNUMA

#include <sys/mman.h> // madvise
#include <cstdlib> // aligned_alloc
#include <memory>

// Align allocation size by rounding up (Silo sometimes rounds down, I'm not
// quite sure why)
constexpr size_t align_size(size_t size, size_t alignment) {
    return ((size + alignment - 1) / alignment) * alignment;
}

// Allocate `size` bytes and tell the Linux kernel that it would be a good idea
// to use hugepages for this.  Will align the size to 2MB.
void* alloc_hugepage(size_t size) {
    constexpr size_t alignment = 2 * 1024 * 1024;
    size_t bytes = align_size(size, alignment);
    void* ptr = aligned_alloc(alignment, bytes);
    madvise(ptr, bytes, MADV_HUGEPAGE);
    return ptr;
}

// Allocate memory, automatically switching to 2MB-aligned allocations with
// support for transparent huge pages if `size` exceeds 1MB.  You can use this
// as a replacement for `malloc`.
void* allocate(size_t size) {
    if (size >= 1024 * 1024) {
        return alloc_hugepage(size);
    } else {
        return malloc(size);
    }
}

// Allocate an array distributed over the available NUMA nodes
void* numa_alloc(size_t bytes, bool align = true) {
#ifndef HAVE_LIBNUMA
    (void) align;
    return allocate(bytes);
#else
    constexpr bool debug = true;

    // the code below is designed to handle *fewer* threads than available,
    // especially for the case where num_numa_nodes does not divide num_threads.
    // In that case, the first NUMA nodes are assumed to get an additional
    // thread each, and the portion of the array allocated on each NUMA node is
    // distributed the same way (i.e., more memory on the first few nodes).
    // This is irrelevant if using all threads.
    int num_threads = std::thread::hardware_concurrency();
    int num_numa_nodes = topoGetSystemNUMANodeCount();
    int threads_per_node = (num_threads + num_numa_nodes - 1) /
        num_numa_nodes;
    size_t bytes_per_thread = (bytes + num_threads - 1) / num_threads;

    // Fallback to normal allocations if this is a non-NUMA system
    if (num_numa_nodes == 1) {
        return allocate(bytes);
    }

    sLOG << "Allocating" << bytes << "bytes on" << num_numa_nodes
         << "NUMA nodes";

    SSiloMemorySpec* specs = (SSiloMemorySpec*)malloc(
        sizeof(SSiloMemorySpec) * num_numa_nodes);
    if (specs == nullptr) { // abort
        assert(false);
        return nullptr;
    }
    // Initialize the specifications.
    int min = 0, max = threads_per_node;
    for (int i = 0; i < num_numa_nodes; ++i) {
        // Align sizes to 2MB
        size_t size = bytes_per_thread * (max - min);
        if (align) size = align_size(size, 2048*1024);
        specs[i].size = size;
        specs[i].numaNode = i;
        sLOG << "Thereof" << specs[i].size << "bytes on node" << i;

        min = max;
        max = std::min(num_threads, max + threads_per_node);
    }

    // Allocate the multi-node array. Uses transparent hugepages.
    void* buffer = siloMultinodeArrayAlloc(num_numa_nodes, specs);

    // If for some reason the multi-NUMA-node allocation failed, fallback to
    // the simple version
    if (buffer == nullptr) {
        LOG << "failed to allocate; fallback to simple";
        buffer = allocate(bytes);
        assert(buffer != nullptr);
    }
    return buffer;
#endif
}

// Pointers allocated with silo need to be freed with `siloFree`.  This also
// frees the SSiloMemorySpec object (Silo tracks these internally)
void numa_free(void* ptr) {
    if (ptr != nullptr) {
#if HAVE_LIBNUMA
        siloFree(ptr);
#else
        free(ptr);
#endif
    }
}

// A struct that fulfills the deleter requirements of std::unique_ptr
struct numa_deleter {
    template <typename T>
    void operator()(T* ptr) {
        numa_free((void*)ptr);
    }
};

// A type definition for easy use
template <typename T>
using numa_arr_ptr = std::unique_ptr<T[], numa_deleter>;

// Helper function to create a numa_arr_ptr akin to std::make_unique
template <typename T>
numa_arr_ptr<T> make_numa_arr(size_t num_elems) {
    T* ptr = static_cast<T*>(numa_alloc(num_elems * sizeof(T)));
    return numa_arr_ptr<T>(ptr);
}


// remove this from the header before actual use...
void example() {
    size_t size = 1000000; // size of the array
    auto array = make_numa_arr<double>(size);
    // do something with the array
    array[0] = 1.234;
    // the array is automatically deallocated using siloFree when it goes out of
    // scope, no need to free it manually
}

#endif // NUMA_ARRAY_HEADER
	/*******************************************************************************
	* numa_array.hpp
	*
	* Arrays that are transparently distributed onto multiple NUMA nodes using Silo
	* (https://github.com/stanford-mast/Silo) and make use of transparent huge
	* pages, with a fallback for non-NUMA systems
	*
	* Copyright (C) 2018 Lorenz Hübschle-Schneider <lorenz@4z2.de>
	*
	* All rights reserved. Published under the BSD-2 license.
	******************************************************************************/

	#pragma once
	#ifndef NUMA_ARRAY_HEADER
	#define NUMA_ARRAY_HEADER

	// https://github.com/tlx/tlx, but only used for logging. You can get rid of
	// this dependency by removing all the lines with "LOG" / "sLOG" statements.
	#include <tlx/logger.hpp>

	/*
	* You should detect whether libnuma is installed using your build system.
	* With cmake, you can do something like:
	find_package(Numa)
	if(NUMA_FOUND)
	list(APPEND MY_DEFINITIONS "HAVE_LIBNUMA")
	else()
	message(STATUS "Could not find libnuma, disabling NUMA awareness")
	endif()
	# use MY_DEFINITIONS as follows:
	target_compile_definitions(my_target PUBLIC ${MY_DEFINITIONS})
	*/
	#ifdef HAVE_LIBNUMA
	#include <silo.h>
	#include <topo.h>
	#endif // HAVE_LIBNUMA

	#include <sys/mman.h> // madvise
	#include <cstdlib> // aligned_alloc
	#include <memory>

	// Align allocation size by rounding up (Silo sometimes rounds down, I'm not
	// quite sure why)
	constexpr size_t align_size(size_t size, size_t alignment) {
	return ((size + alignment - 1) / alignment) * alignment;
	}

	// Allocate `size` bytes and tell the Linux kernel that it would be a good idea
	// to use hugepages for this. Will align the size to 2MB.
	void* alloc_hugepage(size_t size) {
	constexpr size_t alignment = 2 * 1024 * 1024;
	size_t bytes = align_size(size, alignment);
	void* ptr = aligned_alloc(alignment, bytes);
	madvise(ptr, bytes, MADV_HUGEPAGE);
	return ptr;
	}

	// Allocate memory, automatically switching to 2MB-aligned allocations with
	// support for transparent huge pages if `size` exceeds 1MB. You can use this
	// as a replacement for `malloc`.
	void* allocate(size_t size) {
	if (size >= 1024 * 1024) {
	return alloc_hugepage(size);
	} else {
	return malloc(size);
	}
	}

	// Allocate an array distributed over the available NUMA nodes
	void* numa_alloc(size_t bytes, bool align = true) {
	#ifndef HAVE_LIBNUMA
	(void) align;
	return allocate(bytes);
	#else
	constexpr bool debug = true;

	// the code below is designed to handle fewer threads than available,
	// especially for the case where num_numa_nodes does not divide num_threads.
	// In that case, the first NUMA nodes are assumed to get an additional
	// thread each, and the portion of the array allocated on each NUMA node is
	// distributed the same way (i.e., more memory on the first few nodes).
	// This is irrelevant if using all threads.
	int num_threads = std::thread::hardware_concurrency();
	int num_numa_nodes = topoGetSystemNUMANodeCount();
	int threads_per_node = (num_threads + num_numa_nodes - 1) /
	num_numa_nodes;
	size_t bytes_per_thread = (bytes + num_threads - 1) / num_threads;

	// Fallback to normal allocations if this is a non-NUMA system
	if (num_numa_nodes == 1) {
	return allocate(bytes);
	}

	sLOG << "Allocating" << bytes << "bytes on" << num_numa_nodes
	<< "NUMA nodes";

	SSiloMemorySpec* specs = (SSiloMemorySpec*)malloc(
	sizeof(SSiloMemorySpec) * num_numa_nodes);
	if (specs == nullptr) { // abort
	assert(false);
	return nullptr;
	}
	// Initialize the specifications.
	int min = 0, max = threads_per_node;
	for (int i = 0; i < num_numa_nodes; ++i) {
	// Align sizes to 2MB
	size_t size = bytes_per_thread * (max - min);
	if (align) size = align_size(size, 2048*1024);
	specs[i].size = size;
	specs[i].numaNode = i;
	sLOG << "Thereof" << specs[i].size << "bytes on node" << i;

	min = max;
	max = std::min(num_threads, max + threads_per_node);
	}

	// Allocate the multi-node array. Uses transparent hugepages.
	void* buffer = siloMultinodeArrayAlloc(num_numa_nodes, specs);

	// If for some reason the multi-NUMA-node allocation failed, fallback to
	// the simple version
	if (buffer == nullptr) {
	LOG << "failed to allocate; fallback to simple";
	buffer = allocate(bytes);
	assert(buffer != nullptr);
	}
	return buffer;
	#endif
	}

	// Pointers allocated with silo need to be freed with `siloFree`. This also
	// frees the SSiloMemorySpec object (Silo tracks these internally)
	void numa_free(void* ptr) {
	if (ptr != nullptr) {
	#if HAVE_LIBNUMA
	siloFree(ptr);
	#else
	free(ptr);
	#endif
	}
	}

	// A struct that fulfills the deleter requirements of std::unique_ptr
	struct numa_deleter {
	template <typename T>
	void operator()(T* ptr) {
	numa_free((void*)ptr);
	}
	};

	// A type definition for easy use
	template <typename T>
	using numa_arr_ptr = std::unique_ptr<T[], numa_deleter>;

	// Helper function to create a numa_arr_ptr akin to std::make_unique
	template <typename T>
	numa_arr_ptr<T> make_numa_arr(size_t num_elems) {
	T* ptr = static_cast<T>(numa_alloc(num_elems sizeof(T)));
	return numa_arr_ptr<T>(ptr);
	}


	// remove this from the header before actual use...
	void example() {
	size_t size = 1000000; // size of the array
	auto array = make_numa_arr<double>(size);
	// do something with the array
	array[0] = 1.234;
	// the array is automatically deallocated using siloFree when it goes out of
	// scope, no need to free it manually
	}

	#endif // NUMA_ARRAY_HEADER